From 3e1fe8a810b0b36aa39a1ba0b897852ad4c56dbd Mon Sep 17 00:00:00 2001 From: igor <igor.markin@vereign.com> Date: Fri, 25 Dec 2020 11:13:09 +0300 Subject: [PATCH] Refactor pruning --- src/HTMLNormalizer/HTMLNormalizer.ts | 63 +++++++----------------- src/HTMLNormalizer/strategies/common.ts | 16 ------ src/HTMLNormalizer/strategies/gmail.ts | 10 +--- src/HTMLNormalizer/strategies/outlook.ts | 63 ++---------------------- 4 files changed, 22 insertions(+), 130 deletions(-) diff --git a/src/HTMLNormalizer/HTMLNormalizer.ts b/src/HTMLNormalizer/HTMLNormalizer.ts index c40abe5..a137404 100644 --- a/src/HTMLNormalizer/HTMLNormalizer.ts +++ b/src/HTMLNormalizer/HTMLNormalizer.ts @@ -8,28 +8,21 @@ import { import { amendOutlookNodes, cleanupOutlookElementAttributes, - pruneOutlookElement, } from "./strategies/outlook"; import { EMAIL_VENDORS } from "../constants"; import { normalizeTextSpacings } from "../utils"; import { amendGmailNodes, cleanupGMailElementAttributes, - pruneGmailElement, } from "./strategies/gmail"; import { PlainNormalizer } from "../index"; -import { amendNodes } from "./strategies/common"; +import { amendNodes, pruneElement } from "./strategies/common"; const nodesAmendingFunctions = { [EMAIL_VENDORS.GMAIL]: amendGmailNodes, [EMAIL_VENDORS.OUTLOOK]: amendOutlookNodes, }; -const nodesPruningFunctions = { - [EMAIL_VENDORS.GMAIL]: pruneGmailElement, - [EMAIL_VENDORS.OUTLOOK]: pruneOutlookElement, -}; - const attributesCleanupFunctions = { [EMAIL_VENDORS.GMAIL]: cleanupGMailElementAttributes, [EMAIL_VENDORS.OUTLOOK]: cleanupOutlookElementAttributes, @@ -39,65 +32,49 @@ export const normalizeVendorHtml = ( document: HTMLDocument, vendor: string ): string => { - const mimeBody = document.body; + /** + * Remove unnecessary nodes + */ + pruneHtmlNode(document.body); amendNodes(document); - const amendNodesFunction = nodesAmendingFunctions[vendor]; if (amendNodesFunction) { amendNodesFunction(document); } - /** - * Remove unnecessary nodes - */ - const elementPruningFunction = nodesPruningFunctions[vendor]; - - if (!elementPruningFunction) { - throw new Error( - `Vendor "${vendor}" is not supported. Please, develop a pruning function for it.` - ); - } - - pruneHtmlNode(document, elementPruningFunction); - /** * Cleanup unnecessary attributes of nodes */ const elementAttributesCleanupFunction = attributesCleanupFunctions[vendor]; if (elementAttributesCleanupFunction) { - cleanupHtmlNodeAttributes(document, elementAttributesCleanupFunction); + cleanupHtmlNodeAttributes(document.body, elementAttributesCleanupFunction); } - return printHtmlChildren(mimeBody, 0); + return printHtmlChildren(document.body, 0); }; -export const extractPseudoPlainPart = ( - document: HTMLDocument - /*vendor: string*/ -): string => { +export const extractPseudoPlainPart = (document: HTMLDocument): string => { let normalizedTextContent = PlainNormalizer.normalizePlain( document.body.textContent ); const anchors = document.getElementsByTagName("a"); const images = document.getElementsByTagName("img"); - let meaningfulAttributes = []; + let urls = []; Array.from(anchors).forEach((a) => { - meaningfulAttributes.push(a.getAttribute("href")); + urls.push(a.getAttribute("href")); }); Array.from(images).forEach((img) => { - meaningfulAttributes.push(img.getAttribute("src")); - meaningfulAttributes.push(img.getAttribute("alt")); + urls.push(img.getAttribute("src")); }); - meaningfulAttributes = meaningfulAttributes.filter((attr) => !!attr).sort(); + urls = urls.filter((attr) => !!attr).sort(); - normalizedTextContent += meaningfulAttributes.join(","); + normalizedTextContent += urls.sort((a, b) => a.localeCompare(b)).join(","); - // console.log(meaningfulAttributes); return normalizedTextContent; }; @@ -183,10 +160,7 @@ export const cleanupHtmlNodeAttributes = ( } }; -export const pruneHtmlNode = ( - node: Node, - pruneElement: (element: HTMLElement) => boolean -): boolean => { +export const pruneHtmlNode = (node: Node): boolean => { let toBeRemoved = false; switch (node.nodeType) { @@ -195,16 +169,14 @@ export const pruneHtmlNode = ( toBeRemoved = true; break; case TEXT_NODE: { - const trimmedText = normalizeTextSpacings(node.textContent); - if (trimmedText === "") { + if (node.textContent === "") { toBeRemoved = true; - } else { - node.textContent = trimmedText; } break; } case ELEMENT_NODE: toBeRemoved = pruneElement(node as HTMLElement); + break; } if (toBeRemoved) { @@ -213,9 +185,8 @@ export const pruneHtmlNode = ( const childrenToRemove = []; let child = node.firstChild; - while (child) { - pruneHtmlNode(child, pruneElement) && childrenToRemove.push(child); + pruneHtmlNode(child) && childrenToRemove.push(child); child = child.nextSibling; } diff --git a/src/HTMLNormalizer/strategies/common.ts b/src/HTMLNormalizer/strategies/common.ts index 3bef1b5..e41656a 100644 --- a/src/HTMLNormalizer/strategies/common.ts +++ b/src/HTMLNormalizer/strategies/common.ts @@ -1,8 +1,6 @@ // this is a Node module. require is a must to work across different envs const URL = require("url-parse"); -const DUMMY_QR_CODE_ID = "dummyQrCode"; - export const ELEMENT_TYPES_TO_REMOVE = { br: true, hr: true, @@ -54,25 +52,11 @@ export const amendNodes = (document: HTMLDocument): void => { } }; -/** - * Removes dummy QR code from HTML - * @param element - */ -const isDummyQrCode = (element: HTMLElement): boolean => { - if (element.id === DUMMY_QR_CODE_ID) { - return true; - } -}; - /** * Decides whether node should be removed * @param element */ export const pruneElement = (element: HTMLElement): boolean => { - if (isDummyQrCode(element)) { - return true; - } - return !!ELEMENT_TYPES_TO_REMOVE[element.nodeName.toLowerCase()]; }; diff --git a/src/HTMLNormalizer/strategies/gmail.ts b/src/HTMLNormalizer/strategies/gmail.ts index 55f4f9f..efd990e 100644 --- a/src/HTMLNormalizer/strategies/gmail.ts +++ b/src/HTMLNormalizer/strategies/gmail.ts @@ -1,14 +1,6 @@ -import { - ATTRIBUTES_TO_KEEP, - cloneAnchorFromPane, - pruneElement, -} from "./common"; +import { ATTRIBUTES_TO_KEEP, cloneAnchorFromPane } from "./common"; import { ELEMENT_NODE } from "../../constants"; -export const pruneGmailElement = (element: HTMLElement): boolean => { - return pruneElement(element); -}; - const qrCodeContainerIds = { vereignWrapperLink: 1 }; const removeQrCodeNodes = (document: HTMLDocument) => { const remove = (node: Element) => { diff --git a/src/HTMLNormalizer/strategies/outlook.ts b/src/HTMLNormalizer/strategies/outlook.ts index 25b8049..4584d7f 100644 --- a/src/HTMLNormalizer/strategies/outlook.ts +++ b/src/HTMLNormalizer/strategies/outlook.ts @@ -1,25 +1,7 @@ import { ELEMENT_NODE, TEXT_NODE } from "../../constants"; -import { - ATTRIBUTES_TO_KEEP, - cloneAnchorFromPane, - pruneElement, -} from "./common"; +import { ATTRIBUTES_TO_KEEP, cloneAnchorFromPane } from "./common"; import { unwindTags } from "./nodesAmendingFunctions"; -/** - * Returns true if element should be completely removed - * @param element - */ -export const pruneOutlookElement = (element: HTMLElement): boolean => { - if (pruneElement(element)) { - return true; - } - - // Remove Outlook generic <o:*> tags - // return !!element.nodeName.toLowerCase().startsWith("o:"); - return false; -}; - const qrCodeContainerIds = { "test-for-us": 1, }; @@ -65,7 +47,7 @@ export const amendOutlookNodes = (document: HTMLDocument): void => { removeQrCodeNodes(document); /** - * Remove Word o:p paragraphs + * Unwind Word o:p paragraphs */ const ops = document.getElementsByTagName("o:p"); unwindTags(Array.from(ops)); @@ -98,7 +80,7 @@ export const amendOutlookNodes = (document: HTMLDocument): void => { unwindTags(msoNormalParents); /** - * Unwind WordSection tags + * Unwind WordSection1 tags */ const wordSectionWrappers = document.getElementsByClassName("WordSection1"); unwindTags(Array.from(wordSectionWrappers)); @@ -134,45 +116,8 @@ export const amendOutlookNodes = (document: HTMLDocument): void => { /** * Unwind spans, because sometimes Outlook wraps everything into span after sending */ - const spans = document.getElementsByTagName("span"); - - //Sort spans by depth to start unwinding the deepest ones, which does not contain nested spans - const spansDepths: { depth?: Array<Node> } = {}; - Array.from(spans).forEach((span: Node) => { - let descendant = span; - let parent = descendant.parentNode; - - let depth = 0; - while (parent && descendant !== parent) { - descendant = parent; - parent = descendant.parentNode; - depth++; - } - - if (!spansDepths[depth]) { - spansDepths[depth] = []; - } - - spansDepths[depth].push(span); - }); - - Object.keys(spansDepths) - .sort((a, b) => parseInt(b) - parseInt(a)) - .forEach((depth) => { - spansDepths[depth].forEach((span) => { - let child = span.firstChild; - const parent = span.parentNode; - - while (child) { - parent.insertBefore(child.cloneNode(true), span); - - child = child.nextSibling; - } - - span.parentNode.removeChild(span); - }); - }); + unwindTags(Array.from(spans)); }; export const cleanupOutlookElementAttributes = (element: HTMLElement): void => { -- GitLab