From ebfb5bc14090ff370411eb2194cbb0a4430b7ad0 Mon Sep 17 00:00:00 2001 From: igor <igor.markin@vereign.com> Date: Fri, 25 Dec 2020 12:04:26 +0300 Subject: [PATCH] Optimize attributes normalisation --- .../outlook-gmail/chrome-chrome/README.md | 2 +- src/HTMLNormalizer/HTMLNormalizer.ts | 194 ++++++------------ src/HTMLNormalizer/strategies/common.ts | 16 +- src/HTMLNormalizer/strategies/gmail.ts | 27 +-- src/HTMLNormalizer/strategies/outlook.ts | 28 +-- 5 files changed, 80 insertions(+), 187 deletions(-) diff --git a/__tests__/files/outlook-gmail/chrome-chrome/README.md b/__tests__/files/outlook-gmail/chrome-chrome/README.md index a34356d..04bd6e4 100644 --- a/__tests__/files/outlook-gmail/chrome-chrome/README.md +++ b/__tests__/files/outlook-gmail/chrome-chrome/README.md @@ -15,5 +15,5 @@ | 13 | 4 replies with test case 05 | ok | ok | ok | | 14 | 4 replies with test case 06 | ok | ok | ok | | 15 | 4 replies with test case 07 (with both OneDrive and Google Drive attachments) | ok | ok | ok | -| 16 | 4 replies with test case 08 (with both OneDrive and Google Drive attachments) | fail | fail | fail | +| 16 | 4 replies with test case 08 (with both OneDrive and Google Drive attachments) | ok | ok | ok | | 17 | Multiple forwarded emails using cases 01-08:<br><br>E.g. <br>- User A sends "case 01" to User B<br>- User B forwards "case 02" to User A<br>- User A forwards "case 03" to User B<br>...<br>- User B forwards "case 08" to User A<br>- User A completes circle by forwarding "case 01" to User B | ok | ok | ok | diff --git a/src/HTMLNormalizer/HTMLNormalizer.ts b/src/HTMLNormalizer/HTMLNormalizer.ts index a137404..b58b574 100644 --- a/src/HTMLNormalizer/HTMLNormalizer.ts +++ b/src/HTMLNormalizer/HTMLNormalizer.ts @@ -5,27 +5,20 @@ import { ELEMENT_NODE, TEXT_NODE, } from "../constants"; -import { - amendOutlookNodes, - cleanupOutlookElementAttributes, -} from "./strategies/outlook"; +import { normalizeOutlookDocument } from "./strategies/outlook"; import { EMAIL_VENDORS } from "../constants"; import { normalizeTextSpacings } from "../utils"; -import { - amendGmailNodes, - cleanupGMailElementAttributes, -} from "./strategies/gmail"; +import { normalizeGmailDocument } from "./strategies/gmail"; import { PlainNormalizer } from "../index"; -import { amendNodes, pruneElement } from "./strategies/common"; - -const nodesAmendingFunctions = { - [EMAIL_VENDORS.GMAIL]: amendGmailNodes, - [EMAIL_VENDORS.OUTLOOK]: amendOutlookNodes, -}; - -const attributesCleanupFunctions = { - [EMAIL_VENDORS.GMAIL]: cleanupGMailElementAttributes, - [EMAIL_VENDORS.OUTLOOK]: cleanupOutlookElementAttributes, +import { + ATTRIBUTES_TO_KEEP, + normalizeDocumentCommon, + pruneElement, +} from "./strategies/common"; + +const documentNormalizationFunctions = { + [EMAIL_VENDORS.GMAIL]: normalizeGmailDocument, + [EMAIL_VENDORS.OUTLOOK]: normalizeOutlookDocument, }; export const normalizeVendorHtml = ( @@ -37,21 +30,18 @@ export const normalizeVendorHtml = ( */ pruneHtmlNode(document.body); - amendNodes(document); - const amendNodesFunction = nodesAmendingFunctions[vendor]; - if (amendNodesFunction) { - amendNodesFunction(document); - } - /** - * Cleanup unnecessary attributes of nodes + * Apply document normalisations */ - const elementAttributesCleanupFunction = attributesCleanupFunctions[vendor]; - - if (elementAttributesCleanupFunction) { - cleanupHtmlNodeAttributes(document.body, elementAttributesCleanupFunction); + normalizeDocumentCommon(document.body); + const normalizeDocument = documentNormalizationFunctions[vendor]; + if (normalizeDocument) { + normalizeDocument(document); } + /** + * Final printout + */ return printHtmlChildren(document.body, 0); }; @@ -73,11 +63,55 @@ export const extractPseudoPlainPart = (document: HTMLDocument): string => { urls = urls.filter((attr) => !!attr).sort(); - normalizedTextContent += urls.sort((a, b) => a.localeCompare(b)).join(","); + normalizedTextContent += urls.join(","); return normalizedTextContent; }; +export const pruneHtmlNode = (node: Node): boolean => { + let toBeRemoved = false; + + switch (node.nodeType) { + case COMMENT_NODE: + case DOCUMENT_TYPE_NODE: + toBeRemoved = true; + break; + case ELEMENT_NODE: + toBeRemoved = pruneElement(node as HTMLElement); + break; + } + + if (toBeRemoved) { + return true; + } + + const childrenToRemove = []; + let child = node.firstChild; + while (child) { + pruneHtmlNode(child) && childrenToRemove.push(child); + child = child.nextSibling; + } + + childrenToRemove.forEach((child) => node.removeChild(child)); + + return false; +}; + +export const normalizeBodyAttributes = ( + node: Node, + cleanupElementAttributes: (element: HTMLElement) => void +): void => { + if (node.nodeType === ELEMENT_NODE) { + cleanupElementAttributes(node as HTMLElement); + } + + let child = node.firstChild; + while (child) { + normalizeBodyAttributes(child as HTMLElement, cleanupElementAttributes); + child = child.nextSibling; + } +}; + export const printHtmlChildren = (node: Node, depth: number): string => { let child = node.firstChild; if (!child) { @@ -117,11 +151,12 @@ export const printHtmlNode = (node: Node, depth: number): string => { case ELEMENT_NODE: result += "<" + node.nodeName; Array.from((node as HTMLElement).attributes) + .filter((a) => ATTRIBUTES_TO_KEEP[a.name]) .sort((a, b) => a.name.localeCompare(b.name)) .forEach((attribute) => { result += ` ${attribute.name}`; if (attribute.value) { - result += `="${escapeHtmlString(attribute.value)}"`; + result += `="${attribute.value}"`; } }); @@ -144,100 +179,3 @@ export const printHtmlNode = (node: Node, depth: number): string => { return result; }; - -export const cleanupHtmlNodeAttributes = ( - node: Node, - cleanupElementAttributes: (element: HTMLElement) => void -): void => { - if (node.nodeType === ELEMENT_NODE) { - cleanupElementAttributes(node as HTMLElement); - } - - let child = node.firstChild; - while (child) { - cleanupHtmlNodeAttributes(child as HTMLElement, cleanupElementAttributes); - child = child.nextSibling; - } -}; - -export const pruneHtmlNode = (node: Node): boolean => { - let toBeRemoved = false; - - switch (node.nodeType) { - case COMMENT_NODE: - case DOCUMENT_TYPE_NODE: - toBeRemoved = true; - break; - case TEXT_NODE: { - if (node.textContent === "") { - toBeRemoved = true; - } - break; - } - case ELEMENT_NODE: - toBeRemoved = pruneElement(node as HTMLElement); - break; - } - - if (toBeRemoved) { - return true; - } - - const childrenToRemove = []; - let child = node.firstChild; - while (child) { - pruneHtmlNode(child) && childrenToRemove.push(child); - child = child.nextSibling; - } - - childrenToRemove.forEach((child) => node.removeChild(child)); - - return false; -}; - -export const escapeHtmlString = (string: string): string => { - const matchHtmlRegExp = /["'&<>]/; - - const str = "" + string; - const match = matchHtmlRegExp.exec(str); - - if (!match) { - return str; - } - - let escape; - let html = ""; - let index; - let lastIndex = 0; - - for (index = match.index; index < str.length; index++) { - switch (str.charCodeAt(index)) { - case 34: // " - escape = """; - break; - case 38: // & - escape = "&"; - break; - case 39: // ' - escape = "'"; - break; - case 60: // < - escape = "<"; - break; - case 62: // > - escape = ">"; - break; - default: - continue; - } - - if (lastIndex !== index) { - html += str.substring(lastIndex, index); - } - - lastIndex = index + 1; - html += escape; - } - - return lastIndex !== index ? html + str.substring(lastIndex, index) : html; -}; diff --git a/src/HTMLNormalizer/strategies/common.ts b/src/HTMLNormalizer/strategies/common.ts index e41656a..3c3a613 100644 --- a/src/HTMLNormalizer/strategies/common.ts +++ b/src/HTMLNormalizer/strategies/common.ts @@ -2,10 +2,10 @@ const URL = require("url-parse"); export const ELEMENT_TYPES_TO_REMOVE = { - br: true, - hr: true, - use: true, - svg: true, + BR: true, + HR: true, + USE: true, + SVG: true, }; export const ATTRIBUTES_TO_KEEP = { @@ -18,11 +18,11 @@ export const ATTRIBUTES_TO_KEEP = { value: true, }; -export const amendNodes = (document: HTMLDocument): void => { +export const normalizeDocumentCommon = (body: HTMLElement): void => { /** * Unwind Outlook safelink wrappers */ - const anchors = document.getElementsByTagName("a"); + const anchors = body.getElementsByTagName("a"); for (const anchor of anchors) { const url = new URL(anchor.getAttribute("href"), true); @@ -34,7 +34,7 @@ export const amendNodes = (document: HTMLDocument): void => { /** * Unwind Gmail "googleusercontent" wrappers */ - const images = document.getElementsByTagName("img"); + const images = body.getElementsByTagName("img"); for (const image of images) { let url; try { @@ -57,7 +57,7 @@ export const amendNodes = (document: HTMLDocument): void => { * @param element */ export const pruneElement = (element: HTMLElement): boolean => { - return !!ELEMENT_TYPES_TO_REMOVE[element.nodeName.toLowerCase()]; + return !!ELEMENT_TYPES_TO_REMOVE[element.nodeName]; }; export const cloneAnchorFromPane = ( diff --git a/src/HTMLNormalizer/strategies/gmail.ts b/src/HTMLNormalizer/strategies/gmail.ts index efd990e..058b35d 100644 --- a/src/HTMLNormalizer/strategies/gmail.ts +++ b/src/HTMLNormalizer/strategies/gmail.ts @@ -1,4 +1,4 @@ -import { ATTRIBUTES_TO_KEEP, cloneAnchorFromPane } from "./common"; +import { cloneAnchorFromPane } from "./common"; import { ELEMENT_NODE } from "../../constants"; const qrCodeContainerIds = { vereignWrapperLink: 1 }; @@ -36,15 +36,12 @@ const removeQrCodeNodes = (document: HTMLDocument) => { ); }; -export const amendGmailNodes = (document: HTMLDocument): void => { - // unwindTags(document, "span"); - +export const normalizeGmailDocument = (document: HTMLDocument): void => { removeQrCodeNodes(document); /** - * Look for attachments panes and remove everything but links + * Look for attachments panes and extract <a> tags from them */ - const attachmentsPanes = Array.from( document.getElementsByClassName("gmail_chip") ); @@ -60,21 +57,3 @@ export const amendGmailNodes = (document: HTMLDocument): void => { pane.parentNode.removeChild(pane); }); }; - -export const cleanupGMailElementAttributes = (element: HTMLElement): void => { - if (element.attributes.length > 0) { - for (const attribute of element.attributes) { - if (attribute.name === "data-surl") { - element.setAttribute("src", attribute.value); - } - } - - for (let i = 0; i < element.attributes.length; i++) { - const attribute = element.attributes[i]; - if (!ATTRIBUTES_TO_KEEP[attribute.name]) { - element.removeAttribute(attribute.name); - i--; - } - } - } -}; diff --git a/src/HTMLNormalizer/strategies/outlook.ts b/src/HTMLNormalizer/strategies/outlook.ts index 4584d7f..e286494 100644 --- a/src/HTMLNormalizer/strategies/outlook.ts +++ b/src/HTMLNormalizer/strategies/outlook.ts @@ -1,5 +1,5 @@ import { ELEMENT_NODE, TEXT_NODE } from "../../constants"; -import { ATTRIBUTES_TO_KEEP, cloneAnchorFromPane } from "./common"; +import { cloneAnchorFromPane } from "./common"; import { unwindTags } from "./nodesAmendingFunctions"; const qrCodeContainerIds = { @@ -39,11 +39,10 @@ const removeQrCodeNodes = (document: HTMLDocument) => { ); }; -export const amendOutlookNodes = (document: HTMLDocument): void => { +export const normalizeOutlookDocument = (document: HTMLDocument): void => { /** * Remove QR code entries */ - removeQrCodeNodes(document); /** @@ -76,7 +75,6 @@ export const amendOutlookNodes = (document: HTMLDocument): void => { const msoNormalParents = Array.from(msoNormalWrappers) .map((node) => node.parentNode) .filter((node, index, self) => self.indexOf(node) === index); - unwindTags(msoNormalParents); /** @@ -119,25 +117,3 @@ export const amendOutlookNodes = (document: HTMLDocument): void => { const spans = document.getElementsByTagName("span"); unwindTags(Array.from(spans)); }; - -export const cleanupOutlookElementAttributes = (element: HTMLElement): void => { - if (element.attributes.length > 0) { - for (const attribute of element.attributes) { - let valueSplit = attribute.value.split(" "); - - valueSplit = valueSplit.map((value) => - value.startsWith("x_") ? value.replace("x_", "") : value - ); - - element.setAttribute(attribute.name, valueSplit.join(" ")); - } - - for (let i = 0; i < element.attributes.length; i++) { - const attribute = element.attributes[i]; - if (!ATTRIBUTES_TO_KEEP[attribute.name]) { - element.removeAttribute(attribute.name); - i--; - } - } - } -}; -- GitLab