diff --git a/__tests__/files/outlook-gmail/chrome-chrome/README.md b/__tests__/files/outlook-gmail/chrome-chrome/README.md index a34356d59c7979e10df3c33e8b62d039a909dd7d..04bd6e44dec1495ef53c04eb0d78aa4cef356ed2 100644 --- a/__tests__/files/outlook-gmail/chrome-chrome/README.md +++ b/__tests__/files/outlook-gmail/chrome-chrome/README.md @@ -15,5 +15,5 @@ | 13 | 4 replies with test case 05 | ok | ok | ok | | 14 | 4 replies with test case 06 | ok | ok | ok | | 15 | 4 replies with test case 07 (with both OneDrive and Google Drive attachments) | ok | ok | ok | -| 16 | 4 replies with test case 08 (with both OneDrive and Google Drive attachments) | fail | fail | fail | +| 16 | 4 replies with test case 08 (with both OneDrive and Google Drive attachments) | ok | ok | ok | | 17 | Multiple forwarded emails using cases 01-08:<br><br>E.g. <br>- User A sends "case 01" to User B<br>- User B forwards "case 02" to User A<br>- User A forwards "case 03" to User B<br>...<br>- User B forwards "case 08" to User A<br>- User A completes circle by forwarding "case 01" to User B | ok | ok | ok | diff --git a/src/HTMLNormalizer/HTMLNormalizer.ts b/src/HTMLNormalizer/HTMLNormalizer.ts index 601e7ee4ca938f9034d2fe923ed96efe312c075a..b58b574af7b95ece641c341d749d1fc80afb7fae 100644 --- a/src/HTMLNormalizer/HTMLNormalizer.ts +++ b/src/HTMLNormalizer/HTMLNormalizer.ts @@ -5,128 +5,125 @@ import { ELEMENT_NODE, TEXT_NODE, } from "../constants"; -import { - amendOutlookNodes, - cleanupOutlookElementAttributes, - printOutlookElement, - pruneOutlookElement, -} from "./strategies/outlook"; +import { normalizeOutlookDocument } from "./strategies/outlook"; import { EMAIL_VENDORS } from "../constants"; -import { normalizeTextSpacings, removeSpacesAndLinebreaks } from "../utils"; -import { - amendGmailNodes, - cleanupGMailElementAttributes, - pruneGmailElement, -} from "./strategies/gmail"; +import { normalizeTextSpacings } from "../utils"; +import { normalizeGmailDocument } from "./strategies/gmail"; import { PlainNormalizer } from "../index"; -import { amendNodes } from "./strategies/common"; - -const nodesAmendingFunctions = { - [EMAIL_VENDORS.GMAIL]: amendGmailNodes, - [EMAIL_VENDORS.OUTLOOK]: amendOutlookNodes, -}; - -const nodesPruningFunctions = { - [EMAIL_VENDORS.GMAIL]: pruneGmailElement, - [EMAIL_VENDORS.OUTLOOK]: pruneOutlookElement, -}; - -const attributesCleanupFunctions = { - [EMAIL_VENDORS.GMAIL]: cleanupGMailElementAttributes, - [EMAIL_VENDORS.OUTLOOK]: cleanupOutlookElementAttributes, -}; - -const vendorPrintingFunctions = { - [EMAIL_VENDORS.OUTLOOK]: printOutlookElement, +import { + ATTRIBUTES_TO_KEEP, + normalizeDocumentCommon, + pruneElement, +} from "./strategies/common"; + +const documentNormalizationFunctions = { + [EMAIL_VENDORS.GMAIL]: normalizeGmailDocument, + [EMAIL_VENDORS.OUTLOOK]: normalizeOutlookDocument, }; export const normalizeVendorHtml = ( document: HTMLDocument, vendor: string ): string => { - const mimeBody = document.body; - - amendNodes(document); - - const amendNodesFunction = nodesAmendingFunctions[vendor]; - if (amendNodesFunction) { - amendNodesFunction(document); - } - /** * Remove unnecessary nodes */ - const elementPruningFunction = nodesPruningFunctions[vendor]; - - if (!elementPruningFunction) { - throw new Error( - `Vendor "${vendor}" is not supported. Please, develop a pruning function for it.` - ); - } - - pruneHtmlNode(document, elementPruningFunction); + pruneHtmlNode(document.body); /** - * Cleanup unnecessary attributes of nodes + * Apply document normalisations */ - const elementAttributesCleanupFunction = attributesCleanupFunctions[vendor]; - - if (elementAttributesCleanupFunction) { - cleanupHtmlNodeAttributes(document, elementAttributesCleanupFunction); + normalizeDocumentCommon(document.body); + const normalizeDocument = documentNormalizationFunctions[vendor]; + if (normalizeDocument) { + normalizeDocument(document); } /** - * Print nodes + * Final printout */ - const vendorPrintFunction = vendorPrintingFunctions[vendor]; - - return printHtmlChildren(mimeBody, vendorPrintFunction, 0); + return printHtmlChildren(document.body, 0); }; -export const extractPseudoPlainPart = ( - document: HTMLDocument - /*vendor: string*/ -): string => { +export const extractPseudoPlainPart = (document: HTMLDocument): string => { let normalizedTextContent = PlainNormalizer.normalizePlain( document.body.textContent ); const anchors = document.getElementsByTagName("a"); const images = document.getElementsByTagName("img"); - let meaningfulAttributes = []; + let urls = []; Array.from(anchors).forEach((a) => { - meaningfulAttributes.push(a.getAttribute("href")); + urls.push(a.getAttribute("href")); }); Array.from(images).forEach((img) => { - meaningfulAttributes.push(img.getAttribute("src")); - meaningfulAttributes.push(img.getAttribute("alt")); + urls.push(img.getAttribute("src")); }); - meaningfulAttributes = meaningfulAttributes.filter((attr) => !!attr).sort(); + urls = urls.filter((attr) => !!attr).sort(); - normalizedTextContent += meaningfulAttributes.join(","); + normalizedTextContent += urls.join(","); - // console.log(meaningfulAttributes); return normalizedTextContent; }; -export const printHtmlChildren = ( +export const pruneHtmlNode = (node: Node): boolean => { + let toBeRemoved = false; + + switch (node.nodeType) { + case COMMENT_NODE: + case DOCUMENT_TYPE_NODE: + toBeRemoved = true; + break; + case ELEMENT_NODE: + toBeRemoved = pruneElement(node as HTMLElement); + break; + } + + if (toBeRemoved) { + return true; + } + + const childrenToRemove = []; + let child = node.firstChild; + while (child) { + pruneHtmlNode(child) && childrenToRemove.push(child); + child = child.nextSibling; + } + + childrenToRemove.forEach((child) => node.removeChild(child)); + + return false; +}; + +export const normalizeBodyAttributes = ( node: Node, - printFunction: (node: Node) => string, - depth: number -): string => { + cleanupElementAttributes: (element: HTMLElement) => void +): void => { + if (node.nodeType === ELEMENT_NODE) { + cleanupElementAttributes(node as HTMLElement); + } + + let child = node.firstChild; + while (child) { + normalizeBodyAttributes(child as HTMLElement, cleanupElementAttributes); + child = child.nextSibling; + } +}; + +export const printHtmlChildren = (node: Node, depth: number): string => { let child = node.firstChild; if (!child) { return ""; } if (child == node.lastChild && child.nodeType == TEXT_NODE) { - return printHtmlNode(child, printFunction, depth); + return printHtmlNode(child, depth); } else { let result = ""; while (child) { - result = result.concat(printHtmlNode(child, printFunction, depth)); + result = result.concat(printHtmlNode(child, depth)); child = child.nextSibling; } @@ -134,23 +131,12 @@ export const printHtmlChildren = ( } }; -export const printHtmlNode = ( - node: Node, - printFunction: (node: Node) => string, - depth: number -): string => { +export const printHtmlNode = (node: Node, depth: number): string => { let result = ""; - if (printFunction) { - const customPrintout = printFunction(node); - if (customPrintout) { - return customPrintout; - } - } - switch (node.nodeType) { case TEXT_NODE: { - const text = removeSpacesAndLinebreaks(node.textContent); + const text = normalizeTextSpacings(node.textContent).trim(); if (text.length) { result += "<TEXT>"; result += text; @@ -160,21 +146,22 @@ export const printHtmlNode = ( break; } case DOCUMENT_NODE: - result += printHtmlChildren(node, printFunction, depth); + result += printHtmlChildren(node, depth); break; case ELEMENT_NODE: result += "<" + node.nodeName; Array.from((node as HTMLElement).attributes) + .filter((a) => ATTRIBUTES_TO_KEEP[a.name]) .sort((a, b) => a.name.localeCompare(b.name)) .forEach((attribute) => { result += ` ${attribute.name}`; if (attribute.value) { - result += `="${escapeHtmlString(attribute.value)}"`; + result += `="${attribute.value}"`; } }); if (node.firstChild) { - const printout = printHtmlChildren(node, printFunction, depth + 1); + const printout = printHtmlChildren(node, depth + 1); if (printout.trim().length === 0) { result += "/>"; } else { @@ -192,106 +179,3 @@ export const printHtmlNode = ( return result; }; - -export const cleanupHtmlNodeAttributes = ( - node: Node, - cleanupElementAttributes: (element: HTMLElement) => void -): void => { - if (node.nodeType === ELEMENT_NODE) { - cleanupElementAttributes(node as HTMLElement); - } - - let child = node.firstChild; - while (child) { - cleanupHtmlNodeAttributes(child as HTMLElement, cleanupElementAttributes); - child = child.nextSibling; - } -}; - -export const pruneHtmlNode = ( - node: Node, - pruneElement: (element: HTMLElement) => boolean -): boolean => { - let toBeRemoved = false; - - switch (node.nodeType) { - case COMMENT_NODE: - case DOCUMENT_TYPE_NODE: - toBeRemoved = true; - break; - case TEXT_NODE: { - const trimmedText = normalizeTextSpacings(node.textContent); - if (trimmedText === "") { - toBeRemoved = true; - } else { - node.textContent = trimmedText; - } - break; - } - case ELEMENT_NODE: - toBeRemoved = pruneElement(node as HTMLElement); - } - - if (toBeRemoved) { - return true; - } - - const childrenToRemove = []; - let child = node.firstChild; - - while (child) { - pruneHtmlNode(child, pruneElement) && childrenToRemove.push(child); - child = child.nextSibling; - } - - childrenToRemove.forEach((child) => node.removeChild(child)); - - return false; -}; - -export const escapeHtmlString = (string: string): string => { - const matchHtmlRegExp = /["'&<>]/; - - const str = "" + string; - const match = matchHtmlRegExp.exec(str); - - if (!match) { - return str; - } - - let escape; - let html = ""; - let index = 0; - let lastIndex = 0; - - for (index = match.index; index < str.length; index++) { - switch (str.charCodeAt(index)) { - case 34: // " - escape = """; - break; - case 38: // & - escape = "&"; - break; - case 39: // ' - escape = "'"; - break; - case 60: // < - escape = "<"; - break; - case 62: // > - escape = ">"; - break; - default: - continue; - } - - if (lastIndex !== index) { - html += str.substring(lastIndex, index); - } - - lastIndex = index + 1; - html += escape; - } - - return lastIndex !== index ? html + str.substring(lastIndex, index) : html; -}; diff --git a/src/HTMLNormalizer/strategies/common.ts b/src/HTMLNormalizer/strategies/common.ts index 3bef1b581e2afebd2736556eaa9fb3b373d426f2..3c3a6131cf82a39d0b8325d597546b10f488cafe 100644 --- a/src/HTMLNormalizer/strategies/common.ts +++ b/src/HTMLNormalizer/strategies/common.ts @@ -1,13 +1,11 @@ // this is a Node module. require is a must to work across different envs const URL = require("url-parse"); -const DUMMY_QR_CODE_ID = "dummyQrCode"; - export const ELEMENT_TYPES_TO_REMOVE = { - br: true, - hr: true, - use: true, - svg: true, + BR: true, + HR: true, + USE: true, + SVG: true, }; export const ATTRIBUTES_TO_KEEP = { @@ -20,11 +18,11 @@ export const ATTRIBUTES_TO_KEEP = { value: true, }; -export const amendNodes = (document: HTMLDocument): void => { +export const normalizeDocumentCommon = (body: HTMLElement): void => { /** * Unwind Outlook safelink wrappers */ - const anchors = document.getElementsByTagName("a"); + const anchors = body.getElementsByTagName("a"); for (const anchor of anchors) { const url = new URL(anchor.getAttribute("href"), true); @@ -36,7 +34,7 @@ export const amendNodes = (document: HTMLDocument): void => { /** * Unwind Gmail "googleusercontent" wrappers */ - const images = document.getElementsByTagName("img"); + const images = body.getElementsByTagName("img"); for (const image of images) { let url; try { @@ -54,26 +52,12 @@ export const amendNodes = (document: HTMLDocument): void => { } }; -/** - * Removes dummy QR code from HTML - * @param element - */ -const isDummyQrCode = (element: HTMLElement): boolean => { - if (element.id === DUMMY_QR_CODE_ID) { - return true; - } -}; - /** * Decides whether node should be removed * @param element */ export const pruneElement = (element: HTMLElement): boolean => { - if (isDummyQrCode(element)) { - return true; - } - - return !!ELEMENT_TYPES_TO_REMOVE[element.nodeName.toLowerCase()]; + return !!ELEMENT_TYPES_TO_REMOVE[element.nodeName]; }; export const cloneAnchorFromPane = ( diff --git a/src/HTMLNormalizer/strategies/gmail.ts b/src/HTMLNormalizer/strategies/gmail.ts index 55f4f9f1ed63dbee1e398f988911f0a8a6f2ae3a..058b35dde75f92968a16e4dfe6b103cde871aca2 100644 --- a/src/HTMLNormalizer/strategies/gmail.ts +++ b/src/HTMLNormalizer/strategies/gmail.ts @@ -1,14 +1,6 @@ -import { - ATTRIBUTES_TO_KEEP, - cloneAnchorFromPane, - pruneElement, -} from "./common"; +import { cloneAnchorFromPane } from "./common"; import { ELEMENT_NODE } from "../../constants"; -export const pruneGmailElement = (element: HTMLElement): boolean => { - return pruneElement(element); -}; - const qrCodeContainerIds = { vereignWrapperLink: 1 }; const removeQrCodeNodes = (document: HTMLDocument) => { const remove = (node: Element) => { @@ -44,15 +36,12 @@ const removeQrCodeNodes = (document: HTMLDocument) => { ); }; -export const amendGmailNodes = (document: HTMLDocument): void => { - // unwindTags(document, "span"); - +export const normalizeGmailDocument = (document: HTMLDocument): void => { removeQrCodeNodes(document); /** - * Look for attachments panes and remove everything but links + * Look for attachments panes and extract <a> tags from them */ - const attachmentsPanes = Array.from( document.getElementsByClassName("gmail_chip") ); @@ -68,21 +57,3 @@ export const amendGmailNodes = (document: HTMLDocument): void => { pane.parentNode.removeChild(pane); }); }; - -export const cleanupGMailElementAttributes = (element: HTMLElement): void => { - if (element.attributes.length > 0) { - for (const attribute of element.attributes) { - if (attribute.name === "data-surl") { - element.setAttribute("src", attribute.value); - } - } - - for (let i = 0; i < element.attributes.length; i++) { - const attribute = element.attributes[i]; - if (!ATTRIBUTES_TO_KEEP[attribute.name]) { - element.removeAttribute(attribute.name); - i--; - } - } - } -}; diff --git a/src/HTMLNormalizer/strategies/outlook.ts b/src/HTMLNormalizer/strategies/outlook.ts index 835481239abceaa49fca82d056acc396ad48fd02..e28649415ef2ad73984b756efd264db986f60d81 100644 --- a/src/HTMLNormalizer/strategies/outlook.ts +++ b/src/HTMLNormalizer/strategies/outlook.ts @@ -1,35 +1,7 @@ -// TODO: Move this logic to amendOutlookNodes -import { printHtmlChildren } from "../HTMLNormalizer"; import { ELEMENT_NODE, TEXT_NODE } from "../../constants"; -import { - ATTRIBUTES_TO_KEEP, - cloneAnchorFromPane, - pruneElement, -} from "./common"; +import { cloneAnchorFromPane } from "./common"; import { unwindTags } from "./nodesAmendingFunctions"; -export const printOutlookElement = (node: Node): string => { - if (node.nodeType === ELEMENT_NODE) { - if ((node as HTMLElement).classList.contains("WordSection1")) { - return printHtmlChildren(node, null, 0); - } - } -}; - -/** - * Returns true if element should be completely removed - * @param element - */ -export const pruneOutlookElement = (element: HTMLElement): boolean => { - if (pruneElement(element)) { - return true; - } - - // Remove Outlook generic <o:*> tags - // return !!element.nodeName.toLowerCase().startsWith("o:"); - return false; -}; - const qrCodeContainerIds = { "test-for-us": 1, }; @@ -67,15 +39,14 @@ const removeQrCodeNodes = (document: HTMLDocument) => { ); }; -export const amendOutlookNodes = (document: HTMLDocument): void => { +export const normalizeOutlookDocument = (document: HTMLDocument): void => { /** * Remove QR code entries */ - removeQrCodeNodes(document); /** - * Remove Word o:p paragraphs + * Unwind Word o:p paragraphs */ const ops = document.getElementsByTagName("o:p"); unwindTags(Array.from(ops)); @@ -104,11 +75,10 @@ export const amendOutlookNodes = (document: HTMLDocument): void => { const msoNormalParents = Array.from(msoNormalWrappers) .map((node) => node.parentNode) .filter((node, index, self) => self.indexOf(node) === index); - unwindTags(msoNormalParents); /** - * Unwind WordSection tags + * Unwind WordSection1 tags */ const wordSectionWrappers = document.getElementsByClassName("WordSection1"); unwindTags(Array.from(wordSectionWrappers)); @@ -144,65 +114,6 @@ export const amendOutlookNodes = (document: HTMLDocument): void => { /** * Unwind spans, because sometimes Outlook wraps everything into span after sending */ - const spans = document.getElementsByTagName("span"); - - //Sort spans by depth to start unwinding the deepest ones, which does not contain nested spans - const spansDepths: { depth?: Array<Node> } = {}; - Array.from(spans).forEach((span: Node) => { - let descendant = span; - let parent = descendant.parentNode; - - let depth = 0; - while (parent && descendant !== parent) { - descendant = parent; - parent = descendant.parentNode; - depth++; - } - - if (!spansDepths[depth]) { - spansDepths[depth] = []; - } - - spansDepths[depth].push(span); - }); - - Object.keys(spansDepths) - .sort((a, b) => parseInt(b) - parseInt(a)) - .forEach((depth) => { - spansDepths[depth].forEach((span) => { - let child = span.firstChild; - const parent = span.parentNode; - - while (child) { - parent.insertBefore(child.cloneNode(true), span); - - child = child.nextSibling; - } - - span.parentNode.removeChild(span); - }); - }); -}; - -export const cleanupOutlookElementAttributes = (element: HTMLElement): void => { - if (element.attributes.length > 0) { - for (const attribute of element.attributes) { - let valueSplit = attribute.value.split(" "); - - valueSplit = valueSplit.map((value) => - value.startsWith("x_") ? value.replace("x_", "") : value - ); - - element.setAttribute(attribute.name, valueSplit.join(" ")); - } - - for (let i = 0; i < element.attributes.length; i++) { - const attribute = element.attributes[i]; - if (!ATTRIBUTES_TO_KEEP[attribute.name]) { - element.removeAttribute(attribute.name); - i--; - } - } - } + unwindTags(Array.from(spans)); }; diff --git a/src/utils.ts b/src/utils.ts index 3723902a1ebaa16ca517bffe227b0f90eafd1f78..6da5b6a8189e393345727710d897ce49454e03c1 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -1,9 +1,3 @@ -export const removeSpacesAndLinebreaks = (s: string): string => { - const removeSymbols = new RegExp(/[\r\n\v\s\u200B]+/g); - - return s.replace(removeSymbols, "").trim(); -}; - export const normalizeTextSpacings = (s: string): string => { return s.replace(/[\r\n\v\s\u00A0]+/g, " "); };