From 52421c70e8bbc04ad61db6f0c6c91e08137874b5 Mon Sep 17 00:00:00 2001 From: igor <igor.markin@vereign.com> Date: Wed, 25 Nov 2020 15:24:59 +0300 Subject: [PATCH] Implement initial version of HTML normalizer --- README.md | 2 +- package.json | 4 +- src/HTMLNormalizer/HTMLNormalizer.ts | 248 +++++++++++++++++++++++ src/HTMLNormalizer/index.ts | 3 + src/HTMLNormalizer/strategies/common.ts | 47 +++++ src/HTMLNormalizer/strategies/gmail.ts | 44 ++++ src/HTMLNormalizer/strategies/outlook.ts | 117 +++++++++++ src/constants.ts | 10 + src/index.ts | 4 +- src/utils.ts | 6 + 10 files changed, 479 insertions(+), 6 deletions(-) create mode 100644 src/HTMLNormalizer/HTMLNormalizer.ts create mode 100644 src/HTMLNormalizer/index.ts create mode 100644 src/HTMLNormalizer/strategies/common.ts create mode 100644 src/HTMLNormalizer/strategies/gmail.ts create mode 100644 src/HTMLNormalizer/strategies/outlook.ts create mode 100644 src/constants.ts create mode 100644 src/utils.ts diff --git a/README.md b/README.md index b03eaf5..9cb29fa 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# MIME Verifier +# MIME Normalizer diff --git a/package.json b/package.json index f99ad3c..1fcd574 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { - "name": "@vereign/mime-verifier", + "name": "@vereign/mime-normalizer", "author": "Igor Markin <igor.markin@vereign.com>", - "description": "A library that handles verification of MIME messages", + "description": "A library that handles normalization of MIME plain and html parts", "version": "1.0.0", "license": "MIT", "main": "./dist/index.js", diff --git a/src/HTMLNormalizer/HTMLNormalizer.ts b/src/HTMLNormalizer/HTMLNormalizer.ts new file mode 100644 index 0000000..754d67b --- /dev/null +++ b/src/HTMLNormalizer/HTMLNormalizer.ts @@ -0,0 +1,248 @@ +import { DOCUMENT_NODE, ELEMENT_NODE, TEXT_NODE } from "../constants"; +import { + amendOutlookNodes, + cleanupOutlookElementAttributes, + printOutlookElement, + pruneOutlookElement +} from "./strategies/outlook"; +import {EMAIL_VENDORS} from "../constants"; +import {removeSpacesAndLinebreaks} from "../utils"; +import {amendGmailNodes, cleanupGMailElementAttributes, pruneGmailElement} from "./strategies/gmail"; + +const nodesAmendingFunctions = { + [EMAIL_VENDORS.GMAIL]: amendGmailNodes, + [EMAIL_VENDORS.OUTLOOK]: amendOutlookNodes, +}; + +const nodesPruningFunctions = { + [EMAIL_VENDORS.GMAIL]: pruneGmailElement, + [EMAIL_VENDORS.OUTLOOK]: pruneOutlookElement, +}; + +const attributesCleanupFunctions = { + [EMAIL_VENDORS.GMAIL]: cleanupGMailElementAttributes, + [EMAIL_VENDORS.OUTLOOK]: cleanupOutlookElementAttributes, +}; + +const vendorPrintingFunctions = { + [EMAIL_VENDORS.OUTLOOK]: printOutlookElement, +}; + +export const normalizeVendorHtml = (document: HTMLDocument, vendor: string): string => { + const mimeBody = document.body; + + const amendNodesFunction = nodesAmendingFunctions[vendor]; + if (amendNodesFunction) { + amendNodesFunction(document); + } + + /** + * Remove unnecessary nodes + */ + const elementPruningFunction = nodesPruningFunctions[vendor]; + + if (!elementPruningFunction) { + throw new Error( + `Vendor "${vendor}" is not supported. Please, develop a pruning function for it.` + ); + } + + pruneHtmlNode(document, elementPruningFunction); + + /** + * Cleanup unnecessary attributes of nodes + */ + const elementAttributesCleanupFunction = + attributesCleanupFunctions[vendor]; + + if (elementAttributesCleanupFunction) { + cleanupHtmlNodeAttributes(document, elementAttributesCleanupFunction); + } + + /** + * Print nodes + */ + const vendorPrintFunction = vendorPrintingFunctions[vendor]; + + return printHtmlChildren(mimeBody, vendorPrintFunction, 0); +}; + +export const printHtmlChildren = ( + node: Node, + printFunction: (node: Node) => string, + depth: number +): string => { + let child = node.firstChild; + if (!child) { + return ""; + } + + if (child == node.lastChild && child.nodeType == TEXT_NODE) { + return printHtmlNode(child, printFunction, depth); + } else { + let result = ""; + while (child) { + result = result.concat(printHtmlNode(child, printFunction, depth)); + child = child.nextSibling; + } + + return result; + } +}; + +export const printHtmlNode = ( + node: Node, + printFunction: (node: Node) => string, + depth: number +): string => { + let result = ""; + + if (printFunction) { + const customPrintout = printFunction(node); + if (customPrintout) { + return customPrintout; + } + } + + switch (node.nodeType) { + case TEXT_NODE: { + const text = removeSpacesAndLinebreaks(node.textContent); + if (text.length) { + result += "<TEXT>"; + result += text; + result += "</TEXT>"; + result += "\n"; + } + break; + } + case DOCUMENT_NODE: + result += printHtmlChildren(node, printFunction, depth); + break; + case ELEMENT_NODE: + result += "<" + node.nodeName; + Array.from((node as HTMLElement).attributes) + .sort((a, b) => a.name.localeCompare(b.name)) + .forEach((attribute) => { + result += ` ${attribute.name}`; + if (attribute.value) { + result += `="${escapeHtmlString(attribute.value)}"`; + } + }); + + if (node.firstChild) { + result += ">"; + result += "\n"; + result += printHtmlChildren(node, printFunction, depth + 1); + result += "</" + node.nodeName + ">"; + } else { + result += "/>"; + } + result += "\n"; + break; + } + + return result; +}; + +export const cleanupHtmlNodeAttributes = ( + node: Node, + cleanupElementAttributes: (element: HTMLElement) => void +): void => { + if (node.nodeType === node.ELEMENT_NODE) { + cleanupElementAttributes(node as HTMLElement); + } + + let child = node.firstChild; + while (child) { + cleanupHtmlNodeAttributes(child as HTMLElement, cleanupElementAttributes); + child = child.nextSibling; + } +}; + +export const pruneHtmlNode = ( + node: Node, + pruneElement: (element: HTMLElement) => boolean +): boolean => { + let toBeRemoved = false; + + switch (node.nodeType) { + case node.COMMENT_NODE: + case node.DOCUMENT_TYPE_NODE: + toBeRemoved = true; + break; + case node.TEXT_NODE: { + const trimmedText = node.textContent.trim(); + if (trimmedText === "") { + toBeRemoved = true; + } else { + node.textContent = trimmedText; + } + break; + } + case node.ELEMENT_NODE: + toBeRemoved = pruneElement(node as HTMLElement); + } + + if (toBeRemoved) { + return true; + } + + const childrenToRemove = []; + let child = node.firstChild; + + while (child) { + pruneHtmlNode(child, pruneElement) && childrenToRemove.push(child); + child = child.nextSibling; + } + + childrenToRemove.forEach((child) => node.removeChild(child)); + + return false; +}; + +export const escapeHtmlString = (string: string): string => { + const matchHtmlRegExp = /["'&<>]/; + + const str = "" + string; + const match = matchHtmlRegExp.exec(str); + + if (!match) { + return str; + } + + let escape; + let html = ""; + let index = 0; + let lastIndex = 0; + + for (let index = match.index; index < str.length; index++) { + switch (str.charCodeAt(index)) { + case 34: // " + escape = """; + break; + case 38: // & + escape = "&"; + break; + case 39: // ' + escape = "'"; + break; + case 60: // < + escape = "<"; + break; + case 62: // > + escape = ">"; + break; + default: + continue; + } + + if (lastIndex !== index) { + html += str.substring(lastIndex, index); + } + + lastIndex = index + 1; + html += escape; + } + + return lastIndex !== index ? html + str.substring(lastIndex, index) : html; +}; diff --git a/src/HTMLNormalizer/index.ts b/src/HTMLNormalizer/index.ts new file mode 100644 index 0000000..eddde0e --- /dev/null +++ b/src/HTMLNormalizer/index.ts @@ -0,0 +1,3 @@ +import {normalizeVendorHtml} from "./HTMLNormalizer"; + +export default normalizeVendorHtml; diff --git a/src/HTMLNormalizer/strategies/common.ts b/src/HTMLNormalizer/strategies/common.ts new file mode 100644 index 0000000..b1601a2 --- /dev/null +++ b/src/HTMLNormalizer/strategies/common.ts @@ -0,0 +1,47 @@ +const DUMMY_QR_CODE_ID = "dummyQrCode"; + +export const ELEMENT_TYPES_TO_REMOVE = { br: true, hr: true }; + +export const ATTRIBUTES_TO_KEEP = { + alt: true, + src: true, + cite: true, + data: true, + datetime: true, + href: true, + value: true, +}; + +/** + * Removes dummy QR code from HTML + * @param element + */ +const isDummyQrCode = (element: HTMLElement): boolean => { + if (element.id === DUMMY_QR_CODE_ID) { + return true; + } +}; + +/** + * Decides whether node should be removed + * @param element + */ +export const pruneElement = (element: HTMLElement): boolean => { + if (isDummyQrCode(element)) { + return true; + } + + return !!ELEMENT_TYPES_TO_REMOVE[element.nodeName.toLowerCase()]; +}; + +export const cloneAnchorFromPane = (a: HTMLAnchorElement, pane: HTMLElement): void => { + try { + const url = new URL(a.href); + // If this is external url + if (url.host && url.protocol) { + pane.parentNode.insertBefore(a.cloneNode(false), pane); + } + } catch { + return; + } +}; diff --git a/src/HTMLNormalizer/strategies/gmail.ts b/src/HTMLNormalizer/strategies/gmail.ts new file mode 100644 index 0000000..48540e3 --- /dev/null +++ b/src/HTMLNormalizer/strategies/gmail.ts @@ -0,0 +1,44 @@ +import {ATTRIBUTES_TO_KEEP, cloneAnchorFromPane, pruneElement} from "./common"; + +export const pruneGmailElement = (element: HTMLElement): boolean => { + return pruneElement(element); +}; + +export const amendGmailNodes = (document: HTMLDocument): void => { + /** + * Look for attachments panes and remove everything but liks + */ + + const attachmentsPanes = Array.from( + document.getElementsByClassName("gmail_chip") + ); + + attachmentsPanes.forEach((pane) => { + const as = pane.querySelectorAll("a"); + as.forEach((a) => { + cloneAnchorFromPane(a, pane as HTMLElement); + }); + }); + + attachmentsPanes.forEach((pane) => { + pane.parentNode.removeChild(pane); + }); +}; + +export const cleanupGMailElementAttributes = (element: HTMLElement): void => { + if (element.attributes.length > 0) { + for (const attribute of element.attributes) { + if (attribute.name === "data-surl") { + element.setAttribute("src", attribute.value); + } + } + + for (let i = 0; i < element.attributes.length; i++) { + const attribute = element.attributes[i]; + if (!ATTRIBUTES_TO_KEEP[attribute.name]) { + element.removeAttribute(attribute.name); + i--; + } + } + } +}; diff --git a/src/HTMLNormalizer/strategies/outlook.ts b/src/HTMLNormalizer/strategies/outlook.ts new file mode 100644 index 0000000..85f35f8 --- /dev/null +++ b/src/HTMLNormalizer/strategies/outlook.ts @@ -0,0 +1,117 @@ +// TODO: Move this logic to amendOutlookNodes +import {printHtmlChildren} from "../HTMLNormalizer"; +import {ELEMENT_NODE} from "../../constants"; +import {ATTRIBUTES_TO_KEEP, cloneAnchorFromPane, pruneElement} from "./common"; + +export const printOutlookElement = (node: Node): string => { + if (node.nodeType === ELEMENT_NODE) { + if ((node as HTMLElement).classList.contains("WordSection1")) { + return printHtmlChildren(node, null, 0); + } + } +}; + +/** + * Returns true if element should be completely removed + * @param element + */ +export const pruneOutlookElement = (element: HTMLElement): boolean => { + if (pruneElement(element)) { + return true; + } + + // Remove Outlook generic <o:*> tags + return !!element.nodeName.toLowerCase().startsWith("o:"); +}; + +export const amendOutlookNodes = (document: HTMLDocument): void => { + /** + * Get rid of attachments panes + */ + const attachmentsPanesConatiner = document.getElementById( + "OwaReferenceAttachments" + ); + const attachmentsPanesContainerEnd = document.getElementById( + "OwaReferenceAttachmentsEnd" + ); + + if (attachmentsPanesConatiner) { + const as = attachmentsPanesConatiner.getElementsByTagName("a"); + Array.from(as).forEach((a) => { + cloneAnchorFromPane(a, attachmentsPanesConatiner as HTMLElement); + }); + + attachmentsPanesConatiner.parentNode.removeChild(attachmentsPanesConatiner); + } + + attachmentsPanesContainerEnd && + attachmentsPanesContainerEnd.parentNode.removeChild( + attachmentsPanesContainerEnd + ); + + /** + * Unwind spans, because sometimes Outlook wraps everything into span after sending + */ + + const spans = document.getElementsByTagName("span"); + + /** + * Sort spans by depth to start unwinding the deepest ones, which does not contain nested spans + */ + const spansDepths: { depth?: Array<Node> } = {}; + Array.from(spans).forEach((span: Node) => { + let descendant = span; + let parent = descendant.parentNode; + + let depth = 0; + while (parent && descendant !== parent) { + descendant = parent; + parent = descendant.parentNode; + depth++; + } + + if (!spansDepths[depth]) { + spansDepths[depth] = []; + } + + spansDepths[depth].push(span); + }); + + Object.keys(spansDepths) + .sort((a, b) => parseInt(b) - parseInt(a)) + .forEach((depth) => { + spansDepths[depth].forEach((span) => { + let child = span.firstChild; + const parent = span.parentNode; + while (child) { + parent.insertBefore(child.cloneNode(true), span); + + child = child.nextSibling; + } + + span.parentNode.removeChild(span); + }); + }); +}; + +export const cleanupOutlookElementAttributes = (element: HTMLElement): void => { + if (element.attributes.length > 0) { + for (const attribute of element.attributes) { + let valueSplit = attribute.value.split(" "); + + valueSplit = valueSplit.map((value) => + value.startsWith("x_") ? value.replace("x_", "") : value + ); + + element.setAttribute(attribute.name, valueSplit.join(" ")); + } + + for (let i = 0; i < element.attributes.length; i++) { + const attribute = element.attributes[i]; + if (!ATTRIBUTES_TO_KEEP[attribute.name]) { + element.removeAttribute(attribute.name); + i--; + } + } + } +}; diff --git a/src/constants.ts b/src/constants.ts new file mode 100644 index 0000000..a639830 --- /dev/null +++ b/src/constants.ts @@ -0,0 +1,10 @@ +export const ELEMENT_NODE = 1; +export const TEXT_NODE = 3; +export const DOCUMENT_NODE = 9; + +export const EMAIL_VENDORS = { + GMAIL: "GMAIL", + OUTLOOK: "OUTLOOK", + ROUNDCUBE: "ROUNDCUBE", + GENERIC_MIME: "GENERIC_MIME", +}; diff --git a/src/index.ts b/src/index.ts index 562c9b7..791915c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,3 +1 @@ -export const add = (a: number, b: number) => { - return a + b; -} +export { default as HTMLNormalizer } from "./HTMLNormalizer"; diff --git a/src/utils.ts b/src/utils.ts new file mode 100644 index 0000000..c25376f --- /dev/null +++ b/src/utils.ts @@ -0,0 +1,6 @@ +export const removeSpacesAndLinebreaks = (s: string): string => { + const regexNewlines = new RegExp(/[\r\n\v]+/g); + const regexSpaces = new RegExp(/\s+|\u200B/g); + + return s.replace(regexNewlines, "").replace(regexSpaces, ""); +}; -- GitLab