diff --git a/dist/HTMLNormalizer/HTMLNormalizer.d.ts b/dist/HTMLNormalizer/HTMLNormalizer.d.ts index 62ff3a55b69e5319892d8de8f7712d8bc19a7176..c49c6e8766cbb8b7af47c79e165ed918d2f2a7ad 100644 --- a/dist/HTMLNormalizer/HTMLNormalizer.d.ts +++ b/dist/HTMLNormalizer/HTMLNormalizer.d.ts @@ -1,7 +1,6 @@ export declare const normalizeVendorHtml: (document: HTMLDocument, vendor: string) => string; export declare const extractPseudoPlainPart: (document: HTMLDocument) => string; -export declare const printHtmlChildren: (node: Node, printFunction: (node: Node) => string, depth: number) => string; -export declare const printHtmlNode: (node: Node, printFunction: (node: Node) => string, depth: number) => string; -export declare const cleanupHtmlNodeAttributes: (node: Node, cleanupElementAttributes: (element: HTMLElement) => void) => void; -export declare const pruneHtmlNode: (node: Node, pruneElement: (element: HTMLElement) => boolean) => boolean; -export declare const escapeHtmlString: (string: string) => string; +export declare const pruneHtmlNode: (node: Node) => boolean; +export declare const normalizeBodyAttributes: (node: Node, cleanupElementAttributes: (element: HTMLElement) => void) => void; +export declare const printHtmlChildren: (node: Node, depth: number) => string; +export declare const printHtmlNode: (node: Node, depth: number) => string; diff --git a/dist/HTMLNormalizer/HTMLNormalizer.js b/dist/HTMLNormalizer/HTMLNormalizer.js index bd37044fe90955725bc951a73dd8962d8d6fd5d2..21d0c291ab1a1d55167d7b4b99a22b62d79e3bca 100644 --- a/dist/HTMLNormalizer/HTMLNormalizer.js +++ b/dist/HTMLNormalizer/HTMLNormalizer.js @@ -1,6 +1,6 @@ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); -exports.escapeHtmlString = exports.pruneHtmlNode = exports.cleanupHtmlNodeAttributes = exports.printHtmlNode = exports.printHtmlChildren = exports.extractPseudoPlainPart = exports.normalizeVendorHtml = void 0; +exports.printHtmlNode = exports.printHtmlChildren = exports.normalizeBodyAttributes = exports.pruneHtmlNode = exports.extractPseudoPlainPart = exports.normalizeVendorHtml = void 0; const constants_1 = require("../constants"); const outlook_1 = require("./strategies/outlook"); const constants_2 = require("../constants"); @@ -8,100 +8,103 @@ const utils_1 = require("../utils"); const gmail_1 = require("./strategies/gmail"); const index_1 = require("../index"); const common_1 = require("./strategies/common"); -const nodesAmendingFunctions = { - [constants_2.EMAIL_VENDORS.GMAIL]: gmail_1.amendGmailNodes, - [constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.amendOutlookNodes, -}; -const nodesPruningFunctions = { - [constants_2.EMAIL_VENDORS.GMAIL]: gmail_1.pruneGmailElement, - [constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.pruneOutlookElement, -}; -const attributesCleanupFunctions = { - [constants_2.EMAIL_VENDORS.GMAIL]: gmail_1.cleanupGMailElementAttributes, - [constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.cleanupOutlookElementAttributes, -}; -const vendorPrintingFunctions = { - [constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.printOutlookElement, +const documentNormalizationFunctions = { + [constants_2.EMAIL_VENDORS.GMAIL]: gmail_1.normalizeGmailDocument, + [constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.normalizeOutlookDocument, }; const normalizeVendorHtml = (document, vendor) => { - const mimeBody = document.body; - common_1.amendNodes(document); - const amendNodesFunction = nodesAmendingFunctions[vendor]; - if (amendNodesFunction) { - amendNodesFunction(document); - } /** * Remove unnecessary nodes */ - const elementPruningFunction = nodesPruningFunctions[vendor]; - if (!elementPruningFunction) { - throw new Error(`Vendor "${vendor}" is not supported. Please, develop a pruning function for it.`); - } - exports.pruneHtmlNode(document, elementPruningFunction); + exports.pruneHtmlNode(document.body); /** - * Cleanup unnecessary attributes of nodes + * Apply document normalisations */ - const elementAttributesCleanupFunction = attributesCleanupFunctions[vendor]; - if (elementAttributesCleanupFunction) { - exports.cleanupHtmlNodeAttributes(document, elementAttributesCleanupFunction); + common_1.normalizeDocumentCommon(document.body); + const normalizeDocument = documentNormalizationFunctions[vendor]; + if (normalizeDocument) { + normalizeDocument(document); } /** - * Print nodes + * Final printout */ - const vendorPrintFunction = vendorPrintingFunctions[vendor]; - return exports.printHtmlChildren(mimeBody, vendorPrintFunction, 0); + return exports.printHtmlChildren(document.body, 0); }; exports.normalizeVendorHtml = normalizeVendorHtml; -const extractPseudoPlainPart = (document -/*vendor: string*/ -) => { - const textContent = index_1.PlainNormalizer.normalizePlain(document.body.textContent); - // const anchors = document.getElementsByTagName("a"); - // const images = document.getElementsByTagName("img"); - // let meaningfulAttributes = []; - // - // Array.from(anchors).forEach((a) => { - // meaningfulAttributes.push(a.getAttribute("href")); - // }); - // Array.from(images).forEach((img) => { - // meaningfulAttributes.push(img.getAttribute("src")); - // meaningfulAttributes.push(img.getAttribute("alt")); - // }); - // - // meaningfulAttributes = meaningfulAttributes.filter((attr) => !!attr).sort(); - // console.log(meaningfulAttributes); - return textContent; +const extractPseudoPlainPart = (document) => { + let normalizedTextContent = index_1.PlainNormalizer.normalizePlain(document.body.textContent); + const anchors = document.getElementsByTagName("a"); + const images = document.getElementsByTagName("img"); + let urls = []; + Array.from(anchors).forEach((a) => { + urls.push(a.getAttribute("href")); + }); + Array.from(images).forEach((img) => { + urls.push(img.getAttribute("src")); + }); + urls = urls.filter((attr) => !!attr).sort(); + normalizedTextContent += urls.join(","); + return normalizedTextContent; }; exports.extractPseudoPlainPart = extractPseudoPlainPart; -const printHtmlChildren = (node, printFunction, depth) => { +const pruneHtmlNode = (node) => { + let toBeRemoved = false; + switch (node.nodeType) { + case constants_1.COMMENT_NODE: + case constants_1.DOCUMENT_TYPE_NODE: + toBeRemoved = true; + break; + case constants_1.ELEMENT_NODE: + toBeRemoved = common_1.pruneElement(node); + break; + } + if (toBeRemoved) { + return true; + } + const childrenToRemove = []; + let child = node.firstChild; + while (child) { + exports.pruneHtmlNode(child) && childrenToRemove.push(child); + child = child.nextSibling; + } + childrenToRemove.forEach((child) => node.removeChild(child)); + return false; +}; +exports.pruneHtmlNode = pruneHtmlNode; +const normalizeBodyAttributes = (node, cleanupElementAttributes) => { + if (node.nodeType === constants_1.ELEMENT_NODE) { + cleanupElementAttributes(node); + } + let child = node.firstChild; + while (child) { + exports.normalizeBodyAttributes(child, cleanupElementAttributes); + child = child.nextSibling; + } +}; +exports.normalizeBodyAttributes = normalizeBodyAttributes; +const printHtmlChildren = (node, depth) => { let child = node.firstChild; if (!child) { return ""; } if (child == node.lastChild && child.nodeType == constants_1.TEXT_NODE) { - return exports.printHtmlNode(child, printFunction, depth); + return exports.printHtmlNode(child, depth); } else { let result = ""; while (child) { - result = result.concat(exports.printHtmlNode(child, printFunction, depth)); + result = result.concat(exports.printHtmlNode(child, depth)); child = child.nextSibling; } return result; } }; exports.printHtmlChildren = printHtmlChildren; -const printHtmlNode = (node, printFunction, depth) => { +const printHtmlNode = (node, depth) => { let result = ""; - if (printFunction) { - const customPrintout = printFunction(node); - if (customPrintout) { - return customPrintout; - } - } switch (node.nodeType) { case constants_1.TEXT_NODE: { - const text = utils_1.removeSpacesAndLinebreaks(node.textContent); + const text = utils_1.normalizeTextSpacings(node.textContent).trim(); if (text.length) { result += "<TEXT>"; result += text; @@ -111,24 +114,30 @@ const printHtmlNode = (node, printFunction, depth) => { break; } case constants_1.DOCUMENT_NODE: - result += exports.printHtmlChildren(node, printFunction, depth); + result += exports.printHtmlChildren(node, depth); break; case constants_1.ELEMENT_NODE: result += "<" + node.nodeName; Array.from(node.attributes) + .filter((a) => common_1.ATTRIBUTES_TO_KEEP[a.name]) .sort((a, b) => a.name.localeCompare(b.name)) .forEach((attribute) => { result += ` ${attribute.name}`; if (attribute.value) { - result += `="${exports.escapeHtmlString(attribute.value)}"`; + result += `="${attribute.value}"`; } }); if (node.firstChild) { - result += ">"; - result += "\n"; - const printout = exports.printHtmlChildren(node, printFunction, depth + 1); - result += printout; - result += "</" + node.nodeName + ">"; + const printout = exports.printHtmlChildren(node, depth + 1); + if (printout.trim().length === 0) { + result += "/>"; + } + else { + result += ">"; + result += "\n"; + result += printout; + result += "</" + node.nodeName + ">"; + } } else { result += "/>"; @@ -139,87 +148,3 @@ const printHtmlNode = (node, printFunction, depth) => { return result; }; exports.printHtmlNode = printHtmlNode; -const cleanupHtmlNodeAttributes = (node, cleanupElementAttributes) => { - if (node.nodeType === constants_1.ELEMENT_NODE) { - cleanupElementAttributes(node); - } - let child = node.firstChild; - while (child) { - exports.cleanupHtmlNodeAttributes(child, cleanupElementAttributes); - child = child.nextSibling; - } -}; -exports.cleanupHtmlNodeAttributes = cleanupHtmlNodeAttributes; -const pruneHtmlNode = (node, pruneElement) => { - let toBeRemoved = false; - switch (node.nodeType) { - case constants_1.COMMENT_NODE: - case constants_1.DOCUMENT_TYPE_NODE: - toBeRemoved = true; - break; - case constants_1.TEXT_NODE: { - const trimmedText = node.textContent.trim(); - if (trimmedText === "") { - toBeRemoved = true; - } - else { - node.textContent = trimmedText; - } - break; - } - case constants_1.ELEMENT_NODE: - toBeRemoved = pruneElement(node); - } - if (toBeRemoved) { - return true; - } - const childrenToRemove = []; - let child = node.firstChild; - while (child) { - exports.pruneHtmlNode(child, pruneElement) && childrenToRemove.push(child); - child = child.nextSibling; - } - childrenToRemove.forEach((child) => node.removeChild(child)); - return false; -}; -exports.pruneHtmlNode = pruneHtmlNode; -const escapeHtmlString = (string) => { - const matchHtmlRegExp = /["'&<>]/; - const str = "" + string; - const match = matchHtmlRegExp.exec(str); - if (!match) { - return str; - } - let escape; - let html = ""; - let index = 0; - let lastIndex = 0; - for (index = match.index; index < str.length; index++) { - switch (str.charCodeAt(index)) { - case 34: // " - escape = """; - break; - case 38: // & - escape = "&"; - break; - case 39: // ' - escape = "'"; - break; - case 60: // < - escape = "<"; - break; - case 62: // > - escape = ">"; - break; - default: - continue; - } - if (lastIndex !== index) { - html += str.substring(lastIndex, index); - } - lastIndex = index + 1; - html += escape; - } - return lastIndex !== index ? html + str.substring(lastIndex, index) : html; -}; -exports.escapeHtmlString = escapeHtmlString; diff --git a/dist/HTMLNormalizer/strategies/nodesAmendingFunctions.d.ts b/dist/HTMLNormalizer/strategies/nodesAmendingFunctions.d.ts index edaf502f86786f53b7de96549eb34fa40f391896..3138d95617b5615395083a4f32ded746ae94aad3 100644 --- a/dist/HTMLNormalizer/strategies/nodesAmendingFunctions.d.ts +++ b/dist/HTMLNormalizer/strategies/nodesAmendingFunctions.d.ts @@ -1 +1 @@ -export declare const unwindTags: (node: Element | Document, tagName: string) => void; +export declare const unwindTags: (nodes: Array<Node>) => void; diff --git a/dist/HTMLNormalizer/strategies/nodesAmendingFunctions.js b/dist/HTMLNormalizer/strategies/nodesAmendingFunctions.js index da12e8e0ec9d4230830c359e6ae1cc8259e26c82..39cfcf7affd9b07d2e6c055ee983979c9abdc339 100644 --- a/dist/HTMLNormalizer/strategies/nodesAmendingFunctions.js +++ b/dist/HTMLNormalizer/strategies/nodesAmendingFunctions.js @@ -1,11 +1,10 @@ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.unwindTags = void 0; -const unwindTags = (node, tagName) => { - const tags = node.getElementsByTagName(tagName); - //Sort tags by depth to start unwinding the deepest ones, which does not contain nested spans +const unwindTags = (nodes) => { + //Sort nodes by depth to start unwinding the deepest ones const tagsDepths = {}; - Array.from(tags).forEach((span) => { + Array.from(nodes).forEach((span) => { let descendant = span; let parent = descendant.parentNode; let depth = 0; diff --git a/dist/HTMLNormalizer/strategies/outlook.js b/dist/HTMLNormalizer/strategies/outlook.js index 2356919cda8c677993e63fbbf1e132925c8a511e..6e7f859d68e3f049c4b842280da7e9efe5d1b502 100644 --- a/dist/HTMLNormalizer/strategies/outlook.js +++ b/dist/HTMLNormalizer/strategies/outlook.js @@ -1,30 +1,9 @@ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); -exports.cleanupOutlookElementAttributes = exports.amendOutlookNodes = exports.pruneOutlookElement = exports.printOutlookElement = void 0; -// TODO: Move this logic to amendOutlookNodes -const HTMLNormalizer_1 = require("../HTMLNormalizer"); +exports.normalizeOutlookDocument = void 0; const constants_1 = require("../../constants"); const common_1 = require("./common"); -const printOutlookElement = (node) => { - if (node.nodeType === constants_1.ELEMENT_NODE) { - if (node.classList.contains("WordSection1")) { - return HTMLNormalizer_1.printHtmlChildren(node, null, 0); - } - } -}; -exports.printOutlookElement = printOutlookElement; -/** - * Returns true if element should be completely removed - * @param element - */ -const pruneOutlookElement = (element) => { - if (common_1.pruneElement(element)) { - return true; - } - // Remove Outlook generic <o:*> tags - return !!element.nodeName.toLowerCase().startsWith("o:"); -}; -exports.pruneOutlookElement = pruneOutlookElement; +const nodesAmendingFunctions_1 = require("./nodesAmendingFunctions"); const qrCodeContainerIds = { "test-for-us": 1, }; @@ -49,57 +28,44 @@ const removeQrCodeNodes = (document) => { const elementsToRemove = remove(document.body); elementsToRemove.forEach((element) => element.parentNode.removeChild(element)); }; -const amendOutlookNodes = (document) => { - /** - * Remove quoted text - */ - // Quoted text in web apps - // const appendOnSend = document.querySelector( - // "[id*='appendonsend']" - // ) as Node; - // - // if (appendOnSend) { - // let child = appendOnSend; - // while (child) { - // const nextSibling = child.nextSibling; - // child.parentNode.removeChild(child); - // child = nextSibling as Node; - // } - // } - // Quoted text in desktop apps - // let mailOriginal = document.querySelector("[name*='_MailOriginal']") as HTMLElement; - // if (mailOriginal) { - // let removeCurrent = true; - // while (mailOriginal !== document.body) { - // while (mailOriginal.nextSibling) { - // mailOriginal.nextSibling.remove(); - // } - // const currentNode = mailOriginal; - // mailOriginal = mailOriginal.parentElement; - // if (removeCurrent && currentNode.previousSibling) { - // currentNode.remove(); - // removeCurrent = false; - // } - // } - // } - // if (mailOriginal) { - // const separatorCandidate = mailOriginal.parentNode as Node; - // - // // while (!(separatorCandidate.parentNode as Element).classList.contains("WordSection1")) { - // // separatorCandidate = separatorCandidate.parentNode; - // // } - // - // let child = separatorCandidate; - // while (child) { - // const nextSibling = child.nextSibling; - // child.parentNode.removeChild(child); - // child = nextSibling as Node; - // } - // } +const normalizeOutlookDocument = (document) => { /** * Remove QR code entries */ removeQrCodeNodes(document); + /** + * Unwind Word o:p paragraphs + */ + const ops = document.getElementsByTagName("o:p"); + nodesAmendingFunctions_1.unwindTags(Array.from(ops)); + /** + * Remove empty paragraphs + */ + const ps = document.getElementsByTagName("p"); + Array.from(ps).forEach((p) => { + if (p.childNodes.length === 0) { + p.parentNode.removeChild(p); + } + if (p.childNodes.length === 1 && p.childNodes[0].nodeType === constants_1.TEXT_NODE) { + const text = p.childNodes[0].textContent; + if (!text.replace(/\u00A0/g, "").trim()) { + p.parentNode.removeChild(p); + } + } + }); + /** + * Unwind all MSONormal, because outlook might wrap them into <div></div> + */ + const msoNormalWrappers = document.getElementsByClassName("MsoNormal"); + const msoNormalParents = Array.from(msoNormalWrappers) + .map((node) => node.parentNode) + .filter((node, index, self) => self.indexOf(node) === index); + nodesAmendingFunctions_1.unwindTags(msoNormalParents); + /** + * Unwind WordSection1 tags + */ + const wordSectionWrappers = document.getElementsByClassName("WordSection1"); + nodesAmendingFunctions_1.unwindTags(Array.from(wordSectionWrappers)); /** * Get rid of attachments panes */ @@ -120,51 +86,6 @@ const amendOutlookNodes = (document) => { * Unwind spans, because sometimes Outlook wraps everything into span after sending */ const spans = document.getElementsByTagName("span"); - //Sort spans by depth to start unwinding the deepest ones, which does not contain nested spans - const spansDepths = {}; - Array.from(spans).forEach((span) => { - let descendant = span; - let parent = descendant.parentNode; - let depth = 0; - while (parent && descendant !== parent) { - descendant = parent; - parent = descendant.parentNode; - depth++; - } - if (!spansDepths[depth]) { - spansDepths[depth] = []; - } - spansDepths[depth].push(span); - }); - Object.keys(spansDepths) - .sort((a, b) => parseInt(b) - parseInt(a)) - .forEach((depth) => { - spansDepths[depth].forEach((span) => { - let child = span.firstChild; - const parent = span.parentNode; - while (child) { - parent.insertBefore(child.cloneNode(true), span); - child = child.nextSibling; - } - span.parentNode.removeChild(span); - }); - }); -}; -exports.amendOutlookNodes = amendOutlookNodes; -const cleanupOutlookElementAttributes = (element) => { - if (element.attributes.length > 0) { - for (const attribute of element.attributes) { - let valueSplit = attribute.value.split(" "); - valueSplit = valueSplit.map((value) => value.startsWith("x_") ? value.replace("x_", "") : value); - element.setAttribute(attribute.name, valueSplit.join(" ")); - } - for (let i = 0; i < element.attributes.length; i++) { - const attribute = element.attributes[i]; - if (!common_1.ATTRIBUTES_TO_KEEP[attribute.name]) { - element.removeAttribute(attribute.name); - i--; - } - } - } + nodesAmendingFunctions_1.unwindTags(Array.from(spans)); }; -exports.cleanupOutlookElementAttributes = cleanupOutlookElementAttributes; +exports.normalizeOutlookDocument = normalizeOutlookDocument; diff --git a/dist/PlainNormalizer/PlainNormalizer.d.ts b/dist/PlainNormalizer/PlainNormalizer.d.ts index d9447858e7ca1c3b615e23cb0a4318627150a85b..aaf70efeaee2e98d60ec066f1d62ab6b27d41d7d 100644 --- a/dist/PlainNormalizer/PlainNormalizer.d.ts +++ b/dist/PlainNormalizer/PlainNormalizer.d.ts @@ -1 +1,2 @@ export declare const normalizePlainPart: (text: string) => string; +export declare const cleanupHiddenCharacters: (s: string) => string; diff --git a/dist/PlainNormalizer/PlainNormalizer.js b/dist/PlainNormalizer/PlainNormalizer.js index a30288b0bad49d16dbeac81429be53bf9d8ea8f1..cfb447d0ee36130b53180dc99f4086225ceeac26 100644 --- a/dist/PlainNormalizer/PlainNormalizer.js +++ b/dist/PlainNormalizer/PlainNormalizer.js @@ -1,15 +1,16 @@ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); -exports.normalizePlainPart = void 0; +exports.cleanupHiddenCharacters = exports.normalizePlainPart = void 0; // this is a Node module. require is a must to work across different envs const URL = require("url-parse"); const utils_1 = require("../utils"); const normalizePlainPart = (text) => { + text = exports.cleanupHiddenCharacters(text); text = removeListBullets(text); - text = utils_1.removeSpacesAndLinebreaks(text); text = removeQRCodes(text); + text = utils_1.normalizeTextSpacings(text); text = patchOutlookSafelinksWrappers(text); - return text; + return text.trim(); }; exports.normalizePlainPart = normalizePlainPart; const patchOutlookSafelinksWrappers = (text) => { @@ -25,9 +26,14 @@ const patchOutlookSafelinksWrappers = (text) => { }; const removeQRCodes = (s) => { return s - .replace(/\[(image:)*qrcode.png]\s*<https:\/\/.+?>/g, "") + .replace(/\[(image:\s)*qrcode.png]\s*<https:\/\/.+?>/g, "") .replace(/<https:\/\/.+?>\s*\[(image: )*qrcode.png]/g, ""); }; const removeListBullets = (s) => { return s.replace("\n[o§]\n+/g", ""); }; +const cleanupHiddenCharacters = (s) => { + const removeSymbols = new RegExp(/[\u200B]+/g); + return s.replace(removeSymbols, ""); +}; +exports.cleanupHiddenCharacters = cleanupHiddenCharacters; diff --git a/dist/utils.d.ts b/dist/utils.d.ts index 4a534c0b2cea6597399f003f3872310489fc7306..0cfde28094b311309c1e28f69b51813ae482d049 100644 --- a/dist/utils.d.ts +++ b/dist/utils.d.ts @@ -1 +1 @@ -export declare const removeSpacesAndLinebreaks: (s: string) => string; +export declare const normalizeTextSpacings: (s: string) => string; diff --git a/dist/utils.js b/dist/utils.js index 3030ed27ee0506f9f3c3d118b607339781fb0520..47c164d95db73b04ae0081f33f1e03ff871c5c83 100644 --- a/dist/utils.js +++ b/dist/utils.js @@ -1,8 +1,7 @@ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); -exports.removeSpacesAndLinebreaks = void 0; -const removeSpacesAndLinebreaks = (s) => { - const removeSymbols = new RegExp(/[\r\n\v\s\u200B]+/g); - return s.replace(removeSymbols, "").trim(); +exports.normalizeTextSpacings = void 0; +const normalizeTextSpacings = (s) => { + return s.replace(/[\r\n\v\s\u00A0]+/g, " "); }; -exports.removeSpacesAndLinebreaks = removeSpacesAndLinebreaks; +exports.normalizeTextSpacings = normalizeTextSpacings;