From ebfb5bc14090ff370411eb2194cbb0a4430b7ad0 Mon Sep 17 00:00:00 2001
From: igor <igor.markin@vereign.com>
Date: Fri, 25 Dec 2020 12:04:26 +0300
Subject: [PATCH] Optimize attributes normalisation

---
 .../outlook-gmail/chrome-chrome/README.md     |   2 +-
 src/HTMLNormalizer/HTMLNormalizer.ts          | 194 ++++++------------
 src/HTMLNormalizer/strategies/common.ts       |  16 +-
 src/HTMLNormalizer/strategies/gmail.ts        |  27 +--
 src/HTMLNormalizer/strategies/outlook.ts      |  28 +--
 5 files changed, 80 insertions(+), 187 deletions(-)

diff --git a/__tests__/files/outlook-gmail/chrome-chrome/README.md b/__tests__/files/outlook-gmail/chrome-chrome/README.md
index a34356d..04bd6e4 100644
--- a/__tests__/files/outlook-gmail/chrome-chrome/README.md
+++ b/__tests__/files/outlook-gmail/chrome-chrome/README.md
@@ -15,5 +15,5 @@
 | 13          | 4 replies with test case 05                                                                                                                                                                                                                                                                      | ok              | ok               | ok                   |
 | 14          | 4 replies with test case 06                                                                                                                                                                                                                                                                      | ok              | ok               | ok                   |
 | 15          | 4 replies with test case 07 (with both OneDrive and Google Drive attachments)                                                                                                                                                                                                                    | ok              | ok               | ok                   |
-| 16          | 4 replies with test case 08 (with both OneDrive and Google Drive attachments)                                                                                                                                                                                                                    | fail            | fail             | fail                 |
+| 16          | 4 replies with test case 08 (with both OneDrive and Google Drive attachments)                                                                                                                                                                                                                    | ok              | ok               | ok                   |
 | 17          | Multiple forwarded emails using cases 01-08:<br><br>E.g. <br>- User A sends "case 01" to User B<br>- User B forwards "case 02" to User A<br>- User A forwards "case 03" to User B<br>...<br>- User B forwards "case 08" to User A<br>- User A completes circle by forwarding "case 01" to User B | ok              | ok               | ok                   |
diff --git a/src/HTMLNormalizer/HTMLNormalizer.ts b/src/HTMLNormalizer/HTMLNormalizer.ts
index a137404..b58b574 100644
--- a/src/HTMLNormalizer/HTMLNormalizer.ts
+++ b/src/HTMLNormalizer/HTMLNormalizer.ts
@@ -5,27 +5,20 @@ import {
   ELEMENT_NODE,
   TEXT_NODE,
 } from "../constants";
-import {
-  amendOutlookNodes,
-  cleanupOutlookElementAttributes,
-} from "./strategies/outlook";
+import { normalizeOutlookDocument } from "./strategies/outlook";
 import { EMAIL_VENDORS } from "../constants";
 import { normalizeTextSpacings } from "../utils";
-import {
-  amendGmailNodes,
-  cleanupGMailElementAttributes,
-} from "./strategies/gmail";
+import { normalizeGmailDocument } from "./strategies/gmail";
 import { PlainNormalizer } from "../index";
-import { amendNodes, pruneElement } from "./strategies/common";
-
-const nodesAmendingFunctions = {
-  [EMAIL_VENDORS.GMAIL]: amendGmailNodes,
-  [EMAIL_VENDORS.OUTLOOK]: amendOutlookNodes,
-};
-
-const attributesCleanupFunctions = {
-  [EMAIL_VENDORS.GMAIL]: cleanupGMailElementAttributes,
-  [EMAIL_VENDORS.OUTLOOK]: cleanupOutlookElementAttributes,
+import {
+  ATTRIBUTES_TO_KEEP,
+  normalizeDocumentCommon,
+  pruneElement,
+} from "./strategies/common";
+
+const documentNormalizationFunctions = {
+  [EMAIL_VENDORS.GMAIL]: normalizeGmailDocument,
+  [EMAIL_VENDORS.OUTLOOK]: normalizeOutlookDocument,
 };
 
 export const normalizeVendorHtml = (
@@ -37,21 +30,18 @@ export const normalizeVendorHtml = (
    */
   pruneHtmlNode(document.body);
 
-  amendNodes(document);
-  const amendNodesFunction = nodesAmendingFunctions[vendor];
-  if (amendNodesFunction) {
-    amendNodesFunction(document);
-  }
-
   /**
-   * Cleanup unnecessary attributes of nodes
+   * Apply document normalisations
    */
-  const elementAttributesCleanupFunction = attributesCleanupFunctions[vendor];
-
-  if (elementAttributesCleanupFunction) {
-    cleanupHtmlNodeAttributes(document.body, elementAttributesCleanupFunction);
+  normalizeDocumentCommon(document.body);
+  const normalizeDocument = documentNormalizationFunctions[vendor];
+  if (normalizeDocument) {
+    normalizeDocument(document);
   }
 
+  /**
+   * Final printout
+   */
   return printHtmlChildren(document.body, 0);
 };
 
@@ -73,11 +63,55 @@ export const extractPseudoPlainPart = (document: HTMLDocument): string => {
 
   urls = urls.filter((attr) => !!attr).sort();
 
-  normalizedTextContent += urls.sort((a, b) => a.localeCompare(b)).join(",");
+  normalizedTextContent += urls.join(",");
 
   return normalizedTextContent;
 };
 
+export const pruneHtmlNode = (node: Node): boolean => {
+  let toBeRemoved = false;
+
+  switch (node.nodeType) {
+    case COMMENT_NODE:
+    case DOCUMENT_TYPE_NODE:
+      toBeRemoved = true;
+      break;
+    case ELEMENT_NODE:
+      toBeRemoved = pruneElement(node as HTMLElement);
+      break;
+  }
+
+  if (toBeRemoved) {
+    return true;
+  }
+
+  const childrenToRemove = [];
+  let child = node.firstChild;
+  while (child) {
+    pruneHtmlNode(child) && childrenToRemove.push(child);
+    child = child.nextSibling;
+  }
+
+  childrenToRemove.forEach((child) => node.removeChild(child));
+
+  return false;
+};
+
+export const normalizeBodyAttributes = (
+  node: Node,
+  cleanupElementAttributes: (element: HTMLElement) => void
+): void => {
+  if (node.nodeType === ELEMENT_NODE) {
+    cleanupElementAttributes(node as HTMLElement);
+  }
+
+  let child = node.firstChild;
+  while (child) {
+    normalizeBodyAttributes(child as HTMLElement, cleanupElementAttributes);
+    child = child.nextSibling;
+  }
+};
+
 export const printHtmlChildren = (node: Node, depth: number): string => {
   let child = node.firstChild;
   if (!child) {
@@ -117,11 +151,12 @@ export const printHtmlNode = (node: Node, depth: number): string => {
     case ELEMENT_NODE:
       result += "<" + node.nodeName;
       Array.from((node as HTMLElement).attributes)
+        .filter((a) => ATTRIBUTES_TO_KEEP[a.name])
         .sort((a, b) => a.name.localeCompare(b.name))
         .forEach((attribute) => {
           result += ` ${attribute.name}`;
           if (attribute.value) {
-            result += `="${escapeHtmlString(attribute.value)}"`;
+            result += `="${attribute.value}"`;
           }
         });
 
@@ -144,100 +179,3 @@ export const printHtmlNode = (node: Node, depth: number): string => {
 
   return result;
 };
-
-export const cleanupHtmlNodeAttributes = (
-  node: Node,
-  cleanupElementAttributes: (element: HTMLElement) => void
-): void => {
-  if (node.nodeType === ELEMENT_NODE) {
-    cleanupElementAttributes(node as HTMLElement);
-  }
-
-  let child = node.firstChild;
-  while (child) {
-    cleanupHtmlNodeAttributes(child as HTMLElement, cleanupElementAttributes);
-    child = child.nextSibling;
-  }
-};
-
-export const pruneHtmlNode = (node: Node): boolean => {
-  let toBeRemoved = false;
-
-  switch (node.nodeType) {
-    case COMMENT_NODE:
-    case DOCUMENT_TYPE_NODE:
-      toBeRemoved = true;
-      break;
-    case TEXT_NODE: {
-      if (node.textContent === "") {
-        toBeRemoved = true;
-      }
-      break;
-    }
-    case ELEMENT_NODE:
-      toBeRemoved = pruneElement(node as HTMLElement);
-      break;
-  }
-
-  if (toBeRemoved) {
-    return true;
-  }
-
-  const childrenToRemove = [];
-  let child = node.firstChild;
-  while (child) {
-    pruneHtmlNode(child) && childrenToRemove.push(child);
-    child = child.nextSibling;
-  }
-
-  childrenToRemove.forEach((child) => node.removeChild(child));
-
-  return false;
-};
-
-export const escapeHtmlString = (string: string): string => {
-  const matchHtmlRegExp = /["'&<>]/;
-
-  const str = "" + string;
-  const match = matchHtmlRegExp.exec(str);
-
-  if (!match) {
-    return str;
-  }
-
-  let escape;
-  let html = "";
-  let index;
-  let lastIndex = 0;
-
-  for (index = match.index; index < str.length; index++) {
-    switch (str.charCodeAt(index)) {
-      case 34: // "
-        escape = "&quot;";
-        break;
-      case 38: // &
-        escape = "&amp;";
-        break;
-      case 39: // '
-        escape = "&#39;";
-        break;
-      case 60: // <
-        escape = "&lt;";
-        break;
-      case 62: // >
-        escape = "&gt;";
-        break;
-      default:
-        continue;
-    }
-
-    if (lastIndex !== index) {
-      html += str.substring(lastIndex, index);
-    }
-
-    lastIndex = index + 1;
-    html += escape;
-  }
-
-  return lastIndex !== index ? html + str.substring(lastIndex, index) : html;
-};
diff --git a/src/HTMLNormalizer/strategies/common.ts b/src/HTMLNormalizer/strategies/common.ts
index e41656a..3c3a613 100644
--- a/src/HTMLNormalizer/strategies/common.ts
+++ b/src/HTMLNormalizer/strategies/common.ts
@@ -2,10 +2,10 @@
 const URL = require("url-parse");
 
 export const ELEMENT_TYPES_TO_REMOVE = {
-  br: true,
-  hr: true,
-  use: true,
-  svg: true,
+  BR: true,
+  HR: true,
+  USE: true,
+  SVG: true,
 };
 
 export const ATTRIBUTES_TO_KEEP = {
@@ -18,11 +18,11 @@ export const ATTRIBUTES_TO_KEEP = {
   value: true,
 };
 
-export const amendNodes = (document: HTMLDocument): void => {
+export const normalizeDocumentCommon = (body: HTMLElement): void => {
   /**
    * Unwind Outlook safelink wrappers
    */
-  const anchors = document.getElementsByTagName("a");
+  const anchors = body.getElementsByTagName("a");
   for (const anchor of anchors) {
     const url = new URL(anchor.getAttribute("href"), true);
 
@@ -34,7 +34,7 @@ export const amendNodes = (document: HTMLDocument): void => {
   /**
    * Unwind Gmail "googleusercontent" wrappers
    */
-  const images = document.getElementsByTagName("img");
+  const images = body.getElementsByTagName("img");
   for (const image of images) {
     let url;
     try {
@@ -57,7 +57,7 @@ export const amendNodes = (document: HTMLDocument): void => {
  * @param element
  */
 export const pruneElement = (element: HTMLElement): boolean => {
-  return !!ELEMENT_TYPES_TO_REMOVE[element.nodeName.toLowerCase()];
+  return !!ELEMENT_TYPES_TO_REMOVE[element.nodeName];
 };
 
 export const cloneAnchorFromPane = (
diff --git a/src/HTMLNormalizer/strategies/gmail.ts b/src/HTMLNormalizer/strategies/gmail.ts
index efd990e..058b35d 100644
--- a/src/HTMLNormalizer/strategies/gmail.ts
+++ b/src/HTMLNormalizer/strategies/gmail.ts
@@ -1,4 +1,4 @@
-import { ATTRIBUTES_TO_KEEP, cloneAnchorFromPane } from "./common";
+import { cloneAnchorFromPane } from "./common";
 import { ELEMENT_NODE } from "../../constants";
 
 const qrCodeContainerIds = { vereignWrapperLink: 1 };
@@ -36,15 +36,12 @@ const removeQrCodeNodes = (document: HTMLDocument) => {
   );
 };
 
-export const amendGmailNodes = (document: HTMLDocument): void => {
-  // unwindTags(document, "span");
-
+export const normalizeGmailDocument = (document: HTMLDocument): void => {
   removeQrCodeNodes(document);
 
   /**
-   * Look for attachments panes and remove everything but links
+   * Look for attachments panes and extract <a> tags from them
    */
-
   const attachmentsPanes = Array.from(
     document.getElementsByClassName("gmail_chip")
   );
@@ -60,21 +57,3 @@ export const amendGmailNodes = (document: HTMLDocument): void => {
     pane.parentNode.removeChild(pane);
   });
 };
-
-export const cleanupGMailElementAttributes = (element: HTMLElement): void => {
-  if (element.attributes.length > 0) {
-    for (const attribute of element.attributes) {
-      if (attribute.name === "data-surl") {
-        element.setAttribute("src", attribute.value);
-      }
-    }
-
-    for (let i = 0; i < element.attributes.length; i++) {
-      const attribute = element.attributes[i];
-      if (!ATTRIBUTES_TO_KEEP[attribute.name]) {
-        element.removeAttribute(attribute.name);
-        i--;
-      }
-    }
-  }
-};
diff --git a/src/HTMLNormalizer/strategies/outlook.ts b/src/HTMLNormalizer/strategies/outlook.ts
index 4584d7f..e286494 100644
--- a/src/HTMLNormalizer/strategies/outlook.ts
+++ b/src/HTMLNormalizer/strategies/outlook.ts
@@ -1,5 +1,5 @@
 import { ELEMENT_NODE, TEXT_NODE } from "../../constants";
-import { ATTRIBUTES_TO_KEEP, cloneAnchorFromPane } from "./common";
+import { cloneAnchorFromPane } from "./common";
 import { unwindTags } from "./nodesAmendingFunctions";
 
 const qrCodeContainerIds = {
@@ -39,11 +39,10 @@ const removeQrCodeNodes = (document: HTMLDocument) => {
   );
 };
 
-export const amendOutlookNodes = (document: HTMLDocument): void => {
+export const normalizeOutlookDocument = (document: HTMLDocument): void => {
   /**
    * Remove QR code entries
    */
-
   removeQrCodeNodes(document);
 
   /**
@@ -76,7 +75,6 @@ export const amendOutlookNodes = (document: HTMLDocument): void => {
   const msoNormalParents = Array.from(msoNormalWrappers)
     .map((node) => node.parentNode)
     .filter((node, index, self) => self.indexOf(node) === index);
-
   unwindTags(msoNormalParents);
 
   /**
@@ -119,25 +117,3 @@ export const amendOutlookNodes = (document: HTMLDocument): void => {
   const spans = document.getElementsByTagName("span");
   unwindTags(Array.from(spans));
 };
-
-export const cleanupOutlookElementAttributes = (element: HTMLElement): void => {
-  if (element.attributes.length > 0) {
-    for (const attribute of element.attributes) {
-      let valueSplit = attribute.value.split(" ");
-
-      valueSplit = valueSplit.map((value) =>
-        value.startsWith("x_") ? value.replace("x_", "") : value
-      );
-
-      element.setAttribute(attribute.name, valueSplit.join(" "));
-    }
-
-    for (let i = 0; i < element.attributes.length; i++) {
-      const attribute = element.attributes[i];
-      if (!ATTRIBUTES_TO_KEEP[attribute.name]) {
-        element.removeAttribute(attribute.name);
-        i--;
-      }
-    }
-  }
-};
-- 
GitLab