From 5cef614c4f87d42ef5d2917ea6d9175910e112ec Mon Sep 17 00:00:00 2001 From: igor <igor.markin@vereign.com> Date: Wed, 2 Dec 2020 17:28:15 +0300 Subject: [PATCH] Fixes for DOM gmail parser --- __tests__/utils.ts | 32 +++++++++++-------- package.json | 3 +- src/HTMLNormalizer/HTMLNormalizer.ts | 41 ++++++++++++++++--------- src/HTMLNormalizer/strategies/common.ts | 2 +- src/HTMLNormalizer/strategies/gmail.ts | 4 +-- src/constants.ts | 2 ++ yarn.lock | 17 ++++++++++ 7 files changed, 70 insertions(+), 31 deletions(-) diff --git a/__tests__/utils.ts b/__tests__/utils.ts index 1d60560..a4e76a0 100644 --- a/__tests__/utils.ts +++ b/__tests__/utils.ts @@ -1,4 +1,5 @@ import { JSDOM } from "jsdom"; +import { DOM } from "@vereign/dom"; const fs = require("fs"); const SENT_HTML_NAME = "s_htmlContent.html"; @@ -50,7 +51,7 @@ export const getNormalizedHtml = ( .readFileSync(`${testCasePath}/${RECEIVED_HTML_NAME}`) .toString(); - const sentDOM = new JSDOM(sentHtml); + const sentDOM = new DOM(sentHtml); const receivedDOM = new JSDOM(receivedHtml); const sentNormalizedHtml = HTMLNormalizer.normalizeVendorHtml( @@ -76,22 +77,27 @@ export const createDescribeHtmlTestCases = ( * @param casesGroupName - name of the folder with cases * @param failingCases - a list of cases that are failing and ignored. Pending to be fixed */ - (casesGroupName: string, failingCases: Array<string> = []) => (): void => { + ( + casesGroupName: string, + failingCases?: Array<string>, + casesToCheckOnly?: Array<string> + ) => (): void => { const testsCasesPath = testsPath + "/" + casesGroupName; - const testCasesDirs = getTestCasesDirs(testsCasesPath).filter( - (dir) => !failingCases.includes(dir) - ); + let testCasesDirs = getTestCasesDirs(testsCasesPath); + + if (casesToCheckOnly) { + testCasesDirs = testCasesDirs.filter((dir) => + casesToCheckOnly.includes(dir) + ); + } + + if (failingCases) { + testCasesDirs.filter((dir) => !failingCases.includes(dir)); + } test.each(testCasesDirs)("Case %s", (dirName: string) => { const testCasePath = testsCasesPath + "/" + dirName; - let normalizedHtmls; - try { - normalizedHtmls = getNormalizedHtml(testCasePath, vendor); - } catch (e) { - console.log(`Invalid test case: ${casesGroupName}/${dirName}`); - return; - } - + const normalizedHtmls = getNormalizedHtml(testCasePath, vendor); const { sentHtml, receivedHtml } = normalizedHtmls; // expect(receivedHtml.length).toBeGreaterThan(0); diff --git a/package.json b/package.json index 1fcd574..becd0b5 100644 --- a/package.json +++ b/package.json @@ -19,7 +19,8 @@ "jest": "^26.4.2", "lint-staged": "^10.2.13", "prettier": "^2.1.1", - "typescript": "^4.0.2" + "typescript": "^4.0.2", + "@vereign/dom": "git+ssh://git@code.vereign.com:code/js-toolbox/gsdom.git#rework-with-parse-5" }, "scripts": { "prepare": "yarn build", diff --git a/src/HTMLNormalizer/HTMLNormalizer.ts b/src/HTMLNormalizer/HTMLNormalizer.ts index 754d67b..291336c 100644 --- a/src/HTMLNormalizer/HTMLNormalizer.ts +++ b/src/HTMLNormalizer/HTMLNormalizer.ts @@ -1,13 +1,23 @@ -import { DOCUMENT_NODE, ELEMENT_NODE, TEXT_NODE } from "../constants"; +import { + COMMENT_NODE, + DOCUMENT_NODE, + DOCUMENT_TYPE_NODE, + ELEMENT_NODE, + TEXT_NODE, +} from "../constants"; import { amendOutlookNodes, cleanupOutlookElementAttributes, printOutlookElement, - pruneOutlookElement + pruneOutlookElement, } from "./strategies/outlook"; -import {EMAIL_VENDORS} from "../constants"; -import {removeSpacesAndLinebreaks} from "../utils"; -import {amendGmailNodes, cleanupGMailElementAttributes, pruneGmailElement} from "./strategies/gmail"; +import { EMAIL_VENDORS } from "../constants"; +import { removeSpacesAndLinebreaks } from "../utils"; +import { + amendGmailNodes, + cleanupGMailElementAttributes, + pruneGmailElement, +} from "./strategies/gmail"; const nodesAmendingFunctions = { [EMAIL_VENDORS.GMAIL]: amendGmailNodes, @@ -28,7 +38,10 @@ const vendorPrintingFunctions = { [EMAIL_VENDORS.OUTLOOK]: printOutlookElement, }; -export const normalizeVendorHtml = (document: HTMLDocument, vendor: string): string => { +export const normalizeVendorHtml = ( + document: HTMLDocument, + vendor: string +): string => { const mimeBody = document.body; const amendNodesFunction = nodesAmendingFunctions[vendor]; @@ -52,8 +65,7 @@ export const normalizeVendorHtml = (document: HTMLDocument, vendor: string): str /** * Cleanup unnecessary attributes of nodes */ - const elementAttributesCleanupFunction = - attributesCleanupFunctions[vendor]; + const elementAttributesCleanupFunction = attributesCleanupFunctions[vendor]; if (elementAttributesCleanupFunction) { cleanupHtmlNodeAttributes(document, elementAttributesCleanupFunction); @@ -132,7 +144,8 @@ export const printHtmlNode = ( if (node.firstChild) { result += ">"; result += "\n"; - result += printHtmlChildren(node, printFunction, depth + 1); + const printout = printHtmlChildren(node, printFunction, depth + 1); + result += printout; result += "</" + node.nodeName + ">"; } else { result += "/>"; @@ -148,7 +161,7 @@ export const cleanupHtmlNodeAttributes = ( node: Node, cleanupElementAttributes: (element: HTMLElement) => void ): void => { - if (node.nodeType === node.ELEMENT_NODE) { + if (node.nodeType === ELEMENT_NODE) { cleanupElementAttributes(node as HTMLElement); } @@ -166,8 +179,8 @@ export const pruneHtmlNode = ( let toBeRemoved = false; switch (node.nodeType) { - case node.COMMENT_NODE: - case node.DOCUMENT_TYPE_NODE: + case COMMENT_NODE: + case DOCUMENT_TYPE_NODE: toBeRemoved = true; break; case node.TEXT_NODE: { @@ -179,7 +192,7 @@ export const pruneHtmlNode = ( } break; } - case node.ELEMENT_NODE: + case ELEMENT_NODE: toBeRemoved = pruneElement(node as HTMLElement); } @@ -215,7 +228,7 @@ export const escapeHtmlString = (string: string): string => { let index = 0; let lastIndex = 0; - for (let index = match.index; index < str.length; index++) { + for (index = match.index; index < str.length; index++) { switch (str.charCodeAt(index)) { case 34: // " escape = """; diff --git a/src/HTMLNormalizer/strategies/common.ts b/src/HTMLNormalizer/strategies/common.ts index 55c701e..d07e8dc 100644 --- a/src/HTMLNormalizer/strategies/common.ts +++ b/src/HTMLNormalizer/strategies/common.ts @@ -51,7 +51,7 @@ export const cloneAnchorFromPane = ( pane: HTMLElement ): void => { try { - const url = new URL(a.href); + const url = new URL(a.getAttribute("href")); // If this is external url if (url.host && url.protocol) { pane.parentNode.insertBefore(a.cloneNode(false), pane); diff --git a/src/HTMLNormalizer/strategies/gmail.ts b/src/HTMLNormalizer/strategies/gmail.ts index ee4aa87..fa82935 100644 --- a/src/HTMLNormalizer/strategies/gmail.ts +++ b/src/HTMLNormalizer/strategies/gmail.ts @@ -20,8 +20,8 @@ export const amendGmailNodes = (document: HTMLDocument): void => { ); attachmentsPanes.forEach((pane) => { - const as = pane.querySelectorAll("a"); - as.forEach((a) => { + const as = pane.getElementsByTagName("a"); + Array.from(as).forEach((a) => { cloneAnchorFromPane(a, pane as HTMLElement); }); }); diff --git a/src/constants.ts b/src/constants.ts index a639830..4e81cd9 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -1,4 +1,6 @@ export const ELEMENT_NODE = 1; +export const COMMENT_NODE = 8; +export const DOCUMENT_TYPE_NODE = 10; export const TEXT_NODE = 3; export const DOCUMENT_NODE = 9; diff --git a/yarn.lock b/yarn.lock index 24d4a89..ee55475 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1168,6 +1168,11 @@ resolved "https://registry.yarnpkg.com/@types/parse-json/-/parse-json-4.0.0.tgz#2f8bb441434d163b35fb8ffdccd7138927ffb8c0" integrity sha512-//oorEZjL6sbPcKUaCdIGlIUeH26mgzimjBB77G6XRgnDl/L5wOnpyBGRe/Mmf5CVW3PwEBE1NjiMZ/ssFh4wA== +"@types/parse5@^5.0.3": + version "5.0.3" + resolved "https://registry.yarnpkg.com/@types/parse5/-/parse5-5.0.3.tgz#e7b5aebbac150f8b5fdd4a46e7f0bd8e65e19109" + integrity sha512-kUNnecmtkunAoQ3CnjmMkzNU/gtxG8guhi+Fk2U/kOpIKjIMKnXGp4IJCgQJrXSgMsWYimYG4TGjz/UzbGEBTw== + "@types/prettier@^2.0.0": version "2.1.5" resolved "https://registry.yarnpkg.com/@types/prettier/-/prettier-2.1.5.tgz#b6ab3bba29e16b821d84e09ecfaded462b816b00" @@ -1250,6 +1255,13 @@ dependencies: eslint-visitor-keys "^1.1.0" +"@vereign/dom@git+ssh://git@code.vereign.com:code/js-toolbox/gsdom.git#rework-with-parse-5": + version "1.0.0" + resolved "git+ssh://git@code.vereign.com:code/js-toolbox/gsdom.git#b14f5560fc58bd3c1f0095b67166b1771442a1cb" + dependencies: + "@types/parse5" "^5.0.3" + parse5 "^6.0.1" + abab@^2.0.3: version "2.0.5" resolved "https://registry.yarnpkg.com/abab/-/abab-2.0.5.tgz#c0b678fb32d60fc1219c784d6a826fe385aeb79a" @@ -3857,6 +3869,11 @@ parse5@5.1.1: resolved "https://registry.yarnpkg.com/parse5/-/parse5-5.1.1.tgz#f68e4e5ba1852ac2cadc00f4555fff6c2abb6178" integrity sha512-ugq4DFI0Ptb+WWjAdOK16+u/nHfiIrcE+sh8kZMaM0WllQKLI9rOUq6c2b7cwPkXdzfQESqvoqK6ug7U/Yyzug== +parse5@^6.0.1: + version "6.0.1" + resolved "https://registry.yarnpkg.com/parse5/-/parse5-6.0.1.tgz#e1a1c085c569b3dc08321184f19a39cc27f7c30b" + integrity sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw== + pascalcase@^0.1.1: version "0.1.1" resolved "https://registry.yarnpkg.com/pascalcase/-/pascalcase-0.1.1.tgz#b363e55e8006ca6fe21784d2db22bd15d7917f14" -- GitLab