Skip to content
Snippets Groups Projects
Commit 52421c70 authored by Igor Markin's avatar Igor Markin
Browse files

Implement initial version of HTML normalizer

parent 428a68e0
No related branches found
No related tags found
1 merge request!1Initial
Pipeline #35301 canceled
This commit is part of merge request !1. Comments created here will be created in the context of that merge request.
# MIME Verifier # MIME Normalizer
{ {
"name": "@vereign/mime-verifier", "name": "@vereign/mime-normalizer",
"author": "Igor Markin <igor.markin@vereign.com>", "author": "Igor Markin <igor.markin@vereign.com>",
"description": "A library that handles verification of MIME messages", "description": "A library that handles normalization of MIME plain and html parts",
"version": "1.0.0", "version": "1.0.0",
"license": "MIT", "license": "MIT",
"main": "./dist/index.js", "main": "./dist/index.js",
......
import { DOCUMENT_NODE, ELEMENT_NODE, TEXT_NODE } from "../constants";
import {
amendOutlookNodes,
cleanupOutlookElementAttributes,
printOutlookElement,
pruneOutlookElement
} from "./strategies/outlook";
import {EMAIL_VENDORS} from "../constants";
import {removeSpacesAndLinebreaks} from "../utils";
import {amendGmailNodes, cleanupGMailElementAttributes, pruneGmailElement} from "./strategies/gmail";
const nodesAmendingFunctions = {
[EMAIL_VENDORS.GMAIL]: amendGmailNodes,
[EMAIL_VENDORS.OUTLOOK]: amendOutlookNodes,
};
const nodesPruningFunctions = {
[EMAIL_VENDORS.GMAIL]: pruneGmailElement,
[EMAIL_VENDORS.OUTLOOK]: pruneOutlookElement,
};
const attributesCleanupFunctions = {
[EMAIL_VENDORS.GMAIL]: cleanupGMailElementAttributes,
[EMAIL_VENDORS.OUTLOOK]: cleanupOutlookElementAttributes,
};
const vendorPrintingFunctions = {
[EMAIL_VENDORS.OUTLOOK]: printOutlookElement,
};
export const normalizeVendorHtml = (document: HTMLDocument, vendor: string): string => {
const mimeBody = document.body;
const amendNodesFunction = nodesAmendingFunctions[vendor];
if (amendNodesFunction) {
amendNodesFunction(document);
}
/**
* Remove unnecessary nodes
*/
const elementPruningFunction = nodesPruningFunctions[vendor];
if (!elementPruningFunction) {
throw new Error(
`Vendor "${vendor}" is not supported. Please, develop a pruning function for it.`
);
}
pruneHtmlNode(document, elementPruningFunction);
/**
* Cleanup unnecessary attributes of nodes
*/
const elementAttributesCleanupFunction =
attributesCleanupFunctions[vendor];
if (elementAttributesCleanupFunction) {
cleanupHtmlNodeAttributes(document, elementAttributesCleanupFunction);
}
/**
* Print nodes
*/
const vendorPrintFunction = vendorPrintingFunctions[vendor];
return printHtmlChildren(mimeBody, vendorPrintFunction, 0);
};
export const printHtmlChildren = (
node: Node,
printFunction: (node: Node) => string,
depth: number
): string => {
let child = node.firstChild;
if (!child) {
return "";
}
if (child == node.lastChild && child.nodeType == TEXT_NODE) {
return printHtmlNode(child, printFunction, depth);
} else {
let result = "";
while (child) {
result = result.concat(printHtmlNode(child, printFunction, depth));
child = child.nextSibling;
}
return result;
}
};
export const printHtmlNode = (
node: Node,
printFunction: (node: Node) => string,
depth: number
): string => {
let result = "";
if (printFunction) {
const customPrintout = printFunction(node);
if (customPrintout) {
return customPrintout;
}
}
switch (node.nodeType) {
case TEXT_NODE: {
const text = removeSpacesAndLinebreaks(node.textContent);
if (text.length) {
result += "<TEXT>";
result += text;
result += "</TEXT>";
result += "\n";
}
break;
}
case DOCUMENT_NODE:
result += printHtmlChildren(node, printFunction, depth);
break;
case ELEMENT_NODE:
result += "<" + node.nodeName;
Array.from((node as HTMLElement).attributes)
.sort((a, b) => a.name.localeCompare(b.name))
.forEach((attribute) => {
result += ` ${attribute.name}`;
if (attribute.value) {
result += `="${escapeHtmlString(attribute.value)}"`;
}
});
if (node.firstChild) {
result += ">";
result += "\n";
result += printHtmlChildren(node, printFunction, depth + 1);
result += "</" + node.nodeName + ">";
} else {
result += "/>";
}
result += "\n";
break;
}
return result;
};
export const cleanupHtmlNodeAttributes = (
node: Node,
cleanupElementAttributes: (element: HTMLElement) => void
): void => {
if (node.nodeType === node.ELEMENT_NODE) {
cleanupElementAttributes(node as HTMLElement);
}
let child = node.firstChild;
while (child) {
cleanupHtmlNodeAttributes(child as HTMLElement, cleanupElementAttributes);
child = child.nextSibling;
}
};
export const pruneHtmlNode = (
node: Node,
pruneElement: (element: HTMLElement) => boolean
): boolean => {
let toBeRemoved = false;
switch (node.nodeType) {
case node.COMMENT_NODE:
case node.DOCUMENT_TYPE_NODE:
toBeRemoved = true;
break;
case node.TEXT_NODE: {
const trimmedText = node.textContent.trim();
if (trimmedText === "") {
toBeRemoved = true;
} else {
node.textContent = trimmedText;
}
break;
}
case node.ELEMENT_NODE:
toBeRemoved = pruneElement(node as HTMLElement);
}
if (toBeRemoved) {
return true;
}
const childrenToRemove = [];
let child = node.firstChild;
while (child) {
pruneHtmlNode(child, pruneElement) && childrenToRemove.push(child);
child = child.nextSibling;
}
childrenToRemove.forEach((child) => node.removeChild(child));
return false;
};
export const escapeHtmlString = (string: string): string => {
const matchHtmlRegExp = /["'&<>]/;
const str = "" + string;
const match = matchHtmlRegExp.exec(str);
if (!match) {
return str;
}
let escape;
let html = "";
let index = 0;
let lastIndex = 0;
for (let index = match.index; index < str.length; index++) {
switch (str.charCodeAt(index)) {
case 34: // "
escape = "&quot;";
break;
case 38: // &
escape = "&amp;";
break;
case 39: // '
escape = "&#39;";
break;
case 60: // <
escape = "&lt;";
break;
case 62: // >
escape = "&gt;";
break;
default:
continue;
}
if (lastIndex !== index) {
html += str.substring(lastIndex, index);
}
lastIndex = index + 1;
html += escape;
}
return lastIndex !== index ? html + str.substring(lastIndex, index) : html;
};
import {normalizeVendorHtml} from "./HTMLNormalizer";
export default normalizeVendorHtml;
const DUMMY_QR_CODE_ID = "dummyQrCode";
export const ELEMENT_TYPES_TO_REMOVE = { br: true, hr: true };
export const ATTRIBUTES_TO_KEEP = {
alt: true,
src: true,
cite: true,
data: true,
datetime: true,
href: true,
value: true,
};
/**
* Removes dummy QR code from HTML
* @param element
*/
const isDummyQrCode = (element: HTMLElement): boolean => {
if (element.id === DUMMY_QR_CODE_ID) {
return true;
}
};
/**
* Decides whether node should be removed
* @param element
*/
export const pruneElement = (element: HTMLElement): boolean => {
if (isDummyQrCode(element)) {
return true;
}
return !!ELEMENT_TYPES_TO_REMOVE[element.nodeName.toLowerCase()];
};
export const cloneAnchorFromPane = (a: HTMLAnchorElement, pane: HTMLElement): void => {
try {
const url = new URL(a.href);
// If this is external url
if (url.host && url.protocol) {
pane.parentNode.insertBefore(a.cloneNode(false), pane);
}
} catch {
return;
}
};
import {ATTRIBUTES_TO_KEEP, cloneAnchorFromPane, pruneElement} from "./common";
export const pruneGmailElement = (element: HTMLElement): boolean => {
return pruneElement(element);
};
export const amendGmailNodes = (document: HTMLDocument): void => {
/**
* Look for attachments panes and remove everything but liks
*/
const attachmentsPanes = Array.from(
document.getElementsByClassName("gmail_chip")
);
attachmentsPanes.forEach((pane) => {
const as = pane.querySelectorAll("a");
as.forEach((a) => {
cloneAnchorFromPane(a, pane as HTMLElement);
});
});
attachmentsPanes.forEach((pane) => {
pane.parentNode.removeChild(pane);
});
};
export const cleanupGMailElementAttributes = (element: HTMLElement): void => {
if (element.attributes.length > 0) {
for (const attribute of element.attributes) {
if (attribute.name === "data-surl") {
element.setAttribute("src", attribute.value);
}
}
for (let i = 0; i < element.attributes.length; i++) {
const attribute = element.attributes[i];
if (!ATTRIBUTES_TO_KEEP[attribute.name]) {
element.removeAttribute(attribute.name);
i--;
}
}
}
};
// TODO: Move this logic to amendOutlookNodes
import {printHtmlChildren} from "../HTMLNormalizer";
import {ELEMENT_NODE} from "../../constants";
import {ATTRIBUTES_TO_KEEP, cloneAnchorFromPane, pruneElement} from "./common";
export const printOutlookElement = (node: Node): string => {
if (node.nodeType === ELEMENT_NODE) {
if ((node as HTMLElement).classList.contains("WordSection1")) {
return printHtmlChildren(node, null, 0);
}
}
};
/**
* Returns true if element should be completely removed
* @param element
*/
export const pruneOutlookElement = (element: HTMLElement): boolean => {
if (pruneElement(element)) {
return true;
}
// Remove Outlook generic <o:*> tags
return !!element.nodeName.toLowerCase().startsWith("o:");
};
export const amendOutlookNodes = (document: HTMLDocument): void => {
/**
* Get rid of attachments panes
*/
const attachmentsPanesConatiner = document.getElementById(
"OwaReferenceAttachments"
);
const attachmentsPanesContainerEnd = document.getElementById(
"OwaReferenceAttachmentsEnd"
);
if (attachmentsPanesConatiner) {
const as = attachmentsPanesConatiner.getElementsByTagName("a");
Array.from(as).forEach((a) => {
cloneAnchorFromPane(a, attachmentsPanesConatiner as HTMLElement);
});
attachmentsPanesConatiner.parentNode.removeChild(attachmentsPanesConatiner);
}
attachmentsPanesContainerEnd &&
attachmentsPanesContainerEnd.parentNode.removeChild(
attachmentsPanesContainerEnd
);
/**
* Unwind spans, because sometimes Outlook wraps everything into span after sending
*/
const spans = document.getElementsByTagName("span");
/**
* Sort spans by depth to start unwinding the deepest ones, which does not contain nested spans
*/
const spansDepths: { depth?: Array<Node> } = {};
Array.from(spans).forEach((span: Node) => {
let descendant = span;
let parent = descendant.parentNode;
let depth = 0;
while (parent && descendant !== parent) {
descendant = parent;
parent = descendant.parentNode;
depth++;
}
if (!spansDepths[depth]) {
spansDepths[depth] = [];
}
spansDepths[depth].push(span);
});
Object.keys(spansDepths)
.sort((a, b) => parseInt(b) - parseInt(a))
.forEach((depth) => {
spansDepths[depth].forEach((span) => {
let child = span.firstChild;
const parent = span.parentNode;
while (child) {
parent.insertBefore(child.cloneNode(true), span);
child = child.nextSibling;
}
span.parentNode.removeChild(span);
});
});
};
export const cleanupOutlookElementAttributes = (element: HTMLElement): void => {
if (element.attributes.length > 0) {
for (const attribute of element.attributes) {
let valueSplit = attribute.value.split(" ");
valueSplit = valueSplit.map((value) =>
value.startsWith("x_") ? value.replace("x_", "") : value
);
element.setAttribute(attribute.name, valueSplit.join(" "));
}
for (let i = 0; i < element.attributes.length; i++) {
const attribute = element.attributes[i];
if (!ATTRIBUTES_TO_KEEP[attribute.name]) {
element.removeAttribute(attribute.name);
i--;
}
}
}
};
export const ELEMENT_NODE = 1;
export const TEXT_NODE = 3;
export const DOCUMENT_NODE = 9;
export const EMAIL_VENDORS = {
GMAIL: "GMAIL",
OUTLOOK: "OUTLOOK",
ROUNDCUBE: "ROUNDCUBE",
GENERIC_MIME: "GENERIC_MIME",
};
export const add = (a: number, b: number) => { export { default as HTMLNormalizer } from "./HTMLNormalizer";
return a + b;
}
src/utils.ts 0 → 100644
export const removeSpacesAndLinebreaks = (s: string): string => {
const regexNewlines = new RegExp(/[\r\n\v]+/g);
const regexSpaces = new RegExp(/\s+|\u200B/g);
return s.replace(regexNewlines, "").replace(regexSpaces, "");
};
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment