Skip to content
Snippets Groups Projects
Commit 97b04859 authored by Igor Markin's avatar Igor Markin
Browse files

Add build

parent 41a5a9cf
No related branches found
No related tags found
No related merge requests found
export declare const normalizeVendorHtml: (document: HTMLDocument, vendor: string) => string;
export declare const extractPseudoPlainPart: (document: HTMLDocument) => string;
export declare const printHtmlChildren: (node: Node, printFunction: (node: Node) => string, depth: number) => string;
export declare const printHtmlNode: (node: Node, printFunction: (node: Node) => string, depth: number) => string;
export declare const cleanupHtmlNodeAttributes: (node: Node, cleanupElementAttributes: (element: HTMLElement) => void) => void;
export declare const pruneHtmlNode: (node: Node, pruneElement: (element: HTMLElement) => boolean) => boolean;
export declare const escapeHtmlString: (string: string) => string;
export declare const pruneHtmlNode: (node: Node) => boolean;
export declare const normalizeBodyAttributes: (node: Node, cleanupElementAttributes: (element: HTMLElement) => void) => void;
export declare const printHtmlChildren: (node: Node, depth: number) => string;
export declare const printHtmlNode: (node: Node, depth: number) => string;
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.escapeHtmlString = exports.pruneHtmlNode = exports.cleanupHtmlNodeAttributes = exports.printHtmlNode = exports.printHtmlChildren = exports.extractPseudoPlainPart = exports.normalizeVendorHtml = void 0;
exports.printHtmlNode = exports.printHtmlChildren = exports.normalizeBodyAttributes = exports.pruneHtmlNode = exports.extractPseudoPlainPart = exports.normalizeVendorHtml = void 0;
const constants_1 = require("../constants");
const outlook_1 = require("./strategies/outlook");
const constants_2 = require("../constants");
......@@ -8,100 +8,103 @@ const utils_1 = require("../utils");
const gmail_1 = require("./strategies/gmail");
const index_1 = require("../index");
const common_1 = require("./strategies/common");
const nodesAmendingFunctions = {
[constants_2.EMAIL_VENDORS.GMAIL]: gmail_1.amendGmailNodes,
[constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.amendOutlookNodes,
};
const nodesPruningFunctions = {
[constants_2.EMAIL_VENDORS.GMAIL]: gmail_1.pruneGmailElement,
[constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.pruneOutlookElement,
};
const attributesCleanupFunctions = {
[constants_2.EMAIL_VENDORS.GMAIL]: gmail_1.cleanupGMailElementAttributes,
[constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.cleanupOutlookElementAttributes,
};
const vendorPrintingFunctions = {
[constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.printOutlookElement,
const documentNormalizationFunctions = {
[constants_2.EMAIL_VENDORS.GMAIL]: gmail_1.normalizeGmailDocument,
[constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.normalizeOutlookDocument,
};
const normalizeVendorHtml = (document, vendor) => {
const mimeBody = document.body;
common_1.amendNodes(document);
const amendNodesFunction = nodesAmendingFunctions[vendor];
if (amendNodesFunction) {
amendNodesFunction(document);
}
/**
* Remove unnecessary nodes
*/
const elementPruningFunction = nodesPruningFunctions[vendor];
if (!elementPruningFunction) {
throw new Error(`Vendor "${vendor}" is not supported. Please, develop a pruning function for it.`);
}
exports.pruneHtmlNode(document, elementPruningFunction);
exports.pruneHtmlNode(document.body);
/**
* Cleanup unnecessary attributes of nodes
* Apply document normalisations
*/
const elementAttributesCleanupFunction = attributesCleanupFunctions[vendor];
if (elementAttributesCleanupFunction) {
exports.cleanupHtmlNodeAttributes(document, elementAttributesCleanupFunction);
common_1.normalizeDocumentCommon(document.body);
const normalizeDocument = documentNormalizationFunctions[vendor];
if (normalizeDocument) {
normalizeDocument(document);
}
/**
* Print nodes
* Final printout
*/
const vendorPrintFunction = vendorPrintingFunctions[vendor];
return exports.printHtmlChildren(mimeBody, vendorPrintFunction, 0);
return exports.printHtmlChildren(document.body, 0);
};
exports.normalizeVendorHtml = normalizeVendorHtml;
const extractPseudoPlainPart = (document
/*vendor: string*/
) => {
const textContent = index_1.PlainNormalizer.normalizePlain(document.body.textContent);
// const anchors = document.getElementsByTagName("a");
// const images = document.getElementsByTagName("img");
// let meaningfulAttributes = [];
//
// Array.from(anchors).forEach((a) => {
// meaningfulAttributes.push(a.getAttribute("href"));
// });
// Array.from(images).forEach((img) => {
// meaningfulAttributes.push(img.getAttribute("src"));
// meaningfulAttributes.push(img.getAttribute("alt"));
// });
//
// meaningfulAttributes = meaningfulAttributes.filter((attr) => !!attr).sort();
// console.log(meaningfulAttributes);
return textContent;
const extractPseudoPlainPart = (document) => {
let normalizedTextContent = index_1.PlainNormalizer.normalizePlain(document.body.textContent);
const anchors = document.getElementsByTagName("a");
const images = document.getElementsByTagName("img");
let urls = [];
Array.from(anchors).forEach((a) => {
urls.push(a.getAttribute("href"));
});
Array.from(images).forEach((img) => {
urls.push(img.getAttribute("src"));
});
urls = urls.filter((attr) => !!attr).sort();
normalizedTextContent += urls.join(",");
return normalizedTextContent;
};
exports.extractPseudoPlainPart = extractPseudoPlainPart;
const printHtmlChildren = (node, printFunction, depth) => {
const pruneHtmlNode = (node) => {
let toBeRemoved = false;
switch (node.nodeType) {
case constants_1.COMMENT_NODE:
case constants_1.DOCUMENT_TYPE_NODE:
toBeRemoved = true;
break;
case constants_1.ELEMENT_NODE:
toBeRemoved = common_1.pruneElement(node);
break;
}
if (toBeRemoved) {
return true;
}
const childrenToRemove = [];
let child = node.firstChild;
while (child) {
exports.pruneHtmlNode(child) && childrenToRemove.push(child);
child = child.nextSibling;
}
childrenToRemove.forEach((child) => node.removeChild(child));
return false;
};
exports.pruneHtmlNode = pruneHtmlNode;
const normalizeBodyAttributes = (node, cleanupElementAttributes) => {
if (node.nodeType === constants_1.ELEMENT_NODE) {
cleanupElementAttributes(node);
}
let child = node.firstChild;
while (child) {
exports.normalizeBodyAttributes(child, cleanupElementAttributes);
child = child.nextSibling;
}
};
exports.normalizeBodyAttributes = normalizeBodyAttributes;
const printHtmlChildren = (node, depth) => {
let child = node.firstChild;
if (!child) {
return "";
}
if (child == node.lastChild && child.nodeType == constants_1.TEXT_NODE) {
return exports.printHtmlNode(child, printFunction, depth);
return exports.printHtmlNode(child, depth);
}
else {
let result = "";
while (child) {
result = result.concat(exports.printHtmlNode(child, printFunction, depth));
result = result.concat(exports.printHtmlNode(child, depth));
child = child.nextSibling;
}
return result;
}
};
exports.printHtmlChildren = printHtmlChildren;
const printHtmlNode = (node, printFunction, depth) => {
const printHtmlNode = (node, depth) => {
let result = "";
if (printFunction) {
const customPrintout = printFunction(node);
if (customPrintout) {
return customPrintout;
}
}
switch (node.nodeType) {
case constants_1.TEXT_NODE: {
const text = utils_1.removeSpacesAndLinebreaks(node.textContent);
const text = utils_1.normalizeTextSpacings(node.textContent).trim();
if (text.length) {
result += "<TEXT>";
result += text;
......@@ -111,24 +114,30 @@ const printHtmlNode = (node, printFunction, depth) => {
break;
}
case constants_1.DOCUMENT_NODE:
result += exports.printHtmlChildren(node, printFunction, depth);
result += exports.printHtmlChildren(node, depth);
break;
case constants_1.ELEMENT_NODE:
result += "<" + node.nodeName;
Array.from(node.attributes)
.filter((a) => common_1.ATTRIBUTES_TO_KEEP[a.name])
.sort((a, b) => a.name.localeCompare(b.name))
.forEach((attribute) => {
result += ` ${attribute.name}`;
if (attribute.value) {
result += `="${exports.escapeHtmlString(attribute.value)}"`;
result += `="${attribute.value}"`;
}
});
if (node.firstChild) {
result += ">";
result += "\n";
const printout = exports.printHtmlChildren(node, printFunction, depth + 1);
result += printout;
result += "</" + node.nodeName + ">";
const printout = exports.printHtmlChildren(node, depth + 1);
if (printout.trim().length === 0) {
result += "/>";
}
else {
result += ">";
result += "\n";
result += printout;
result += "</" + node.nodeName + ">";
}
}
else {
result += "/>";
......@@ -139,87 +148,3 @@ const printHtmlNode = (node, printFunction, depth) => {
return result;
};
exports.printHtmlNode = printHtmlNode;
const cleanupHtmlNodeAttributes = (node, cleanupElementAttributes) => {
if (node.nodeType === constants_1.ELEMENT_NODE) {
cleanupElementAttributes(node);
}
let child = node.firstChild;
while (child) {
exports.cleanupHtmlNodeAttributes(child, cleanupElementAttributes);
child = child.nextSibling;
}
};
exports.cleanupHtmlNodeAttributes = cleanupHtmlNodeAttributes;
const pruneHtmlNode = (node, pruneElement) => {
let toBeRemoved = false;
switch (node.nodeType) {
case constants_1.COMMENT_NODE:
case constants_1.DOCUMENT_TYPE_NODE:
toBeRemoved = true;
break;
case constants_1.TEXT_NODE: {
const trimmedText = node.textContent.trim();
if (trimmedText === "") {
toBeRemoved = true;
}
else {
node.textContent = trimmedText;
}
break;
}
case constants_1.ELEMENT_NODE:
toBeRemoved = pruneElement(node);
}
if (toBeRemoved) {
return true;
}
const childrenToRemove = [];
let child = node.firstChild;
while (child) {
exports.pruneHtmlNode(child, pruneElement) && childrenToRemove.push(child);
child = child.nextSibling;
}
childrenToRemove.forEach((child) => node.removeChild(child));
return false;
};
exports.pruneHtmlNode = pruneHtmlNode;
const escapeHtmlString = (string) => {
const matchHtmlRegExp = /["'&<>]/;
const str = "" + string;
const match = matchHtmlRegExp.exec(str);
if (!match) {
return str;
}
let escape;
let html = "";
let index = 0;
let lastIndex = 0;
for (index = match.index; index < str.length; index++) {
switch (str.charCodeAt(index)) {
case 34: // "
escape = "&quot;";
break;
case 38: // &
escape = "&amp;";
break;
case 39: // '
escape = "&#39;";
break;
case 60: // <
escape = "&lt;";
break;
case 62: // >
escape = "&gt;";
break;
default:
continue;
}
if (lastIndex !== index) {
html += str.substring(lastIndex, index);
}
lastIndex = index + 1;
html += escape;
}
return lastIndex !== index ? html + str.substring(lastIndex, index) : html;
};
exports.escapeHtmlString = escapeHtmlString;
export declare const unwindTags: (node: Element | Document, tagName: string) => void;
export declare const unwindTags: (nodes: Array<Node>) => void;
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.unwindTags = void 0;
const unwindTags = (node, tagName) => {
const tags = node.getElementsByTagName(tagName);
//Sort tags by depth to start unwinding the deepest ones, which does not contain nested spans
const unwindTags = (nodes) => {
//Sort nodes by depth to start unwinding the deepest ones
const tagsDepths = {};
Array.from(tags).forEach((span) => {
Array.from(nodes).forEach((span) => {
let descendant = span;
let parent = descendant.parentNode;
let depth = 0;
......
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.cleanupOutlookElementAttributes = exports.amendOutlookNodes = exports.pruneOutlookElement = exports.printOutlookElement = void 0;
// TODO: Move this logic to amendOutlookNodes
const HTMLNormalizer_1 = require("../HTMLNormalizer");
exports.normalizeOutlookDocument = void 0;
const constants_1 = require("../../constants");
const common_1 = require("./common");
const printOutlookElement = (node) => {
if (node.nodeType === constants_1.ELEMENT_NODE) {
if (node.classList.contains("WordSection1")) {
return HTMLNormalizer_1.printHtmlChildren(node, null, 0);
}
}
};
exports.printOutlookElement = printOutlookElement;
/**
* Returns true if element should be completely removed
* @param element
*/
const pruneOutlookElement = (element) => {
if (common_1.pruneElement(element)) {
return true;
}
// Remove Outlook generic <o:*> tags
return !!element.nodeName.toLowerCase().startsWith("o:");
};
exports.pruneOutlookElement = pruneOutlookElement;
const nodesAmendingFunctions_1 = require("./nodesAmendingFunctions");
const qrCodeContainerIds = {
"test-for-us": 1,
};
......@@ -49,57 +28,44 @@ const removeQrCodeNodes = (document) => {
const elementsToRemove = remove(document.body);
elementsToRemove.forEach((element) => element.parentNode.removeChild(element));
};
const amendOutlookNodes = (document) => {
/**
* Remove quoted text
*/
// Quoted text in web apps
// const appendOnSend = document.querySelector(
// "[id*='appendonsend']"
// ) as Node;
//
// if (appendOnSend) {
// let child = appendOnSend;
// while (child) {
// const nextSibling = child.nextSibling;
// child.parentNode.removeChild(child);
// child = nextSibling as Node;
// }
// }
// Quoted text in desktop apps
// let mailOriginal = document.querySelector("[name*='_MailOriginal']") as HTMLElement;
// if (mailOriginal) {
// let removeCurrent = true;
// while (mailOriginal !== document.body) {
// while (mailOriginal.nextSibling) {
// mailOriginal.nextSibling.remove();
// }
// const currentNode = mailOriginal;
// mailOriginal = mailOriginal.parentElement;
// if (removeCurrent && currentNode.previousSibling) {
// currentNode.remove();
// removeCurrent = false;
// }
// }
// }
// if (mailOriginal) {
// const separatorCandidate = mailOriginal.parentNode as Node;
//
// // while (!(separatorCandidate.parentNode as Element).classList.contains("WordSection1")) {
// // separatorCandidate = separatorCandidate.parentNode;
// // }
//
// let child = separatorCandidate;
// while (child) {
// const nextSibling = child.nextSibling;
// child.parentNode.removeChild(child);
// child = nextSibling as Node;
// }
// }
const normalizeOutlookDocument = (document) => {
/**
* Remove QR code entries
*/
removeQrCodeNodes(document);
/**
* Unwind Word o:p paragraphs
*/
const ops = document.getElementsByTagName("o:p");
nodesAmendingFunctions_1.unwindTags(Array.from(ops));
/**
* Remove empty paragraphs
*/
const ps = document.getElementsByTagName("p");
Array.from(ps).forEach((p) => {
if (p.childNodes.length === 0) {
p.parentNode.removeChild(p);
}
if (p.childNodes.length === 1 && p.childNodes[0].nodeType === constants_1.TEXT_NODE) {
const text = p.childNodes[0].textContent;
if (!text.replace(/\u00A0/g, "").trim()) {
p.parentNode.removeChild(p);
}
}
});
/**
* Unwind all MSONormal, because outlook might wrap them into <div></div>
*/
const msoNormalWrappers = document.getElementsByClassName("MsoNormal");
const msoNormalParents = Array.from(msoNormalWrappers)
.map((node) => node.parentNode)
.filter((node, index, self) => self.indexOf(node) === index);
nodesAmendingFunctions_1.unwindTags(msoNormalParents);
/**
* Unwind WordSection1 tags
*/
const wordSectionWrappers = document.getElementsByClassName("WordSection1");
nodesAmendingFunctions_1.unwindTags(Array.from(wordSectionWrappers));
/**
* Get rid of attachments panes
*/
......@@ -120,51 +86,6 @@ const amendOutlookNodes = (document) => {
* Unwind spans, because sometimes Outlook wraps everything into span after sending
*/
const spans = document.getElementsByTagName("span");
//Sort spans by depth to start unwinding the deepest ones, which does not contain nested spans
const spansDepths = {};
Array.from(spans).forEach((span) => {
let descendant = span;
let parent = descendant.parentNode;
let depth = 0;
while (parent && descendant !== parent) {
descendant = parent;
parent = descendant.parentNode;
depth++;
}
if (!spansDepths[depth]) {
spansDepths[depth] = [];
}
spansDepths[depth].push(span);
});
Object.keys(spansDepths)
.sort((a, b) => parseInt(b) - parseInt(a))
.forEach((depth) => {
spansDepths[depth].forEach((span) => {
let child = span.firstChild;
const parent = span.parentNode;
while (child) {
parent.insertBefore(child.cloneNode(true), span);
child = child.nextSibling;
}
span.parentNode.removeChild(span);
});
});
};
exports.amendOutlookNodes = amendOutlookNodes;
const cleanupOutlookElementAttributes = (element) => {
if (element.attributes.length > 0) {
for (const attribute of element.attributes) {
let valueSplit = attribute.value.split(" ");
valueSplit = valueSplit.map((value) => value.startsWith("x_") ? value.replace("x_", "") : value);
element.setAttribute(attribute.name, valueSplit.join(" "));
}
for (let i = 0; i < element.attributes.length; i++) {
const attribute = element.attributes[i];
if (!common_1.ATTRIBUTES_TO_KEEP[attribute.name]) {
element.removeAttribute(attribute.name);
i--;
}
}
}
nodesAmendingFunctions_1.unwindTags(Array.from(spans));
};
exports.cleanupOutlookElementAttributes = cleanupOutlookElementAttributes;
exports.normalizeOutlookDocument = normalizeOutlookDocument;
export declare const normalizePlainPart: (text: string) => string;
export declare const cleanupHiddenCharacters: (s: string) => string;
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizePlainPart = void 0;
exports.cleanupHiddenCharacters = exports.normalizePlainPart = void 0;
// this is a Node module. require is a must to work across different envs
const URL = require("url-parse");
const utils_1 = require("../utils");
const normalizePlainPart = (text) => {
text = exports.cleanupHiddenCharacters(text);
text = removeListBullets(text);
text = utils_1.removeSpacesAndLinebreaks(text);
text = removeQRCodes(text);
text = utils_1.normalizeTextSpacings(text);
text = patchOutlookSafelinksWrappers(text);
return text;
return text.trim();
};
exports.normalizePlainPart = normalizePlainPart;
const patchOutlookSafelinksWrappers = (text) => {
......@@ -25,9 +26,14 @@ const patchOutlookSafelinksWrappers = (text) => {
};
const removeQRCodes = (s) => {
return s
.replace(/\[(image:)*qrcode.png]\s*<https:\/\/.+?>/g, "")
.replace(/\[(image:\s)*qrcode.png]\s*<https:\/\/.+?>/g, "")
.replace(/<https:\/\/.+?>\s*\[(image: )*qrcode.png]/g, "");
};
const removeListBullets = (s) => {
return s.replace("\n[o§]\n+/g", "");
};
const cleanupHiddenCharacters = (s) => {
const removeSymbols = new RegExp(/[\u200B]+/g);
return s.replace(removeSymbols, "");
};
exports.cleanupHiddenCharacters = cleanupHiddenCharacters;
export declare const removeSpacesAndLinebreaks: (s: string) => string;
export declare const normalizeTextSpacings: (s: string) => string;
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.removeSpacesAndLinebreaks = void 0;
const removeSpacesAndLinebreaks = (s) => {
const removeSymbols = new RegExp(/[\r\n\v\s\u200B]+/g);
return s.replace(removeSymbols, "").trim();
exports.normalizeTextSpacings = void 0;
const normalizeTextSpacings = (s) => {
return s.replace(/[\r\n\v\s\u00A0]+/g, " ");
};
exports.removeSpacesAndLinebreaks = removeSpacesAndLinebreaks;
exports.normalizeTextSpacings = normalizeTextSpacings;
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment