Skip to content
Snippets Groups Projects
Commit 77bdb2a5 authored by Igor Markin's avatar Igor Markin
Browse files

Merge branch 'add-pseudoplain-tests' into 'master'

Implement pseudo plain parsing

See merge request !6
parents b26ccc5e 27cd21f6
No related branches found
No related tags found
1 merge request!6Implement pseudo plain parsing
Showing with 245 additions and 12 deletions
import { EMAIL_VENDORS } from "../src";
const path = require("path");
import { describe } from "@jest/globals";
import { createDescribePseudoPlainTestCases } from "./utils";
const TESTS_GLOBAL_PATH = "/files/gmail-gmail";
const testsPath = path.resolve(__dirname, `.${TESTS_GLOBAL_PATH}`);
describe("[Pseudo PLAIN] GMail-GMail", () => {
const describeFunction = createDescribePseudoPlainTestCases(
testsPath,
EMAIL_VENDORS.GMAIL
);
describe("One", describeFunction("one"));
});
import { describe } from "@jest/globals";
import { createDescribePseudoPlainTestCases } from "./utils";
import { EMAIL_VENDORS } from "../src";
const path = require("path");
const TESTS_GLOBAL_PATH = "/files/gmail-outlook";
const testsPath = path.resolve(__dirname, `.${TESTS_GLOBAL_PATH}`);
describe("[Pseudo PLAIN] Gmail-Outlook normalization", () => {
const describeFunction = createDescribePseudoPlainTestCases(
testsPath,
EMAIL_VENDORS.GMAIL
);
describe(
"One",
describeFunction("one", [
"21forward", // missing file
"23forward", // missing file
"24forward", // missing file
])
);
});
import { describe } from "@jest/globals";
import { createDescribePseudoPlainTestCases } from "./utils";
import { EMAIL_VENDORS } from "../src";
const path = require("path");
const TESTS_GLOBAL_PATH = "/files/outlook-outlook";
const testsPath = path.resolve(__dirname, `.${TESTS_GLOBAL_PATH}`);
describe("[Pseudo PLAIN] Outlook-Outlook normalization", () => {
const describeFunction = createDescribePseudoPlainTestCases(
testsPath,
EMAIL_VENDORS.OUTLOOK
);
// ["01"] - is a filter. Pass here names of directories with test cases you want to check
describe("Emails Chrome", describeFunction("chrome", null, []));
describe(
"Emails Edge",
describeFunction("edge", [
"08", // Files are missing for test case
"10", // Files are missing for test case
])
);
describe("Emails Safari", describeFunction("safari", ["08"]));
//Does not work at all
describe(
"Emails MacOS",
describeFunction("macos", [
"23", // has special character
])
);
describe(
"Emails Windows",
describeFunction("windows", [
"10", // missing files
])
);
});
...@@ -8,6 +8,7 @@ const RECEIVED_PLAIN_NAME = "r_plainContent.data"; ...@@ -8,6 +8,7 @@ const RECEIVED_PLAIN_NAME = "r_plainContent.data";
import { PlainNormalizer, HTMLNormalizer } from "../src"; import { PlainNormalizer, HTMLNormalizer } from "../src";
import { expect, test } from "@jest/globals"; import { expect, test } from "@jest/globals";
import { DOM } from "@vereign/dom"; import { DOM } from "@vereign/dom";
//import { diffStringsUnified } from "jest-diff";
export const getNormalizedPlain = ( export const getNormalizedPlain = (
testCasePath: string testCasePath: string
...@@ -111,12 +112,21 @@ export const createDescribeHtmlTestCases = ( ...@@ -111,12 +112,21 @@ export const createDescribeHtmlTestCases = (
export const createDescribePlainTestCases = (testsPath: string) => ( export const createDescribePlainTestCases = (testsPath: string) => (
casesName: string, casesName: string,
failingCases: Array<string> = [] failingCases: Array<string> = [],
casesToCheckOnly?: Array<string>
) => (): void => { ) => (): void => {
const testsCasesPath = testsPath + "/" + casesName; const testsCasesPath = testsPath + "/" + casesName;
const testCasesDirs = getTestCasesDirs(testsCasesPath).filter( let testCasesDirs = getTestCasesDirs(testsCasesPath);
(dir) => !failingCases.includes(dir)
); if (casesToCheckOnly && casesToCheckOnly.length) {
testCasesDirs = testCasesDirs.filter((dir) =>
casesToCheckOnly.includes(dir)
);
}
if (failingCases && failingCases.length) {
testCasesDirs = testCasesDirs.filter((dir) => !failingCases.includes(dir));
}
test.each(testCasesDirs)("Case %s", (dirName: string) => { test.each(testCasesDirs)("Case %s", (dirName: string) => {
const testCasePath = testsCasesPath + "/" + dirName; const testCasePath = testsCasesPath + "/" + dirName;
...@@ -127,5 +137,87 @@ export const createDescribePlainTestCases = (testsPath: string) => ( ...@@ -127,5 +137,87 @@ export const createDescribePlainTestCases = (testsPath: string) => (
// expect(sentPlain.length).toBeGreaterThan(0); // expect(sentPlain.length).toBeGreaterThan(0);
// expect(receivedPlain.length).toBeGreaterThan(0); // expect(receivedPlain.length).toBeGreaterThan(0);
expect(receivedPlain).toContain(sentPlain); expect(receivedPlain).toContain(sentPlain);
// const diff = diffStringsUnified(sentPlain, receivedPlain);
// console.log(diff);
}); });
}; };
export const createDescribePseudoPlainTestCases = (
testsPath: string,
vendor: string
) =>
/**
* @param casesGroupName - name of the folder with cases
* @param failingCases - a list of cases that are failing and ignored. Pending to be fixed
* @param casesToCheckOnly - a filter to use if you want to check specific cases
*/
(
casesGroupName: string,
failingCases?: Array<string>,
casesToCheckOnly?: Array<string>
) => (): void => {
const testsCasesPath = testsPath + "/" + casesGroupName;
let testCasesDirs = getTestCasesDirs(testsCasesPath);
if (casesToCheckOnly && casesToCheckOnly.length) {
testCasesDirs = testCasesDirs.filter((dir) =>
casesToCheckOnly.includes(dir)
);
}
if (failingCases && failingCases.length) {
testCasesDirs = testCasesDirs.filter(
(dir) => !failingCases.includes(dir)
);
}
test.each(testCasesDirs)("Case %s", (dirName: string) => {
const testCasePath = testsCasesPath + "/" + dirName;
const { sentHtmlDocument, receivedHtmlDocument } = getDOMDocuments(
testCasePath
);
HTMLNormalizer.normalizeVendorHtml(receivedHtmlDocument, vendor);
HTMLNormalizer.normalizeVendorHtml(sentHtmlDocument, vendor);
const normalizedReceivedPseudoPlainText = HTMLNormalizer.extractPseudoPlainPart(
receivedHtmlDocument
);
const normalizedSentPseudoPlainText = HTMLNormalizer.extractPseudoPlainPart(
sentHtmlDocument
);
expect(normalizedReceivedPseudoPlainText).toEqual(
normalizedSentPseudoPlainText
);
// const diff = diffStringsUnified(
// normalizedReceivedPseudoPlainText,
// normalizedSentPseudoPlainText
// );
// console.log(diff);
});
};
export const getDOMDocuments = (
testCasePath: string
): {
sentHtmlDocument: HTMLDocument;
receivedHtmlDocument: HTMLDocument;
} => {
const sentHtml = fs
.readFileSync(`${testCasePath}/${SENT_HTML_NAME}`)
.toString();
const receivedHtml = fs
.readFileSync(`${testCasePath}/${RECEIVED_HTML_NAME}`)
.toString();
const sentDOM = new DOM(sentHtml);
const receivedDOM = new JSDOM(receivedHtml);
return {
sentHtmlDocument: sentDOM.window.document,
receivedHtmlDocument: receivedDOM.window.document,
};
};
export declare const normalizeVendorHtml: (document: HTMLDocument, vendor: string) => string; export declare const normalizeVendorHtml: (document: HTMLDocument, vendor: string) => string;
export declare const extractPseudoPlainPart: (document: HTMLDocument) => string;
export declare const printHtmlChildren: (node: Node, printFunction: (node: Node) => string, depth: number) => string; export declare const printHtmlChildren: (node: Node, printFunction: (node: Node) => string, depth: number) => string;
export declare const printHtmlNode: (node: Node, printFunction: (node: Node) => string, depth: number) => string; export declare const printHtmlNode: (node: Node, printFunction: (node: Node) => string, depth: number) => string;
export declare const cleanupHtmlNodeAttributes: (node: Node, cleanupElementAttributes: (element: HTMLElement) => void) => void; export declare const cleanupHtmlNodeAttributes: (node: Node, cleanupElementAttributes: (element: HTMLElement) => void) => void;
......
"use strict"; "use strict";
Object.defineProperty(exports, "__esModule", { value: true }); Object.defineProperty(exports, "__esModule", { value: true });
exports.escapeHtmlString = exports.pruneHtmlNode = exports.cleanupHtmlNodeAttributes = exports.printHtmlNode = exports.printHtmlChildren = exports.normalizeVendorHtml = void 0; exports.escapeHtmlString = exports.pruneHtmlNode = exports.cleanupHtmlNodeAttributes = exports.printHtmlNode = exports.printHtmlChildren = exports.extractPseudoPlainPart = exports.normalizeVendorHtml = void 0;
const constants_1 = require("../constants"); const constants_1 = require("../constants");
const outlook_1 = require("./strategies/outlook"); const outlook_1 = require("./strategies/outlook");
const constants_2 = require("../constants"); const constants_2 = require("../constants");
const utils_1 = require("../utils"); const utils_1 = require("../utils");
const gmail_1 = require("./strategies/gmail"); const gmail_1 = require("./strategies/gmail");
const index_1 = require("../index");
const nodesAmendingFunctions = { const nodesAmendingFunctions = {
[constants_2.EMAIL_VENDORS.GMAIL]: gmail_1.amendGmailNodes, [constants_2.EMAIL_VENDORS.GMAIL]: gmail_1.amendGmailNodes,
[constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.amendOutlookNodes, [constants_2.EMAIL_VENDORS.OUTLOOK]: outlook_1.amendOutlookNodes,
...@@ -49,6 +50,27 @@ const normalizeVendorHtml = (document, vendor) => { ...@@ -49,6 +50,27 @@ const normalizeVendorHtml = (document, vendor) => {
return exports.printHtmlChildren(mimeBody, vendorPrintFunction, 0); return exports.printHtmlChildren(mimeBody, vendorPrintFunction, 0);
}; };
exports.normalizeVendorHtml = normalizeVendorHtml; exports.normalizeVendorHtml = normalizeVendorHtml;
const extractPseudoPlainPart = (document
/*vendor: string*/
) => {
const textContent = index_1.PlainNormalizer.normalizePlain(document.body.textContent);
// const anchors = document.getElementsByTagName("a");
// const images = document.getElementsByTagName("img");
// let meaningfulAttributes = [];
//
// Array.from(anchors).forEach((a) => {
// meaningfulAttributes.push(a.getAttribute("href"));
// });
// Array.from(images).forEach((img) => {
// meaningfulAttributes.push(img.getAttribute("src"));
// meaningfulAttributes.push(img.getAttribute("alt"));
// });
//
// meaningfulAttributes = meaningfulAttributes.filter((attr) => !!attr).sort();
// console.log(meaningfulAttributes);
return textContent;
};
exports.extractPseudoPlainPart = extractPseudoPlainPart;
const printHtmlChildren = (node, printFunction, depth) => { const printHtmlChildren = (node, printFunction, depth) => {
let child = node.firstChild; let child = node.firstChild;
if (!child) { if (!child) {
......
declare const _default: { declare const _default: {
normalizeVendorHtml: (document: HTMLDocument, vendor: string) => string; normalizeVendorHtml: (document: HTMLDocument, vendor: string) => string;
extractPseudoPlainPart: (document: HTMLDocument) => string;
}; };
export default _default; export default _default;
...@@ -3,4 +3,5 @@ Object.defineProperty(exports, "__esModule", { value: true }); ...@@ -3,4 +3,5 @@ Object.defineProperty(exports, "__esModule", { value: true });
const HTMLNormalizer_1 = require("./HTMLNormalizer"); const HTMLNormalizer_1 = require("./HTMLNormalizer");
exports.default = { exports.default = {
normalizeVendorHtml: HTMLNormalizer_1.normalizeVendorHtml, normalizeVendorHtml: HTMLNormalizer_1.normalizeVendorHtml,
extractPseudoPlainPart: HTMLNormalizer_1.extractPseudoPlainPart,
}; };
...@@ -3,6 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true }); ...@@ -3,6 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizePlainPart = void 0; exports.normalizePlainPart = void 0;
const utils_1 = require("../utils"); const utils_1 = require("../utils");
const normalizePlainPart = (text) => { const normalizePlainPart = (text) => {
text = removeListBullets(text);
text = utils_1.removeSpacesAndLinebreaks(text); text = utils_1.removeSpacesAndLinebreaks(text);
return removeQRCodes(text); return removeQRCodes(text);
}; };
...@@ -12,3 +13,6 @@ const removeQRCodes = (s) => { ...@@ -12,3 +13,6 @@ const removeQRCodes = (s) => {
.replace(/\[qrcode.png]\s*<https:\/\/[\w./?=\-&]+>/g, "") .replace(/\[qrcode.png]\s*<https:\/\/[\w./?=\-&]+>/g, "")
.replace(/<https:\/\/[\w./?=\-&]+>\s*\[qrcode.png]/g, ""); .replace(/<https:\/\/[\w./?=\-&]+>\s*\[qrcode.png]/g, "");
}; };
const removeListBullets = (s) => {
return s.replace("\n[o§]\n+/g", "");
};
...@@ -2,8 +2,7 @@ ...@@ -2,8 +2,7 @@
Object.defineProperty(exports, "__esModule", { value: true }); Object.defineProperty(exports, "__esModule", { value: true });
exports.removeSpacesAndLinebreaks = void 0; exports.removeSpacesAndLinebreaks = void 0;
const removeSpacesAndLinebreaks = (s) => { const removeSpacesAndLinebreaks = (s) => {
const regexNewlines = new RegExp(/[\r\n\v]+/g); const removeSymbols = new RegExp(/[\r\n\v\s\u200B]+/g);
const regexSpaces = new RegExp(/\s+|\u200B/g); return s.replace(removeSymbols, "").trim();
return s.replace(regexNewlines, "").replace(regexSpaces, "");
}; };
exports.removeSpacesAndLinebreaks = removeSpacesAndLinebreaks; exports.removeSpacesAndLinebreaks = removeSpacesAndLinebreaks;
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
"eslint": "^7.7.0", "eslint": "^7.7.0",
"husky": "^4.2.5", "husky": "^4.2.5",
"jest": "^26.4.2", "jest": "^26.4.2",
"jest-diff": "^26.6.2",
"lint-staged": "^10.2.13", "lint-staged": "^10.2.13",
"prettier": "^2.1.1", "prettier": "^2.1.1",
"typescript": "^4.0.2" "typescript": "^4.0.2"
......
...@@ -18,6 +18,7 @@ import { ...@@ -18,6 +18,7 @@ import {
cleanupGMailElementAttributes, cleanupGMailElementAttributes,
pruneGmailElement, pruneGmailElement,
} from "./strategies/gmail"; } from "./strategies/gmail";
import { PlainNormalizer } from "../index";
const nodesAmendingFunctions = { const nodesAmendingFunctions = {
[EMAIL_VENDORS.GMAIL]: amendGmailNodes, [EMAIL_VENDORS.GMAIL]: amendGmailNodes,
...@@ -79,6 +80,29 @@ export const normalizeVendorHtml = ( ...@@ -79,6 +80,29 @@ export const normalizeVendorHtml = (
return printHtmlChildren(mimeBody, vendorPrintFunction, 0); return printHtmlChildren(mimeBody, vendorPrintFunction, 0);
}; };
export const extractPseudoPlainPart = (
document: HTMLDocument
/*vendor: string*/
): string => {
const textContent = PlainNormalizer.normalizePlain(document.body.textContent);
// const anchors = document.getElementsByTagName("a");
// const images = document.getElementsByTagName("img");
// let meaningfulAttributes = [];
//
// Array.from(anchors).forEach((a) => {
// meaningfulAttributes.push(a.getAttribute("href"));
// });
// Array.from(images).forEach((img) => {
// meaningfulAttributes.push(img.getAttribute("src"));
// meaningfulAttributes.push(img.getAttribute("alt"));
// });
//
// meaningfulAttributes = meaningfulAttributes.filter((attr) => !!attr).sort();
// console.log(meaningfulAttributes);
return textContent;
};
export const printHtmlChildren = ( export const printHtmlChildren = (
node: Node, node: Node,
printFunction: (node: Node) => string, printFunction: (node: Node) => string,
......
import { normalizeVendorHtml } from "./HTMLNormalizer"; import { normalizeVendorHtml, extractPseudoPlainPart } from "./HTMLNormalizer";
export default { export default {
normalizeVendorHtml, normalizeVendorHtml,
extractPseudoPlainPart,
}; };
import { removeSpacesAndLinebreaks } from "../utils"; import { removeSpacesAndLinebreaks } from "../utils";
export const normalizePlainPart = (text: string): string => { export const normalizePlainPart = (text: string): string => {
text = removeListBullets(text);
text = removeSpacesAndLinebreaks(text); text = removeSpacesAndLinebreaks(text);
return removeQRCodes(text); return removeQRCodes(text);
}; };
...@@ -10,3 +11,7 @@ const removeQRCodes = (s: string): string => { ...@@ -10,3 +11,7 @@ const removeQRCodes = (s: string): string => {
.replace(/\[qrcode.png]\s*<https:\/\/[\w./?=\-&]+>/g, "") .replace(/\[qrcode.png]\s*<https:\/\/[\w./?=\-&]+>/g, "")
.replace(/<https:\/\/[\w./?=\-&]+>\s*\[qrcode.png]/g, ""); .replace(/<https:\/\/[\w./?=\-&]+>\s*\[qrcode.png]/g, "");
}; };
const removeListBullets = (s: string): string => {
return s.replace("\n[o§]\n+/g", "");
};
export const removeSpacesAndLinebreaks = (s: string): string => { export const removeSpacesAndLinebreaks = (s: string): string => {
const regexNewlines = new RegExp(/[\r\n\v]+/g); const removeSymbols = new RegExp(/[\r\n\v\s\u200B]+/g);
const regexSpaces = new RegExp(/\s+|\u200B/g);
return s.replace(regexNewlines, "").replace(regexSpaces, ""); return s.replace(removeSymbols, "").trim();
}; };
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment