From 72f85d315d24589a939691c0651bec904eddc1fc Mon Sep 17 00:00:00 2001 From: igor <igor.markin@vereign.com> Date: Thu, 24 Dec 2020 10:57:17 +0300 Subject: [PATCH] Properly handle spaces in plain part --- .../outlook-outlook/chrome-chrome/07/received.eml | 2 +- .../outlook-outlook/chrome-chrome/08/received.eml | 2 +- .../outlook-outlook/chrome-chrome/15/received.eml | 2 +- src/PlainNormalizer/PlainNormalizer.ts | 15 +++++++++++---- src/utils.ts | 4 ++++ 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/__tests__/files/outlook-outlook/chrome-chrome/07/received.eml b/__tests__/files/outlook-outlook/chrome-chrome/07/received.eml index 68c2e4f..5b3539a 100644 --- a/__tests__/files/outlook-outlook/chrome-chrome/07/received.eml +++ b/__tests__/files/outlook-outlook/chrome-chrome/07/received.eml @@ -205,7 +205,7 @@ Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable Stephan Morphis has shared OneDrive files with you. To view them, click the= -links below. + links below. <https://1drv.ms/u/s!Au0bzLX4nSlWiDaBztIacMtR0TFd> [https://r1.res.office365.com/owa/prem/images/dc-generic_20.png]<https://1d= rv.ms/u/s!Au0bzLX4nSlWiDaBztIacMtR0TFd> diff --git a/__tests__/files/outlook-outlook/chrome-chrome/08/received.eml b/__tests__/files/outlook-outlook/chrome-chrome/08/received.eml index 2dbf245..94f09f9 100644 --- a/__tests__/files/outlook-outlook/chrome-chrome/08/received.eml +++ b/__tests__/files/outlook-outlook/chrome-chrome/08/received.eml @@ -206,7 +206,7 @@ Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable Stephan Morphis has shared OneDrive files with you. To view them, click the= -links below. + links below. <https://1drv.ms/u/s!Au0bzLX4nSlWiDaBztIacMtR0TFd> [https://r1.res.office365.com/owa/prem/images/dc-generic_20.png]<https://1d= rv.ms/u/s!Au0bzLX4nSlWiDaBztIacMtR0TFd> diff --git a/__tests__/files/outlook-outlook/chrome-chrome/15/received.eml b/__tests__/files/outlook-outlook/chrome-chrome/15/received.eml index d788cca..8f7b852 100644 --- a/__tests__/files/outlook-outlook/chrome-chrome/15/received.eml +++ b/__tests__/files/outlook-outlook/chrome-chrome/15/received.eml @@ -210,7 +210,7 @@ Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable Stephan Morphis has shared OneDrive files with you. To view them, click the= -links below. + links below. <https://1drv.ms/u/s!Au0bzLX4nSlWlj-_pxZ0QmCUigL4> [https://r1.res.office365.com/owa/prem/images/dc-png_20.png]<https://1drv.m= s/u/s!Au0bzLX4nSlWlj-_pxZ0QmCUigL4> diff --git a/src/PlainNormalizer/PlainNormalizer.ts b/src/PlainNormalizer/PlainNormalizer.ts index ba47353..f230a11 100644 --- a/src/PlainNormalizer/PlainNormalizer.ts +++ b/src/PlainNormalizer/PlainNormalizer.ts @@ -1,14 +1,16 @@ // this is a Node module. require is a must to work across different envs const URL = require("url-parse"); -import { removeSpacesAndLinebreaks } from "../utils"; +import { normalizeTextSpacings } from "../utils"; export const normalizePlainPart = (text: string): string => { + text = cleanupHiddenCharacters(text); text = removeListBullets(text); - text = removeSpacesAndLinebreaks(text); text = removeQRCodes(text); + text = normalizeTextSpacings(text); text = patchOutlookSafelinksWrappers(text); - return text; + + return text.trim(); }; const patchOutlookSafelinksWrappers = (text: string) => { @@ -29,10 +31,15 @@ const patchOutlookSafelinksWrappers = (text: string) => { const removeQRCodes = (s: string): string => { return s - .replace(/\[(image:)*qrcode.png]\s*<https:\/\/.+?>/g, "") + .replace(/\[(image:\s)*qrcode.png]\s*<https:\/\/.+?>/g, "") .replace(/<https:\/\/.+?>\s*\[(image: )*qrcode.png]/g, ""); }; const removeListBullets = (s: string): string => { return s.replace("\n[o§]\n+/g", ""); }; + +export const cleanupHiddenCharacters = (s: string): string => { + const removeSymbols = new RegExp(/[\u200B]+/g); + return s.replace(removeSymbols, ""); +}; diff --git a/src/utils.ts b/src/utils.ts index c8bb6bb..51805e0 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -3,3 +3,7 @@ export const removeSpacesAndLinebreaks = (s: string): string => { return s.replace(removeSymbols, "").trim(); }; + +export const normalizeTextSpacings = (s: string): string => { + return s.replace(/[\r\n\v\s]+/g, " "); +}; -- GitLab