diff --git a/__tests__/files/outlook-outlook/chrome-chrome/07/received.eml b/__tests__/files/outlook-outlook/chrome-chrome/07/received.eml index 68c2e4f69347c3b94951935559cf66d3e5feb4b0..5b3539ab00e99abb9a73656068f5727baa9ab5ab 100644 --- a/__tests__/files/outlook-outlook/chrome-chrome/07/received.eml +++ b/__tests__/files/outlook-outlook/chrome-chrome/07/received.eml @@ -205,7 +205,7 @@ Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable Stephan Morphis has shared OneDrive files with you. To view them, click the= -links below. + links below. <https://1drv.ms/u/s!Au0bzLX4nSlWiDaBztIacMtR0TFd> [https://r1.res.office365.com/owa/prem/images/dc-generic_20.png]<https://1d= rv.ms/u/s!Au0bzLX4nSlWiDaBztIacMtR0TFd> diff --git a/__tests__/files/outlook-outlook/chrome-chrome/08/received.eml b/__tests__/files/outlook-outlook/chrome-chrome/08/received.eml index 2dbf2452dd5f68e81c0b0e3c433b865f29409d5a..94f09f94eedab0d0c801a99aafde8c0927611d71 100644 --- a/__tests__/files/outlook-outlook/chrome-chrome/08/received.eml +++ b/__tests__/files/outlook-outlook/chrome-chrome/08/received.eml @@ -206,7 +206,7 @@ Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable Stephan Morphis has shared OneDrive files with you. To view them, click the= -links below. + links below. <https://1drv.ms/u/s!Au0bzLX4nSlWiDaBztIacMtR0TFd> [https://r1.res.office365.com/owa/prem/images/dc-generic_20.png]<https://1d= rv.ms/u/s!Au0bzLX4nSlWiDaBztIacMtR0TFd> diff --git a/__tests__/files/outlook-outlook/chrome-chrome/15/received.eml b/__tests__/files/outlook-outlook/chrome-chrome/15/received.eml index d788cca066658bef6e918835a85def8be83538f7..8f7b8523071a6c108096b6d3d59aeafed6d4b3aa 100644 --- a/__tests__/files/outlook-outlook/chrome-chrome/15/received.eml +++ b/__tests__/files/outlook-outlook/chrome-chrome/15/received.eml @@ -210,7 +210,7 @@ Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable Stephan Morphis has shared OneDrive files with you. To view them, click the= -links below. + links below. <https://1drv.ms/u/s!Au0bzLX4nSlWlj-_pxZ0QmCUigL4> [https://r1.res.office365.com/owa/prem/images/dc-png_20.png]<https://1drv.m= s/u/s!Au0bzLX4nSlWlj-_pxZ0QmCUigL4> diff --git a/src/PlainNormalizer/PlainNormalizer.ts b/src/PlainNormalizer/PlainNormalizer.ts index ba473539832956f985b464b257b7d111e09010ce..f230a11e347ba5f819048fc49e375200445ca671 100644 --- a/src/PlainNormalizer/PlainNormalizer.ts +++ b/src/PlainNormalizer/PlainNormalizer.ts @@ -1,14 +1,16 @@ // this is a Node module. require is a must to work across different envs const URL = require("url-parse"); -import { removeSpacesAndLinebreaks } from "../utils"; +import { normalizeTextSpacings } from "../utils"; export const normalizePlainPart = (text: string): string => { + text = cleanupHiddenCharacters(text); text = removeListBullets(text); - text = removeSpacesAndLinebreaks(text); text = removeQRCodes(text); + text = normalizeTextSpacings(text); text = patchOutlookSafelinksWrappers(text); - return text; + + return text.trim(); }; const patchOutlookSafelinksWrappers = (text: string) => { @@ -29,10 +31,15 @@ const patchOutlookSafelinksWrappers = (text: string) => { const removeQRCodes = (s: string): string => { return s - .replace(/\[(image:)*qrcode.png]\s*<https:\/\/.+?>/g, "") + .replace(/\[(image:\s)*qrcode.png]\s*<https:\/\/.+?>/g, "") .replace(/<https:\/\/.+?>\s*\[(image: )*qrcode.png]/g, ""); }; const removeListBullets = (s: string): string => { return s.replace("\n[o§]\n+/g", ""); }; + +export const cleanupHiddenCharacters = (s: string): string => { + const removeSymbols = new RegExp(/[\u200B]+/g); + return s.replace(removeSymbols, ""); +}; diff --git a/src/utils.ts b/src/utils.ts index c8bb6bb80bbf0e10a5b9c8bbbe95b69b179bdfcf..51805e0b55168c2ecf577d0d154f94b940ebb7a3 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -3,3 +3,7 @@ export const removeSpacesAndLinebreaks = (s: string): string => { return s.replace(removeSymbols, "").trim(); }; + +export const normalizeTextSpacings = (s: string): string => { + return s.replace(/[\r\n\v\s]+/g, " "); +};