From 959661f9e5f58be882bb5dd84798b116b6ead3c8 Mon Sep 17 00:00:00 2001
From: Jared Hancock <jared@osticket.com>
Date: Thu, 30 Jan 2014 09:42:37 -0600
Subject: [PATCH] htmLawed: Fix corruption to UTF8 encoded text

On some combinations of operating systems, PHP and libpcre versions, `\s`
will match the iso-8859-x non-breaking-space, 0xa0. This regular expression
will munge the UTF8 encoded version, 0xc2a0 to 0xc220, which is not a valid
UTF8 character.

When inserted into a UTF8 field in mysql, the text will be truncated at and
after the first invalid character.
---
 include/htmLawed.php | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/htmLawed.php b/include/htmLawed.php
index 6d25f1f98..9d0cc9e95 100644
--- a/include/htmLawed.php
+++ b/include/htmLawed.php
@@ -644,7 +644,7 @@ return '';
 function hl_tidy($t, $w, $p){
 // Tidy/compact HTM
 if(strpos(' pre,script,textarea', "$p,")){return $t;}
-$t = preg_replace('`\s+`', ' ', preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(</\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t));
+$t = preg_replace('`[ \t\r\n\f]+`', ' ', preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(</\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t));
 if(($w = strtolower($w)) == -1){
  return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t);
 }
-- 
GitLab