From 959661f9e5f58be882bb5dd84798b116b6ead3c8 Mon Sep 17 00:00:00 2001 From: Jared Hancock <jared@osticket.com> Date: Thu, 30 Jan 2014 09:42:37 -0600 Subject: [PATCH] htmLawed: Fix corruption to UTF8 encoded text On some combinations of operating systems, PHP and libpcre versions, `\s` will match the iso-8859-x non-breaking-space, 0xa0. This regular expression will munge the UTF8 encoded version, 0xc2a0 to 0xc220, which is not a valid UTF8 character. When inserted into a UTF8 field in mysql, the text will be truncated at and after the first invalid character. --- include/htmLawed.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/htmLawed.php b/include/htmLawed.php index 6d25f1f98..9d0cc9e95 100644 --- a/include/htmLawed.php +++ b/include/htmLawed.php @@ -644,7 +644,7 @@ return ''; function hl_tidy($t, $w, $p){ // Tidy/compact HTM if(strpos(' pre,script,textarea', "$p,")){return $t;} -$t = preg_replace('`\s+`', ' ', preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(</\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t)); +$t = preg_replace('`[ \t\r\n\f]+`', ' ', preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(</\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t)); if(($w = strtolower($w)) == -1){ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t); } -- GitLab