From 6d5d19fa091b4b497f4d3374fd3a84503690e210 Mon Sep 17 00:00:00 2001 From: Peter Rotich <peter@osticket.com> Date: Fri, 23 Jan 2015 22:21:56 +0000 Subject: [PATCH] charset: Normalize charset This pull request adds a cleanup util for bogus and invalid charsets, mostly added by a nameless company out of Redmond, WA. --- include/class.charset.php | 41 +++++++++++++++++++++++++++++++++++++++ include/class.format.php | 9 +++------ 2 files changed, 44 insertions(+), 6 deletions(-) create mode 100644 include/class.charset.php diff --git a/include/class.charset.php b/include/class.charset.php new file mode 100644 index 000000000..5a4f28ad7 --- /dev/null +++ b/include/class.charset.php @@ -0,0 +1,41 @@ +<?php +/********************************************************************* + class.charset.php + + Charset util class + + Copyright (c) 2015 osTicket + http://www.osticket.com + + Released under the GNU General Public License WITHOUT ANY WARRANTY. + See LICENSE.TXT for details. + + vim: expandtab sw=4 ts=4 sts=4: +**********************************************************************/ + +class Charset { + + // Cleanup invalid charsets + // Thanks in part to https://github.com/mikel/mail/commit/88457e + function normalize($charset) { + + $match = array(); + switch (true) { + // Windows charsets - force correct format + case preg_match('`^Windows-?(\d+)$`', $charset, $match): + return 'Windows-'.$match[1]; + // ks_c_5601-1987: Korean alias for cp949 + case preg_match('`^ks_c_5601-1987`', $charset): + return 'cp949'; + // Incorrect, bogus, ambiguous or empty charsets + // ISO-8859-1 is assumed + case preg_match('`^(default|x-user-defined|iso|us-ascii)`', $charset): + case preg_match('`^\s*$`', $charset): + return 'ISO-8859-1'; + } + + // Hmmmm + return $charset; + } +} +?> diff --git a/include/class.format.php b/include/class.format.php index a124a577f..aaff16b8b 100644 --- a/include/class.format.php +++ b/include/class.format.php @@ -14,6 +14,7 @@ vim: expandtab sw=4 ts=4 sts=4: **********************************************************************/ +include_once INCLUDE_DIR.'class.charset.php'; class Format { @@ -47,12 +48,8 @@ class Format { if (!$charset && function_exists('mb_detect_encoding')) $charset = mb_detect_encoding($text); - // Cleanup - incorrect, bogus, or ambiguous charsets - // ISO-8859-1 is assumed for empty charset. - if (!$charset || in_array(strtolower(trim($charset)), - array('default','x-user-defined','iso','us-ascii'))) - $charset = 'ISO-8859-1'; - + // Normalize bogus or ambiguous charsets + $charset = Charset::normalize(trim($charset)); $original = $text; if (function_exists('iconv')) $text = iconv($charset, $encoding.'//IGNORE', $text); -- GitLab