diff --git a/include/class.api.php b/include/class.api.php index 2382a70e406d2b3f114fc4f1fd280b5ba67aed91..082cc178d5704598742df3b7dfda646261e71967 100644 --- a/include/class.api.php +++ b/include/class.api.php @@ -345,7 +345,7 @@ class ApiXmlDataParser extends XmlDataParser { unset($value[":text"]); } if (isset($value['encoding'])) - $value['body'] = Format::utf8encode($value['body'], $value['encoding']); + $value['body'] = Charset::utf8($value['body'], $value['encoding']); if (!strcasecmp($value['type'], 'text/html')) $value = new HtmlThreadBody($value['body']); diff --git a/include/class.charset.php b/include/class.charset.php new file mode 100644 index 0000000000000000000000000000000000000000..5411e6b4abcb1347a0e22998678b13d57fe119d3 --- /dev/null +++ b/include/class.charset.php @@ -0,0 +1,75 @@ +<?php +/********************************************************************* + class.charset.php + + Charset util class + + Copyright (c) 2015 osTicket + http://www.osticket.com + + Released under the GNU General Public License WITHOUT ANY WARRANTY. + See LICENSE.TXT for details. + + vim: expandtab sw=4 ts=4 sts=4: +**********************************************************************/ + +class Charset { + + const UTF8 = 'utf-8'; + + // Cleanup invalid charsets + // Thanks in part to https://github.com/mikel/mail/commit/88457e + static function normalize($charset) { + + $match = array(); + switch (true) { + // Windows charsets - force correct format + case preg_match('`^Windows-?(\d+)$`', $charset, $match): + return 'Windows-'.$match[1]; + // ks_c_5601-1987: Korean alias for cp949 + case preg_match('`^ks_c_5601-1987`', $charset): + return 'cp949'; + // Incorrect, bogus, ambiguous or empty charsets + // ISO-8859-1 is assumed + case preg_match('`^(default|x-user-defined|iso|us-ascii)`', $charset): + case preg_match('`^\s*$`', $charset): + return 'ISO-8859-1'; + } + + // Hmmmm + return $charset; + } + + // Translate characters ($text) from one encoding ($from) to another ($to) + static function transcode($text, $from, $to) { + + //Try auto-detecting charset/encoding + if (!$from && function_exists('mb_detect_encoding')) + $from = mb_detect_encoding($text); + + // Normalize bogus or ambiguous charsets + $from = self::normalize(trim($from)); + $to = self::normalize(trim($to)); + + $original = $text; + if (function_exists('iconv')) + $text = iconv($from, $to.'//IGNORE', $text); + elseif (function_exists('mb_convert_encoding')) + $text = mb_convert_encoding($text, $to, $from); + elseif (!strcasecmp($to, 'utf-8') + && function_exists('utf8_encode') + && !strcasecmp($from, 'ISO-8859-1')) + $text = utf8_encode($text); + + // If $text is false, then we have a (likely) invalid charset, use + // the original text and assume 8-bit (latin-1 / iso-8859-1) + // encoding + return (!$text && $original) ? $original : $text; + } + + //Wrapper for utf-8 transcoding. + function utf8($text, $charset=null) { + return self::transcode($text, $charset, self::UTF8); + } +} +?> diff --git a/include/class.format.php b/include/class.format.php index a124a577f60a0cc88beff06dce0c3953fa8d96d6..e597de6185788bab00167f6e866623388f6ffa6b 100644 --- a/include/class.format.php +++ b/include/class.format.php @@ -14,6 +14,7 @@ vim: expandtab sw=4 ts=4 sts=4: **********************************************************************/ +include_once INCLUDE_DIR.'class.charset.php'; class Format { @@ -40,47 +41,13 @@ class Format { return $size; } - /* encode text into desired encoding - taking into accout charset when available. */ - function encode($text, $charset=null, $encoding='utf-8') { - - //Try auto-detecting charset/encoding - if (!$charset && function_exists('mb_detect_encoding')) - $charset = mb_detect_encoding($text); - - // Cleanup - incorrect, bogus, or ambiguous charsets - // ISO-8859-1 is assumed for empty charset. - if (!$charset || in_array(strtolower(trim($charset)), - array('default','x-user-defined','iso','us-ascii'))) - $charset = 'ISO-8859-1'; - - $original = $text; - if (function_exists('iconv')) - $text = iconv($charset, $encoding.'//IGNORE', $text); - elseif (function_exists('mb_convert_encoding')) - $text = mb_convert_encoding($text, $encoding, $charset); - elseif (!strcasecmp($encoding, 'utf-8') - && function_exists('utf8_encode') - && !strcasecmp($charset, 'ISO-8859-1')) - $text = utf8_encode($text); - - // If $text is false, then we have a (likely) invalid charset, use - // the original text and assume 8-bit (latin-1 / iso-8859-1) - // encoding - return (!$text && $original) ? $original : $text; - } - - //Wrapper for utf-8 encoding. - function utf8encode($text, $charset=null) { - return Format::encode($text, $charset, 'utf-8'); - } - function mimedecode($text, $encoding='UTF-8') { if(function_exists('imap_mime_header_decode') && ($parts = imap_mime_header_decode($text))) { $str =''; foreach ($parts as $part) - $str.= Format::encode($part->text, $part->charset, $encoding); + $str.= Charset::transcode($part->text, $part->charset, $encoding); $text = $str; } elseif($text[0] == '=' && function_exists('iconv_mime_decode')) { @@ -105,7 +72,7 @@ class Format { $filename, $match)) // XXX: Currently we don't care about the language component. // The encoding hint is sufficient. - return self::utf8encode(urldecode($match[3]), $match[1]); + return Charset::utf8(urldecode($match[3]), $match[1]); else return $filename; } @@ -555,7 +522,7 @@ class Format { $contents = base64_decode($contents); } if ($output_encoding && $charset) - $contents = Format::encode($contents, $charset, $output_encoding); + $contents = Charset::transcode($contents, $charset, $output_encoding); return array( 'data' => $contents, diff --git a/include/class.mailfetch.php b/include/class.mailfetch.php index 1210bffa3f3bfd924169e87939d369c229a4a9ce..363589b46ab2f9720960ac4a8084e80793d9f67c 100644 --- a/include/class.mailfetch.php +++ b/include/class.mailfetch.php @@ -206,7 +206,7 @@ class MailFetcher { //Convert text to desired encoding..defaults to utf8 function mime_encode($text, $charset=null, $encoding='utf-8') { //Thank in part to afterburner - return Format::encode($text, $charset, $encoding); + return Charset::transcode($text, $charset, $encoding); } function mailbox_encode($mailbox) { @@ -240,7 +240,7 @@ class MailFetcher { if (function_exists('mb_detect_encoding')) if (($src_enc = mb_detect_encoding($text)) && (strcasecmp($src_enc, 'ASCII') !== 0)) - return Format::encode($text, $src_enc, $encoding); + return Charset::transcode($text, $src_enc, $encoding); // Handle ASCII text and RFC-2047 encoding $str = ''; diff --git a/include/class.mailparse.php b/include/class.mailparse.php index 4038f1226fa53b3a038bbea912ada03e3ceaf108..2a91faf7f451b789a107e4fbe691b4ff50ac0bb5 100644 --- a/include/class.mailparse.php +++ b/include/class.mailparse.php @@ -335,7 +335,7 @@ class Mail_Parse { $content = $struct->body; //Encode to desired encoding - ONLY if charset is known?? if (isset($struct->ctype_parameters['charset'])) - $content = Format::encode($content, + $content = Charset::transcode($content, $struct->ctype_parameters['charset'], $this->charset); return $content; @@ -358,7 +358,7 @@ class Mail_Parse { function mime_encode($text, $charset=null, $encoding='utf-8') { - return Format::encode($text, $charset, $encoding); + return Charset::transcode($text, $charset, $encoding); } function getAttachments($part=null){ diff --git a/include/class.translation.php b/include/class.translation.php index cc21848a31e9153f8940c939a48d624bed36d461..0f0924ed299a2adabae8e56156085d6b0e7781a1 100644 --- a/include/class.translation.php +++ b/include/class.translation.php @@ -576,7 +576,7 @@ class Translation extends gettext_reader implements Serializable { if (!$this->encode) return $string; - return Format::encode($string, 'utf-8', $this->charset); + return Charset::transcode($string, 'utf-8', $this->charset); } static function buildHashFile($mofile, $outfile=false, $return=false) { @@ -626,9 +626,8 @@ class Translation extends gettext_reader implements Serializable { } if ($charset && strcasecmp($charset, 'utf-8') !== 0) { foreach ($table as $orig=>$trans) { - // Format::encode defaults to UTF-8 output - $table[Format::encode($orig, $charset)] = - Format::encode($trans, $charset); + $table[Charset::utf8($orig, $charset)] = + Charset::utf8($trans, $charset); unset($table[$orig]); } } diff --git a/include/tnef_decoder.php b/include/tnef_decoder.php index e0f7869ba82b4ce554ada486efbfb1ae9cc4bb67..70e455851fc69d236f822ed033fdb7e731219f57 100644 --- a/include/tnef_decoder.php +++ b/include/tnef_decoder.php @@ -341,7 +341,7 @@ class TnefAttributeStreamReader extends TnefStreamReader { /* Read and truncate to length. */ $text = substr($this->_getx($datalen), 0, $length); if ($type == self::TypeUnicode) { - $text = Format::encode($text, 'ucs2'); + $text = Charset::utf8($text, 'ucs2'); } return $text; @@ -543,7 +543,7 @@ class TnefMessage extends AbstractTnefObject { // Transcode it if ($encoding && $charset) - $body = Format::encode($body, $charset, $encoding); + $body = Charset::transcode($body, $charset, $encoding); return $body; }