From 0495d5cd4fa72c705bb4560b831029f32a2f74fc Mon Sep 17 00:00:00 2001 From: Jared Hancock <jared@osticket.com> Date: Fri, 7 Mar 2014 23:51:04 -0600 Subject: [PATCH] html2text: Properly wrap and pad text with diacritics --- include/html2text.php | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/include/html2text.php b/include/html2text.php index 705da931f..fac166b4d 100644 --- a/include/html2text.php +++ b/include/html2text.php @@ -55,6 +55,10 @@ function convert_html_to_text($html, $width=74) { 'b' => array('text-transform' => 'uppercase'), 'strong' => array('text-transform' => 'uppercase'), 'h4' => array('text-transform' => 'uppercase'), + + // Crazy M$ styles + '.MsoNormal' => array('margin' => 0, 'margin-bottom' => 0.0001), + '.MsoPlainText' => array('margin' => 0, 'margin-bottom' => 0.0001), )) ); $options = array(); @@ -188,9 +192,9 @@ class HtmlInlineElement { $this->ws = $this->getStyle('white-space', 'normal'); // Direction $dir = $this->node->getAttribute('dir'); - // Ensure we have a value, but don't a control char unless direction - // is declared - $this->dir = $dir ?: 'left'; + // Ensure we have a value, but don't emit a control char unless + // direction is declared + $this->dir = $dir ?: 'ltr'; switch (strtolower($dir)) { case 'ltr': $output .= "\xE2\x80\x8E"; # LEFT-TO-RIGHT MARK @@ -234,7 +238,7 @@ class HtmlInlineElement { } switch ($this->getStyle('text-decoration', 'none')) { case 'underline': - // Remove diacritics and underline chars which do not go below + // Split diacritics and underline chars which do not go below // the baseline if (class_exists('Normalizer')) $output = Normalizer::normalize($output, Normalizer::FORM_D); @@ -903,7 +907,8 @@ function mb_wordwrap($string, $width=75, $break="\n", $cut=false) { if ($cut) { // Match anything 1 to $width chars long followed by whitespace or EOS, // otherwise match anything $width chars long - $search = '/(.{1,'.$width.'})(?:\s|$|(\p{Ps}))|(.{'.$width.'})/uS'; + $search = '/((?>[^\n\p{M}]\p{M}*){1,'.$width.'})(?:[ \n]|$|(\p{Ps}))|((?>[^\n\p{M}]\p{M}*){' + .$width.'})/uS'; # <?php $replace = '$1$3'.$break.'$2'; } else { // Anchor the beginning of the pattern with a lookahead @@ -917,8 +922,9 @@ function mb_wordwrap($string, $width=75, $break="\n", $cut=false) { // Thanks http://www.php.net/manual/en/ref.mbstring.php#90611 function mb_str_pad($input, $pad_length, $pad_string=" ", $pad_style=STR_PAD_RIGHT) { + $marks = preg_match_all('/\p{M}/u', $input, $match); return str_pad($input, - strlen($input)-mb_strwidth($input)+$pad_length, $pad_string, + strlen($input)-mb_strwidth($input)+$marks+$pad_length, $pad_string, $pad_style); } -- GitLab