Skip to content
Snippets Groups Projects
Commit 98018391 authored by Jared Hancock's avatar Jared Hancock
Browse files

html2text: Fix several Unicode rendering issues

parent db9cecf4
No related branches found
No related tags found
No related merge requests found
......@@ -28,9 +28,20 @@ function convert_html_to_text($html, $width=74) {
$html = fix_newlines($html);
$doc = new DOMDocument('1.0', 'utf-8');
if (strpos($html, '<?xml ') === false)
$html = '<?xml encoding="utf-8"?>'.$html; # <?php (4vim)
if (!@$doc->loadHTML($html))
return $html;
// Thanks, http://us3.php.net/manual/en/domdocument.loadhtml.php#95251
// dirty fix -- remove the inserted processing instruction
foreach ($doc->childNodes as $item) {
if ($item->nodeType == XML_PI_NODE) {
$doc->removeChild($item); // remove hack
break;
}
}
$elements = identify_node($doc);
// Add the default stylesheet
......@@ -216,7 +227,7 @@ class HtmlInlineElement {
if ($c instanceof HtmlInlineElement)
$this->weight += $c->getWeight();
elseif ($c instanceof DomText)
$this->weight += strlen($c->wholeText);
$this->weight += mb_strwidth($c->wholeText);
}
}
return $this->weight;
......@@ -281,7 +292,7 @@ class HtmlBlockElement extends HtmlInlineElement {
return new PreFormattedText("\n" . $output);
$output = trim($output);
if (!strlen(trim($output)))
if (!strlen($output))
return "";
// Wordwrap the content to the width
......@@ -293,11 +304,12 @@ class HtmlBlockElement extends HtmlInlineElement {
case 'pre-wrap':
case 'normal':
default:
$output = wordwrap($output, $width, "\n", true);
$output = mb_wordwrap($output, $width, "\n", true);
}
// Apply stylesheet styles
// TODO: Padding
// TODO: Justification
// Border
if ($bw)
$output = self::borderize($output, $width);
......@@ -311,7 +323,7 @@ class HtmlBlockElement extends HtmlInlineElement {
function borderize($what, $width) {
$output = ',-'.str_repeat('-', $width)."-.\n";
foreach (explode("\n", $what) as $l)
$output .= '| '.str_pad($l, $width)." |\n";
$output .= '| '.mb_str_pad($l, $width)." |\n";
$output .= '`-'.str_repeat('-', $width)."-'\n";
return $output;
}
......@@ -371,7 +383,7 @@ class HtmlHeadlineElement extends HtmlBlockElement {
default:
return $headline;
}
$length = max(array_map('strlen', explode("\n", $headline)));
$length = max(array_map('mb_strwidth', explode("\n", $headline)));
$headline .= "\n" . str_repeat($line, $length) . "\n";
return $headline;
}
......@@ -392,7 +404,7 @@ class HtmlCiteElement extends HtmlBlockElement {
$lines[0] = "-- " . $lines[0];
// Right justification
foreach ($lines as &$l)
$l = str_pad($l, $width, " ", STR_PAD_LEFT);
$l = mb_str_pad($l, $width, " ", STR_PAD_LEFT);
unset($l);
return implode("\n", $lines);
}
......@@ -408,7 +420,7 @@ class HtmlImgElement extends HtmlInlineElement {
return "[image:$alt$title] ";
}
function getWeight() {
return strlen($this->node->getAttribute("alt")) + 8;
return mb_strwidth($this->node->getAttribute("alt")) + 8;
}
}
......@@ -422,7 +434,7 @@ class HtmlAElement extends HtmlInlineElement {
if ($this->node->getAttribute("name") != null) {
$output = "[$output]";
}
} elseif (strlen($href) > $width / 2) {
} elseif (mb_strwidth($href) > $width / 2) {
if ($href != $output)
$this->getRoot()->addFootnote($output, $href);
$output = "[$output]";
......@@ -471,16 +483,20 @@ class HtmlListItem extends HtmlBlockElement {
function render($width, $options) {
$prefix = sprintf($options['marker'], $this->number);
$lines = explode("\n", trim(parent::render($width-strlen($prefix), $options)));
$lines = explode("\n", trim(parent::render($width-mb_strwidth($prefix), $options)));
$lines[0] = $prefix . $lines[0];
return new PreFormattedText(
implode("\n".str_repeat(" ", strlen($prefix)), $lines)."\n");
implode("\n".str_repeat(" ", mb_strwidth($prefix)), $lines)."\n");
}
}
class HtmlCodeElement extends HtmlInlineElement {
function render($width, $options) {
return '`'.parent::render($width-2, $options).'`';
$content = parent::render($width-2, $options);
if (strpos($content, "\n"))
return "```\n".$content."\n```";
else
return "`$content`";
}
}
......@@ -672,11 +688,7 @@ class HtmlTable extends HtmlBlockElement {
foreach ($r as $x=>$cell) {
$content = (isset($contents[$y][$x][$k]))
? $contents[$y][$x][$k] : "";
$pad = $cell->width - mb_strlen($content, 'utf8');
$output .= " ".$content;
if ($pad > 0)
$output .= str_repeat(" ", $pad);
$output .= " |";
$output .= " ".mb_str_pad($content, $cell->width)." |";
$x += $cell->cols;
}
$output .= "\n";
......@@ -821,6 +833,41 @@ class PreFormattedText {
}
}
if (!function_exists('mb_strwidth')) {
function mb_strwidth($string) {
return mb_strlen($string);
}
}
// Thanks http://www.php.net/manual/en/function.wordwrap.php#107570
// @see http://www.tads.org/t3doc/doc/htmltads/linebrk.htm
// for some more line breaking characters and rules
// XXX: This does not wrap Chinese characters well
// @see http://xml.ascc.net/en/utf-8/faq/zhl10n-faq-xsl.html#qb1
// for some more rules concerning Chinese chars
function mb_wordwrap($string, $width=75, $break="\n", $cut=false) {
if ($cut) {
// Match anything 1 to $width chars long followed by whitespace or EOS,
// otherwise match anything $width chars long
$search = '/(.{1,'.$width.'})(?:\s|$|(\p{Ps}))|(.{'.$width.'})/uS';
$replace = '$1$3'.$break.'$2';
} else {
// Anchor the beginning of the pattern with a lookahead
// to avoid crazy backtracking when words are longer than $width
$pattern = '/(?=[\s\p{Ps}])(.{1,'.$width.'})(?:\s|$|(\p{Ps}))/uS';
$replace = '$1'.$break.'$2';
}
return rtrim(preg_replace($search, $replace, $string), $break);
}
// Thanks http://www.php.net/manual/en/ref.mbstring.php#90611
function mb_str_pad($input, $pad_length, $pad_string=" ",
$pad_style=STR_PAD_RIGHT) {
return str_pad($input,
strlen($input)-mb_strwidth($input)+$pad_length, $pad_string,
$pad_style);
}
// Enable use of html2text from command line
// The syntax is the following: php html2text.php file.html
......@@ -832,5 +879,9 @@ do {
$width = 74;
if (isset($argv[2]))
$width = (int) $argv[2];
elseif (isset($ENV['COLUMNS']))
$width = $ENV['COLUMNS'];
require_once(dirname(__file__).'/../bootstrap.php');
Bootstrap::i18n_prep();
echo convert_html_to_text (file_get_contents ($file), $width);
} while (0);
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment