From a71bdfe4d5f1c9d62c36944c0b6be6a362b74b53 Mon Sep 17 00:00:00 2001 From: Jared Hancock <jared@osticket.com> Date: Sat, 16 May 2015 09:22:49 -0500 Subject: [PATCH] html2text: Better rendering for tables * Support @border and @cellpadding attributes * Support padding for display:block elements --- include/html2text.php | 156 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 128 insertions(+), 28 deletions(-) diff --git a/include/html2text.php b/include/html2text.php index d859d428b..8da0755fc 100644 --- a/include/html2text.php +++ b/include/html2text.php @@ -48,6 +48,7 @@ function convert_html_to_text($html, $width=74) { $elements->getRoot()->addStylesheet( HtmlStylesheet::fromArray(array( 'html' => array('white-space' => 'pre'), # Don't wrap footnotes + 'center' => array('text-align' => 'center'), 'p' => array('margin-bottom' => '1em'), 'pre' => array('white-space' => 'pre'), 'u' => array('text-decoration' => 'underline'), @@ -117,6 +118,7 @@ function identify_node($node, $parent=null) { case "head": case "html": case "body": + case "center": case "div": case "p": case "pre": @@ -172,9 +174,10 @@ class HtmlInlineElement { $this->parent = $parent; $this->node = $node; $this->traverse($node); + $this->style = new CssStyleRules(); if ($node instanceof DomElement && ($style = $this->node->getAttribute('style'))) - $this->style = new CssStyleRules($style); + $this->style->add($style); } function traverse($node) { @@ -216,6 +219,10 @@ class HtmlInlineElement { case 'normal': default: if ($after_block) $more = ltrim($more); + if ($this instanceof HtmlBlockElement && trim($more) == '') + // Ignore pure whitespace in-between elements inside + // block elements + $more = ''; $more = preg_replace('/[ \r\n\t\f]+/mu', ' ', $more); } } @@ -272,6 +279,10 @@ class HtmlInlineElement { return $this->weight; } + function setStyle($property, $value) { + $this->style->set($property, $value); + } + function getStyle($property, $default=null, $tag=false, $classes=false) { if ($this->style && $this->style->has($property)) return $this->style->get($property, $default); @@ -318,6 +329,14 @@ class HtmlInlineElement { class HtmlBlockElement extends HtmlInlineElement { var $min_width = false; + var $pad_left; + var $pad_right; + + function __construct($node, $parent) { + parent::__construct($node, $parent); + $this->pad_left = str_repeat(' ', $this->getStyle('padding-left', 0.0)); + $this->pad_right = str_repeat(' ', $this->getStyle('padding-right', 0.0)); + } function render($width, $options) { // Allow room for the border. @@ -329,12 +348,16 @@ class HtmlBlockElement extends HtmlInlineElement { $output = parent::render($width, $options); if ($output instanceof PreFormattedText) // TODO: Consider CSS rules - return new PreFormattedText("\n" . $output); + return $output; + // Leading and trailing whitespace is ignored in block elements $output = trim($output); if (!strlen($output)) return ""; + // Padding + $width -= strlen($this->pad_left) + strlen($this->pad_right); + // Wordwrap the content to the width switch ($this->ws) { case 'nowrap': @@ -347,9 +370,6 @@ class HtmlBlockElement extends HtmlInlineElement { $output = mb_wordwrap($output, $width, "\n", true); } - // Apply stylesheet styles - // TODO: Padding - // Justification static $aligns = array( 'left' => STR_PAD_RIGHT, @@ -357,10 +377,19 @@ class HtmlBlockElement extends HtmlInlineElement { 'center' => STR_PAD_BOTH, ); $talign = $this->getStyle('text-align', 'none'); + $self = $this; if (isset($aligns[$talign])) { // Explode lines, justify, implode again - $output = array_map(function($l) use ($talign, $aligns, $width) { - return mb_str_pad($l, $width, ' ', $aligns[$talign]); + $output = array_map(function($l) use ($talign, $aligns, $width, $self) { + return $self->pad_left.mb_str_pad($l, $width, ' ', $aligns[$talign]).$self->pad_right; + }, explode("\n", $output) + ); + $output = implode("\n", $output); + } + // Apply left and right padding, if specified + elseif ($this->pad_left || $this->pad_right) { + $output = array_map(function($l) use ($self) { + return $self->pad_left.$l.$self->pad_right; }, explode("\n", $output) ); $output = implode("\n", $output); @@ -371,7 +400,8 @@ class HtmlBlockElement extends HtmlInlineElement { $output = self::borderize($output, $width); // Margin - $mb = $this->getStyle('margin-bottom', 0); + $mb = $this->getStyle('margin-bottom', 0.0) + + $this->getStyle('padding-bottom', 0.0); $output .= str_repeat("\n", (int)$mb); return $output."\n"; @@ -395,7 +425,7 @@ class HtmlBlockElement extends HtmlInlineElement { explode(' ', $c->wholeText))), $this->min_width); } } - return $this->min_width; + return $this->min_width + strlen($this->pad_left) + strlen($this->pad_right); } } @@ -553,11 +583,23 @@ class HtmlCodeElement extends HtmlInlineElement { } class HtmlTable extends HtmlBlockElement { + var $body; + var $foot; + var $rows; + var $border = true; + var $padding = true; + function __construct($node, $parent) { $this->body = array(); $this->foot = array(); $this->rows = &$this->body; parent::__construct($node, $parent); + $A = $this->node->getAttribute('border'); + if (isset($A)) + $this->border = (bool) $A; + $A = $this->node->getAttribute('cellpadding'); + if (isset($A)) + $this->padding = (bool) $A; } function getMinWidth() { @@ -566,7 +608,7 @@ class HtmlTable extends HtmlBlockElement { foreach ($r as $cell) $this->min_width = max($this->min_width, $cell->getMinWidth()); } - return $this->min_width + 4; + return $this->min_width + ($this->border ? 2 : 0) + ($this->padding ? 2 : 0); } function getWeight() { @@ -665,6 +707,7 @@ class HtmlTable extends HtmlBlockElement { $i = 0; foreach ($r as $cell) { for ($j=0; $j<$cell->cols; $j++) { + // TODO: Use cell-specified width $weights[$i] = max($weights[$i], $cell->getWeight()); $mins[$i] = max($mins[$i], $cell->getMinWidth()); } @@ -673,7 +716,8 @@ class HtmlTable extends HtmlBlockElement { } # Subtract internal padding and borders from the available width - $inner_width = $width - $cols*3 - 1; + $inner_width = $width - ($this->border ? $cols + 1 : 0) + - ($this->padding ? $cols*2 : 0); # Optimal case, where the preferred width of all the columns is # doable @@ -692,7 +736,9 @@ class HtmlTable extends HtmlBlockElement { $widths[] = (int)($inner_width * $c / $total); $this->_fixupWidths($widths, $mins); } - $outer_width = array_sum($widths) + $cols*3 + 1; + $outer_width = array_sum($widths) + + ($this->border ? $cols + 1 : 0) + + ($this->padding ? $cols * 2 : 0); $contents = array(); $heights = array(); @@ -708,7 +754,8 @@ class HtmlTable extends HtmlBlockElement { # Compute the effective cell width for spanned columns # Add extra space for the unneeded border padding for # spanned columns - $cwidth = ($cell->cols - 1) * 3; + $cwidth = ($this->border ? ($cell->cols - 1) : 0) + + ($this->padding ? ($cell->cols - 1) * 2 : 0); for ($j = 0; $j < $cell->cols; $j++) $cwidth += $widths[$x+$j]; # Stash the computed width so it doesn't need to be @@ -716,7 +763,8 @@ class HtmlTable extends HtmlBlockElement { $cell->width = $cwidth; unset($data); $data = explode("\n", $cell->render($cwidth, $options)); - $heights[$y] = max(count($data), $heights[$y]); + // NOTE: block elements have trailing newline + $heights[$y] = max(count($data)-1, $heights[$y]); $contents[$y][$i] = &$data; $x += $cell->cols; } @@ -724,29 +772,34 @@ class HtmlTable extends HtmlBlockElement { # Build the header $header = ""; - for ($i = 0; $i < $cols; $i++) - $header .= "+-" . str_repeat("-", $widths[$i]) . "-"; - $header .= "+"; + if ($this->border) { + $padding = $this->padding ? '-' : ''; + for ($i = 0; $i < $cols; $i++) { + $header .= '+'.$padding.str_repeat("-", $widths[$i]).$padding; + } + $header .= "+\n"; + } # Emit the rows - $output = "\n"; if (isset($this->caption)) { $this->caption = $this->caption->render($outer_width, $options); } + $border = $this->border ? '|' : ''; + $padding = $this->padding ? ' ' : ''; foreach ($rows as $y=>$r) { - $output .= $header . "\n"; + $output .= $header; for ($x = 0, $k = 0; $k < $heights[$y]; $k++) { - $output .= "|"; + $output .= $border; foreach ($r as $x=>$cell) { $content = (isset($contents[$y][$x][$k])) ? $contents[$y][$x][$k] : ""; - $output .= " ".mb_str_pad($content, $cell->width)." |"; + $output .= $padding.mb_str_pad($content, $cell->width).$padding.$border; $x += $cell->cols; } $output .= "\n"; } } - $output .= $header . "\n"; + $output .= $header; return new PreFormattedText($output); } } @@ -759,10 +812,14 @@ class HtmlTableCell extends HtmlBlockElement { if (!$this->cols) $this->cols = 1; if (!$this->rows) $this->rows = 1; + + // Upgrade old attributes + if ($A = $this->node->getAttribute('align')) + $this->setStyle('text-align', $A); } function render($width, $options) { - return ltrim(parent::render($width, $options)); + return parent::render($width, $options); } function getWeight() { @@ -821,13 +878,48 @@ class HtmlStylesheet { class CssStyleRules { var $rules = array(); - function __construct($rules) { + static $compact_rules = array( + 'padding' => 1, + ); + + function __construct($rules='') { + if ($rules) + $this->add($rules); + } + + function add($rules) { foreach (explode(';', $rules) as $r) { if (strpos($r, ':') === false) continue; list($prop, $val) = explode(':', $r); - $this->rules[trim($prop)] = trim($val); + $prop = trim($prop); // TODO: Explode compact rules, like 'border', 'margin', etc. + if (isset(self::$compact_rules[$prop])) + $this->expand($prop, trim($val)); + else + $this->rules[$prop] = trim($val); + } + } + + function expand($prop, $val) { + switch (strtolower($prop)) { + case 'padding': + @list($a, $b, $c, $d) = preg_split('/\s+/', $val); + if (!isset($b)) { + $d = $c = $b = $a; + } + elseif (!isset($c)) { + $d = $b; + $c = $a; + } + elseif (!isset($d)) { + $d = $b; + } + $this->rules['padding-top'] = $a; + $this->styles['padding-right'] = $b; + $this->rules['padding-bottom'] = $c; + $this->rules['padding-left'] = $d; + } } @@ -853,18 +945,26 @@ class CssStyleRules { return $val; } - static function convert($value, $units) { + function set($prop, $value) { + $this->rules[$prop] = $value; + } + + static function convert($value, $units, $max=0) { if ($value === null) return $value; // Converts common CSS units to units of characters switch ($units) { + default: + if (substr($units, -1) == '%') { + return ((float) $value) * 0.01 * $max; + } case 'px': - return $value / 20.0; + // 600px =~ 60chars + return (int) ($value / 10.0); case 'pt': return $value / 12.0; case 'em': - default: return $value; } } -- GitLab