Skip to content
Snippets Groups Projects
Commit 430459df authored by Jared Hancock's avatar Jared Hancock
Browse files

html2text: Add underline support for <u> and <a>

Also remove `*` chars from <b> and <strong> elements and change <hr> to use
box drawing Unicode chars.
parent 204100b2
No related branches found
No related tags found
No related merge requests found
...@@ -50,6 +50,11 @@ function convert_html_to_text($html, $width=74) { ...@@ -50,6 +50,11 @@ function convert_html_to_text($html, $width=74) {
'html' => array('white-space' => 'pre'), # Don't wrap footnotes 'html' => array('white-space' => 'pre'), # Don't wrap footnotes
'p' => array('margin-bottom' => '1em'), 'p' => array('margin-bottom' => '1em'),
'pre' => array('white-space' => 'pre'), 'pre' => array('white-space' => 'pre'),
'u' => array('text-decoration' => 'underline'),
'a' => array('text-decoration' => 'underline'),
'b' => array('text-transform' => 'uppercase'),
'strong' => array('text-transform' => 'uppercase'),
'h4' => array('text-transform' => 'uppercase'),
)) ))
); );
$options = array(); $options = array();
...@@ -129,13 +134,6 @@ function identify_node($node, $parent=null) { ...@@ -129,13 +134,6 @@ function identify_node($node, $parent=null) {
case "a": case "a":
return new HtmlAElement($node, $parent); return new HtmlAElement($node, $parent);
case "b":
case "strong":
return new HtmlBElement($node, $parent);
case "u":
return new HtmlUElement($node, $parent);
case "ol": case "ol":
return new HtmlListElement($node, $parent); return new HtmlListElement($node, $parent);
case "ul": case "ul":
...@@ -152,8 +150,8 @@ function identify_node($node, $parent=null) { ...@@ -152,8 +150,8 @@ function identify_node($node, $parent=null) {
default: default:
// print out contents of unknown tags // print out contents of unknown tags
if ($node->hasChildNodes() && $node->childNodes->length == 1) //if ($node->hasChildNodes() && $node->childNodes->length == 1)
return identify_node($node->childNodes->item(0), $parent); // return identify_node($node->childNodes->item(0), $parent);
return new HtmlInlineElement($node, $parent); return new HtmlInlineElement($node, $parent);
} }
...@@ -188,6 +186,19 @@ class HtmlInlineElement { ...@@ -188,6 +186,19 @@ class HtmlInlineElement {
$output = ''; $output = '';
$after_block = false; $after_block = false;
$this->ws = $this->getStyle('white-space', 'normal'); $this->ws = $this->getStyle('white-space', 'normal');
// Direction
$dir = $this->node->getAttribute('dir');
// Ensure we have a value, but don't a control char unless direction
// is declared
$this->dir = $dir ?: 'left';
switch (strtolower($dir)) {
case 'ltr':
$output .= "\xE2\x80\x8E"; # LEFT-TO-RIGHT MARK
break;
case 'rtl':
$output .= "\xE2\x80\x8F"; # RIGHT-TO-LEFT MARK
break;
}
foreach ($this->children as $c) { foreach ($this->children as $c) {
if ($c instanceof DOMText) { if ($c instanceof DOMText) {
// Collapse white-space // Collapse white-space
...@@ -216,6 +227,20 @@ class HtmlInlineElement { ...@@ -216,6 +227,20 @@ class HtmlInlineElement {
elseif (is_string($more)) elseif (is_string($more))
$output .= $more; $output .= $more;
} }
switch ($this->getStyle('text-transform', 'none')) {
case 'uppercase':
$output = mb_strtoupper($output);
break;
}
switch ($this->getStyle('text-decoration', 'none')) {
case 'underline':
// Remove diacritics and underline chars which do not go below
// the baseline
if (class_exists('Normalizer'))
$output = Normalizer::normalize($output, Normalizer::FORM_D);
$output = preg_replace("/[a-fhik-or-xzA-PR-Z0-9#]/u", "$0\xcc\xb2", $output);
break;
}
if ($this->footnotes) { if ($this->footnotes) {
$output .= "\n\n" . str_repeat('-', $width/2) . "\n"; $output .= "\n\n" . str_repeat('-', $width/2) . "\n";
$id = 1; $id = 1;
...@@ -232,7 +257,7 @@ class HtmlInlineElement { ...@@ -232,7 +257,7 @@ class HtmlInlineElement {
if ($c instanceof HtmlInlineElement) if ($c instanceof HtmlInlineElement)
$this->weight += $c->getWeight(); $this->weight += $c->getWeight();
elseif ($c instanceof DomText) elseif ($c instanceof DomText)
$this->weight += mb_strwidth($c->wholeText); $this->weight += mb_strwidth2($c->wholeText);
} }
} }
return $this->weight; return $this->weight;
...@@ -240,7 +265,7 @@ class HtmlInlineElement { ...@@ -240,7 +265,7 @@ class HtmlInlineElement {
function getStyle($property, $default=null, $tag=false, $classes=false) { function getStyle($property, $default=null, $tag=false, $classes=false) {
if ($this->style && $this->style->has($property)) if ($this->style && $this->style->has($property))
return $this->style->get($property); return $this->style->get($property, $default);
if ($tag === false) if ($tag === false)
$tag = $this->node->nodeName; $tag = $this->node->nodeName;
...@@ -315,15 +340,39 @@ class HtmlBlockElement extends HtmlInlineElement { ...@@ -315,15 +340,39 @@ class HtmlBlockElement extends HtmlInlineElement {
// Apply stylesheet styles // Apply stylesheet styles
// TODO: Padding // TODO: Padding
// TODO: Justification
// Justification
static $aligns = array(
'left' => STR_PAD_RIGHT,
'right' => STR_PAD_LEFT,
'center' => STR_PAD_BOTH,
);
$talign = $this->getStyle('text-align', 'none');
if (isset($aligns[$talign])) {
// Explode lines, justify, implode again
$output = array_map(function($l) use ($talign, $aligns, $width) {
return mb_str_pad($l, $width, ' ', $aligns[$talign]);
}, explode("\n", $output)
);
$output = implode("\n", $output);
}
// Border // Border
if ($bw) if ($bw)
$output = self::borderize($output, $width); $output = self::borderize($output, $width);
// Margin // Margin
$mb = $this->getStyle('margin-bottom', 0); $mb = $this->getStyle('margin-bottom', 0);
$output .= str_repeat("\n", (int)$mb); $output .= str_repeat("\n", (int)$mb);
return "\n" . $output; // Add leading newline if not preceed by <br/>
if (!($pn = $this->node->previousSibling)
|| !in_array(strtolower($pn->nodeName), array('br','hr'))
) {
$output = "\n" . $output;
}
return $output;
} }
function borderize($what, $width) { function borderize($what, $width) {
...@@ -340,7 +389,7 @@ class HtmlBlockElement extends HtmlInlineElement { ...@@ -340,7 +389,7 @@ class HtmlBlockElement extends HtmlInlineElement {
if ($c instanceof HtmlBlockElement) if ($c instanceof HtmlBlockElement)
$this->min_width = max($c->getMinWidth(), $this->min_width); $this->min_width = max($c->getMinWidth(), $this->min_width);
elseif ($c instanceof DomText) elseif ($c instanceof DomText)
$this->min_width = max(max(array_map('mb_strwidth', $this->min_width = max(max(array_map('mb_strwidth2',
explode(' ', $c->wholeText))), $this->min_width); explode(' ', $c->wholeText))), $this->min_width);
} }
} }
...@@ -353,25 +402,10 @@ class HtmlBrElement extends HtmlBlockElement { ...@@ -353,25 +402,10 @@ class HtmlBrElement extends HtmlBlockElement {
return "\n"; return "\n";
} }
} }
class HtmlUElement extends HtmlInlineElement {
function render($width, $options) {
$output = parent::render($width, $options);
return "_".str_replace(" ", "_", $output)."_";
}
function getWeight() { return parent::getWeight() + 2; }
}
class HtmlBElement extends HtmlInlineElement {
function render($width, $options) {
$output = parent::render($width, $options);
return "*".$output."*";
}
function getWeight() { return parent::getWeight() + 2; }
}
class HtmlHrElement extends HtmlBlockElement { class HtmlHrElement extends HtmlBlockElement {
function render($width, $options) { function render($width, $options) {
return "\n".str_repeat('-', $width)."\n"; return "\n".str_repeat("\xE2\x94\x80", $width)."\n";
} }
function getWeight() { return 1; } function getWeight() { return 1; }
function getMinWidth() { return 0; } function getMinWidth() { return 0; }
...@@ -384,17 +418,18 @@ class HtmlHeadlineElement extends HtmlBlockElement { ...@@ -384,17 +418,18 @@ class HtmlHeadlineElement extends HtmlBlockElement {
return ""; return "";
switch ($this->node->nodeName) { switch ($this->node->nodeName) {
case 'h1': case 'h1':
$line = "\xE2\x95\x90"; # U+2505
break;
case 'h2': case 'h2':
$line = '='; $line = "\xE2\x94\x81"; # U+2501
break; break;
case 'h3': case 'h3':
case 'h4': $line = "\xE2\x94\x80"; # U+2500
$line = '-';
break; break;
default: default:
return $headline; return $headline;
} }
$length = max(array_map('mb_strwidth', explode("\n", $headline))); $length = max(array_map('mb_strwidth2', explode("\n", $headline)));
$headline .= "\n" . str_repeat($line, $length) . "\n"; $headline .= "\n" . str_repeat($line, $length) . "\n";
return $headline; return $headline;
} }
...@@ -430,7 +465,7 @@ class HtmlImgElement extends HtmlInlineElement { ...@@ -430,7 +465,7 @@ class HtmlImgElement extends HtmlInlineElement {
return "[image:$alt$title] "; return "[image:$alt$title] ";
} }
function getWeight() { function getWeight() {
return mb_strwidth($this->node->getAttribute("alt")) + 8; return mb_strwidth2($this->node->getAttribute("alt")) + 8;
} }
} }
...@@ -447,8 +482,8 @@ class HtmlAElement extends HtmlInlineElement { ...@@ -447,8 +482,8 @@ class HtmlAElement extends HtmlInlineElement {
} elseif (strpos($href, 'mailto:') === 0) { } elseif (strpos($href, 'mailto:') === 0) {
$href = substr($href, 7); $href = substr($href, 7);
$output = (($href != $output) ? "$href " : '') . "<$output>"; $output = (($href != $output) ? "$href " : '') . "<$output>";
} elseif (mb_strwidth($href) > $width / 2) { } elseif (mb_strwidth2($href) > $width / 2) {
if (mb_strwidth($output) > $width / 2) { if (mb_strwidth2($output) > $width / 2) {
// Parse URL and use relative path part // Parse URL and use relative path part
if ($PU = parse_url($output)) if ($PU = parse_url($output))
$output = $PU['host'] . $PU['path']; $output = $PU['host'] . $PU['path'];
...@@ -498,10 +533,10 @@ class HtmlListItem extends HtmlBlockElement { ...@@ -498,10 +533,10 @@ class HtmlListItem extends HtmlBlockElement {
function render($width, $options) { function render($width, $options) {
$prefix = sprintf($options['marker'], $this->number); $prefix = sprintf($options['marker'], $this->number);
$lines = explode("\n", trim(parent::render($width-mb_strwidth($prefix), $options))); $lines = explode("\n", trim(parent::render($width-mb_strwidth2($prefix), $options)));
$lines[0] = $prefix . $lines[0]; $lines[0] = $prefix . $lines[0];
return new PreFormattedText( return new PreFormattedText(
implode("\n".str_repeat(" ", mb_strwidth($prefix)), $lines)."\n"); implode("\n".str_repeat(" ", mb_strwidth2($prefix)), $lines)."\n");
} }
} }
...@@ -853,6 +888,10 @@ if (!function_exists('mb_strwidth')) { ...@@ -853,6 +888,10 @@ if (!function_exists('mb_strwidth')) {
return mb_strlen($string); return mb_strlen($string);
} }
} }
function mb_strwidth2($string) {
$junk = array();
return mb_strwidth($string) - preg_match_all("/\p{M}/u", $string, $junk);
}
// Thanks http://www.php.net/manual/en/function.wordwrap.php#107570 // Thanks http://www.php.net/manual/en/function.wordwrap.php#107570
// @see http://www.tads.org/t3doc/doc/htmltads/linebrk.htm // @see http://www.tads.org/t3doc/doc/htmltads/linebrk.htm
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment