Skip to content
Snippets Groups Projects
html2text.php 33.5 KiB
Newer Older
// XXX: This does not wrap Chinese characters well
// @see http://xml.ascc.net/en/utf-8/faq/zhl10n-faq-xsl.html#qb1
//      for some more rules concerning Chinese chars
function mb_wordwrap($string, $width=75, $break="\n", $cut=false) {
  if ($cut) {
    // Match anything 1 to $width chars long followed by whitespace or EOS,
    // otherwise match anything $width chars long
    $search = '/((?>[^\n\p{M}]\p{M}*){1,'.$width.'})(?:[ \n]|$|(\p{Ps}))|((?>[^\n\p{M}]\p{M}*){'
          .$width.'})/uS'; # <?php
    $replace = '$1$3'.$break.'$2';
  } else {
    // Anchor the beginning of the pattern with a lookahead
    // to avoid crazy backtracking when words are longer than $width
    $pattern = '/(?=[\s\p{Ps}])(.{1,'.$width.'})(?:\s|$|(\p{Ps}))/uS';
    $replace = '$1'.$break.'$2';
  }
  return rtrim(preg_replace($search, $replace, $string), $break);
}

// Thanks http://www.php.net/manual/en/ref.mbstring.php#90611
function mb_str_pad($input, $pad_length, $pad_string=" ",
        $pad_style=STR_PAD_RIGHT) {
    $marks = preg_match_all('/\p{M}/u', $input, $match);
    return str_pad($input,
        strlen($input)-mb_strwidth($input)+$marks+$pad_length, $pad_string,
// Enable use of html2text from command line
// The syntax is the following: php html2text.php file.html

do {
  if (PHP_SAPI != 'cli') break;
  if (empty ($_SERVER['argc']) || $_SERVER['argc'] < 2) break;
  if (empty ($_SERVER['PHP_SELF']) || FALSE === strpos ($_SERVER['PHP_SELF'], 'html2text.php') ) break;
  $file = $argv[1];
  $width = 74;
  if (isset($argv[2]))
      $width = (int) $argv[2];
  elseif (isset($ENV['COLUMNS']))
      $width = $ENV['COLUMNS'];
  require_once(dirname(__file__).'/../bootstrap.php');
  Bootstrap::i18n_prep();
  echo convert_html_to_text (file_get_contents ($file), $width);
} while (0);