Skip to content
Snippets Groups Projects
Commit 46c705f4 authored by Peter Rotich's avatar Peter Rotich
Browse files

html: Decode html entities before sanitizing

Encoded entities can be used to bypass safety checks
Don't remove iframe when using xml_dom to balance tags
parent 69f8a0a3
No related branches found
No related tags found
No related merge requests found
...@@ -136,7 +136,8 @@ class Format { ...@@ -136,7 +136,8 @@ class Format {
// Remove empty nodes // Remove empty nodes
$xpath = new DOMXPath($doc); $xpath = new DOMXPath($doc);
static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1,
'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); 'iframe' => 1, 'hr'=>1, 'img'=>1, 'input'=>1,
'isindex'=>1, 'param'=>1);
do { do {
$done = true; $done = true;
$nodes = $xpath->query('//*[not(text()) and not(node())]'); $nodes = $xpath->query('//*[not(text()) and not(node())]');
...@@ -218,6 +219,17 @@ class Format { ...@@ -218,6 +219,17 @@ class Format {
static function __html_cleanup($el, $attributes=0) { static function __html_cleanup($el, $attributes=0) {
static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1,
'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1);
// We're dealing with closing tag
if ($attributes === 0)
return "</{$el}>";
// Remove iframe and embed without src (perhaps striped by spec)
// It would be awesome to rickroll such entry :)
if (in_array($el, array('iframe', 'embed'))
&& (!isset($attributes['src']) || empty($attributes['src'])))
return '';
// Clean unexpected class values // Clean unexpected class values
if (isset($attributes['class'])) { if (isset($attributes['class'])) {
$classes = explode(' ', $attributes['class']); $classes = explode(' ', $attributes['class']);
...@@ -268,7 +280,20 @@ class Format { ...@@ -268,7 +280,20 @@ class Format {
} }
} }
function safe_html($html, $balance=1) { function safe_html($html, $options=array()) {
$options = array_merge(array(
// Balance html tags
'balance' => 1,
// Decoding special html char like &lt; and &gt; which
// can be used to skip cleaning
'decode' => true
),
$options);
if ($options['decode'])
$html = Format::htmldecode($html);
// Remove HEAD and STYLE sections // Remove HEAD and STYLE sections
$html = preg_replace( $html = preg_replace(
array(':<(head|style|script).+?</\1>:is', # <head> and <style> sections array(':<(head|style|script).+?</\1>:is', # <head> and <style> sections
...@@ -278,9 +303,11 @@ class Format { ...@@ -278,9 +303,11 @@ class Format {
), ),
array('', '', '', ''), array('', '', '', ''),
$html); $html);
// HtmLawed specific config only
$config = array( $config = array(
'safe' => 1, //Exclude applet, embed, iframe, object and script tags. 'safe' => 1, //Exclude applet, embed, iframe, object and script tags.
'balance' => $balance, 'balance' => $options['balance'],
'comment' => 1, //Remove html comments (OUTLOOK LOVE THEM) 'comment' => 1, //Remove html comments (OUTLOOK LOVE THEM)
'tidy' => -1, 'tidy' => -1,
'deny_attribute' => 'id', 'deny_attribute' => 'id',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment