%PDF- %PDF-
Direktori : /home/lightco1/upgrade.lightco.com.au/libraries/regularlabs/helpers/ |
Current File : /home/lightco1/upgrade.lightco.com.au/libraries/regularlabs/helpers/htmlfixer.class.php |
<? // ------------------------------------------------- // HTML FIXER v.2.05 15/07/2010 // clean dirty html and make it better, fix open tags // bad nesting, bad quotes, bad autoclosing tags. // // by Giulio Pons, http://www.barattalo.it // ------------------------------------------------- // usage: // ------------------------------------------------- // $a = new HtmlFixer(); // $clean_html = $a->getFixedHtml($dirty_html); // ------------------------------------------------- Class HtmlFixer { public $dirtyhtml; public $fixedhtml; public $allowed_styles; // inline styles array of allowed css (if empty means ALL allowed) private $matrix; // array used to store nodes public $debug; private $fixedhtmlDisplayCode; public function __construct() { $this->dirtyhtml = ""; $this->fixedhtml = ""; $this->debug = false; $this->fixedhtmlDisplayCode = ""; $this->allowed_styles = array(); } public function getFixedHtml($dirtyhtml) { $c = 0; $this->dirtyhtml = $dirtyhtml; $this->fixedhtml = ""; $this->fixedhtmlDisplayCode = ""; if (is_array($this->matrix)) unset($this->matrix); $errorsFound = 0; while ($c < 10) { /* iterations, every time it's getting better... */ if ($c > 0) $this->dirtyhtml = $this->fixedxhtml; $errorsFound = $this->charByCharJob(); if (!$errorsFound) $c = 10; // if no corrections made, stops iteration $this->fixedxhtml = str_replace('<root>', '', $this->fixedxhtml); $this->fixedxhtml = str_replace('</root>', '', $this->fixedxhtml); $this->fixedxhtml = $this->removeSpacesAndBadTags($this->fixedxhtml); $c++; } return $this->fixedxhtml; } private function fixStrToLower($m) { /* $m is a part of the tag: make the first part of attr=value lowercase */ $right = strstr($m, '='); $left = str_replace($right, '', $m); return strtolower($left) . $right; } private function fixQuotes($s) { $q = "\"";// thanks to emmanuel@evobilis.com if (!stristr($s, "=")) return $s; $out = $s; preg_match_all("|=(.*)|", $s, $o, PREG_PATTERN_ORDER); for ($i = 0; $i < count($o[1]); $i++) { $t = trim($o[1][$i]); $lc = ""; if ($t != "") { if ($t[strlen($t) - 1] == ">") { $lc = ($t[strlen($t) - 2] . $t[strlen($t) - 1]) == "/>" ? "/>" : ">"; $t = substr($t, 0, -1); } //missing " or ' at the beginning if (($t[0] != "\"") && ($t[0] != "'")) $out = str_replace($t, "\"" . $t, $out); else $q = $t[0]; //missing " or ' at the end if (($t[strlen($t) - 1] != "\"") && ($t[strlen($t) - 1] != "'")) $out = str_replace($t . $lc, $t . $q . $lc, $out); } } return $out; } private function fixTag($t) { /* remove non standard attributes and call the fix for quoted attributes */ $t = preg_replace( array( '/borderColor=([^ >])*/i', '/border=([^ >])*/i', ), array( '', '', ) , $t); $ar = explode(" ", $t); $nt = ""; for ($i = 0; $i < count($ar); $i++) { $ar[$i] = $this->fixStrToLower($ar[$i]); if (stristr($ar[$i], "=")) $ar[$i] = $this->fixQuotes($ar[$i]); // thanks to emmanuel@evobilis.com //if (stristr($ar[$i],"=") && !stristr($ar[$i],"=\"")) $ar[$i] = $this->fixQuotes($ar[$i]); $nt .= $ar[$i] . " "; } $nt = preg_replace("/<( )*/i", "<", $nt); $nt = preg_replace("/( )*>/i", ">", $nt); return trim($nt); } private function extractChars($tag1, $tag2, $tutto) { /*extract a block between $tag1 and $tag2*/ if (!stristr($tutto, $tag1)) return ''; $s = stristr($tutto, $tag1); $s = substr($s, strlen($tag1)); if (!stristr($s, $tag2)) return ''; $s1 = stristr($s, $tag2); return substr($s, 0, strlen($s) - strlen($s1)); } private function mergeStyleAttributes($s) { // // merge many style definitions in the same tag in just one attribute style // $x = ""; $temp = ""; $c = 0; while (stristr($s, "style=\"")) { $temp = $this->extractChars("style=\"", "\"", $s); if ($temp == "") { // missing closing quote! add missing quote. return preg_replace("/(\/)?>/i", "\"\\1>", $s); } if ($c == 0) $s = str_replace("style=\"" . $temp . "\"", "##PUTITHERE##", $s); $s = str_replace("style=\"" . $temp . "\"", "", $s); if (!preg_match("/;$/i", $temp)) $temp .= ";"; $x .= $temp; $c++; } if (count($this->allowed_styles) > 0) { // keep only allowed styles by Martin Vool 2010-04-19 $check = explode(';', $x); $x = ""; foreach ($check as $chk) { foreach ($this->allowed_styles as $as) if (stripos($chk, $as) !== false) { $x .= $chk . ';'; break; } } } if ($c > 0) $s = str_replace("##PUTITHERE##", "style=\"" . $x . "\"", $s); return $s; } private function fixAutoclosingTags($tag, $tipo = "") { /* metodo richiamato da fix() per aggiustare i tag auto chiudenti (<br/> <img ... />) */ if (in_array($tipo, array("img", "input", "br", "hr"))) { if (!stristr($tag, '/>')) $tag = str_replace('>', '/>', $tag); } return $tag; } private function getTypeOfTag($tag) { $tag = trim(preg_replace("/[\>\<\/]/i", "", $tag)); $a = explode(" ", $tag); return $a[0]; } private function checkTree() { // return the number of errors found $errorsCounter = 0; for ($i = 1; $i < count($this->matrix); $i++) { $flag = false; if ($this->matrix[$i]["tagType"] == "div") { //div cannot stay inside a p, b, etc. $parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"]; if (in_array($parentType, array("p", "b", "i", "font", "u", "small", "strong", "em"))) $flag = true; } if (in_array($this->matrix[$i]["tagType"], array("b", "strong"))) { //b cannot stay inside b o strong. $parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"]; if (in_array($parentType, array("b", "strong"))) $flag = true; } if (in_array($this->matrix[$i]["tagType"], array("i", "em"))) { //i cannot stay inside i or em $parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"]; if (in_array($parentType, array("i", "em"))) $flag = true; } if ($this->matrix[$i]["tagType"] == "p") { $parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"]; if (in_array($parentType, array("p", "b", "i", "font", "u", "small", "strong", "em"))) $flag = true; } if ($this->matrix[$i]["tagType"] == "table") { $parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"]; if (in_array($parentType, array("p", "b", "i", "font", "u", "small", "strong", "em", "tr", "table"))) $flag = true; } if ($flag) { $errorsCounter++; if ($this->debug) echo "<div style='color:#ff0000'>Found a <b>" . $this->matrix[$i]["tagType"] . "</b> tag inside a <b>" . htmlspecialchars($parentType) . "</b> tag at node $i: MOVED</div>"; $swap = $this->matrix[$this->matrix[$i]["parentTag"]]["parentTag"]; if ($this->debug) echo "<div style='color:#ff0000'>Every node that has parent " . $this->matrix[$i]["parentTag"] . " will have parent " . $swap . "</div>"; $this->matrix[$this->matrix[$i]["parentTag"]]["tag"] = "<!-- T A G \"" . $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"] . "\" R E M O V E D -->"; $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"] = ""; $hoSpostato = 0; for ($j = count($this->matrix) - 1; $j >= $i; $j--) { if ($this->matrix[$j]["parentTag"] == $this->matrix[$i]["parentTag"]) { $this->matrix[$j]["parentTag"] = $swap; $hoSpostato = 1; } } } } return $errorsCounter; } private function findSonsOf($parentTag) { // build correct html recursively $out = ""; for ($i = 1; $i < count($this->matrix); $i++) { if ($this->matrix[$i]["parentTag"] == $parentTag) { if ($this->matrix[$i]["tag"] != "") { $out .= $this->matrix[$i]["pre"]; $out .= $this->matrix[$i]["tag"]; $out .= $this->matrix[$i]["post"]; } else { $out .= $this->matrix[$i]["pre"]; $out .= $this->matrix[$i]["post"]; } if ($this->matrix[$i]["tag"] != "") { $out .= $this->findSonsOf($i); if ($this->matrix[$i]["tagType"] != "") { //write the closing tag if (!in_array($this->matrix[$i]["tagType"], array("br", "img", "hr", "input"))) $out .= "</" . $this->matrix[$i]["tagType"] . ">"; } } } } return $out; } private function findSonsOfDisplayCode($parentTag) { //used for debug $out = ""; for ($i = 1; $i < count($this->matrix); $i++) { if ($this->matrix[$i]["parentTag"] == $parentTag) { $out .= "<div style=\"padding-left:15\"><span style='float:left;background-color:#FFFF99;color:#000;'>{$i}:</span>"; if ($this->matrix[$i]["tag"] != "") { if ($this->matrix[$i]["pre"] != "") $out .= htmlspecialchars($this->matrix[$i]["pre"]) . "<br>"; $out .= "" . htmlspecialchars($this->matrix[$i]["tag"]) . "<span style='background-color:red; color:white'>{$i} <em>" . $this->matrix[$i]["tagType"] . "</em></span>"; $out .= htmlspecialchars($this->matrix[$i]["post"]); } else { if ($this->matrix[$i]["pre"] != "") $out .= htmlspecialchars($this->matrix[$i]["pre"]) . "<br>"; $out .= htmlspecialchars($this->matrix[$i]["post"]); } if ($this->matrix[$i]["tag"] != "") { $out .= "<div>" . $this->findSonsOfDisplayCode($i) . "</div>\n"; if ($this->matrix[$i]["tagType"] != "") { if (($this->matrix[$i]["tagType"] != "br") && ($this->matrix[$i]["tagType"] != "img") && ($this->matrix[$i]["tagType"] != "hr") && ($this->matrix[$i]["tagType"] != "input")) $out .= "<div style='color:red'>" . htmlspecialchars("</" . $this->matrix[$i]["tagType"] . ">") . "{$i} <em>" . $this->matrix[$i]["tagType"] . "</em></div>"; } } $out .= "</div>\n"; } } return $out; } private function removeSpacesAndBadTags($s) { $i = 0; while ($i < 10) { $i++; $s = preg_replace( array( '/[\r\n]/i', '/ /i', '/<p([^>])*>( )*\s*<\/p>/i', '/<span([^>])*>( )*\s*<\/span>/i', '/<strong([^>])*>( )*\s*<\/strong>/i', '/<em([^>])*>( )*\s*<\/em>/i', '/<font([^>])*>( )*\s*<\/font>/i', '/<small([^>])*>( )*\s*<\/small>/i', '/<\?xml:namespace([^>])*><\/\?xml:namespace>/i', '/<\?xml:namespace([^>])*\/>/i', '/class=\"MsoNormal\"/i', '/<o:p><\/o:p>/i', '/<!DOCTYPE([^>])*>/i', '/<!--(.|\s)*?-->/', '/<\?(.|\s)*?\?>/', ), array( ' ', ' ', '', '', '', '', '', '', '', '', '', ' ', '', '', ) , trim($s)); } return $s; } private function charByCharJob() { $s = $this->removeSpacesAndBadTags($this->dirtyhtml); if ($s == "") return; $s = "<root>" . $s . "</root>"; $contenuto = ""; $ns = ""; $i = 0; $j = 0; $indexparentTag = 0; $padri = array(); array_push($padri, "0"); $this->matrix[$j]["tagType"] = ""; $this->matrix[$j]["tag"] = ""; $this->matrix[$j]["parentTag"] = "0"; $this->matrix[$j]["pre"] = ""; $this->matrix[$j]["post"] = ""; $tags = array(); while ($i < strlen($s)) { if ($s[$i] == "<") { /* found a tag */ $contenuto = $ns; $ns = ""; $tag = ""; while ($i < strlen($s) && $s[$i] != ">") { // get chars till the end of a tag $tag .= $s[$i]; $i++; } $tag .= $s[$i]; if ($s[$i] == ">") { /* $tag contains a tag <...chars...> let's clean it! */ $tag = $this->fixTag($tag); $tagType = $this->getTypeOfTag($tag); $tag = $this->fixAutoclosingTags($tag, $tagType); $tag = $this->mergeStyleAttributes($tag); if (!isset($tags[$tagType])) $tags[$tagType] = 0; $tagok = true; if (($tags[$tagType] == 0) && (stristr($tag, '/' . $tagType . '>'))) { $tagok = false; /* there is a close tag without any open tag, I delete it */ if ($this->debug) echo "<div style='color:#ff0000'>Found a closing tag <b>" . htmlspecialchars($tag) . "</b> at char $i without open tag: REMOVED</div>"; } } if ($tagok) { $j++; $this->matrix[$j]["pre"] = ""; $this->matrix[$j]["post"] = ""; $this->matrix[$j]["parentTag"] = ""; $this->matrix[$j]["tag"] = ""; $this->matrix[$j]["tagType"] = ""; if (stristr($tag, '/' . $tagType . '>')) { /* it's the closing tag */ $ind = array_pop($padri); $this->matrix[$j]["post"] = $contenuto; $this->matrix[$j]["parentTag"] = $ind; $tags[$tagType]--; } else { if (@preg_match("/" . $tagType . "\/>$/i", $tag) || preg_match("/\/>/i", $tag)) { /* it's a autoclosing tag */ $this->matrix[$j]["tagType"] = $tagType; $this->matrix[$j]["tag"] = $tag; $indexparentTag = array_pop($padri); array_push($padri, $indexparentTag); $this->matrix[$j]["parentTag"] = $indexparentTag; $this->matrix[$j]["pre"] = $contenuto; $this->matrix[$j]["post"] = ""; } else { /* it's a open tag */ $tags[$tagType]++; $this->matrix[$j]["tagType"] = $tagType; $this->matrix[$j]["tag"] = $tag; $indexparentTag = array_pop($padri); array_push($padri, $indexparentTag); array_push($padri, $j); $this->matrix[$j]["parentTag"] = $indexparentTag; $this->matrix[$j]["pre"] = $contenuto; $this->matrix[$j]["post"] = ""; } } } } else { /* content of the tag */ $ns .= $s[$i]; } $i++; } /* remove not valid tags */ for ($eli = $j + 1; $eli < count($this->matrix); $eli++) { $this->matrix[$eli]["pre"] = ""; $this->matrix[$eli]["post"] = ""; $this->matrix[$eli]["parentTag"] = ""; $this->matrix[$eli]["tag"] = ""; $this->matrix[$eli]["tagType"] = ""; } $errorsCounter = $this->checkTree(); // errorsCounter contains the number of removed tags $this->fixedxhtml = $this->findSonsOf(0); // build html fixed if ($this->debug) { $this->fixedxhtmlDisplayCode = $this->findSonsOfDisplayCode(0); echo "<table border=1 cellspacing=0 cellpadding=0>"; echo "<tr><th>node id</th>"; echo "<th>pre</th>"; echo "<th>tag</th>"; echo "<th>post</th>"; echo "<th>parentTag</th>"; echo "<th>tipo</th></tr>"; for ($k = 0; $k <= $j; $k++) { echo "<tr><td>$k</td>"; echo "<td> " . htmlspecialchars($this->matrix[$k]["pre"]) . "</td>"; echo "<td> " . htmlspecialchars($this->matrix[$k]["tag"]) . "</td>"; echo "<td> " . htmlspecialchars($this->matrix[$k]["post"]) . "</td>"; echo "<td> " . $this->matrix[$k]["parentTag"] . "</td>"; echo "<td> <i>" . $this->matrix[$k]["tagType"] . "</i></td></tr>"; } echo "</table>"; echo "<hr/>{$j}<hr/>\n\n\n\n" . $this->fixedxhtmlDisplayCode; } return $errorsCounter; } }