* VERSION: 2006-10-31:15:56 * (a timestamp to see if you have the latest version) ******************************************************************** * LICENSING/COPYRIGHT * CS Wagner is the original author of this function * and has had to fend off the attacks of the HTML_TIDY crowd to make * this function a reality. Any adaptation (or bug fix) is allowed * as long as the new script is emailed to cs@kainaw.com. * There is no licensing for this script. Everyone may use it for * any purpose that they choose. ******************************************************************** * USAGE * If you have html in a string that you want to clean up, use: * $clean_html = html_tidier($messy_html); * If you have html in a file, use: * $clean_html = html_tidier_file($filename); * Both functions return the clean HTML as a string. ******************************************************************** * WHY NOT HTML_TIDY? * I tried to use HTML_TIDY from Sourceforge, but it lacked the main * feature that I required: removing/merging repeated redundant tags. * So, I wrote this, not to really clean up the HTML and make it * perfectly HTML4.0 or XML compliant, but to remove all the extra * garbage that WYSIWYG editors add. ******************************************************************** * SETTINGS * You can customize a lot of how this works with the global arrays * at the top of the file: * $HTML_TIDIER_EMPTY_TAGS * These are tags that are expected to be empty - they do not have * more HTML between an open and close tag. * $HTML_TIDIER_ALLOWED_EMPTY * These are tags that are allowed to be empty. Normally, tags * that do not contain anything (such as ) are simply * deleted from the clean HTML. * $HTML_TIDIER_APPEND_ATTRIBUTES * When a tag is repeated, the last attribute is the one used. For * example, some text would * become some text because the size of 3 will * override the size of 1. When you do not want the attribute to * overwrite, such as with the style attribute, add the attribute * name to the array (as a key) and the separating character as a * value. * $HTML_TIDIER_ALLOWED_REPEAT * Normally, any tag that is repeated in a redundant fashion is * deleted from the clean HTML. Tags in this array are allowed to * be redunantly repeated. * $HTML_TIDIER_RENAME * Replace tag names with more common tag names (required to remove * redundant repetitions such as text). ********************************************************************/ $HTML_TIDIER_EMPTY_TAGS = array("area", "base", "basefont", "br", "hr", "img", "input", "isindex", "link", "meta", "nextid", "param", "title"); // Items in $HTML_TIDIER_EMPTY_TAGS do not need to be repeated in $HTML_TIDIER_ALLOWED_EMPTY. $HTML_TIDIER_ALLOWED_EMPTY = array("li", "option", "p", "style", "script", "td", "textarea"); $HTML_TIDIER_APPEND_ATTRIBUTES = array("style"=>";"); $HTML_TIDIER_ALLOWED_REPEAT = array("p"); $HTML_TIDIER_RENAME = array("strong"=>"b", "em"=>"i"); /** * This is the main class for all tags. * It is designed to be a container. * It has attributes and contents. */ class html_tidier_tag { var $name; // Name of the tag var $attributes; // Attribute array for the tag var $contents; // Array of contents var $closed; // Has the tag been closed? /** * Constructor * @param name Name of the tag * @param attributes Array of attributes (or attribute string key1=val1 key2=val2...) * @param contents Array of contents */ function html_tidier_tag($name="__NONAME__", $attributes=null, $contents=null) { global $HTML_TIDIER_EMPTY_TAGS, $HTML_TIDIER_RENAME; // Set the name of the tag (replacing as required) $this->name = strtolower($name); if(isset($HTML_TIDIER_RENAME[$name])) $this->name = $HTML_TIDIER_RENAME[$name]; // Set the attributes for the tag if($attributes == null) $this->attributes = array(); elseif(is_array($attributes)) $this->attributes = $attributes; else $this->attributes = html_tidier_parse_attributes($attributes); // Set the contents of the tag if($contents == null) $this->contents = array(); elseif($is_array($content)) $this->contents = $contents; else $this->contents = array($contents); // Close it if it is an empty tag $this->closed = in_array($this->name, $HTML_TIDIER_EMPTY_TAGS); } /** * Add an item to the tag. * This adds to the open tag inside this tag. * @param item Item to add */ function add($item) { $this->repair(); // Make sure the tag is not broken // Try to add it to an open content item first. foreach($this->contents as &$content) if(is_object($content) && $content->is_open()) return $content->add($item); // All contents were closed, add it to this tag. return array_push($this->contents, $item); } /** * Is the tag still open? * @return true if open, false if closed */ function is_open() { return !$this->closed; } /** * Is the tag closed? * @return true if closed, false if open */ function is_closed() { return $this->closed; } /** * Close the tag with the given name. * @param name Name of the tag to close * @return true if a tag was closed, false otherwise */ function close($name=null) { $name = strtolower($name); global $HTML_TIDIER_RENAME; if(isset($HTML_TIDIER_RENAME[$name])) $name = $HTML_TIDIER_RENAME[$name]; $this->repair(); // Make sure the tag is not broken // Try to close a contained tag first foreach($this->contents as &$content) if(is_object($content) && $content->is_open()) if($content->close($name)) return true; // Couldn't close a contained tag, is this tag the same name? if($name == $this->name) { $this->closed = true; return true; } // No tag to close return false; } /** * Tidy up the HTML. */ function tidy() { global $HTML_TIDIER_EMPTY_TAGS, $HTML_TIDIER_ALLOWED_EMPTY, $HTML_TIDIER_APPEND_ATTRIBUTES; $this->repair(); // Make sure the tag is not broken // Run tidy on all of the contents first. foreach($this->contents as $i=>&$content) { if(is_object($content)) { $content->tidy(); // If the content item is named "__DELETED__", remove it. if($content->name == "__DELETED__") unset($this->contents[$i]); } // Remove empty data strings. elseif(trim($content) == "") unset($this->contents[$i]); } // If this is an empty tag (and is not allowed to be empty) delete it. if((sizeof($this->contents) == 0) && !in_array($this->name, $HTML_TIDIER_EMPTY_TAGS) && !in_array($this->name, $HTML_TIDIER_ALLOWED_EMPTY)) $this->name = "__DELETED__"; // If this has just one content, merge redundant tags. if(sizeof($this->contents) == 1) { foreach($this->contents as $i=>&$content); if(is_object($content)) { $merger = $content->merge_up($this->name); if($merger != null) { foreach($merger->attributes as $key=>$val) { if(isset($this->attributes[$key]) && isset($HTML_TIDIER_APPEND_ATTRIBUTES[$key])) { if(($this->attributes[$key] != "") && (substr($this->attributes[$key], -1) != $HTML_TIDIER_APPEND_ATTRIBUTES[$key])) $this->attributes[$key].= $HTML_TIDIER_APPEND_ATTRIBUTES[$key]; $this->attributes[$key].= $val; } else $this->attributes[$key] = $val; } } } } } /** * Merge up redundant tags. * @param name Name of tags to merge up * @return redundant tag object, null if no tag is found */ function merge_up($name) { global $HTML_TIDIER_APPEND_ATTRIBUTES; // Can the contained tag be merged? $merger = null; if(sizeof($this->contents) == 1) { foreach($this->contents as $i=>&$content); if(is_object($content)) $merger = $content->merge_up($name); } // Can this tag be merged? if($this->name == $name) { if($merger != null) { foreach($merger->attributes as $key=>$val) { if(isset($this->attributes[$key]) && isset($HTML_TIDIER_APPEND_ATTRIBUTES[$key])) { if(($this->attributes[$key] != "") && (substr($this->attributes[$key], -1) != $HTML_TIDIER_APPEND_ATTRIBUTES[$key])) $this->attributes[$key].= $HTML_TIDIER_APPEND_ATTRIBUTES[$key]; $this->attributes[$key].= $val; } else $this->attributes[$key] = $val; } } $this->name = "__DELETED__"; return $this; } return null; } /** * Fix broken tags. * Attributes must be an array. * Contents must be an array. */ function repair() { if(!is_array($this->attributes)) $this->attributes = html_tidier_parse_attributes($this->attributes); if(!is_array($this->contents)) $this->contents = array($this->contents); } /** * Get the HTML. * @param indent Indention characters * @return Cleaned HTML (as a string) */ function html($indent="") { global $HTML_TIDIER_EMPTY_TAGS; $this->repair(); // Make sure the tag is not broken. $html = ""; if(substr($this->name, 0, 2) == "__") { foreach($this->contents as &$content) { if(!is_object($content)) $html.= $indent.$content."\n"; else $html.= $content->html($indent); } } else { $html.= $indent."<".$this->name; foreach($this->attributes as $key=>$val) if($key != "/") $html.= " $key=\"$val\""; if(in_array($this->name, $HTML_TIDIER_EMPTY_TAGS)) $html.= " />\n"; else { $html.= ">\n"; foreach($this->contents as &$content) { if(!is_object($content)) $html.= $indent." ".$content."\n"; else $html.= $content->html($indent." "); } $html.= $indent."name.">\n"; } } return $html; } } /** * Clean HTML. * @param html HTML to clean up * @return Cleaned HTML */ function html_tidier($html) { $page = new html_tidier_tag(); // A container for all the HTML $tag = ""; // Current tag text $data = ""; // Current data text $intag = false; // Am I in a tag? // Parse all of the text character by character... for($i=0; $i") { // Get the attributes // __TAGNAME__ is appended to force the name to come through as an attribute $attributes = html_tidier_parse_attributes("__TAGNAME__=".$tag); // Pull the name out of the attributes $name = strtolower($attributes["__TAGNAME__"]); unset($attributes["__TAGNAME__"]); if(substr($name, 0, 1) == "/") $page->close(substr($name, 1)); else $page->add(new html_tidier_tag($name, $attributes)); $intag = false; $tag = $data = ""; } else { $tag.= $c; } } else { // If it is an opening tag, add any data we've found if($c == "<") { $intag = true; $data = html_tidier_remove_whitespace($data); if($data != "") $page->add($data); $tag = $data = ""; } else { $data.= $c; } } } // If there is data left over, add it too $data = html_tidier_remove_whitespace($data); if($data != "") $page->add($data); // Clean up and return the HTML $page->tidy(); return $page->html(); } /** * A wrapper to send a file to html_tidier. * @param file Filename * @return Clean HTML */ function html_tidier_file($file) { $fp = @fopen($file, "r"); $html = ""; if($fp) { while($b = fgets($fp, 1024)) $html.= $b; fclose($fp); } return html_tidier($html); } /** * Remove whitespace from HTML. * All whitespace is replaced with a single space character. * @param html HTML that may have whitespace * @return All whitespace cleaned to a single space */ function html_tidier_remove_whitespace($html) { $html = preg_replace("/\\s/", " ", $html); while(strpos($html, " ") !== false) $html = str_replace(" ", " ", $html); return trim($html); } /** * Parse the attributes of a tag. * @param str Attribute string * @return Array of attribute_keys=>attribute_values */ function html_tidier_parse_attributes($str) { // Replace whitespace with regular spaces. $str = preg_replace("/\\s/", " ", $str); // Replace equal signs and spaces in values with temp strings $tstr = ""; $inq = null; for($i=0; $i