Feb 14th, 2013
Here's a snippet that parses some HTML to find elements with a certain class using PHP DOM (http://php.net/manual/en/book.dom.php) and XPATH (http://msdn.microsoft.com/en-us/library/ms256086.aspx).
<?php // create a new DOMDocument // @see http://www.php.net/manual/en/class.domdocument.php $dom = new DOMDocument; // ensure that HTML5 elements don't cause php notices libxml_use_internal_errors(true); // load in some html as a string $dom->loadHTML('<div class="stupid error face"></div>'); // uncomment the next line to load in a file instead; can be a URI // $dom->load($filePath); // clear any errors generated from HTML5 elements or other bad markup libxml_clear_errors(); $finder = new DomXPath($dom); $classname = "error"; // create new DOMNodeList // @see http://www.php.net/manual/en/class.domnodelist.php // http://stackoverflow.com/questions/6366351/getting-dom-elements-by-class-name#answer-6366390 $nodes = $finder->query("//*[contains(@class, '$classname')]"); // $nodes = $dom->getElementsByTagName('a'); // another example foreach ($nodes as $domnode) { // each $domnode is a DOMElement // @see http://php.net/manual/en/class.domelement.php if (!$domnode->hasAttribute('class')) { continue; } $class = $domnode->getAttribute('class'); if (empty($class)) { continue; } print $class; } ?>
another example
<?php /** * Parse HTML and return the first link found */ function get_link_from_html($html) { $dom = new DOMDocument; $dom->loadHTML($html); $nodes = $dom->getElementsByTagName('a'); $link = $nodes->item(0); $link = !is_null($link) ? $dom->saveXml($link) : ''; return $link; } ?>
Sometimes it's nice to strip out the !DOCTYPE, head, and body elements of XML as can be seen at the bottom of this next code snippet (not sure where I picked up this function but the only relevant part is the last part... I just wanted to write this example here in case I needed to do something similar in the future.):
/** * Parse the dom element for the page to add in the styles for the links * and images. */ function _super_parse_links($html) { if(strlen($html)) { $dom = new DOMDocument('1.0', 'UTF-8'); //mute errors from HTML5 libxml_use_internal_errors(true); $dom->loadHTML($html); $classname = "formatted-email"; $nodes = $dom->getElementsByTagName('a'); foreach($nodes as $node) { $orig = $node; if(!$node->hasAttribute('class') || ($node->getAttribute('class') != $classname)) { $node->setAttribute('style', 'color: #5396cd;text-decoration: none;'); $node->setAttribute('class', $classname); $orig->parentNode->replaceChild($node, $orig); } } $imgs = $dom->getElementsByTagName('img'); foreach($imgs as $node) { $orig = $node; //change https imgs to http if($node->hasAttribute('src')) { $node->setAttribute('src', str_replace('https://' , 'http://', $node->getAttribute('src'))); } if($node->hasAttribute('data-enlarged')) { $node->setAttribute('data-enlarged', str_replace('https://' , 'http://', $node->getAttribute('data-enlarged'))); } $node->setAttribute('style', 'display: block;float: left;margin: 4px 1em .85em 0;'); $orig->parentNode->replaceChild($node, $orig); } return preg_replace('~<(?:!DOCTYPE|/?(?:html|body))[^>]*>\s*~i', '', $dom->saveHTML()); } return; }