
248 行
8.4 KiB

namespace CommonsMetadata;
use DOMDocument;
use DOMElement;
use DOMNode;
use DOMNodeList;
use DOMXPath;
* A very simple wrapper to DOMDocument to make it easy to traverse nodes which match
* simple CSS selectors.
class DomNavigator {
* The document to search through.
* @var DOMXPath
protected $domx;
* @param string $html
public function __construct( $html ) {
$oldLoaderState = false;
if ( LIBXML_VERSION < 20900 ) {
$oldLoaderState = libxml_disable_entity_loader( true );
$oldHandlerState = libxml_use_internal_errors( true );
$dom = new DOMDocument();
$dom->loadHTML( '<!doctype html><html><head><meta charset="UTF-8"/></head><body>' . $html . '</body></html>' );
$this->domx = new DOMXPath( $dom );
if ( LIBXML_VERSION < 20900 ) {
libxml_disable_entity_loader( $oldLoaderState );
libxml_use_internal_errors( $oldHandlerState );
* Returns a list of elements of the given type which have the given class.
* (In other words, this is equivalent to the CSS selector 'element.class'.)
* @param string|array $element HTML tag name (* to accept all) or array of tag names
* @param string $class
* @param DOMNode|null $context if present, the method will only search inside this element
* @return DOMNodeList|DOMElement[]
public function findElementsWithClass( $element, $class, DOMNode $context = null ) {
$element = $this->handleElementOrList( $element );
$xpath = "./descendant-or-self::{$element}" .
"[contains(concat(' ', normalize-space(@class), ' '), ' $class ')]";
return $this->findByXpath( $xpath, $context );
* Returns a list of elements of the given type which have a class starting with the given
* string.
* @param string|array $element HTML tag name (* to accept all) or array of tag names
* @param string $classPrefix
* @param DOMNode|null $context if present, the method will only search inside this element
* @return DOMNodeList|DOMElement[]
public function findElementsWithClassPrefix( $element, $classPrefix, DOMNode $context = null ) {
$element = $this->handleElementOrList( $element );
$xpath = "./descendant-or-self::{$element}" .
"[contains(concat(' ', normalize-space(@class)), ' $classPrefix')]";
return $this->findByXpath( $xpath, $context );
* Returns a list of elements of the given type which have the given class and any lang
* attribute. (In other words, this is equivalent to the CSS selector 'element.class[lang]'.)
* @param string|array $element HTML tag name (* to accept all) or array of tag names
* @param string $class
* @param DOMNode|null $context if present, the method will only search inside this element
* @return DOMNodeList|DOMElement[]
public function findElementsWithClassAndLang( $element, $class, DOMNode $context = null ) {
$element = $this->handleElementOrList( $element );
$xpath = "./descendant-or-self::{$element}" .
"[@lang and contains(concat(' ', normalize-space(@class), ' '), ' $class ')]";
return $this->findByXpath( $xpath, $context );
* Returns a list of elements of the given type which have the given id.
* (In other words, this is equivalent to the CSS selector 'element#id'.)
* When there are multiple elements with this ID, all are returned.
* @param string|array $element HTML tag name (* to accept all) or array of tag names
* @param string $id
* @param DOMNode|null $context if present, the method will only search inside this element
* @return DOMNodeList|DOMElement[]
public function findElementsWithId( $element, $id, DOMNode $context = null ) {
$element = $this->handleElementOrList( $element );
$xpath = "./descendant-or-self::{$element}[@id='$id']";
return $this->findByXpath( $xpath, $context );
* Returns a list of elements of the given type which have an id starting with the given prefix.
* (In other words, this is equivalent to the CSS selector 'element[id^=prefix]'.)
* @param string|array $element HTML tag name (* to accept all) or array of tag names
* @param string $idPrefix
* @param DOMNode|null $context if present, the method will only search inside this element
* @return DOMNodeList|DOMElement[]
public function findElementsWithIdPrefix( $element, $idPrefix, DOMNode $context = null ) {
$element = $this->handleElementOrList( $element );
$xpath = "./descendant-or-self::{$element}[starts-with(@id, '$idPrefix')]";
return $this->findByXpath( $xpath, $context );
* Returns a list of elements of the given type which have the given attribute with any value.
* (In other words, this is equivalent to the CSS selector 'element[attribute]'.)
* When there are multiple elements with this attribute, all are returned.
* @param string|array $element HTML tag name (* to accept all) or array of tag names
* @param string $attribute
* @param DOMNode|null $context if present, the method will only search inside this element
* @return DOMNodeList|DOMElement[]
public function findElementsWithAttribute( $element, $attribute, DOMNode $context = null ) {
$element = $this->handleElementOrList( $element );
$xpath = "./descendant-or-self::{$element}[@{$attribute}]";
return $this->findByXpath( $xpath, $context );
* Returns true if the node has all the specified classes.
* @param DOMNode $node
* @param string $classes one or more class names (separated with space)
* @return bool
public function hasClass( DOMNode $node, $classes ) {
if ( !$node instanceof \DOMElement ) {
return false;
$nodeClasses = explode( ' ', $node->getAttribute( 'class' ) );
$testClasses = explode( ' ', $classes );
return !array_diff( $testClasses, $nodeClasses );
* Returns the first class matching a prefix.
* @param DOMNode $node
* @param string $classPrefix
* @return string|null
public function getFirstClassWithPrefix( DOMNode $node, $classPrefix ) {
if ( !$node instanceof \DOMElement ) {
return null;
$classes = explode( ' ', $node->getAttribute( 'class' ) );
foreach ( $classes as $class ) {
$length = strlen( $classPrefix );
if ( substr( $class, 0, $length ) === $classPrefix ) {
return $class;
return null;
* Returns the closest ancestor of the given node, which is of the given type
* (like jQuery.closest())
* @param DOMNode $node
* @param string $element HTML tag name
* @return DOMElement|null
public function closest( DOMNode $node, $element ) {
while ( !$node instanceof DOMElement || $node->nodeName !== $element ) {
if ( $node->parentNode instanceof DOMNode ) {
$node = $node->parentNode;
} else {
return null;
// @phan-suppress-next-line PhanTypeMismatchReturnSuperType
return $node;
* Returns the nodes matching an XPath expression.
* @param string $xpath
* @param DOMNode|null $context
* @return DOMNodeList|DOMNode[]
public function findByXpath( $xpath, DOMNode $context = null ) {
$results = $this->domx->query( $xpath, $context );
if ( $results === false ) {
$error = libxml_get_last_error();
$logMessage = sprintf( 'HTML parsing error: %s (%s) at line %s, columnt %s',
$error->message, $error->code, $error->line, $error->column );
wfDebugLog( 'CommonsMetadata', $logMessage );
return new DOMNodeList();
return $results;
* Returns the first node matching an XPath expression, or null.
* @param string $xpath
* @param DOMNode|null $context
* @return DOMNode|null
public function getByXpath( $xpath, DOMNode $context = null ) {
$results = $this->findByXpath( $xpath, $context );
foreach ( $results as $result ) {
return $result;
return null;
* Return next sibling element (or null)
* @param DOMElement $node
* @return DOMElement|null
public function nextElementSibling( DOMElement $node ) {
$nextSibling = $node->nextSibling;
while ( $nextSibling && !$nextSibling instanceof DOMElement ) {
$nextSibling = $nextSibling->nextSibling;
return $nextSibling;
* Takes an element name or array of element names and returns an XPath expression which can
* be used as an element name, but matches all of the provided elements.
* @param string|array $elmementOrList
* @return string
protected function handleElementOrList( $elmementOrList ) {
if ( is_array( $elmementOrList ) ) {
return '*[' . implode( ' or ', array_map(
static function ( $el ) {
return 'self::' . $el;
) ) . ']';
} else {
return $elmementOrList;