<?php namespace PicoFeed\Parser; use DomDocument; use SimpleXmlElement; use ZendXml\Security; /** * XML parser class. * * Checks for XML eXternal Entity (XXE) and XML Entity Expansion (XEE) attacks on XML documents * * @author Frederic Guillot */ class XmlParser { /** * Get a SimpleXmlElement instance or return false. * * @static * @param string $input XML content * @return mixed */ public static function getSimpleXml($input) { return self::scan($input); } /** * Get a DomDocument instance or return false. * * @static * @param string $input XML content * @return \DOMDocument */ public static function getDomDocument($input) { if (empty($input)) { return false; } $dom = self::scan($input, new DOMDocument()); // The document is empty, there is probably some parsing errors if ($dom && $dom->childNodes->length === 0) { return false; } return $dom; } /** * Small wrapper around ZendXml to turn their exceptions into picoFeed * exceptions * * @param $input the xml to load * @param $dom pass in a dom document or use null/omit if simpleXml should * be used */ private static function scan($input, $dom = null) { try { return Security::scan($input, $dom); } catch(\ZendXml\Exception\RuntimeException $e) { throw new XmlEntityException($e->getMessage()); } } /** * Load HTML document by using a DomDocument instance or return false on failure. * * @static * @param string $input XML content * @return \DOMDocument */ public static function getHtmlDocument($input) { $dom = new DomDocument(); if (empty($input)) { return $dom; } libxml_use_internal_errors(true); if (version_compare(PHP_VERSION, '5.4.0', '>=')) { $dom->loadHTML($input, LIBXML_NONET); } else { $dom->loadHTML($input); } return $dom; } /** * Convert a HTML document to XML. * * @static * * @param string $html HTML document * * @return string */ public static function htmlToXml($html) { $dom = self::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html); return $dom->saveXML($dom->getElementsByTagName('body')->item(0)); } /** * Get XML parser errors. * * @static * @return string */ public static function getErrors() { $errors = array(); foreach (libxml_get_errors() as $error) { $errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)', $error->message, $error->line, $error->column, $error->code ); } return implode(', ', $errors); } /** * Get the encoding from a xml tag. * * @static * @param string $data Input data * @return string */ public static function getEncodingFromXmlTag($data) { $encoding = ''; if (strpos($data, '<?xml') !== false) { $data = substr($data, 0, strrpos($data, '?>')); $data = str_replace("'", '"', $data); $p1 = strpos($data, 'encoding='); $p2 = strpos($data, '"', $p1 + 10); if ($p1 !== false && $p2 !== false) { $encoding = substr($data, $p1 + 10, $p2 - $p1 - 10); $encoding = strtolower($encoding); } } return $encoding; } /** * Get the charset from a meta tag. * * @static * @param string $data Input data * @return string */ public static function getEncodingFromMetaTag($data) { $encoding = ''; if (preg_match('/<meta.*?charset\s*=\s*["\']?\s*([^"\'\s\/>;]+)/i', $data, $match) === 1) { $encoding = strtolower($match[1]); } return $encoding; } /** * Rewrite XPath query to use namespace-uri and local-name derived from prefix. * * @param string $query XPath query * @param array $ns Prefix to namespace URI mapping * @return string */ public static function replaceXPathPrefixWithNamespaceURI($query, array $ns) { return preg_replace_callback('/([A-Z0-9]+):([A-Z0-9]+)/iu', function ($matches) use ($ns) { // don't try to map the special prefix XML if (strtolower($matches[1]) === 'xml') { return $matches[0]; } return '*[namespace-uri()="'.$ns[$matches[1]].'" and local-name()="'.$matches[2].'"]'; }, $query); } /** * Get the result elements of a XPath query. * * @param \SimpleXMLElement $xml XML element * @param string $query XPath query * @param array $ns Prefix to namespace URI mapping * @return \SimpleXMLElement[] */ public static function getXPathResult(SimpleXMLElement $xml, $query, array $ns = array()) { if (!empty($ns)) { $query = static::replaceXPathPrefixWithNamespaceURI($query, $ns); } return $xml->xpath($query); } /** * Get the first Xpath result or SimpleXMLElement value * * @static * @access public * @param mixed $value * @return string */ public static function getValue($value) { $result = ''; if (is_array($value) && count($value) > 0) { $result = (string) $value[0]; } elseif (is_a($value, 'SimpleXMLElement')) { return $result = (string) $value; } return trim($result); } }