This commit is contained in:
Marcel Kapfer (mmk2410) 2016-04-18 09:56:52 +02:00
parent cb491341df
commit 9947a5f033
391 changed files with 0 additions and 15712 deletions

View file

@ -1,366 +0,0 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
use PicoFeed\Filter\Filter;
use PicoFeed\Client\Url;
/**
* Atom parser.
*
* @author Frederic Guillot
*/
class Atom extends Parser
{
/**
* Supported namespaces.
*/
protected $namespaces = array(
'atom' => 'http://www.w3.org/2005/Atom',
);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml)
{
return XmlParser::getXPathResult($xml, 'atom:entry', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'entry');
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->feed_url = $this->getUrl($xml, 'self');
}
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->site_url = $this->getUrl($xml, 'alternate', true);
}
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
{
$description = XmlParser::getXPathResult($xml, 'atom:subtitle', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'subtitle');
$feed->description = (string) current($description);
}
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
{
$logo = XmlParser::getXPathResult($xml, 'atom:logo', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'logo');
$feed->logo = (string) current($logo);
}
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
{
$icon = XmlParser::getXPathResult($xml, 'atom:icon', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'icon');
$feed->icon = (string) current($icon);
}
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$title = XmlParser::getXPathResult($xml, 'atom:title', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'title');
$feed->title = Filter::stripWhiteSpace((string) current($title)) ?: $feed->getSiteUrl();
}
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$language = XmlParser::getXPathResult($xml, '*[not(self::atom:entry)]/@xml:lang', $this->namespaces)
?: XmlParser::getXPathResult($xml, '@xml:lang');
$feed->language = (string) current($language);
}
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$id = XmlParser::getXPathResult($xml, 'atom:id', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'id');
$feed->id = (string) current($id);
}
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$updated = XmlParser::getXPathResult($xml, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'updated');
$feed->date = $this->date->getDateTime((string) current($updated));
}
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$published = XmlParser::getXPathResult($entry, 'atom:published', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'published');
$updated = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'updated');
$published = !empty($published) ? $this->date->getDateTime((string) current($published)) : null;
$updated = !empty($updated) ? $this->date->getDateTime((string) current($updated)) : null;
if ($published === null && $updated === null) {
$item->date = $feed->getDate(); // We use the feed date if there is no date for the item
} elseif ($published !== null && $updated !== null) {
$item->date = max($published, $updated); // We use the most recent date between published and updated
} else {
$item->date = $updated ?: $published;
}
}
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$title = XmlParser::getXPathResult($entry, 'atom:title', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'title');
$item->title = Filter::stripWhiteSpace((string) current($title)) ?: $item->url;
}
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
$author = XmlParser::getXPathResult($entry, 'atom:author/atom:name', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'author/name')
?: XmlParser::getXPathResult($xml, 'atom:author/atom:name', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'author/name');
$item->author = (string) current($author);
}
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$item->content = $this->getContent($entry);
}
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$item->url = $this->getUrl($entry, 'alternate', true);
}
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$id = XmlParser::getXPathResult($entry, 'atom:id', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'id');
if (!empty($id)) {
$item->id = $this->generateId((string) current($id));
} else {
$item->id = $this->generateId(
$item->getTitle(), $item->getUrl(), $item->getContent()
);
}
}
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$enclosure = $this->findLink($entry, 'enclosure');
if ($enclosure) {
$item->enclosure_url = Url::resolve((string) $enclosure['href'], $feed->getSiteUrl());
$item->enclosure_type = (string) $enclosure['type'];
}
}
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$language = XmlParser::getXPathResult($entry, './/@xml:lang');
$item->language = (string) current($language) ?: $feed->language;
}
/**
* Get the URL from a link tag.
*
* @param SimpleXMLElement $xml XML tag
* @param string $rel Link relationship: alternate, enclosure, related, self, via
*
* @return string
*/
private function getUrl(SimpleXMLElement $xml, $rel, $fallback = false)
{
$link = $this->findLink($xml, $rel);
if ($link) {
return (string) $link['href'];
}
if ($fallback) {
$link = $this->findLink($xml, '');
return $link ? (string) $link['href'] : '';
}
return '';
}
/**
* Get a link tag that match a relationship.
*
* @param SimpleXMLElement $xml XML tag
* @param string $rel Link relationship: alternate, enclosure, related, self, via
*
* @return SimpleXMLElement|null
*/
private function findLink(SimpleXMLElement $xml, $rel)
{
$links = XmlParser::getXPathResult($xml, 'atom:link', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'link');
foreach ($links as $link) {
if ($rel === (string) $link['rel']) {
return $link;
}
}
return;
}
/**
* Get the entry content.
*
* @param SimpleXMLElement $entry XML Entry
*
* @return string
*/
private function getContent(SimpleXMLElement $entry)
{
$content = current(
XmlParser::getXPathResult($entry, 'atom:content', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'content')
);
if (!empty($content) && count($content->children())) {
$xml_string = '';
foreach ($content->children() as $child) {
$xml_string .= $child->asXML();
}
return $xml_string;
} elseif (trim((string) $content) !== '') {
return (string) $content;
}
$summary = XmlParser::getXPathResult($entry, 'atom:summary', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'summary');
return (string) current($summary);
}
}

View file

@ -1,113 +0,0 @@
<?php
namespace PicoFeed\Parser;
use DateTime;
use DateTimeZone;
/**
* Date Parser.
*
* @author Frederic Guillot
*/
class DateParser
{
/**
* Timezone used to parse feed dates.
*
* @var string
*/
public $timezone = 'UTC';
/**
* Supported formats [ 'format' => length ].
*
* @var array
*/
public $formats = array(
DATE_ATOM => null,
DATE_RSS => null,
DATE_COOKIE => null,
DATE_ISO8601 => null,
DATE_RFC822 => null,
DATE_RFC850 => null,
DATE_RFC1036 => null,
DATE_RFC1123 => null,
DATE_RFC2822 => null,
DATE_RFC3339 => null,
'D, d M Y H:i:s' => 25,
'D, d M Y h:i:s' => 25,
'D M d Y H:i:s' => 24,
'j M Y H:i:s' => 20,
'Y-m-d H:i:s' => 19,
'Y-m-d\TH:i:s' => 19,
'd/m/Y H:i:s' => 19,
'D, d M Y' => 16,
'Y-m-d' => 10,
'd-m-Y' => 10,
'm-d-Y' => 10,
'd.m.Y' => 10,
'm.d.Y' => 10,
'd/m/Y' => 10,
'm/d/Y' => 10,
);
/**
* Try to parse all date format for broken feeds.
*
* @param string $value Original date format
*
* @return DateTime
*/
public function getDateTime($value)
{
$value = trim($value);
foreach ($this->formats as $format => $length) {
$truncated_value = $value;
if ($length !== null) {
$truncated_value = substr($truncated_value, 0, $length);
}
$date = $this->getValidDate($format, $truncated_value);
if ($date !== false) {
return $date;
}
}
return $this->getCurrentDateTime();
}
/**
* Get a valid date from a given format.
*
* @param string $format Date format
* @param string $value Original date value
*
* @return DateTime|bool
*/
public function getValidDate($format, $value)
{
$date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone));
if ($date !== false) {
$errors = DateTime::getLastErrors();
if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) {
return $date;
}
}
return false;
}
/**
* Get the current datetime.
*
* @return DateTime
*/
public function getCurrentDateTime()
{
return new DateTime('now', new DateTimeZone($this->timezone));
}
}

View file

@ -1,194 +0,0 @@
<?php
namespace PicoFeed\Parser;
/**
* Feed.
*
* @author Frederic Guillot
*/
class Feed
{
/**
* Feed items.
*
* @var array
*/
public $items = array();
/**
* Feed id.
*
* @var string
*/
public $id = '';
/**
* Feed title.
*
* @var string
*/
public $title = '';
/**
* Feed description.
*
* @var string
*/
public $description = '';
/**
* Feed url.
*
* @var string
*/
public $feed_url = '';
/**
* Site url.
*
* @var string
*/
public $site_url = '';
/**
* Feed date.
*
* @var \DateTime
*/
public $date = null;
/**
* Feed language.
*
* @var string
*/
public $language = '';
/**
* Feed logo URL.
*
* @var string
*/
public $logo = '';
/**
* Feed icon URL.
*
* @var string
*/
public $icon = '';
/**
* Return feed information.
*/
public function __toString()
{
$output = '';
foreach (array('id', 'title', 'feed_url', 'site_url', 'language', 'description', 'logo') as $property) {
$output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL;
}
$output .= 'Feed::date = '.$this->date->format(DATE_RFC822).PHP_EOL;
$output .= 'Feed::isRTL() = '.($this->isRTL() ? 'true' : 'false').PHP_EOL;
$output .= 'Feed::items = '.count($this->items).' items'.PHP_EOL;
foreach ($this->items as $item) {
$output .= '----'.PHP_EOL;
$output .= $item;
}
return $output;
}
/**
* Get title.
*/
public function getTitle()
{
return $this->title;
}
/**
* Get description.
*/
public function getDescription()
{
return $this->description;
}
/**
* Get the logo url.
*/
public function getLogo()
{
return $this->logo;
}
/**
* Get the icon url.
*/
public function getIcon()
{
return $this->icon;
}
/**
* Get feed url.
*/
public function getFeedUrl()
{
return $this->feed_url;
}
/**
* Get site url.
*/
public function getSiteUrl()
{
return $this->site_url;
}
/**
* Get date.
*/
public function getDate()
{
return $this->date;
}
/**
* Get language.
*/
public function getLanguage()
{
return $this->language;
}
/**
* Get id.
*/
public function getId()
{
return $this->id;
}
/**
* Get feed items.
*/
public function getItems()
{
return $this->items;
}
/**
* Return true if the feed is "Right to Left".
*
* @return bool
*/
public function isRTL()
{
return Parser::isLanguageRTL($this->language);
}
}

View file

@ -1,230 +0,0 @@
<?php
namespace PicoFeed\Parser;
/**
* Feed Item.
*
* @author Frederic Guillot
*/
class Item
{
/**
* List of known RTL languages.
*
* @var public
*/
public $rtl = array(
'ar', // Arabic (ar-**)
'fa', // Farsi (fa-**)
'ur', // Urdu (ur-**)
'ps', // Pashtu (ps-**)
'syr', // Syriac (syr-**)
'dv', // Divehi (dv-**)
'he', // Hebrew (he-**)
'yi', // Yiddish (yi-**)
);
/**
* Item id.
*
* @var string
*/
public $id = '';
/**
* Item title.
*
* @var string
*/
public $title = '';
/**
* Item url.
*
* @var string
*/
public $url = '';
/**
* Item author.
*
* @var string
*/
public $author = '';
/**
* Item date.
*
* @var \DateTime
*/
public $date = null;
/**
* Item content.
*
* @var string
*/
public $content = '';
/**
* Item enclosure url.
*
* @var string
*/
public $enclosure_url = '';
/**
* Item enclusure type.
*
* @var string
*/
public $enclosure_type = '';
/**
* Item language.
*
* @var string
*/
public $language = '';
/**
* Raw XML.
*
* @var \SimpleXMLElement
*/
public $xml;
/**
* List of namespaces.
*
* @var array
*/
public $namespaces = array();
/**
* Get specific XML tag or attribute value.
*
* @param string $tag Tag name (examples: guid, media:content)
* @param string $attribute Tag attribute
*
* @return array|false Tag values or error
*/
public function getTag($tag, $attribute = '')
{
// convert to xPath attribute query
if ($attribute !== '') {
$attribute = '/@'.$attribute;
}
// construct query
$query = './/'.$tag.$attribute;
$elements = XmlParser::getXPathResult($this->xml, $query, $this->namespaces);
if ($elements === false) { // xPath error
return false;
}
return array_map(function ($element) { return (string) $element;}, $elements);
}
/**
* Return item information.
*/
public function __toString()
{
$output = '';
foreach (array('id', 'title', 'url', 'language', 'author', 'enclosure_url', 'enclosure_type') as $property) {
$output .= 'Item::'.$property.' = '.$this->$property.PHP_EOL;
}
$output .= 'Item::date = '.$this->date->format(DATE_RFC822).PHP_EOL;
$output .= 'Item::isRTL() = '.($this->isRTL() ? 'true' : 'false').PHP_EOL;
$output .= 'Item::content = '.strlen($this->content).' bytes'.PHP_EOL;
return $output;
}
/**
* Get title.
*/
public function getTitle()
{
return $this->title;
}
/**
* Get url.
*/
public function getUrl()
{
return $this->url;
}
/**
* Get id.
*/
public function getId()
{
return $this->id;
}
/**
* Get date.
*/
public function getDate()
{
return $this->date;
}
/**
* Get content.
*/
public function getContent()
{
return $this->content;
}
/**
* Get enclosure url.
*/
public function getEnclosureUrl()
{
return $this->enclosure_url;
}
/**
* Get enclosure type.
*/
public function getEnclosureType()
{
return $this->enclosure_type;
}
/**
* Get language.
*/
public function getLanguage()
{
return $this->language;
}
/**
* Get author.
*/
public function getAuthor()
{
return $this->author;
}
/**
* Return true if the item is "Right to Left".
*
* @return bool
*/
public function isRTL()
{
return Parser::isLanguageRTL($this->language);
}
}

View file

@ -1,12 +0,0 @@
<?php
namespace PicoFeed\Parser;
/**
* MalformedXmlException Exception.
*
* @author Frederic Guillot
*/
class MalformedXmlException extends ParserException
{
}

View file

@ -1,576 +0,0 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
use PicoFeed\Client\Url;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
use PicoFeed\Scraper\Scraper;
/**
* Base parser class.
*
* @author Frederic Guillot
*/
abstract class Parser
{
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* DateParser object.
*
* @var \PicoFeed\Parser\DateParser
*/
protected $date;
/**
* Hash algorithm used to generate item id, any value supported by PHP, see hash_algos().
*
* @var string
*/
private $hash_algo = 'sha256';
/**
* Feed content (XML data).
*
* @var string
*/
protected $content = '';
/**
* Fallback url.
*
* @var string
*/
protected $fallback_url = '';
/**
* XML namespaces supported by parser.
*
* @var array
*/
protected $namespaces = array();
/**
* XML namespaces used in document.
*
* @var array
*/
protected $used_namespaces = array();
/**
* Enable the content filtering.
*
* @var bool
*/
private $enable_filter = true;
/**
* Enable the content grabber.
*
* @var bool
*/
private $enable_grabber = false;
/**
* Enable the content grabber on all pages.
*
* @var bool
*/
private $grabber_needs_rule_file = false;
/**
* Ignore those urls for the content scraper.
*
* @var array
*/
private $grabber_ignore_urls = array();
/**
* Constructor.
*
* @param string $content Feed content
* @param string $http_encoding HTTP encoding (headers)
* @param string $fallback_url Fallback url when the feed provide relative or broken url
*/
public function __construct($content, $http_encoding = '', $fallback_url = '')
{
$this->date = new DateParser();
$this->fallback_url = $fallback_url;
$xml_encoding = XmlParser::getEncodingFromXmlTag($content);
// Strip XML tag to avoid multiple encoding/decoding in the next XML processing
$this->content = Filter::stripXmlTag($content);
// Encode everything in UTF-8
Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
$this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding);
}
/**
* Parse the document.
*
* @return \PicoFeed\Parser\Feed
*/
public function execute()
{
Logger::setMessage(get_called_class().': begin parsing');
$xml = XmlParser::getSimpleXml($this->content);
if ($xml === false) {
Logger::setMessage(get_called_class().': Applying XML workarounds');
$this->content = Filter::normalizeData($this->content);
$xml = XmlParser::getSimpleXml($this->content);
if ($xml === false) {
Logger::setMessage(get_called_class().': XML parsing error');
Logger::setMessage(XmlParser::getErrors());
throw new MalformedXmlException('XML parsing error');
}
}
$this->used_namespaces = $xml->getNamespaces(true);
$xml = $this->registerSupportedNamespaces($xml);
$feed = new Feed();
$this->findFeedUrl($xml, $feed);
$this->checkFeedUrl($feed);
$this->findSiteUrl($xml, $feed);
$this->checkSiteUrl($feed);
$this->findFeedTitle($xml, $feed);
$this->findFeedDescription($xml, $feed);
$this->findFeedLanguage($xml, $feed);
$this->findFeedId($xml, $feed);
$this->findFeedDate($xml, $feed);
$this->findFeedLogo($xml, $feed);
$this->findFeedIcon($xml, $feed);
foreach ($this->getItemsTree($xml) as $entry) {
$entry = $this->registerSupportedNamespaces($entry);
$item = new Item();
$item->xml = $entry;
$item->namespaces = $this->used_namespaces;
$this->findItemAuthor($xml, $entry, $item);
$this->findItemUrl($entry, $item);
$this->checkItemUrl($feed, $item);
$this->findItemTitle($entry, $item);
$this->findItemContent($entry, $item);
// Id generation can use the item url/title/content (order is important)
$this->findItemId($entry, $item, $feed);
$this->findItemDate($entry, $item, $feed);
$this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed);
// Order is important (avoid double filtering)
$this->filterItemContent($feed, $item);
$this->scrapWebsite($item);
$feed->items[] = $item;
}
Logger::setMessage(get_called_class().PHP_EOL.$feed);
return $feed;
}
/**
* Check if the feed url is correct.
*
* @param Feed $feed Feed object
*/
public function checkFeedUrl(Feed $feed)
{
if ($feed->getFeedUrl() === '') {
$feed->feed_url = $this->fallback_url;
} else {
$feed->feed_url = Url::resolve($feed->getFeedUrl(), $this->fallback_url);
}
}
/**
* Check if the site url is correct.
*
* @param Feed $feed Feed object
*/
public function checkSiteUrl(Feed $feed)
{
if ($feed->getSiteUrl() === '') {
$feed->site_url = Url::base($feed->getFeedUrl());
} else {
$feed->site_url = Url::resolve($feed->getSiteUrl(), $this->fallback_url);
}
}
/**
* Check if the item url is correct.
*
* @param Feed $feed Feed object
* @param Item $item Item object
*/
public function checkItemUrl(Feed $feed, Item $item)
{
$item->url = Url::resolve($item->getUrl(), $feed->getSiteUrl());
}
/**
* Fetch item content with the content grabber.
*
* @param Item $item Item object
*/
public function scrapWebsite(Item $item)
{
if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) {
$grabber = new Scraper($this->config);
$grabber->setUrl($item->getUrl());
if ($this->grabber_needs_rule_file) {
$grabber->disableCandidateParser();
}
$grabber->execute();
if ($grabber->hasRelevantContent()) {
$item->content = $grabber->getFilteredContent();
}
}
}
/**
* Filter HTML for entry content.
*
* @param Feed $feed Feed object
* @param Item $item Item object
*/
public function filterItemContent(Feed $feed, Item $item)
{
if ($this->isFilteringEnabled()) {
$filter = Filter::html($item->getContent(), $feed->getSiteUrl());
$filter->setConfig($this->config);
$item->content = $filter->execute();
} else {
Logger::setMessage(get_called_class().': Content filtering disabled');
}
}
/**
* Generate a unique id for an entry (hash all arguments).
*
* @return string
*/
public function generateId()
{
return hash($this->hash_algo, implode(func_get_args()));
}
/**
* Return true if the given language is "Right to Left".
*
* @static
*
* @param string $language Language: fr-FR, en-US
*
* @return bool
*/
public static function isLanguageRTL($language)
{
$language = strtolower($language);
$rtl_languages = array(
'ar', // Arabic (ar-**)
'fa', // Farsi (fa-**)
'ur', // Urdu (ur-**)
'ps', // Pashtu (ps-**)
'syr', // Syriac (syr-**)
'dv', // Divehi (dv-**)
'he', // Hebrew (he-**)
'yi', // Yiddish (yi-**)
);
foreach ($rtl_languages as $prefix) {
if (strpos($language, $prefix) === 0) {
return true;
}
}
return false;
}
/**
* Set Hash algorithm used for id generation.
*
* @param string $algo Algorithm name
*
* @return \PicoFeed\Parser\Parser
*/
public function setHashAlgo($algo)
{
$this->hash_algo = $algo ?: $this->hash_algo;
return $this;
}
/**
* Set a different timezone.
*
* @see http://php.net/manual/en/timezones.php
*
* @param string $timezone Timezone
*
* @return \PicoFeed\Parser\Parser
*/
public function setTimezone($timezone)
{
if ($timezone) {
$this->date->timezone = $timezone;
}
return $this;
}
/**
* Set config object.
*
* @param \PicoFeed\Config\Config $config Config instance
*
* @return \PicoFeed\Parser\Parser
*/
public function setConfig($config)
{
$this->config = $config;
return $this;
}
/**
* Enable the content grabber.
*
* @return \PicoFeed\Parser\Parser
*/
public function disableContentFiltering()
{
$this->enable_filter = false;
}
/**
* Return true if the content filtering is enabled.
*
* @return bool
*/
public function isFilteringEnabled()
{
if ($this->config === null) {
return $this->enable_filter;
}
return $this->config->getContentFiltering($this->enable_filter);
}
/**
* Enable the content grabber.
*
* @param bool $needs_rule_file true if only pages with rule files should be
* scraped
*
* @return \PicoFeed\Parser\Parser
*/
public function enableContentGrabber($needs_rule_file = false)
{
$this->enable_grabber = true;
$this->grabber_needs_rule_file = $needs_rule_file;
}
/**
* Set ignored URLs for the content grabber.
*
* @param array $urls URLs
*
* @return \PicoFeed\Parser\Parser
*/
public function setGrabberIgnoreUrls(array $urls)
{
$this->grabber_ignore_urls = $urls;
}
/**
* Register all supported namespaces to be used within an xpath query.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function registerSupportedNamespaces(SimpleXMLElement $xml)
{
foreach ($this->namespaces as $prefix => $ns) {
$xml->registerXPathNamespace($prefix, $ns);
}
return $xml;
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedUrl(SimpleXMLElement $xml, Feed $feed);
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findSiteUrl(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedTitle(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedDescription(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedId(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedDate(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedLogo(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedIcon(SimpleXMLElement $xml, Feed $feed);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
abstract public function getItemsTree(SimpleXMLElement $xml);
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item);
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemUrl(SimpleXMLElement $entry, Item $item);
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemTitle(SimpleXMLElement $entry, Item $item);
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemContent(SimpleXMLElement $entry, Item $item);
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed);
}

View file

@ -1,14 +0,0 @@
<?php
namespace PicoFeed\Parser;
use PicoFeed\PicoFeedException;
/**
* ParserException Exception.
*
* @author Frederic Guillot
*/
abstract class ParserException extends PicoFeedException
{
}

View file

@ -1,270 +0,0 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
use PicoFeed\Filter\Filter;
/**
* RSS 1.0 parser.
*
* @author Frederic Guillot
*/
class Rss10 extends Parser
{
/**
* Supported namespaces.
*/
protected $namespaces = array(
'rss' => 'http://purl.org/rss/1.0/',
'dc' => 'http://purl.org/dc/elements/1.1/',
'content' => 'http://purl.org/rss/1.0/modules/content/',
'feedburner' => 'http://rssnamespace.org/feedburner/ext/1.0',
);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml)
{
return XmlParser::getXPathResult($xml, 'rss:item', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'item');
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->feed_url = '';
}
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
{
$site_url = XmlParser::getXPathResult($xml, 'rss:channel/rss:link', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/link');
$feed->site_url = (string) current($site_url);
}
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
{
$description = XmlParser::getXPathResult($xml, 'rss:channel/rss:description', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/description');
$feed->description = (string) current($description);
}
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
{
$logo = XmlParser::getXPathResult($xml, 'rss:image/rss:url', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'image/url');
$feed->logo = (string) current($logo);
}
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
{
$feed->icon = '';
}
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$title = XmlParser::getXPathResult($xml, 'rss:channel/rss:title', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/title');
$feed->title = Filter::stripWhiteSpace((string) current($title)) ?: $feed->getSiteUrl();
}
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$language = XmlParser::getXPathResult($xml, 'rss:channel/dc:language', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:language', $this->namespaces);
$feed->language = (string) current($language);
}
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$feed->id = $feed->getFeedUrl() ?: $feed->getSiteUrl();
}
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$date = XmlParser::getXPathResult($xml, 'rss:channel/dc:date', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:date', $this->namespaces);
$feed->date = $this->date->getDateTime((string) current($date));
}
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$date = XmlParser::getXPathResult($entry, 'dc:date', $this->namespaces);
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
}
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$title = XmlParser::getXPathResult($entry, 'rss:title', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'title');
$item->title = Filter::stripWhiteSpace((string) current($title)) ?: $item->url;
}
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
$author = XmlParser::getXPathResult($entry, 'dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'rss:channel/dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:creator', $this->namespaces);
$item->author = (string) current($author);
}
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$content = XmlParser::getXPathResult($entry, 'content:encoded', $this->namespaces);
if (trim((string) current($content)) === '') {
$content = XmlParser::getXPathResult($entry, 'rss:description', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'description');
}
$item->content = (string) current($content);
}
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$link = XmlParser::getXPathResult($entry, 'feedburner:origLink', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'rss:link', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'link');
$item->url = trim((string) current($link));
}
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$item->id = $this->generateId(
$item->getTitle(), $item->getUrl(), $item->getContent()
);
}
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
}
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces);
$item->language = (string) current($language) ?: $feed->language;
}
}

View file

@ -1,291 +0,0 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
use PicoFeed\Filter\Filter;
use PicoFeed\Client\Url;
/**
* RSS 2.0 Parser.
*
* @author Frederic Guillot
*/
class Rss20 extends Parser
{
/**
* Supported namespaces.
*/
protected $namespaces = array(
'dc' => 'http://purl.org/dc/elements/1.1/',
'content' => 'http://purl.org/rss/1.0/modules/content/',
'feedburner' => 'http://rssnamespace.org/feedburner/ext/1.0',
'atom' => 'http://www.w3.org/2005/Atom',
);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml)
{
return XmlParser::getXPathResult($xml, 'channel/item');
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->feed_url = '';
}
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
{
$site_url = XmlParser::getXPathResult($xml, 'channel/link');
$feed->site_url = (string) current($site_url);
}
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
{
$description = XmlParser::getXPathResult($xml, 'channel/description');
$feed->description = (string) current($description);
}
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
{
$logo = XmlParser::getXPathResult($xml, 'channel/image/url');
$feed->logo = (string) current($logo);
}
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
{
$feed->icon = '';
}
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$title = XmlParser::getXPathResult($xml, 'channel/title');
$feed->title = Filter::stripWhiteSpace((string) current($title)) ?: $feed->getSiteUrl();
}
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$language = XmlParser::getXPathResult($xml, 'channel/language');
$feed->language = (string) current($language);
}
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$feed->id = $feed->getFeedUrl() ?: $feed->getSiteUrl();
}
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$publish_date = XmlParser::getXPathResult($xml, 'channel/pubDate');
$update_date = XmlParser::getXPathResult($xml, 'channel/lastBuildDate');
$published = !empty($publish_date) ? $this->date->getDateTime((string) current($publish_date)) : null;
$updated = !empty($update_date) ? $this->date->getDateTime((string) current($update_date)) : null;
if ($published === null && $updated === null) {
$feed->date = $this->date->getCurrentDateTime(); // We use the current date if there is no date for the feed
} elseif ($published !== null && $updated !== null) {
$feed->date = max($published, $updated); // We use the most recent date between published and updated
} else {
$feed->date = $updated ?: $published;
}
}
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$date = XmlParser::getXPathResult($entry, 'pubDate');
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
}
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$title = XmlParser::getXPathResult($entry, 'title');
$item->title = Filter::stripWhiteSpace((string) current($title)) ?: $item->url;
}
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
$author = XmlParser::getXPathResult($entry, 'dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'author')
?: XmlParser::getXPathResult($xml, 'channel/dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/managingEditor');
$item->author = (string) current($author);
}
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$content = XmlParser::getXPathResult($entry, 'content:encoded', $this->namespaces);
if (trim((string) current($content)) === '') {
$content = XmlParser::getXPathResult($entry, 'description');
}
$item->content = (string) current($content);
}
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$link = XmlParser::getXPathResult($entry, 'feedburner:origLink', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'link')
?: XmlParser::getXPathResult($entry, 'atom:link/@href', $this->namespaces);
if (!empty($link)) {
$item->url = trim((string) current($link));
} else {
$link = XmlParser::getXPathResult($entry, 'guid');
$link = trim((string) current($link));
if (filter_var($link, FILTER_VALIDATE_URL) !== false) {
$item->url = $link;
}
}
}
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$id = (string) current(XmlParser::getXPathResult($entry, 'guid'));
if ($id) {
$item->id = $this->generateId($id);
} else {
$item->id = $this->generateId(
$item->getTitle(), $item->getUrl(), $item->getContent()
);
}
}
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
if (isset($entry->enclosure)) {
$enclosure_url = XmlParser::getXPathResult($entry, 'feedburner:origEnclosureLink', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'enclosure/@url');
$enclosure_type = XmlParser::getXPathResult($entry, 'enclosure/@type');
$item->enclosure_url = Url::resolve((string) current($enclosure_url), $feed->getSiteUrl());
$item->enclosure_type = (string) current($enclosure_type);
}
}
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces);
$item->language = (string) current($language) ?: $feed->language;
}
}

View file

@ -1,12 +0,0 @@
<?php
namespace PicoFeed\Parser;
/**
* RSS 0.91 Parser.
*
* @author Frederic Guillot
*/
class Rss91 extends Rss20
{
}

View file

@ -1,12 +0,0 @@
<?php
namespace PicoFeed\Parser;
/**
* RSS 0.92 Parser.
*
* @author Frederic Guillot
*/
class Rss92 extends Rss20
{
}

View file

@ -1,12 +0,0 @@
<?php
namespace PicoFeed\Parser;
/**
* XmlEntityException Exception.
*
* @author Bernhard Posselt
*/
class XmlEntityException extends MalformedXmlException
{
}

View file

@ -1,229 +0,0 @@
<?php
namespace PicoFeed\Parser;
use DomDocument;
use SimpleXmlElement;
use Exception;
use ZendXml\Security;
/**
* XML parser class.
*
* Checks for XML eXternal Entity (XXE) and XML Entity Expansion (XEE) attacks on XML documents
*
* @author Frederic Guillot
*/
class XmlParser
{
/**
* Get a SimpleXmlElement instance or return false.
*
* @static
*
* @param string $input XML content
*
* @return mixed
*/
public static function getSimpleXml($input)
{
return self::scan($input);
}
/**
* Get a DomDocument instance or return false.
*
* @static
*
* @param string $input XML content
*
* @return \DOMNDocument
*/
public static function getDomDocument($input)
{
if (empty($input)) {
return false;
}
$dom = self::scan($input, new DOMDocument());
// The document is empty, there is probably some parsing errors
if ($dom && $dom->childNodes->length === 0) {
return false;
}
return $dom;
}
/**
* Small wrapper around ZendXml to turn their exceptions into picoFeed
* exceptions
* @param $input the xml to load
* @param $dom pass in a dom document or use null/omit if simpleXml should
* be used
*/
private static function scan($input, $dom=null)
{
try {
return Security::scan($input, $dom);
} catch(\ZendXml\Exception\RuntimeException $e) {
throw new XmlEntityException($e->getMessage());
}
}
/**
* Load HTML document by using a DomDocument instance or return false on failure.
*
* @static
*
* @param string $input XML content
*
* @return \DOMDocument
*/
public static function getHtmlDocument($input)
{
$dom = new DomDocument();
if (empty($input)) {
return $dom;
}
libxml_use_internal_errors(true);
if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
$dom->loadHTML($input, LIBXML_NONET);
} else {
$dom->loadHTML($input);
}
return $dom;
}
/**
* Convert a HTML document to XML.
*
* @static
*
* @param string $html HTML document
*
* @return string
*/
public static function htmlToXml($html)
{
$dom = self::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
return $dom->saveXML($dom->getElementsByTagName('body')->item(0));
}
/**
* Get XML parser errors.
*
* @static
*
* @return string
*/
public static function getErrors()
{
$errors = array();
foreach (libxml_get_errors() as $error) {
$errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)',
$error->message,
$error->line,
$error->column,
$error->code
);
}
return implode(', ', $errors);
}
/**
* Get the encoding from a xml tag.
*
* @static
*
* @param string $data Input data
*
* @return string
*/
public static function getEncodingFromXmlTag($data)
{
$encoding = '';
if (strpos($data, '<?xml') !== false) {
$data = substr($data, 0, strrpos($data, '?>'));
$data = str_replace("'", '"', $data);
$p1 = strpos($data, 'encoding=');
$p2 = strpos($data, '"', $p1 + 10);
if ($p1 !== false && $p2 !== false) {
$encoding = substr($data, $p1 + 10, $p2 - $p1 - 10);
$encoding = strtolower($encoding);
}
}
return $encoding;
}
/**
* Get the charset from a meta tag.
*
* @static
*
* @param string $data Input data
*
* @return string
*/
public static function getEncodingFromMetaTag($data)
{
$encoding = '';
if (preg_match('/<meta.*?charset\s*=\s*["\']?\s*([^"\'\s\/>;]+)/i', $data, $match) === 1) {
$encoding = strtolower($match[1]);
}
return $encoding;
}
/**
* Rewrite XPath query to use namespace-uri and local-name derived from prefix.
*
* @param string $query XPath query
* @param array $ns Prefix to namespace URI mapping
*
* @return string
*/
public static function replaceXPathPrefixWithNamespaceURI($query, array $ns)
{
return preg_replace_callback('/([A-Z0-9]+):([A-Z0-9]+)/iu', function ($matches) use ($ns) {
// don't try to map the special prefix XML
if (strtolower($matches[1]) === 'xml') {
return $matches[0];
}
return '*[namespace-uri()="'.$ns[$matches[1]].'" and local-name()="'.$matches[2].'"]';
},
$query);
}
/**
* Get the result elements of a XPath query.
*
* @param \SimpleXMLElement $xml XML element
* @param string $query XPath query
* @param array $ns Prefix to namespace URI mapping
*
* @return \SimpleXMLElement
*/
public static function getXPathResult(SimpleXMLElement $xml, $query, array $ns = array())
{
if (!empty($ns)) {
$query = static::replaceXPathPrefixWithNamespaceURI($query, $ns);
}
return $xml->xpath($query);
}
}