Cleanup
This commit is contained in:
parent
cb491341df
commit
9947a5f033
391 changed files with 0 additions and 15712 deletions
|
@ -1,366 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
use SimpleXMLElement;
|
||||
use PicoFeed\Filter\Filter;
|
||||
use PicoFeed\Client\Url;
|
||||
|
||||
/**
|
||||
* Atom parser.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class Atom extends Parser
|
||||
{
|
||||
/**
|
||||
* Supported namespaces.
|
||||
*/
|
||||
protected $namespaces = array(
|
||||
'atom' => 'http://www.w3.org/2005/Atom',
|
||||
);
|
||||
|
||||
/**
|
||||
* Get the path to the items XML tree.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
*
|
||||
* @return SimpleXMLElement
|
||||
*/
|
||||
public function getItemsTree(SimpleXMLElement $xml)
|
||||
{
|
||||
return XmlParser::getXPathResult($xml, 'atom:entry', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'entry');
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$feed->feed_url = $this->getUrl($xml, 'self');
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the site url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$feed->site_url = $this->getUrl($xml, 'alternate', true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed description.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$description = XmlParser::getXPathResult($xml, 'atom:subtitle', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'subtitle');
|
||||
|
||||
$feed->description = (string) current($description);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed logo url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$logo = XmlParser::getXPathResult($xml, 'atom:logo', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'logo');
|
||||
|
||||
$feed->logo = (string) current($logo);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed icon.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$icon = XmlParser::getXPathResult($xml, 'atom:icon', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'icon');
|
||||
|
||||
$feed->icon = (string) current($icon);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed title.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$title = XmlParser::getXPathResult($xml, 'atom:title', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'title');
|
||||
|
||||
$feed->title = Filter::stripWhiteSpace((string) current($title)) ?: $feed->getSiteUrl();
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed language.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$language = XmlParser::getXPathResult($xml, '*[not(self::atom:entry)]/@xml:lang', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, '@xml:lang');
|
||||
|
||||
$feed->language = (string) current($language);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed id.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$id = XmlParser::getXPathResult($xml, 'atom:id', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'id');
|
||||
|
||||
$feed->id = (string) current($id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed date.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$updated = XmlParser::getXPathResult($xml, 'atom:updated', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'updated');
|
||||
|
||||
$feed->date = $this->date->getDateTime((string) current($updated));
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item date.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
$published = XmlParser::getXPathResult($entry, 'atom:published', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'published');
|
||||
|
||||
$updated = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'updated');
|
||||
|
||||
$published = !empty($published) ? $this->date->getDateTime((string) current($published)) : null;
|
||||
$updated = !empty($updated) ? $this->date->getDateTime((string) current($updated)) : null;
|
||||
|
||||
if ($published === null && $updated === null) {
|
||||
$item->date = $feed->getDate(); // We use the feed date if there is no date for the item
|
||||
} elseif ($published !== null && $updated !== null) {
|
||||
$item->date = max($published, $updated); // We use the most recent date between published and updated
|
||||
} else {
|
||||
$item->date = $updated ?: $published;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item title.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param Item $item Item object
|
||||
*/
|
||||
public function findItemTitle(SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$title = XmlParser::getXPathResult($entry, 'atom:title', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'title');
|
||||
|
||||
$item->title = Filter::stripWhiteSpace((string) current($title)) ?: $item->url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item author.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$author = XmlParser::getXPathResult($entry, 'atom:author/atom:name', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'author/name')
|
||||
?: XmlParser::getXPathResult($xml, 'atom:author/atom:name', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'author/name');
|
||||
|
||||
$item->author = (string) current($author);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item content.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
public function findItemContent(SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$item->content = $this->getContent($entry);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item URL.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
public function findItemUrl(SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$item->url = $this->getUrl($entry, 'alternate', true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Genereate the item id.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
$id = XmlParser::getXPathResult($entry, 'atom:id', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'id');
|
||||
|
||||
if (!empty($id)) {
|
||||
$item->id = $this->generateId((string) current($id));
|
||||
} else {
|
||||
$item->id = $this->generateId(
|
||||
$item->getTitle(), $item->getUrl(), $item->getContent()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item enclosure.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
$enclosure = $this->findLink($entry, 'enclosure');
|
||||
|
||||
if ($enclosure) {
|
||||
$item->enclosure_url = Url::resolve((string) $enclosure['href'], $feed->getSiteUrl());
|
||||
$item->enclosure_type = (string) $enclosure['type'];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item language.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
$language = XmlParser::getXPathResult($entry, './/@xml:lang');
|
||||
|
||||
$item->language = (string) current($language) ?: $feed->language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the URL from a link tag.
|
||||
*
|
||||
* @param SimpleXMLElement $xml XML tag
|
||||
* @param string $rel Link relationship: alternate, enclosure, related, self, via
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private function getUrl(SimpleXMLElement $xml, $rel, $fallback = false)
|
||||
{
|
||||
$link = $this->findLink($xml, $rel);
|
||||
|
||||
if ($link) {
|
||||
return (string) $link['href'];
|
||||
}
|
||||
|
||||
if ($fallback) {
|
||||
$link = $this->findLink($xml, '');
|
||||
|
||||
return $link ? (string) $link['href'] : '';
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a link tag that match a relationship.
|
||||
*
|
||||
* @param SimpleXMLElement $xml XML tag
|
||||
* @param string $rel Link relationship: alternate, enclosure, related, self, via
|
||||
*
|
||||
* @return SimpleXMLElement|null
|
||||
*/
|
||||
private function findLink(SimpleXMLElement $xml, $rel)
|
||||
{
|
||||
$links = XmlParser::getXPathResult($xml, 'atom:link', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'link');
|
||||
|
||||
foreach ($links as $link) {
|
||||
if ($rel === (string) $link['rel']) {
|
||||
return $link;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the entry content.
|
||||
*
|
||||
* @param SimpleXMLElement $entry XML Entry
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private function getContent(SimpleXMLElement $entry)
|
||||
{
|
||||
$content = current(
|
||||
XmlParser::getXPathResult($entry, 'atom:content', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'content')
|
||||
);
|
||||
|
||||
if (!empty($content) && count($content->children())) {
|
||||
$xml_string = '';
|
||||
|
||||
foreach ($content->children() as $child) {
|
||||
$xml_string .= $child->asXML();
|
||||
}
|
||||
|
||||
return $xml_string;
|
||||
} elseif (trim((string) $content) !== '') {
|
||||
return (string) $content;
|
||||
}
|
||||
|
||||
$summary = XmlParser::getXPathResult($entry, 'atom:summary', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'summary');
|
||||
|
||||
return (string) current($summary);
|
||||
}
|
||||
}
|
|
@ -1,113 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
use DateTime;
|
||||
use DateTimeZone;
|
||||
|
||||
/**
|
||||
* Date Parser.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class DateParser
|
||||
{
|
||||
/**
|
||||
* Timezone used to parse feed dates.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $timezone = 'UTC';
|
||||
|
||||
/**
|
||||
* Supported formats [ 'format' => length ].
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public $formats = array(
|
||||
DATE_ATOM => null,
|
||||
DATE_RSS => null,
|
||||
DATE_COOKIE => null,
|
||||
DATE_ISO8601 => null,
|
||||
DATE_RFC822 => null,
|
||||
DATE_RFC850 => null,
|
||||
DATE_RFC1036 => null,
|
||||
DATE_RFC1123 => null,
|
||||
DATE_RFC2822 => null,
|
||||
DATE_RFC3339 => null,
|
||||
'D, d M Y H:i:s' => 25,
|
||||
'D, d M Y h:i:s' => 25,
|
||||
'D M d Y H:i:s' => 24,
|
||||
'j M Y H:i:s' => 20,
|
||||
'Y-m-d H:i:s' => 19,
|
||||
'Y-m-d\TH:i:s' => 19,
|
||||
'd/m/Y H:i:s' => 19,
|
||||
'D, d M Y' => 16,
|
||||
'Y-m-d' => 10,
|
||||
'd-m-Y' => 10,
|
||||
'm-d-Y' => 10,
|
||||
'd.m.Y' => 10,
|
||||
'm.d.Y' => 10,
|
||||
'd/m/Y' => 10,
|
||||
'm/d/Y' => 10,
|
||||
);
|
||||
|
||||
/**
|
||||
* Try to parse all date format for broken feeds.
|
||||
*
|
||||
* @param string $value Original date format
|
||||
*
|
||||
* @return DateTime
|
||||
*/
|
||||
public function getDateTime($value)
|
||||
{
|
||||
$value = trim($value);
|
||||
|
||||
foreach ($this->formats as $format => $length) {
|
||||
$truncated_value = $value;
|
||||
if ($length !== null) {
|
||||
$truncated_value = substr($truncated_value, 0, $length);
|
||||
}
|
||||
|
||||
$date = $this->getValidDate($format, $truncated_value);
|
||||
if ($date !== false) {
|
||||
return $date;
|
||||
}
|
||||
}
|
||||
|
||||
return $this->getCurrentDateTime();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a valid date from a given format.
|
||||
*
|
||||
* @param string $format Date format
|
||||
* @param string $value Original date value
|
||||
*
|
||||
* @return DateTime|bool
|
||||
*/
|
||||
public function getValidDate($format, $value)
|
||||
{
|
||||
$date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone));
|
||||
|
||||
if ($date !== false) {
|
||||
$errors = DateTime::getLastErrors();
|
||||
|
||||
if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) {
|
||||
return $date;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current datetime.
|
||||
*
|
||||
* @return DateTime
|
||||
*/
|
||||
public function getCurrentDateTime()
|
||||
{
|
||||
return new DateTime('now', new DateTimeZone($this->timezone));
|
||||
}
|
||||
}
|
|
@ -1,194 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
/**
|
||||
* Feed.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class Feed
|
||||
{
|
||||
/**
|
||||
* Feed items.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public $items = array();
|
||||
|
||||
/**
|
||||
* Feed id.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $id = '';
|
||||
|
||||
/**
|
||||
* Feed title.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $title = '';
|
||||
|
||||
/**
|
||||
* Feed description.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $description = '';
|
||||
|
||||
/**
|
||||
* Feed url.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $feed_url = '';
|
||||
|
||||
/**
|
||||
* Site url.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $site_url = '';
|
||||
|
||||
/**
|
||||
* Feed date.
|
||||
*
|
||||
* @var \DateTime
|
||||
*/
|
||||
public $date = null;
|
||||
|
||||
/**
|
||||
* Feed language.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $language = '';
|
||||
|
||||
/**
|
||||
* Feed logo URL.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $logo = '';
|
||||
|
||||
/**
|
||||
* Feed icon URL.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $icon = '';
|
||||
|
||||
/**
|
||||
* Return feed information.
|
||||
*/
|
||||
public function __toString()
|
||||
{
|
||||
$output = '';
|
||||
|
||||
foreach (array('id', 'title', 'feed_url', 'site_url', 'language', 'description', 'logo') as $property) {
|
||||
$output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL;
|
||||
}
|
||||
|
||||
$output .= 'Feed::date = '.$this->date->format(DATE_RFC822).PHP_EOL;
|
||||
$output .= 'Feed::isRTL() = '.($this->isRTL() ? 'true' : 'false').PHP_EOL;
|
||||
$output .= 'Feed::items = '.count($this->items).' items'.PHP_EOL;
|
||||
|
||||
foreach ($this->items as $item) {
|
||||
$output .= '----'.PHP_EOL;
|
||||
$output .= $item;
|
||||
}
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get title.
|
||||
*/
|
||||
public function getTitle()
|
||||
{
|
||||
return $this->title;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get description.
|
||||
*/
|
||||
public function getDescription()
|
||||
{
|
||||
return $this->description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the logo url.
|
||||
*/
|
||||
public function getLogo()
|
||||
{
|
||||
return $this->logo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the icon url.
|
||||
*/
|
||||
public function getIcon()
|
||||
{
|
||||
return $this->icon;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get feed url.
|
||||
*/
|
||||
public function getFeedUrl()
|
||||
{
|
||||
return $this->feed_url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get site url.
|
||||
*/
|
||||
public function getSiteUrl()
|
||||
{
|
||||
return $this->site_url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get date.
|
||||
*/
|
||||
public function getDate()
|
||||
{
|
||||
return $this->date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get language.
|
||||
*/
|
||||
public function getLanguage()
|
||||
{
|
||||
return $this->language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get id.
|
||||
*/
|
||||
public function getId()
|
||||
{
|
||||
return $this->id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get feed items.
|
||||
*/
|
||||
public function getItems()
|
||||
{
|
||||
return $this->items;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the feed is "Right to Left".
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isRTL()
|
||||
{
|
||||
return Parser::isLanguageRTL($this->language);
|
||||
}
|
||||
}
|
|
@ -1,230 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
/**
|
||||
* Feed Item.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class Item
|
||||
{
|
||||
/**
|
||||
* List of known RTL languages.
|
||||
*
|
||||
* @var public
|
||||
*/
|
||||
public $rtl = array(
|
||||
'ar', // Arabic (ar-**)
|
||||
'fa', // Farsi (fa-**)
|
||||
'ur', // Urdu (ur-**)
|
||||
'ps', // Pashtu (ps-**)
|
||||
'syr', // Syriac (syr-**)
|
||||
'dv', // Divehi (dv-**)
|
||||
'he', // Hebrew (he-**)
|
||||
'yi', // Yiddish (yi-**)
|
||||
);
|
||||
|
||||
/**
|
||||
* Item id.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $id = '';
|
||||
|
||||
/**
|
||||
* Item title.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $title = '';
|
||||
|
||||
/**
|
||||
* Item url.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $url = '';
|
||||
|
||||
/**
|
||||
* Item author.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $author = '';
|
||||
|
||||
/**
|
||||
* Item date.
|
||||
*
|
||||
* @var \DateTime
|
||||
*/
|
||||
public $date = null;
|
||||
|
||||
/**
|
||||
* Item content.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $content = '';
|
||||
|
||||
/**
|
||||
* Item enclosure url.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $enclosure_url = '';
|
||||
|
||||
/**
|
||||
* Item enclusure type.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $enclosure_type = '';
|
||||
|
||||
/**
|
||||
* Item language.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
public $language = '';
|
||||
|
||||
/**
|
||||
* Raw XML.
|
||||
*
|
||||
* @var \SimpleXMLElement
|
||||
*/
|
||||
public $xml;
|
||||
|
||||
/**
|
||||
* List of namespaces.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public $namespaces = array();
|
||||
|
||||
/**
|
||||
* Get specific XML tag or attribute value.
|
||||
*
|
||||
* @param string $tag Tag name (examples: guid, media:content)
|
||||
* @param string $attribute Tag attribute
|
||||
*
|
||||
* @return array|false Tag values or error
|
||||
*/
|
||||
public function getTag($tag, $attribute = '')
|
||||
{
|
||||
// convert to xPath attribute query
|
||||
if ($attribute !== '') {
|
||||
$attribute = '/@'.$attribute;
|
||||
}
|
||||
|
||||
// construct query
|
||||
$query = './/'.$tag.$attribute;
|
||||
$elements = XmlParser::getXPathResult($this->xml, $query, $this->namespaces);
|
||||
|
||||
if ($elements === false) { // xPath error
|
||||
return false;
|
||||
}
|
||||
|
||||
return array_map(function ($element) { return (string) $element;}, $elements);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return item information.
|
||||
*/
|
||||
public function __toString()
|
||||
{
|
||||
$output = '';
|
||||
|
||||
foreach (array('id', 'title', 'url', 'language', 'author', 'enclosure_url', 'enclosure_type') as $property) {
|
||||
$output .= 'Item::'.$property.' = '.$this->$property.PHP_EOL;
|
||||
}
|
||||
|
||||
$output .= 'Item::date = '.$this->date->format(DATE_RFC822).PHP_EOL;
|
||||
$output .= 'Item::isRTL() = '.($this->isRTL() ? 'true' : 'false').PHP_EOL;
|
||||
$output .= 'Item::content = '.strlen($this->content).' bytes'.PHP_EOL;
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get title.
|
||||
*/
|
||||
public function getTitle()
|
||||
{
|
||||
return $this->title;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get url.
|
||||
*/
|
||||
public function getUrl()
|
||||
{
|
||||
return $this->url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get id.
|
||||
*/
|
||||
public function getId()
|
||||
{
|
||||
return $this->id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get date.
|
||||
*/
|
||||
public function getDate()
|
||||
{
|
||||
return $this->date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get content.
|
||||
*/
|
||||
public function getContent()
|
||||
{
|
||||
return $this->content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get enclosure url.
|
||||
*/
|
||||
public function getEnclosureUrl()
|
||||
{
|
||||
return $this->enclosure_url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get enclosure type.
|
||||
*/
|
||||
public function getEnclosureType()
|
||||
{
|
||||
return $this->enclosure_type;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get language.
|
||||
*/
|
||||
public function getLanguage()
|
||||
{
|
||||
return $this->language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get author.
|
||||
*/
|
||||
public function getAuthor()
|
||||
{
|
||||
return $this->author;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the item is "Right to Left".
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isRTL()
|
||||
{
|
||||
return Parser::isLanguageRTL($this->language);
|
||||
}
|
||||
}
|
|
@ -1,12 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
/**
|
||||
* MalformedXmlException Exception.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class MalformedXmlException extends ParserException
|
||||
{
|
||||
}
|
|
@ -1,576 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
use SimpleXMLElement;
|
||||
use PicoFeed\Client\Url;
|
||||
use PicoFeed\Encoding\Encoding;
|
||||
use PicoFeed\Filter\Filter;
|
||||
use PicoFeed\Logging\Logger;
|
||||
use PicoFeed\Scraper\Scraper;
|
||||
|
||||
/**
|
||||
* Base parser class.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
abstract class Parser
|
||||
{
|
||||
/**
|
||||
* Config object.
|
||||
*
|
||||
* @var \PicoFeed\Config\Config
|
||||
*/
|
||||
private $config;
|
||||
|
||||
/**
|
||||
* DateParser object.
|
||||
*
|
||||
* @var \PicoFeed\Parser\DateParser
|
||||
*/
|
||||
protected $date;
|
||||
|
||||
/**
|
||||
* Hash algorithm used to generate item id, any value supported by PHP, see hash_algos().
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
private $hash_algo = 'sha256';
|
||||
|
||||
/**
|
||||
* Feed content (XML data).
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
protected $content = '';
|
||||
|
||||
/**
|
||||
* Fallback url.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
protected $fallback_url = '';
|
||||
|
||||
/**
|
||||
* XML namespaces supported by parser.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $namespaces = array();
|
||||
|
||||
/**
|
||||
* XML namespaces used in document.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $used_namespaces = array();
|
||||
|
||||
/**
|
||||
* Enable the content filtering.
|
||||
*
|
||||
* @var bool
|
||||
*/
|
||||
private $enable_filter = true;
|
||||
|
||||
/**
|
||||
* Enable the content grabber.
|
||||
*
|
||||
* @var bool
|
||||
*/
|
||||
private $enable_grabber = false;
|
||||
|
||||
/**
|
||||
* Enable the content grabber on all pages.
|
||||
*
|
||||
* @var bool
|
||||
*/
|
||||
private $grabber_needs_rule_file = false;
|
||||
|
||||
/**
|
||||
* Ignore those urls for the content scraper.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
private $grabber_ignore_urls = array();
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param string $content Feed content
|
||||
* @param string $http_encoding HTTP encoding (headers)
|
||||
* @param string $fallback_url Fallback url when the feed provide relative or broken url
|
||||
*/
|
||||
public function __construct($content, $http_encoding = '', $fallback_url = '')
|
||||
{
|
||||
$this->date = new DateParser();
|
||||
$this->fallback_url = $fallback_url;
|
||||
$xml_encoding = XmlParser::getEncodingFromXmlTag($content);
|
||||
|
||||
// Strip XML tag to avoid multiple encoding/decoding in the next XML processing
|
||||
$this->content = Filter::stripXmlTag($content);
|
||||
|
||||
// Encode everything in UTF-8
|
||||
Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
|
||||
$this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the document.
|
||||
*
|
||||
* @return \PicoFeed\Parser\Feed
|
||||
*/
|
||||
public function execute()
|
||||
{
|
||||
Logger::setMessage(get_called_class().': begin parsing');
|
||||
|
||||
$xml = XmlParser::getSimpleXml($this->content);
|
||||
|
||||
if ($xml === false) {
|
||||
Logger::setMessage(get_called_class().': Applying XML workarounds');
|
||||
$this->content = Filter::normalizeData($this->content);
|
||||
$xml = XmlParser::getSimpleXml($this->content);
|
||||
|
||||
if ($xml === false) {
|
||||
Logger::setMessage(get_called_class().': XML parsing error');
|
||||
Logger::setMessage(XmlParser::getErrors());
|
||||
throw new MalformedXmlException('XML parsing error');
|
||||
}
|
||||
}
|
||||
|
||||
$this->used_namespaces = $xml->getNamespaces(true);
|
||||
$xml = $this->registerSupportedNamespaces($xml);
|
||||
|
||||
$feed = new Feed();
|
||||
|
||||
$this->findFeedUrl($xml, $feed);
|
||||
$this->checkFeedUrl($feed);
|
||||
|
||||
$this->findSiteUrl($xml, $feed);
|
||||
$this->checkSiteUrl($feed);
|
||||
|
||||
$this->findFeedTitle($xml, $feed);
|
||||
$this->findFeedDescription($xml, $feed);
|
||||
$this->findFeedLanguage($xml, $feed);
|
||||
$this->findFeedId($xml, $feed);
|
||||
$this->findFeedDate($xml, $feed);
|
||||
$this->findFeedLogo($xml, $feed);
|
||||
$this->findFeedIcon($xml, $feed);
|
||||
|
||||
foreach ($this->getItemsTree($xml) as $entry) {
|
||||
$entry = $this->registerSupportedNamespaces($entry);
|
||||
|
||||
$item = new Item();
|
||||
$item->xml = $entry;
|
||||
$item->namespaces = $this->used_namespaces;
|
||||
|
||||
$this->findItemAuthor($xml, $entry, $item);
|
||||
|
||||
$this->findItemUrl($entry, $item);
|
||||
$this->checkItemUrl($feed, $item);
|
||||
|
||||
$this->findItemTitle($entry, $item);
|
||||
$this->findItemContent($entry, $item);
|
||||
|
||||
// Id generation can use the item url/title/content (order is important)
|
||||
$this->findItemId($entry, $item, $feed);
|
||||
|
||||
$this->findItemDate($entry, $item, $feed);
|
||||
$this->findItemEnclosure($entry, $item, $feed);
|
||||
$this->findItemLanguage($entry, $item, $feed);
|
||||
|
||||
// Order is important (avoid double filtering)
|
||||
$this->filterItemContent($feed, $item);
|
||||
$this->scrapWebsite($item);
|
||||
|
||||
$feed->items[] = $item;
|
||||
}
|
||||
|
||||
Logger::setMessage(get_called_class().PHP_EOL.$feed);
|
||||
|
||||
return $feed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the feed url is correct.
|
||||
*
|
||||
* @param Feed $feed Feed object
|
||||
*/
|
||||
public function checkFeedUrl(Feed $feed)
|
||||
{
|
||||
if ($feed->getFeedUrl() === '') {
|
||||
$feed->feed_url = $this->fallback_url;
|
||||
} else {
|
||||
$feed->feed_url = Url::resolve($feed->getFeedUrl(), $this->fallback_url);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the site url is correct.
|
||||
*
|
||||
* @param Feed $feed Feed object
|
||||
*/
|
||||
public function checkSiteUrl(Feed $feed)
|
||||
{
|
||||
if ($feed->getSiteUrl() === '') {
|
||||
$feed->site_url = Url::base($feed->getFeedUrl());
|
||||
} else {
|
||||
$feed->site_url = Url::resolve($feed->getSiteUrl(), $this->fallback_url);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the item url is correct.
|
||||
*
|
||||
* @param Feed $feed Feed object
|
||||
* @param Item $item Item object
|
||||
*/
|
||||
public function checkItemUrl(Feed $feed, Item $item)
|
||||
{
|
||||
$item->url = Url::resolve($item->getUrl(), $feed->getSiteUrl());
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch item content with the content grabber.
|
||||
*
|
||||
* @param Item $item Item object
|
||||
*/
|
||||
public function scrapWebsite(Item $item)
|
||||
{
|
||||
if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) {
|
||||
$grabber = new Scraper($this->config);
|
||||
$grabber->setUrl($item->getUrl());
|
||||
|
||||
if ($this->grabber_needs_rule_file) {
|
||||
$grabber->disableCandidateParser();
|
||||
}
|
||||
|
||||
$grabber->execute();
|
||||
|
||||
if ($grabber->hasRelevantContent()) {
|
||||
$item->content = $grabber->getFilteredContent();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter HTML for entry content.
|
||||
*
|
||||
* @param Feed $feed Feed object
|
||||
* @param Item $item Item object
|
||||
*/
|
||||
public function filterItemContent(Feed $feed, Item $item)
|
||||
{
|
||||
if ($this->isFilteringEnabled()) {
|
||||
$filter = Filter::html($item->getContent(), $feed->getSiteUrl());
|
||||
$filter->setConfig($this->config);
|
||||
$item->content = $filter->execute();
|
||||
} else {
|
||||
Logger::setMessage(get_called_class().': Content filtering disabled');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a unique id for an entry (hash all arguments).
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function generateId()
|
||||
{
|
||||
return hash($this->hash_algo, implode(func_get_args()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the given language is "Right to Left".
|
||||
*
|
||||
* @static
|
||||
*
|
||||
* @param string $language Language: fr-FR, en-US
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public static function isLanguageRTL($language)
|
||||
{
|
||||
$language = strtolower($language);
|
||||
|
||||
$rtl_languages = array(
|
||||
'ar', // Arabic (ar-**)
|
||||
'fa', // Farsi (fa-**)
|
||||
'ur', // Urdu (ur-**)
|
||||
'ps', // Pashtu (ps-**)
|
||||
'syr', // Syriac (syr-**)
|
||||
'dv', // Divehi (dv-**)
|
||||
'he', // Hebrew (he-**)
|
||||
'yi', // Yiddish (yi-**)
|
||||
);
|
||||
|
||||
foreach ($rtl_languages as $prefix) {
|
||||
if (strpos($language, $prefix) === 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set Hash algorithm used for id generation.
|
||||
*
|
||||
* @param string $algo Algorithm name
|
||||
*
|
||||
* @return \PicoFeed\Parser\Parser
|
||||
*/
|
||||
public function setHashAlgo($algo)
|
||||
{
|
||||
$this->hash_algo = $algo ?: $this->hash_algo;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a different timezone.
|
||||
*
|
||||
* @see http://php.net/manual/en/timezones.php
|
||||
*
|
||||
* @param string $timezone Timezone
|
||||
*
|
||||
* @return \PicoFeed\Parser\Parser
|
||||
*/
|
||||
public function setTimezone($timezone)
|
||||
{
|
||||
if ($timezone) {
|
||||
$this->date->timezone = $timezone;
|
||||
}
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set config object.
|
||||
*
|
||||
* @param \PicoFeed\Config\Config $config Config instance
|
||||
*
|
||||
* @return \PicoFeed\Parser\Parser
|
||||
*/
|
||||
public function setConfig($config)
|
||||
{
|
||||
$this->config = $config;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable the content grabber.
|
||||
*
|
||||
* @return \PicoFeed\Parser\Parser
|
||||
*/
|
||||
public function disableContentFiltering()
|
||||
{
|
||||
$this->enable_filter = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the content filtering is enabled.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isFilteringEnabled()
|
||||
{
|
||||
if ($this->config === null) {
|
||||
return $this->enable_filter;
|
||||
}
|
||||
|
||||
return $this->config->getContentFiltering($this->enable_filter);
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable the content grabber.
|
||||
*
|
||||
* @param bool $needs_rule_file true if only pages with rule files should be
|
||||
* scraped
|
||||
*
|
||||
* @return \PicoFeed\Parser\Parser
|
||||
*/
|
||||
public function enableContentGrabber($needs_rule_file = false)
|
||||
{
|
||||
$this->enable_grabber = true;
|
||||
$this->grabber_needs_rule_file = $needs_rule_file;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set ignored URLs for the content grabber.
|
||||
*
|
||||
* @param array $urls URLs
|
||||
*
|
||||
* @return \PicoFeed\Parser\Parser
|
||||
*/
|
||||
public function setGrabberIgnoreUrls(array $urls)
|
||||
{
|
||||
$this->grabber_ignore_urls = $urls;
|
||||
}
|
||||
|
||||
/**
|
||||
* Register all supported namespaces to be used within an xpath query.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
*
|
||||
* @return SimpleXMLElement
|
||||
*/
|
||||
public function registerSupportedNamespaces(SimpleXMLElement $xml)
|
||||
{
|
||||
foreach ($this->namespaces as $prefix => $ns) {
|
||||
$xml->registerXPathNamespace($prefix, $ns);
|
||||
}
|
||||
|
||||
return $xml;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findFeedUrl(SimpleXMLElement $xml, Feed $feed);
|
||||
|
||||
/**
|
||||
* Find the site url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findSiteUrl(SimpleXMLElement $xml, Feed $feed);
|
||||
|
||||
/**
|
||||
* Find the feed title.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findFeedTitle(SimpleXMLElement $xml, Feed $feed);
|
||||
|
||||
/**
|
||||
* Find the feed description.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findFeedDescription(SimpleXMLElement $xml, Feed $feed);
|
||||
|
||||
/**
|
||||
* Find the feed language.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed);
|
||||
|
||||
/**
|
||||
* Find the feed id.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findFeedId(SimpleXMLElement $xml, Feed $feed);
|
||||
|
||||
/**
|
||||
* Find the feed date.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findFeedDate(SimpleXMLElement $xml, Feed $feed);
|
||||
|
||||
/**
|
||||
* Find the feed logo url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findFeedLogo(SimpleXMLElement $xml, Feed $feed);
|
||||
|
||||
/**
|
||||
* Find the feed icon.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findFeedIcon(SimpleXMLElement $xml, Feed $feed);
|
||||
|
||||
/**
|
||||
* Get the path to the items XML tree.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
*
|
||||
* @return SimpleXMLElement
|
||||
*/
|
||||
abstract public function getItemsTree(SimpleXMLElement $xml);
|
||||
|
||||
/**
|
||||
* Find the item author.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
abstract public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item);
|
||||
|
||||
/**
|
||||
* Find the item URL.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
abstract public function findItemUrl(SimpleXMLElement $entry, Item $item);
|
||||
|
||||
/**
|
||||
* Find the item title.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
abstract public function findItemTitle(SimpleXMLElement $entry, Item $item);
|
||||
|
||||
/**
|
||||
* Genereate the item id.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed);
|
||||
|
||||
/**
|
||||
* Find the item date.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed);
|
||||
|
||||
/**
|
||||
* Find the item content.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
abstract public function findItemContent(SimpleXMLElement $entry, Item $item);
|
||||
|
||||
/**
|
||||
* Find the item enclosure.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed);
|
||||
|
||||
/**
|
||||
* Find the item language.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
abstract public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed);
|
||||
}
|
|
@ -1,14 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
use PicoFeed\PicoFeedException;
|
||||
|
||||
/**
|
||||
* ParserException Exception.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
abstract class ParserException extends PicoFeedException
|
||||
{
|
||||
}
|
|
@ -1,270 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
use SimpleXMLElement;
|
||||
use PicoFeed\Filter\Filter;
|
||||
|
||||
/**
|
||||
* RSS 1.0 parser.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class Rss10 extends Parser
|
||||
{
|
||||
/**
|
||||
* Supported namespaces.
|
||||
*/
|
||||
protected $namespaces = array(
|
||||
'rss' => 'http://purl.org/rss/1.0/',
|
||||
'dc' => 'http://purl.org/dc/elements/1.1/',
|
||||
'content' => 'http://purl.org/rss/1.0/modules/content/',
|
||||
'feedburner' => 'http://rssnamespace.org/feedburner/ext/1.0',
|
||||
);
|
||||
|
||||
/**
|
||||
* Get the path to the items XML tree.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
*
|
||||
* @return SimpleXMLElement
|
||||
*/
|
||||
public function getItemsTree(SimpleXMLElement $xml)
|
||||
{
|
||||
return XmlParser::getXPathResult($xml, 'rss:item', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'item');
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$feed->feed_url = '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the site url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$site_url = XmlParser::getXPathResult($xml, 'rss:channel/rss:link', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'channel/link');
|
||||
|
||||
$feed->site_url = (string) current($site_url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed description.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$description = XmlParser::getXPathResult($xml, 'rss:channel/rss:description', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'channel/description');
|
||||
|
||||
$feed->description = (string) current($description);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed logo url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$logo = XmlParser::getXPathResult($xml, 'rss:image/rss:url', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'image/url');
|
||||
|
||||
$feed->logo = (string) current($logo);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed icon.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$feed->icon = '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed title.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$title = XmlParser::getXPathResult($xml, 'rss:channel/rss:title', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'channel/title');
|
||||
|
||||
$feed->title = Filter::stripWhiteSpace((string) current($title)) ?: $feed->getSiteUrl();
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed language.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$language = XmlParser::getXPathResult($xml, 'rss:channel/dc:language', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'channel/dc:language', $this->namespaces);
|
||||
|
||||
$feed->language = (string) current($language);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed id.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$feed->id = $feed->getFeedUrl() ?: $feed->getSiteUrl();
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed date.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$date = XmlParser::getXPathResult($xml, 'rss:channel/dc:date', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'channel/dc:date', $this->namespaces);
|
||||
|
||||
$feed->date = $this->date->getDateTime((string) current($date));
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item date.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
$date = XmlParser::getXPathResult($entry, 'dc:date', $this->namespaces);
|
||||
|
||||
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item title.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
public function findItemTitle(SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$title = XmlParser::getXPathResult($entry, 'rss:title', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'title');
|
||||
|
||||
$item->title = Filter::stripWhiteSpace((string) current($title)) ?: $item->url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item author.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$author = XmlParser::getXPathResult($entry, 'dc:creator', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'rss:channel/dc:creator', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'channel/dc:creator', $this->namespaces);
|
||||
|
||||
$item->author = (string) current($author);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item content.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
public function findItemContent(SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$content = XmlParser::getXPathResult($entry, 'content:encoded', $this->namespaces);
|
||||
|
||||
if (trim((string) current($content)) === '') {
|
||||
$content = XmlParser::getXPathResult($entry, 'rss:description', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'description');
|
||||
}
|
||||
|
||||
$item->content = (string) current($content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item URL.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
public function findItemUrl(SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$link = XmlParser::getXPathResult($entry, 'feedburner:origLink', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'rss:link', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'link');
|
||||
|
||||
$item->url = trim((string) current($link));
|
||||
}
|
||||
|
||||
/**
|
||||
* Genereate the item id.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
$item->id = $this->generateId(
|
||||
$item->getTitle(), $item->getUrl(), $item->getContent()
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item enclosure.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item language.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
$language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces);
|
||||
|
||||
$item->language = (string) current($language) ?: $feed->language;
|
||||
}
|
||||
}
|
|
@ -1,291 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
use SimpleXMLElement;
|
||||
use PicoFeed\Filter\Filter;
|
||||
use PicoFeed\Client\Url;
|
||||
|
||||
/**
|
||||
* RSS 2.0 Parser.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class Rss20 extends Parser
|
||||
{
|
||||
/**
|
||||
* Supported namespaces.
|
||||
*/
|
||||
protected $namespaces = array(
|
||||
'dc' => 'http://purl.org/dc/elements/1.1/',
|
||||
'content' => 'http://purl.org/rss/1.0/modules/content/',
|
||||
'feedburner' => 'http://rssnamespace.org/feedburner/ext/1.0',
|
||||
'atom' => 'http://www.w3.org/2005/Atom',
|
||||
);
|
||||
|
||||
/**
|
||||
* Get the path to the items XML tree.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
*
|
||||
* @return SimpleXMLElement
|
||||
*/
|
||||
public function getItemsTree(SimpleXMLElement $xml)
|
||||
{
|
||||
return XmlParser::getXPathResult($xml, 'channel/item');
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$feed->feed_url = '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the site url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$site_url = XmlParser::getXPathResult($xml, 'channel/link');
|
||||
$feed->site_url = (string) current($site_url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed description.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$description = XmlParser::getXPathResult($xml, 'channel/description');
|
||||
$feed->description = (string) current($description);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed logo url.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$logo = XmlParser::getXPathResult($xml, 'channel/image/url');
|
||||
$feed->logo = (string) current($logo);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed icon.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$feed->icon = '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed title.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$title = XmlParser::getXPathResult($xml, 'channel/title');
|
||||
$feed->title = Filter::stripWhiteSpace((string) current($title)) ?: $feed->getSiteUrl();
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed language.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$language = XmlParser::getXPathResult($xml, 'channel/language');
|
||||
$feed->language = (string) current($language);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed id.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$feed->id = $feed->getFeedUrl() ?: $feed->getSiteUrl();
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the feed date.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed xml
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
|
||||
{
|
||||
$publish_date = XmlParser::getXPathResult($xml, 'channel/pubDate');
|
||||
$update_date = XmlParser::getXPathResult($xml, 'channel/lastBuildDate');
|
||||
|
||||
$published = !empty($publish_date) ? $this->date->getDateTime((string) current($publish_date)) : null;
|
||||
$updated = !empty($update_date) ? $this->date->getDateTime((string) current($update_date)) : null;
|
||||
|
||||
if ($published === null && $updated === null) {
|
||||
$feed->date = $this->date->getCurrentDateTime(); // We use the current date if there is no date for the feed
|
||||
} elseif ($published !== null && $updated !== null) {
|
||||
$feed->date = max($published, $updated); // We use the most recent date between published and updated
|
||||
} else {
|
||||
$feed->date = $updated ?: $published;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item date.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
$date = XmlParser::getXPathResult($entry, 'pubDate');
|
||||
|
||||
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item title.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
public function findItemTitle(SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$title = XmlParser::getXPathResult($entry, 'title');
|
||||
$item->title = Filter::stripWhiteSpace((string) current($title)) ?: $item->url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item author.
|
||||
*
|
||||
* @param SimpleXMLElement $xml Feed
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$author = XmlParser::getXPathResult($entry, 'dc:creator', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'author')
|
||||
?: XmlParser::getXPathResult($xml, 'channel/dc:creator', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($xml, 'channel/managingEditor');
|
||||
|
||||
$item->author = (string) current($author);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item content.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
public function findItemContent(SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$content = XmlParser::getXPathResult($entry, 'content:encoded', $this->namespaces);
|
||||
|
||||
if (trim((string) current($content)) === '') {
|
||||
$content = XmlParser::getXPathResult($entry, 'description');
|
||||
}
|
||||
|
||||
$item->content = (string) current($content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item URL.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
*/
|
||||
public function findItemUrl(SimpleXMLElement $entry, Item $item)
|
||||
{
|
||||
$link = XmlParser::getXPathResult($entry, 'feedburner:origLink', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'link')
|
||||
?: XmlParser::getXPathResult($entry, 'atom:link/@href', $this->namespaces);
|
||||
|
||||
if (!empty($link)) {
|
||||
$item->url = trim((string) current($link));
|
||||
} else {
|
||||
$link = XmlParser::getXPathResult($entry, 'guid');
|
||||
$link = trim((string) current($link));
|
||||
|
||||
if (filter_var($link, FILTER_VALIDATE_URL) !== false) {
|
||||
$item->url = $link;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Genereate the item id.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
$id = (string) current(XmlParser::getXPathResult($entry, 'guid'));
|
||||
|
||||
if ($id) {
|
||||
$item->id = $this->generateId($id);
|
||||
} else {
|
||||
$item->id = $this->generateId(
|
||||
$item->getTitle(), $item->getUrl(), $item->getContent()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item enclosure.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
if (isset($entry->enclosure)) {
|
||||
$enclosure_url = XmlParser::getXPathResult($entry, 'feedburner:origEnclosureLink', $this->namespaces)
|
||||
?: XmlParser::getXPathResult($entry, 'enclosure/@url');
|
||||
|
||||
$enclosure_type = XmlParser::getXPathResult($entry, 'enclosure/@type');
|
||||
|
||||
$item->enclosure_url = Url::resolve((string) current($enclosure_url), $feed->getSiteUrl());
|
||||
$item->enclosure_type = (string) current($enclosure_type);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the item language.
|
||||
*
|
||||
* @param SimpleXMLElement $entry Feed item
|
||||
* @param \PicoFeed\Parser\Item $item Item object
|
||||
* @param \PicoFeed\Parser\Feed $feed Feed object
|
||||
*/
|
||||
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
|
||||
{
|
||||
$language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces);
|
||||
|
||||
$item->language = (string) current($language) ?: $feed->language;
|
||||
}
|
||||
}
|
|
@ -1,12 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
/**
|
||||
* RSS 0.91 Parser.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class Rss91 extends Rss20
|
||||
{
|
||||
}
|
|
@ -1,12 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
/**
|
||||
* RSS 0.92 Parser.
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class Rss92 extends Rss20
|
||||
{
|
||||
}
|
|
@ -1,12 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
/**
|
||||
* XmlEntityException Exception.
|
||||
*
|
||||
* @author Bernhard Posselt
|
||||
*/
|
||||
class XmlEntityException extends MalformedXmlException
|
||||
{
|
||||
}
|
|
@ -1,229 +0,0 @@
|
|||
<?php
|
||||
|
||||
namespace PicoFeed\Parser;
|
||||
|
||||
use DomDocument;
|
||||
use SimpleXmlElement;
|
||||
use Exception;
|
||||
|
||||
use ZendXml\Security;
|
||||
|
||||
/**
|
||||
* XML parser class.
|
||||
*
|
||||
* Checks for XML eXternal Entity (XXE) and XML Entity Expansion (XEE) attacks on XML documents
|
||||
*
|
||||
* @author Frederic Guillot
|
||||
*/
|
||||
class XmlParser
|
||||
{
|
||||
/**
|
||||
* Get a SimpleXmlElement instance or return false.
|
||||
*
|
||||
* @static
|
||||
*
|
||||
* @param string $input XML content
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
public static function getSimpleXml($input)
|
||||
{
|
||||
return self::scan($input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a DomDocument instance or return false.
|
||||
*
|
||||
* @static
|
||||
*
|
||||
* @param string $input XML content
|
||||
*
|
||||
* @return \DOMNDocument
|
||||
*/
|
||||
public static function getDomDocument($input)
|
||||
{
|
||||
if (empty($input)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$dom = self::scan($input, new DOMDocument());
|
||||
|
||||
// The document is empty, there is probably some parsing errors
|
||||
if ($dom && $dom->childNodes->length === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return $dom;
|
||||
}
|
||||
|
||||
/**
|
||||
* Small wrapper around ZendXml to turn their exceptions into picoFeed
|
||||
* exceptions
|
||||
* @param $input the xml to load
|
||||
* @param $dom pass in a dom document or use null/omit if simpleXml should
|
||||
* be used
|
||||
*/
|
||||
private static function scan($input, $dom=null)
|
||||
{
|
||||
try {
|
||||
return Security::scan($input, $dom);
|
||||
} catch(\ZendXml\Exception\RuntimeException $e) {
|
||||
throw new XmlEntityException($e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load HTML document by using a DomDocument instance or return false on failure.
|
||||
*
|
||||
* @static
|
||||
*
|
||||
* @param string $input XML content
|
||||
*
|
||||
* @return \DOMDocument
|
||||
*/
|
||||
public static function getHtmlDocument($input)
|
||||
{
|
||||
$dom = new DomDocument();
|
||||
|
||||
if (empty($input)) {
|
||||
return $dom;
|
||||
}
|
||||
|
||||
libxml_use_internal_errors(true);
|
||||
|
||||
if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
|
||||
$dom->loadHTML($input, LIBXML_NONET);
|
||||
} else {
|
||||
$dom->loadHTML($input);
|
||||
}
|
||||
|
||||
return $dom;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a HTML document to XML.
|
||||
*
|
||||
* @static
|
||||
*
|
||||
* @param string $html HTML document
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function htmlToXml($html)
|
||||
{
|
||||
$dom = self::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
|
||||
|
||||
return $dom->saveXML($dom->getElementsByTagName('body')->item(0));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get XML parser errors.
|
||||
*
|
||||
* @static
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function getErrors()
|
||||
{
|
||||
$errors = array();
|
||||
|
||||
foreach (libxml_get_errors() as $error) {
|
||||
$errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)',
|
||||
$error->message,
|
||||
$error->line,
|
||||
$error->column,
|
||||
$error->code
|
||||
);
|
||||
}
|
||||
|
||||
return implode(', ', $errors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the encoding from a xml tag.
|
||||
*
|
||||
* @static
|
||||
*
|
||||
* @param string $data Input data
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function getEncodingFromXmlTag($data)
|
||||
{
|
||||
$encoding = '';
|
||||
|
||||
if (strpos($data, '<?xml') !== false) {
|
||||
$data = substr($data, 0, strrpos($data, '?>'));
|
||||
$data = str_replace("'", '"', $data);
|
||||
|
||||
$p1 = strpos($data, 'encoding=');
|
||||
$p2 = strpos($data, '"', $p1 + 10);
|
||||
|
||||
if ($p1 !== false && $p2 !== false) {
|
||||
$encoding = substr($data, $p1 + 10, $p2 - $p1 - 10);
|
||||
$encoding = strtolower($encoding);
|
||||
}
|
||||
}
|
||||
|
||||
return $encoding;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the charset from a meta tag.
|
||||
*
|
||||
* @static
|
||||
*
|
||||
* @param string $data Input data
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function getEncodingFromMetaTag($data)
|
||||
{
|
||||
$encoding = '';
|
||||
|
||||
if (preg_match('/<meta.*?charset\s*=\s*["\']?\s*([^"\'\s\/>;]+)/i', $data, $match) === 1) {
|
||||
$encoding = strtolower($match[1]);
|
||||
}
|
||||
|
||||
return $encoding;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rewrite XPath query to use namespace-uri and local-name derived from prefix.
|
||||
*
|
||||
* @param string $query XPath query
|
||||
* @param array $ns Prefix to namespace URI mapping
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function replaceXPathPrefixWithNamespaceURI($query, array $ns)
|
||||
{
|
||||
return preg_replace_callback('/([A-Z0-9]+):([A-Z0-9]+)/iu', function ($matches) use ($ns) {
|
||||
// don't try to map the special prefix XML
|
||||
if (strtolower($matches[1]) === 'xml') {
|
||||
return $matches[0];
|
||||
}
|
||||
|
||||
return '*[namespace-uri()="'.$ns[$matches[1]].'" and local-name()="'.$matches[2].'"]';
|
||||
},
|
||||
$query);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the result elements of a XPath query.
|
||||
*
|
||||
* @param \SimpleXMLElement $xml XML element
|
||||
* @param string $query XPath query
|
||||
* @param array $ns Prefix to namespace URI mapping
|
||||
*
|
||||
* @return \SimpleXMLElement
|
||||
*/
|
||||
public static function getXPathResult(SimpleXMLElement $xml, $query, array $ns = array())
|
||||
{
|
||||
if (!empty($ns)) {
|
||||
$query = static::replaceXPathPrefixWithNamespaceURI($query, $ns);
|
||||
}
|
||||
|
||||
return $xml->xpath($query);
|
||||
}
|
||||
}
|
Reference in a new issue