composer update

This commit is contained in:
Marcel Kapfer (mmk2410) 2016-12-30 00:04:12 +01:00
parent 9ac51e0523
commit 623395064f
279 changed files with 4458 additions and 16328 deletions

View file

@ -2,6 +2,7 @@
namespace PicoFeed\Client;
use DateTime;
use LogicException;
use PicoFeed\Logging\Logger;
use PicoFeed\Config\Config;
@ -55,6 +56,13 @@ abstract class Client
*/
protected $last_modified = '';
/**
* Expiration DateTime
*
* @var DateTime
*/
protected $expiration = null;
/**
* Proxy hostname.
*
@ -214,6 +222,9 @@ abstract class Client
$this->handleErrorResponse($response);
$this->handleNormalResponse($response);
$this->expiration = $this->parseExpiration($response['headers']);
Logger::setMessage(get_called_class().' Expiration: '.$this->expiration->format(DATE_ISO8601));
return $this;
}
@ -241,6 +252,9 @@ abstract class Client
* Handle Http Error codes
*
* @param array $response Client response
* @throws ForbiddenException
* @throws InvalidUrlException
* @throws UnauthorizedException
*/
protected function handleErrorResponse(array $response)
{
@ -402,13 +416,12 @@ abstract class Client
/**
* Set the url.
*
* @param $url
* @return string
* @return \PicoFeed\Client\Client
*/
public function setUrl($url)
{
$this->url = $url;
return $this;
}
@ -670,4 +683,31 @@ abstract class Client
{
return $code == 301 || $code == 302 || $code == 303 || $code == 307;
}
public function parseExpiration(HttpHeaders $headers)
{
if (isset($headers['Cache-Control'])) {
if (preg_match('/s-maxage=(\d+)/', $headers['Cache-Control'], $matches)) {
return new DateTime('+' . $matches[1] . ' seconds');
} else if (preg_match('/max-age=(\d+)/', $headers['Cache-Control'], $matches)) {
return new DateTime('+' . $matches[1] . ' seconds');
}
}
if (! empty($headers['Expires'])) {
return new DateTime($headers['Expires']);
}
return new DateTime();
}
/**
* Get expiration date time from "Expires" or "Cache-Control" headers
*
* @return DateTime
*/
public function getExpiration()
{
return $this->expiration ?: new DateTime();
}
}

View file

@ -11,6 +11,8 @@ use PicoFeed\Logging\Logger;
*/
class Curl extends Client
{
protected $nbRedirects = 0;
/**
* HTTP response body.
*
@ -136,6 +138,7 @@ class Curl extends Client
if ($this->etag) {
$headers[] = 'If-None-Match: '.$this->etag;
$headers[] = 'A-IM: feed';
}
if ($this->last_modified) {
@ -199,6 +202,9 @@ class Curl extends Client
*/
private function prepareDownloadMode($ch)
{
$this->body = '';
$this->response_headers = array();
$this->response_headers_count = 0;
$write_function = 'readBody';
$header_function = 'readHeaders';
@ -304,12 +310,11 @@ class Curl extends Client
* Handle HTTP redirects
*
* @param string $location Redirected URL
*
* @return array
* @throws MaxRedirectException
*/
private function handleRedirection($location)
{
$nb_redirects = 0;
$result = array();
$this->url = Url::resolve($location, $this->url);
$this->body = '';
@ -318,9 +323,9 @@ class Curl extends Client
$this->response_headers_count = 0;
while (true) {
++$nb_redirects;
$this->nbRedirects++;
if ($nb_redirects >= $this->max_redirects) {
if ($this->nbRedirects >= $this->max_redirects) {
throw new MaxRedirectException('Maximum number of redirections reached');
}

View file

@ -31,6 +31,7 @@ class Stream extends Client
if ($this->etag) {
$headers[] = 'If-None-Match: '.$this->etag;
$headers[] = 'A-IM: feed';
}
if ($this->last_modified) {
@ -104,6 +105,9 @@ class Stream extends Client
* Do the HTTP request.
*
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...]
* @throws InvalidUrlException
* @throws MaxSizeException
* @throws TimeoutException
*/
public function doRequest()
{

View file

@ -51,6 +51,7 @@ class Attribute
'td' => array(),
'tbody' => array(),
'thead' => array(),
'h1' => array(),
'h2' => array(),
'h3' => array(),
'h4' => array(),

View file

@ -42,6 +42,7 @@ class Tag extends Base
'td',
'tbody',
'thead',
'h1',
'h2',
'h3',
'h4',
@ -67,6 +68,8 @@ class Tag extends Base
'abbr',
'iframe',
'q',
'sup',
'sub',
);
/**

View file

@ -9,6 +9,7 @@ use PicoFeed\Client\Url;
/**
* Atom parser.
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
class Atom extends Parser
@ -154,30 +155,33 @@ class Atom extends Parser
}
/**
* Find the item date.
* Find the item published date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
public function findItemPublishedDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$published = XmlParser::getXPathResult($entry, 'atom:published', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'published');
$date = XmlParser::getXPathResult($entry, 'atom:published', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'published');
$updated = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'updated');
$item->setPublishedDate(!empty($date) ? $this->getDateParser()->getDateTime((string) current($date)) : null);
}
$published = !empty($published) ? $this->getDateParser()->getDateTime((string) current($published)) : null;
$updated = !empty($updated) ? $this->getDateParser()->getDateTime((string) current($updated)) : null;
/**
* Find the item updated date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemUpdatedDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$date = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'updated');
if ($published === null && $updated === null) {
$item->setDate($feed->getDate()); // We use the feed date if there is no date for the item
} elseif ($published !== null && $updated !== null) {
$item->setDate(max($published, $updated)); // We use the most recent date between published and updated
} else {
$item->setDate($updated ?: $published);
}
$item->setUpdatedDate(!empty($date) ? $this->getDateParser()->getDateTime((string) current($date)) : null);
}
/**

View file

@ -9,6 +9,7 @@ use PicoFeed\Base;
/**
* Date Parser.
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
class DateParser extends Base

View file

@ -5,6 +5,7 @@ namespace PicoFeed\Parser;
/**
* Feed.
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
class Feed
@ -12,7 +13,7 @@ class Feed
/**
* Feed items.
*
* @var array
* @var Item[]
*/
public $items = array();

View file

@ -5,6 +5,7 @@ namespace PicoFeed\Parser;
/**
* Feed Item.
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
class Item
@ -60,6 +61,20 @@ class Item
*/
public $date = null;
/**
* Item published date.
*
* @var \DateTime
*/
public $publishedDate = null;
/**
* Item updated date.
*
* @var \DateTime
*/
public $updatedDate = null;
/**
* Item content.
*
@ -151,7 +166,12 @@ class Item
$output .= 'Item::'.$property.' = '.$this->$property.PHP_EOL;
}
$publishedDate = $this->publishedDate != null ? $this->publishedDate->format(DATE_RFC822) : null;
$updatedDate = $this->updatedDate != null ? $this->updatedDate->format(DATE_RFC822) : null;
$output .= 'Item::date = '.$this->date->format(DATE_RFC822).PHP_EOL;
$output .= 'Item::publishedDate = '.$publishedDate.PHP_EOL;
$output .= 'Item::updatedDate = '.$updatedDate.PHP_EOL;
$output .= 'Item::isRTL() = '.($this->isRTL() ? 'true' : 'false').PHP_EOL;
$output .= 'Item::content = '.strlen($this->content).' bytes'.PHP_EOL;
@ -212,6 +232,26 @@ class Item
return $this->date;
}
/**
* Get published date.
*
* @return \DateTime
*/
public function getPublishedDate()
{
return $this->publishedDate;
}
/**
* Get updated date.
*
* @return \DateTime
*/
public function getUpdatedDate()
{
return $this->updatedDate;
}
/**
* Get content.
*
@ -333,6 +373,30 @@ class Item
return $this;
}
/**
* Set item published date.
*
* @param \DateTime $publishedDate
* @return Item
*/
public function setPublishedDate($publishedDate)
{
$this->publishedDate = $publishedDate;
return $this;
}
/**
* Set item updated date.
*
* @param \DateTime $updatedDate
* @return Item
*/
public function setUpdatedDate($updatedDate)
{
$this->updatedDate = $updatedDate;
return $this;
}
/**
* Set enclosure url.
*

View file

@ -5,6 +5,7 @@ namespace PicoFeed\Parser;
/**
* MalformedXmlException Exception.
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
class MalformedXmlException extends ParserException

View file

@ -15,9 +15,10 @@ use PicoFeed\Logging\Logger;
/**
* Base parser class.
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
abstract class Parser
abstract class Parser implements ParserInterface
{
/**
* Config object.
@ -211,6 +212,32 @@ abstract class Parser
$item->url = Url::resolve($item->getUrl(), $feed->getSiteUrl());
}
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$this->findItemPublishedDate($entry, $item, $feed);
$this->findItemUpdatedDate($entry, $item, $feed);
if ($item->getPublishedDate() === null) {
// Use the updated date if available, otherwise use the feed date
$item->setPublishedDate($item->getUpdatedDate() ?: $feed->getDate());
}
if ($item->getUpdatedDate() === null) {
// Use the published date as fallback
$item->setUpdatedDate($item->getPublishedDate());
}
// Use the most recent of published and updated dates
$item->setDate(max($item->getPublishedDate(), $item->getUpdatedDate()));
}
/**
* Get Item Post Processor instance
*
@ -371,153 +398,5 @@ abstract class Parser
return $xml;
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedUrl(SimpleXMLElement $xml, Feed $feed);
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findSiteUrl(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedTitle(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedDescription(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedId(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedDate(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedLogo(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedIcon(SimpleXMLElement $xml, Feed $feed);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
abstract public function getItemsTree(SimpleXMLElement $xml);
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item);
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemUrl(SimpleXMLElement $entry, Item $item);
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemTitle(SimpleXMLElement $entry, Item $item);
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemContent(SimpleXMLElement $entry, Item $item);
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed);
}

View file

@ -7,6 +7,7 @@ use PicoFeed\PicoFeedException;
/**
* ParserException Exception.
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
abstract class ParserException extends PicoFeedException

View file

@ -0,0 +1,173 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
/**
* Interface ParserInterface
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
interface ParserInterface
{
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed);
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param Feed $feed Feed object
*/
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param Feed $feed Feed object
*/
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml);
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item);
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item);
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item);
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item published date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param Feed $feed Feed object
*/
public function findItemPublishedDate(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item updated date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param Feed $feed Feed object
*/
public function findItemUpdatedDate(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item);
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed);
}

View file

@ -8,6 +8,7 @@ use PicoFeed\Filter\Filter;
/**
* RSS 1.0 parser.
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
class Rss10 extends Parser
@ -157,17 +158,32 @@ class Rss10 extends Parser
}
/**
* Find the item date.
* Find the item published date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
public function findItemPublishedDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$date = XmlParser::getXPathResult($entry, 'dc:date', $this->namespaces);
$item->setDate(empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime(XmlParser::getValue($date)));
$item->setPublishedDate(!empty($date) ? $this->getDateParser()->getDateTime(XmlParser::getValue($date)) : null);
}
/**
* Find the item updated date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemUpdatedDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
if ($item->publishedDate === null) {
$this->findItemPublishedDate($entry, $item, $feed);
}
$item->setUpdatedDate($item->getPublishedDate()); // No updated date in RSS 1.0 specifications
}
/**

View file

@ -9,6 +9,7 @@ use PicoFeed\Client\Url;
/**
* RSS 2.0 Parser.
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
class Rss20 extends Parser
@ -152,17 +153,32 @@ class Rss20 extends Parser
}
/**
* Find the item date.
* Find the item published date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
public function findItemPublishedDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$date = XmlParser::getXPathResult($entry, 'pubDate');
$item->setDate(empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime(XmlParser::getValue($date)));
$item->setPublishedDate(!empty($date) ? $this->getDateParser()->getDateTime(XmlParser::getValue($date)) : null);
}
/**
* Find the item updated date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemUpdatedDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
if ($item->publishedDate === null) {
$this->findItemPublishedDate($entry, $item, $feed);
}
$item->setUpdatedDate($item->getPublishedDate()); // No updated date in RSS 2.0 specifications
}
/**

View file

@ -5,6 +5,7 @@ namespace PicoFeed\Parser;
/**
* RSS 0.91 Parser.
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
class Rss91 extends Rss20

View file

@ -5,6 +5,7 @@ namespace PicoFeed\Parser;
/**
* RSS 0.92 Parser.
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
class Rss92 extends Rss20

View file

@ -5,6 +5,7 @@ namespace PicoFeed\Parser;
/**
* XmlEntityException Exception.
*
* @package PicoFeed\Parser
* @author Bernhard Posselt
*/
class XmlEntityException extends MalformedXmlException

View file

@ -2,9 +2,9 @@
namespace PicoFeed\Parser;
use DomDocument;
use SimpleXmlElement;
use DOMDocument;
use SimpleXMLElement;
use ZendXml\Exception\RuntimeException;
use ZendXml\Security;
/**
@ -12,6 +12,7 @@ use ZendXml\Security;
*
* Checks for XML eXternal Entity (XXE) and XML Entity Expansion (XEE) attacks on XML documents
*
* @package PicoFeed\Parser
* @author Frederic Guillot
*/
class XmlParser
@ -33,7 +34,7 @@ class XmlParser
*
* @static
* @param string $input XML content
* @return \DOMDocument
* @return DOMDocument
*/
public static function getDomDocument($input)
{
@ -52,18 +53,20 @@ class XmlParser
}
/**
* Small wrapper around ZendXml to turn their exceptions into picoFeed
* exceptions
* Small wrapper around ZendXml to turn their exceptions into PicoFeed exceptions
*
* @param $input the xml to load
* @param $dom pass in a dom document or use null/omit if simpleXml should
* be used
* @static
* @access private
* @param string $input
* @param DOMDocument $dom
* @throws XmlEntityException
* @return SimpleXMLElement|DomDocument|boolean
*/
private static function scan($input, $dom = null)
{
try {
return Security::scan($input, $dom);
} catch(\ZendXml\Exception\RuntimeException $e) {
} catch(RuntimeException $e) {
throw new XmlEntityException($e->getMessage());
}
}
@ -72,8 +75,9 @@ class XmlParser
* Load HTML document by using a DomDocument instance or return false on failure.
*
* @static
* @param string $input XML content
* @return \DOMDocument
* @access public
* @param string $input XML content
* @return DOMDocument
*/
public static function getHtmlDocument($input)
{
@ -98,9 +102,8 @@ class XmlParser
* Convert a HTML document to XML.
*
* @static
*
* @param string $html HTML document
*
* @access public
* @param string $html HTML document
* @return string
*/
public static function htmlToXml($html)
@ -113,6 +116,7 @@ class XmlParser
* Get XML parser errors.
*
* @static
* @access public
* @return string
*/
public static function getErrors()
@ -135,7 +139,8 @@ class XmlParser
* Get the encoding from a xml tag.
*
* @static
* @param string $data Input data
* @access public
* @param string $data Input data
* @return string
*/
public static function getEncodingFromXmlTag($data)
@ -162,7 +167,8 @@ class XmlParser
* Get the charset from a meta tag.
*
* @static
* @param string $data Input data
* @access public
* @param string $data Input data
* @return string
*/
public static function getEncodingFromMetaTag($data)
@ -179,6 +185,8 @@ class XmlParser
/**
* Rewrite XPath query to use namespace-uri and local-name derived from prefix.
*
* @static
* @access public
* @param string $query XPath query
* @param array $ns Prefix to namespace URI mapping
* @return string
@ -199,10 +207,12 @@ class XmlParser
/**
* Get the result elements of a XPath query.
*
* @param \SimpleXMLElement $xml XML element
* @param string $query XPath query
* @param array $ns Prefix to namespace URI mapping
* @return \SimpleXMLElement[]
* @static
* @access public
* @param SimpleXMLElement $xml XML element
* @param string $query XPath query
* @param array $ns Prefix to namespace URI mapping
* @return SimpleXMLElement[]
*/
public static function getXPathResult(SimpleXMLElement $xml, $query, array $ns = array())
{

View file

@ -0,0 +1,24 @@
<?php
return array(
'grabber' => array(
'%^/news.*%' => array(
'test_url' => 'http://www.adventuregamers.com/news/view/31079',
'body' => array(
'//div[@class="bodytext"]',
)
),
'%^/videos.*%' => array(
'test_url' => 'http://www.adventuregamers.com/videos/view/31056',
'body' => array(
'//iframe',
)
),
'%^/articles.*%' => array(
'test_url' => 'http://www.adventuregamers.com/articles/view/31049',
'body' => array(
'//div[@class="cleft"]',
)
)
),
);

View file

@ -0,0 +1,31 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://bigpicture.ru/?p=556658',
'body' => array(
'//div[@class="article container"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//h1',
'//*[@class="wp-smiley"]',
'//div[@class="ipmd"]',
'//div[@class="tags"]',
'//div[@class="social-button"]',
'//div[@class="bottom-share"]',
'//div[@class="raccoonbox"]',
'//div[@class="yndadvert"]',
'//div[@class="we-recommend"]',
'//div[@class="relap-bigpicture_ru-wrapper"]',
'//div[@id="mmail"]',
'//div[@id="mobile-ads-cut"]',
'//div[@id="liquidstorm-alt-html"]',
'//div[contains(@class, "post-tags")]',
'//*[contains(text(),"Смотрите также")]',
),
),
),
);

View file

@ -0,0 +1,22 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://e-w-e.ru/16-prekrasnyx-izobretenij-zhenshhin/',
'body' => array(
'//div[contains(@class, "post_text")]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="views_post"]',
'//*[@class="adman_mobile"]',
'//*[@class="adman_desctop"]',
'//*[contains(@rel, "nofollow")]',
'//*[contains(@class, "wp-smiley")]',
'//*[contains(text(),"Источник:")]',
),
),
),
);

View file

@ -0,0 +1,27 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.factroom.ru/life/20-facts-about-oil',
'body' => array(
'//div[@class="post"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//h1',
'//div[@id="yandex_ad2"]',
'//*[@class="jp-relatedposts"]',
'//div[contains(@class, "likely-desktop")]',
'//div[contains(@class, "likely-mobile")]',
'//p[last()]',
'//div[contains(@class, "facebook")]',
'//div[contains(@class, "desktop-underpost-direct")]',
'//div[contains(@class, "source-box")]',
'//div[contains(@class, "under-likely-desktop")]',
'//div[contains(@class, "mobile-down-post")]',
),
),
),
);

View file

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://fototelegraf.ru/?p=348232',
'body' => array(
'//div[@class="post-content"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//div[@class="imageButtonsBlock"]',
'//div[@class="adOnPostBtwImg"]',
'//div[contains(@class, "post-tags")]',
),
),
),
);

View file

@ -6,7 +6,15 @@ return array(
'test_url' => 'http://www.golem.de/news/breko-telekom-verzoegert-gezielt-den-vectoring-ausbau-1311-102974.html',
'body' => array(
'//header[@class="cluster-header"]',
'//header[@class="paged-cluster-header"]',
'//div[@class="formatted"]',
),
'next_page' => array(
'//a[@id="atoc_next"]'
),
'strip' => array(
'//header[@class="cluster-header"]/a',
'//div[@id="iqadtile4"]',
),
),
),

View file

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://gorabbit.ru/article/10-oshchushcheniy-za-rulem-kogda-tolko-poluchil-voditelskie-prava',
'body' => array(
'//div[@class="detail_text"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//div[@class="socials"]',
'//div[@id="cr_1"]',
'//div[@class="related_items"]',
),
),
),
);

View file

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%^/news.*%' => array(
'test_url' => 'http://www.hardware.fr/news/14760/intel-lance-nouveaux-ssd-nand-3d.html',
'body' => array(
'//div[@class="content_actualite"]/div[@class="md"]',
)
),
),
);

View file

@ -0,0 +1,23 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'https://hotshowlife.com/top-10-chempionov-produktov-po-szhiganiyu-kalorij/',
'body' => array(
'//div[@class="entry-content"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//div[@class="ads2"]',
'//div[@class="mistape_caption"]',
'//div[contains(@class, "et_social_media_hidden")]',
'//div[contains(@class, "et_social_inline_bottom")]',
'//div[contains(@class, "avatar")]',
'//ul[contains(@class, "entry-tags")]',
'//div[contains(@class, "entry-meta")]',
),
),
),
);

View file

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://indiehaven.com/no-mans-sky-is-a-solo-space-adventure-and-im-ok-with-that/',
'body' => array(
'//section[contains(@class, "entry-content")]',
)
),
),
);

View file

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://justcoolidea.ru/idealnyj-sad-samodelnye-proekty-dlya-berezhlivogo-domovladeltsa/',
'body' => array(
'//section[@class="entry-content"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[contains(@class, "essb_links")]',
'//*[contains(@rel, "nofollow")]',
'//*[contains(@class, "ads")]',
),
),
),
);

View file

@ -0,0 +1,23 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => array(
'http://www.legorafi.fr/2016/12/16/gorafi-magazine-bravo-vous-avez-bientot-presque-survecu-a-2016/',
'http://www.legorafi.fr/2016/12/15/manuel-valls-promet-quune-fois-elu-il-debarrassera-la-france-de-manuel-valls/',
),
'body' => array(
'//section[@id="banner_magazine"]',
'//figure[@class="main_picture"]',
'//div[@class="content"]',
),
'strip' => array(
'//figcaption',
'//div[@class="sharebox"]',
'//div[@class="tags"]',
'//section[@class="taboola_article"]',
),
),
),
);

View file

@ -0,0 +1,22 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://lifehacker.ru/2016/03/03/polymail/',
'body' => array(
'//div[@class="post-content"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="wp-thumbnail-caption"]',
'//*[contains(@class, "social-likes")]',
'//*[@class="jp-relatedposts"]',
'//*[contains(@class, "wpappbox")]',
'//*[contains(@class, "icon__image")]',
'//div[@id="hypercomments_widget"]',
),
),
),
);

View file

@ -6,6 +6,7 @@ return array(
'test_url' => 'https://medium.com/lessons-learned/917b8b63ae3e',
'body' => array(
'//div[contains(@class, "post-field body")]',
'//div[contains(@class, "section-inner layoutSingleColumn")]',
),
'strip' => array(
),

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.monandroid.com/blog/tutoriel-avance-activer-le-stockage-fusionne-sur-android-6-marshamallow-t12.html',
'body' => array(
'//div[@class="blog-post-body"]',
),
'strip' => array(
),
),
),
);

View file

@ -3,7 +3,7 @@
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.monwindowsphone.com/tout-savoir-sur-le-centre-d-action-de-windows-phone-8-1-t40574.html',
'test_url' => 'http://www.monwindows.com/tout-savoir-sur-le-centre-d-action-de-windows-phone-8-1-t40574.html',
'body' => array(
'//div[@class="blog-post-body"]',
),

View file

@ -0,0 +1,21 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.moya-planeta.ru/travel/view/chto_yaponcu_horosho_russkomu_ne_ponyat_20432/',
'body' => array(
'//div[@class="full_object"]',
),
'strip' => array(
'//div[@class="full_object_panel object_panel"]',
'//div[@class="full_object_panel_geo object_panel"]',
'//div[@class="full_object_title"]',
'//div[@class="full_object_social_likes"]',
'//div[@class="full_object_planeta_likes"]',
'//div[@class="full_object_go2comments"]',
'//div[@id="yandex_ad_R-163191-3"]',
'//div[@class="full_object_shop_article_recommend"]',
),
),
),
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.nat-geo.ru/fact/868093-knidos-antichnyy-naukograd/',
'body' => array(
'//div[@class="article-inner-text"]',
),
),
),
);

View file

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.nextinpact.com/news/101122-3d-nand-intel-lance-six-nouvelles-gammes-ssd-pour-tous-usages.htm',
'body' => array(
'//div[@class="container_article"]',
),
'strip' => array(
'//div[@class="infos_article"]',
'//div[@id="actu_auteur"]',
'//div[@id="soutenir_journaliste"]',
'//section[@id="bandeau_abonnez_vous"]',
'//br'
),
),
),
);

View file

@ -0,0 +1,24 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.publy.ru/post/19988',
'body' => array(
'//div[@class="singlepost"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="featured"]',
'//*[@class="toc_white no_bullets"]',
'//*[@class="toc_title"]',
'//*[@class="pba"]',
'//*[@class="comments"]',
'//*[contains(@class, "g-single")]',
'//*[@class="ts-fab-wrapper"]',
'//*[contains(@class, "wp_rp_wrap")]',
),
),
),
);

View file

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'https://www.rockpapershotgun.com/2016/08/26/the-divisions-expansions-delayed-to-improve-the-game/',
'body' => array(
'//div[@class="entry"]',
)
),
),
);

View file

@ -3,15 +3,15 @@ return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.rugbyrama.fr/rugby/top-14/2015-2016/top-14-hayman-coupe-du-monde-finale-2012-lutte.-voici-levan-chilachava-toulon_sto5283863/story.shtml',
'body' => array(
'//div[@class="story-simple-content"]',
'body' => array(
'//div[@class="storyfull__content"]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="share-buttons"]',
'//*[@class="show-mobile-block"]',
'//*[@class="ad"]',
'//*[@class="hide-desktop"]',
'//*[@id="tracking_img"]',
)

View file

@ -1,9 +1,15 @@
<?php
return array(
'filter' => array(
'grabber' => array(
'%.*%' => array(
'%(<img.+)(\.png"/>)%' => '$1$2$1after$2',
'test_url' => 'http://www.smbc-comics.com/comic/the-troll-toll',
'body' => array(
'//div[@id="cc-comicbody"]',
'//div[@id="aftercomic"]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,21 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://takprosto.cc/kokteyl-dlya-pohudeniya-v-domashnih-usloviyah/',
'body' => array(
'//div[contains(@class, "entry-contentt")]',
),
'strip' => array(
'//script',
'//form',
'//style',
'//*[@class="views_post"]',
'//*[contains(@class, "mailchimp-box")]',
'//*[contains(@class, "essb_links")]',
'//*[contains(@rel, "nofollow")]',
'//*[contains(@class, "ads")]',
),
),
),
);

View file

@ -2,20 +2,16 @@
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.thelocal.se/20151018/swedish-moderates-tighten-focus-on-begging-ban',
'test_url' => 'www.thelocal.se/20161219/this-swede-can-memorize-hundreds-of-numbers-in-only-five-minutes',
'body' => array(
'//article',
'//div[@id="article-photo"]',
'//div[@id="article-description"]',
'//div[@id="article-body"]',
),
'strip' => array(
'//p[@id="mobile-signature"]',
'//article/div[4]',
'//article/ul[1]',
'//div[@class="clr"]',
'//p[@class="small"]',
'//p[@style="font-weight: bold; font-size: 14px;"]',
'//div[@class="author"]',
'//div[@class="ad_container"]',
'//div[@id="article-info-middle"]',
)
)
)
);

View file

@ -4,7 +4,7 @@ return array(
'%.*%' => array(
'test_url' => 'http://www.geekculture.com/joyoftech/joyarchives/2180.html',
'body' => array(
'//p[contains(@class,"Maintext")][2]/img',
'//p[contains(@class,"Maintext")][2]/a/img[contains(@src,"joyimages")]',
),
'strip' => array(),
),

View file

@ -1,14 +0,0 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.pcinpact.com/news/85954-air-france-ne-vous-demande-plus-deteindre-vos-appareils-electroniques.htm?utm_source=PCi_RSS_Feed&utm_medium=news&utm_campaign=pcinpact',
'body' => array(
'//div[contains(@id, "actu_content")]',
),
'strip' => array(
),
),
),
);

View file

@ -243,6 +243,16 @@ class CandidateParser implements ParserInterface
}
}
/**
* Find link for next page of the article.
*
* @return string
*/
public function findNextLink()
{
return null;
}
/**
* Return false if the node should not be removed.
*

View file

@ -10,4 +10,11 @@ interface ParserInterface
* @return string
*/
public function execute();
/**
* Find link for next page of the article.
*
* @return string
*/
public function findNextLink();
}

View file

@ -65,7 +65,6 @@ class RuleParser implements ParserInterface
public function findContent()
{
$content = '';
if (isset($this->rules['body']) && is_array($this->rules['body'])) {
foreach ($this->rules['body'] as $pattern) {
$nodes = $this->xpath->query($pattern);
@ -80,4 +79,24 @@ class RuleParser implements ParserInterface
return $content;
}
/**
* Fetch next link based on Xpath rules.
*
* @return string
*/
public function findNextLink()
{
if (isset($this->rules['next_page']) && is_array($this->rules['next_page'])) {
foreach ($this->rules['next_page'] as $pattern) {
$nodes = $this->xpath->query($pattern);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
return $node->getAttribute('href');
}
}
}
}
return null;
}
}

View file

@ -206,19 +206,31 @@ class Scraper extends Base
/**
* Execute the scraper.
*/
public function execute()
public function execute($pageContent = '', $recursionDepth = 0)
{
$this->content = '';
$this->html = '';
$this->encoding = '';
$this->content = '';
$this->download();
$this->prepareHtml();
$parser = $this->getParser();
if ($parser !== null) {
$this->content = $parser->execute();
$maxRecursions = $this->config->getMaxRecursions();
if(!isset($maxRecursions)){
$maxRecursions = 25;
}
$pageContent .= $parser->execute();
// check if there is a link to next page and recursively get content (max 25 pages)
if((($nextLink = $parser->findNextLink()) !== null) && $recursionDepth < $maxRecursions){
$nextLink = Url::resolve($nextLink,$this->url);
$this->setUrl($nextLink);
$this->execute($pageContent,$recursionDepth+1);
}
else{
$this->content = $pageContent;
}
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
}
}

View file

@ -56,7 +56,7 @@ abstract class FeedBuilder
/**
* @var ItemBuilder[]
*/
protected $items;
protected $items = array();
/**
* Constructor

View file

@ -36,7 +36,7 @@ class Rss20Helper
* @param DOMElement $element
* @param string $tag
* @param string $value
* @return AtomHelper
* @return $this
*/
public function buildNode(DOMElement $element, $tag, $value)
{
@ -52,7 +52,7 @@ class Rss20Helper
* @access public
* @param DOMElement $element
* @param string $title
* @return AtomHelper
* @return $this
*/
public function buildTitle(DOMElement $element, $title)
{
@ -66,7 +66,7 @@ class Rss20Helper
* @param DOMElement $element
* @param DateTime $date
* @param string $type
* @return AtomHelper
* @return $this
*/
public function buildDate(DOMElement $element, DateTime $date, $type = 'pubDate')
{
@ -79,7 +79,7 @@ class Rss20Helper
* @access public
* @param DOMElement $element
* @param string $url
* @return AtomHelper
* @return $this
*/
public function buildLink(DOMElement $element, $url)
{
@ -94,7 +94,7 @@ class Rss20Helper
* @param string $tag
* @param string $authorName
* @param string $authorEmail
* @return AtomHelper
* @return $this
*/
public function buildAuthor(DOMElement $element, $tag, $authorName, $authorEmail)
{