add composer's vendor directory

This commit is contained in:
Marcel Kapfer (mmk2410) 2016-05-07 12:59:40 +02:00
parent 01a3860d73
commit 60b094d5fa
745 changed files with 56017 additions and 1 deletions

21
vendor/fguillot/picofeed/LICENSE vendored Normal file
View file

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2015 Frederic Guillot
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View file

@ -0,0 +1,34 @@
<?php
namespace PicoFeed;
use PicoFeed\Config\Config;
use PicoFeed\Logging\Logger;
/**
* Base class
*
* @package PicoFeed
* @author Frederic Guillot
*/
abstract class Base
{
/**
* Config class instance
*
* @access protected
* @var \PicoFeed\Config\Config
*/
protected $config;
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config = null)
{
$this->config = $config ?: new Config();
Logger::setTimezone($this->config->getTimezone());
}
}

View file

@ -0,0 +1,673 @@
<?php
namespace PicoFeed\Client;
use LogicException;
use PicoFeed\Logging\Logger;
use PicoFeed\Config\Config;
/**
* Client class.
*
* @author Frederic Guillot
*/
abstract class Client
{
/**
* Flag that say if the resource have been modified.
*
* @var bool
*/
private $is_modified = true;
/**
* HTTP Content-Type.
*
* @var string
*/
private $content_type = '';
/**
* HTTP encoding.
*
* @var string
*/
private $encoding = '';
/**
* HTTP request headers.
*
* @var array
*/
protected $request_headers = array();
/**
* HTTP Etag header.
*
* @var string
*/
protected $etag = '';
/**
* HTTP Last-Modified header.
*
* @var string
*/
protected $last_modified = '';
/**
* Proxy hostname.
*
* @var string
*/
protected $proxy_hostname = '';
/**
* Proxy port.
*
* @var int
*/
protected $proxy_port = 3128;
/**
* Proxy username.
*
* @var string
*/
protected $proxy_username = '';
/**
* Proxy password.
*
* @var string
*/
protected $proxy_password = '';
/**
* Basic auth username.
*
* @var string
*/
protected $username = '';
/**
* Basic auth password.
*
* @var string
*/
protected $password = '';
/**
* Client connection timeout.
*
* @var int
*/
protected $timeout = 10;
/**
* User-agent.
*
* @var string
*/
protected $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
/**
* Real URL used (can be changed after a HTTP redirect).
*
* @var string
*/
protected $url = '';
/**
* Page/Feed content.
*
* @var string
*/
protected $content = '';
/**
* Number maximum of HTTP redirections to avoid infinite loops.
*
* @var int
*/
protected $max_redirects = 5;
/**
* Maximum size of the HTTP body response.
*
* @var int
*/
protected $max_body_size = 2097152; // 2MB
/**
* HTTP response status code.
*
* @var int
*/
protected $status_code = 0;
/**
* Enables direct passthrough to requesting client.
*
* @var bool
*/
protected $passthrough = false;
/**
* Do the HTTP request.
*
* @abstract
*
* @return array
*/
abstract public function doRequest();
/**
* Get client instance: curl or stream driver.
*
* @static
*
* @return \PicoFeed\Client\Client
*/
public static function getInstance()
{
if (function_exists('curl_init')) {
return new Curl();
} elseif (ini_get('allow_url_fopen')) {
return new Stream();
}
throw new LogicException('You must have "allow_url_fopen=1" or curl extension installed');
}
/**
* Add HTTP Header to the request.
*
* @param array $headers
*/
public function setHeaders($headers)
{
$this->request_headers = $headers;
}
/**
* Perform the HTTP request.
*
* @param string $url URL
*
* @return Client
*/
public function execute($url = '')
{
if ($url !== '') {
$this->url = $url;
}
Logger::setMessage(get_called_class().' Fetch URL: '.$this->url);
Logger::setMessage(get_called_class().' Etag provided: '.$this->etag);
Logger::setMessage(get_called_class().' Last-Modified provided: '.$this->last_modified);
$response = $this->doRequest();
$this->status_code = $response['status'];
$this->handleNotModifiedResponse($response);
$this->handleErrorResponse($response);
$this->handleNormalResponse($response);
return $this;
}
/**
* Handle not modified response.
*
* @param array $response Client response
*/
protected function handleNotModifiedResponse(array $response)
{
if ($response['status'] == 304) {
$this->is_modified = false;
} elseif ($response['status'] == 200) {
$this->is_modified = $this->hasBeenModified($response, $this->etag, $this->last_modified);
$this->etag = $this->getHeader($response, 'ETag');
$this->last_modified = $this->getHeader($response, 'Last-Modified');
}
if ($this->is_modified === false) {
Logger::setMessage(get_called_class().' Resource not modified');
}
}
/**
* Handle Http Error codes
*
* @param array $response Client response
*/
protected function handleErrorResponse(array $response)
{
$status = $response['status'];
if ($status == 401) {
throw new UnauthorizedException('Wrong or missing credentials');
} else if ($status == 403) {
throw new ForbiddenException('Not allowed to access resource');
} else if ($status == 404) {
throw new InvalidUrlException('Resource not found');
}
}
/**
* Handle normal response.
*
* @param array $response Client response
*/
protected function handleNormalResponse(array $response)
{
if ($response['status'] == 200) {
$this->content = $response['body'];
$this->content_type = $this->findContentType($response);
$this->encoding = $this->findCharset();
}
}
/**
* Check if a request has been modified according to the parameters.
*
* @param array $response
* @param string $etag
* @param string $lastModified
*
* @return bool
*/
private function hasBeenModified($response, $etag, $lastModified)
{
$headers = array(
'Etag' => $etag,
'Last-Modified' => $lastModified,
);
// Compare the values for each header that is present
$presentCacheHeaderCount = 0;
foreach ($headers as $key => $value) {
if (isset($response['headers'][$key])) {
if ($response['headers'][$key] !== $value) {
return true;
}
++$presentCacheHeaderCount;
}
}
// If at least one header is present and the values match, the response
// was not modified
if ($presentCacheHeaderCount > 0) {
return false;
}
return true;
}
/**
* Find content type from response headers.
*
* @param array $response Client response
*
* @return string
*/
public function findContentType(array $response)
{
return strtolower($this->getHeader($response, 'Content-Type'));
}
/**
* Find charset from response headers.
*
* @return string
*/
public function findCharset()
{
$result = explode('charset=', $this->content_type);
return isset($result[1]) ? $result[1] : '';
}
/**
* Get header value from a client response.
*
* @param array $response Client response
* @param string $header Header name
*
* @return string
*/
public function getHeader(array $response, $header)
{
return isset($response['headers'][$header]) ? $response['headers'][$header] : '';
}
/**
* Set the Last-Modified HTTP header.
*
* @param string $last_modified Header value
*
* @return \PicoFeed\Client\Client
*/
public function setLastModified($last_modified)
{
$this->last_modified = $last_modified;
return $this;
}
/**
* Get the value of the Last-Modified HTTP header.
*
* @return string
*/
public function getLastModified()
{
return $this->last_modified;
}
/**
* Set the value of the Etag HTTP header.
*
* @param string $etag Etag HTTP header value
*
* @return \PicoFeed\Client\Client
*/
public function setEtag($etag)
{
$this->etag = $etag;
return $this;
}
/**
* Get the Etag HTTP header value.
*
* @return string
*/
public function getEtag()
{
return $this->etag;
}
/**
* Get the final url value.
*
* @return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Set the url.
*
* @return string
* @return \PicoFeed\Client\Client
*/
public function setUrl($url)
{
$this->url = $url;
return $this;
}
/**
* Get the HTTP response status code.
*
* @return int
*/
public function getStatusCode()
{
return $this->status_code;
}
/**
* Get the body of the HTTP response.
*
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get the content type value from HTTP headers.
*
* @return string
*/
public function getContentType()
{
return $this->content_type;
}
/**
* Get the encoding value from HTTP headers.
*
* @return string
*/
public function getEncoding()
{
return $this->encoding;
}
/**
* Return true if the remote resource has changed.
*
* @return bool
*/
public function isModified()
{
return $this->is_modified;
}
/**
* return true if passthrough mode is enabled.
*
* @return bool
*/
public function isPassthroughEnabled()
{
return $this->passthrough;
}
/**
* Set connection timeout.
*
* @param int $timeout Connection timeout
*
* @return \PicoFeed\Client\Client
*/
public function setTimeout($timeout)
{
$this->timeout = $timeout ?: $this->timeout;
return $this;
}
/**
* Set a custom user agent.
*
* @param string $user_agent User Agent
*
* @return \PicoFeed\Client\Client
*/
public function setUserAgent($user_agent)
{
$this->user_agent = $user_agent ?: $this->user_agent;
return $this;
}
/**
* Set the maximum number of HTTP redirections.
*
* @param int $max Maximum
*
* @return \PicoFeed\Client\Client
*/
public function setMaxRedirections($max)
{
$this->max_redirects = $max ?: $this->max_redirects;
return $this;
}
/**
* Set the maximum size of the HTTP body.
*
* @param int $max Maximum
*
* @return \PicoFeed\Client\Client
*/
public function setMaxBodySize($max)
{
$this->max_body_size = $max ?: $this->max_body_size;
return $this;
}
/**
* Set the proxy hostname.
*
* @param string $hostname Proxy hostname
*
* @return \PicoFeed\Client\Client
*/
public function setProxyHostname($hostname)
{
$this->proxy_hostname = $hostname ?: $this->proxy_hostname;
return $this;
}
/**
* Set the proxy port.
*
* @param int $port Proxy port
*
* @return \PicoFeed\Client\Client
*/
public function setProxyPort($port)
{
$this->proxy_port = $port ?: $this->proxy_port;
return $this;
}
/**
* Set the proxy username.
*
* @param string $username Proxy username
*
* @return \PicoFeed\Client\Client
*/
public function setProxyUsername($username)
{
$this->proxy_username = $username ?: $this->proxy_username;
return $this;
}
/**
* Set the proxy password.
*
* @param string $password Password
*
* @return \PicoFeed\Client\Client
*/
public function setProxyPassword($password)
{
$this->proxy_password = $password ?: $this->proxy_password;
return $this;
}
/**
* Set the username.
*
* @param string $username Basic Auth username
*
* @return \PicoFeed\Client\Client
*/
public function setUsername($username)
{
$this->username = $username ?: $this->username;
return $this;
}
/**
* Set the password.
*
* @param string $password Basic Auth Password
*
* @return \PicoFeed\Client\Client
*/
public function setPassword($password)
{
$this->password = $password ?: $this->password;
return $this;
}
/**
* Enable the passthrough mode.
*
* @return \PicoFeed\Client\Client
*/
public function enablePassthroughMode()
{
$this->passthrough = true;
return $this;
}
/**
* Disable the passthrough mode.
*
* @return \PicoFeed\Client\Client
*/
public function disablePassthroughMode()
{
$this->passthrough = false;
return $this;
}
/**
* Set config object.
*
* @param \PicoFeed\Config\Config $config Config instance
*
* @return \PicoFeed\Client\Client
*/
public function setConfig(Config $config)
{
if ($config !== null) {
$this->setTimeout($config->getClientTimeout());
$this->setUserAgent($config->getClientUserAgent());
$this->setMaxRedirections($config->getMaxRedirections());
$this->setMaxBodySize($config->getMaxBodySize());
$this->setProxyHostname($config->getProxyHostname());
$this->setProxyPort($config->getProxyPort());
$this->setProxyUsername($config->getProxyUsername());
$this->setProxyPassword($config->getProxyPassword());
}
return $this;
}
/**
* Return true if the HTTP status code is a redirection
*
* @access protected
* @param integer $code
* @return boolean
*/
public function isRedirection($code)
{
return $code == 301 || $code == 302 || $code == 303 || $code == 307;
}
}

View file

@ -0,0 +1,14 @@
<?php
namespace PicoFeed\Client;
use PicoFeed\PicoFeedException;
/**
* ClientException Exception.
*
* @author Frederic Guillot
*/
abstract class ClientException extends PicoFeedException
{
}

View file

@ -0,0 +1,386 @@
<?php
namespace PicoFeed\Client;
use PicoFeed\Logging\Logger;
/**
* cURL HTTP client.
*
* @author Frederic Guillot
*/
class Curl extends Client
{
/**
* HTTP response body.
*
* @var string
*/
private $body = '';
/**
* Body size.
*
* @var int
*/
private $body_length = 0;
/**
* HTTP response headers.
*
* @var array
*/
private $response_headers = array();
/**
* Counter on the number of header received.
*
* @var int
*/
private $response_headers_count = 0;
/**
* cURL callback to read the HTTP body.
*
* If the function return -1, curl stop to read the HTTP response
*
* @param resource $ch cURL handler
* @param string $buffer Chunk of data
*
* @return int Length of the buffer
*/
public function readBody($ch, $buffer)
{
$length = strlen($buffer);
$this->body_length += $length;
if ($this->body_length > $this->max_body_size) {
return -1;
}
$this->body .= $buffer;
return $length;
}
/**
* cURL callback to read HTTP headers.
*
* @param resource $ch cURL handler
* @param string $buffer Header line
*
* @return int Length of the buffer
*/
public function readHeaders($ch, $buffer)
{
$length = strlen($buffer);
if ($buffer === "\r\n" || $buffer === "\n") {
++$this->response_headers_count;
} else {
if (!isset($this->response_headers[$this->response_headers_count])) {
$this->response_headers[$this->response_headers_count] = '';
}
$this->response_headers[$this->response_headers_count] .= $buffer;
}
return $length;
}
/**
* cURL callback to passthrough the HTTP body to the client.
*
* If the function return -1, curl stop to read the HTTP response
*
* @param resource $ch cURL handler
* @param string $buffer Chunk of data
*
* @return int Length of the buffer
*/
public function passthroughBody($ch, $buffer)
{
// do it only at the beginning of a transmission
if ($this->body_length === 0) {
list($status, $headers) = HttpHeaders::parse(explode("\n", $this->response_headers[$this->response_headers_count - 1]));
if ($this->isRedirection($status)) {
return $this->handleRedirection($headers['Location']);
}
header(':', true, $status);
if (isset($headers['Content-Type'])) {
header('Content-Type:' .$headers['Content-Type']);
}
}
$length = strlen($buffer);
$this->body_length += $length;
echo $buffer;
return $length;
}
/**
* Prepare HTTP headers.
*
* @return string[]
*/
private function prepareHeaders()
{
$headers = array(
'Connection: close',
);
if ($this->etag) {
$headers[] = 'If-None-Match: '.$this->etag;
}
if ($this->last_modified) {
$headers[] = 'If-Modified-Since: '.$this->last_modified;
}
$headers = array_merge($headers, $this->request_headers);
return $headers;
}
/**
* Prepare curl proxy context.
*
* @param resource $ch
*
* @return resource $ch
*/
private function prepareProxyContext($ch)
{
if ($this->proxy_hostname) {
Logger::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port);
curl_setopt($ch, CURLOPT_PROXYPORT, $this->proxy_port);
curl_setopt($ch, CURLOPT_PROXYTYPE, 'HTTP');
curl_setopt($ch, CURLOPT_PROXY, $this->proxy_hostname);
if ($this->proxy_username) {
Logger::setMessage(get_called_class().' Proxy credentials: Yes');
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $this->proxy_username.':'.$this->proxy_password);
} else {
Logger::setMessage(get_called_class().' Proxy credentials: No');
}
}
return $ch;
}
/**
* Prepare curl auth context.
*
* @param resource $ch
*
* @return resource $ch
*/
private function prepareAuthContext($ch)
{
if ($this->username && $this->password) {
curl_setopt($ch, CURLOPT_USERPWD, $this->username.':'.$this->password);
}
return $ch;
}
/**
* Set write/header functions.
*
* @param resource $ch
*
* @return resource $ch
*/
private function prepareDownloadMode($ch)
{
$write_function = 'readBody';
$header_function = 'readHeaders';
if ($this->isPassthroughEnabled()) {
$write_function = 'passthroughBody';
}
curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, $write_function));
curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, $header_function));
return $ch;
}
/**
* Prepare curl context.
*
* @return resource
*/
private function prepareContext()
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->url);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent);
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->prepareHeaders());
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_ENCODING, '');
curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory');
curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory');
// Disable SSLv3 by enforcing TLSv1.x for curl >= 7.34.0 and < 7.39.0.
// Versions prior to 7.34 and at least when compiled against openssl
// interpret this parameter as "limit to TLSv1.0" which fails for sites
// which enforce TLS 1.1+.
// Starting with curl 7.39.0 SSLv3 is disabled by default.
$version = curl_version();
if ($version['version_number'] >= 467456 && $version['version_number'] < 468736) {
curl_setopt($ch, CURLOPT_SSLVERSION, 1);
}
$ch = $this->prepareDownloadMode($ch);
$ch = $this->prepareProxyContext($ch);
$ch = $this->prepareAuthContext($ch);
return $ch;
}
/**
* Execute curl context.
*/
private function executeContext()
{
$ch = $this->prepareContext();
curl_exec($ch);
Logger::setMessage(get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME));
Logger::setMessage(get_called_class().' cURL dns lookup time: '.curl_getinfo($ch, CURLINFO_NAMELOOKUP_TIME));
Logger::setMessage(get_called_class().' cURL connect time: '.curl_getinfo($ch, CURLINFO_CONNECT_TIME));
Logger::setMessage(get_called_class().' cURL speed download: '.curl_getinfo($ch, CURLINFO_SPEED_DOWNLOAD));
Logger::setMessage(get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
$curl_errno = curl_errno($ch);
if ($curl_errno) {
Logger::setMessage(get_called_class().' cURL error: '.curl_error($ch));
curl_close($ch);
$this->handleError($curl_errno);
}
// Update the url if there where redirects
$this->url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
}
/**
* Do the HTTP request.
*
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...]
*/
public function doRequest()
{
$this->executeContext();
list($status, $headers) = HttpHeaders::parse(explode("\n", $this->response_headers[$this->response_headers_count - 1]));
if ($this->isRedirection($status)) {
return $this->handleRedirection($headers['Location']);
}
return array(
'status' => $status,
'body' => $this->body,
'headers' => $headers,
);
}
/**
* Handle HTTP redirects
*
* @param string $location Redirected URL
*
* @return array
*/
private function handleRedirection($location)
{
$nb_redirects = 0;
$result = array();
$this->url = Url::resolve($location, $this->url);
$this->body = '';
$this->body_length = 0;
$this->response_headers = array();
$this->response_headers_count = 0;
while (true) {
++$nb_redirects;
if ($nb_redirects >= $this->max_redirects) {
throw new MaxRedirectException('Maximum number of redirections reached');
}
$result = $this->doRequest();
if ($this->isRedirection($result['status'])) {
$this->url = Url::resolve($result['headers']['Location'], $this->url);
$this->body = '';
$this->body_length = 0;
$this->response_headers = array();
$this->response_headers_count = 0;
} else {
break;
}
}
return $result;
}
/**
* Handle cURL errors (throw individual exceptions).
*
* We don't use constants because they are not necessary always available
* (depends of the version of libcurl linked to php)
*
* @see http://curl.haxx.se/libcurl/c/libcurl-errors.html
*
* @param int $errno cURL error code
*/
private function handleError($errno)
{
switch ($errno) {
case 78: // CURLE_REMOTE_FILE_NOT_FOUND
throw new InvalidUrlException('Resource not found', $errno);
case 6: // CURLE_COULDNT_RESOLVE_HOST
throw new InvalidUrlException('Unable to resolve hostname', $errno);
case 7: // CURLE_COULDNT_CONNECT
throw new InvalidUrlException('Unable to connect to the remote host', $errno);
case 23: // CURLE_WRITE_ERROR
throw new MaxSizeException('Maximum response size exceeded', $errno);
case 28: // CURLE_OPERATION_TIMEDOUT
throw new TimeoutException('Operation timeout', $errno);
case 35: // CURLE_SSL_CONNECT_ERROR
case 51: // CURLE_PEER_FAILED_VERIFICATION
case 58: // CURLE_SSL_CERTPROBLEM
case 60: // CURLE_SSL_CACERT
case 59: // CURLE_SSL_CIPHER
case 64: // CURLE_USE_SSL_FAILED
case 66: // CURLE_SSL_ENGINE_INITFAILED
case 77: // CURLE_SSL_CACERT_BADFILE
case 83: // CURLE_SSL_ISSUER_ERROR
$msg = 'Invalid SSL certificate caused by CURL error number ' .
$errno;
throw new InvalidCertificateException($msg, $errno);
case 47: // CURLE_TOO_MANY_REDIRECTS
throw new MaxRedirectException('Maximum number of redirections reached', $errno);
case 63: // CURLE_FILESIZE_EXCEEDED
throw new MaxSizeException('Maximum response size exceeded', $errno);
default:
throw new InvalidUrlException('Unable to fetch the URL', $errno);
}
}
}

View file

@ -0,0 +1,10 @@
<?php
namespace PicoFeed\Client;
/**
* @author Bernhard Posselt
*/
class ForbiddenException extends ClientException
{
}

View file

@ -0,0 +1,79 @@
<?php
namespace PicoFeed\Client;
use ArrayAccess;
use PicoFeed\Logging\Logger;
/**
* Class to handle HTTP headers case insensitivity.
*
* @author Bernhard Posselt
* @author Frederic Guillot
*/
class HttpHeaders implements ArrayAccess
{
private $headers = array();
public function __construct(array $headers)
{
foreach ($headers as $key => $value) {
$this->headers[strtolower($key)] = $value;
}
}
public function offsetGet($offset)
{
return $this->headers[strtolower($offset)];
}
public function offsetSet($offset, $value)
{
$this->headers[strtolower($offset)] = $value;
}
public function offsetExists($offset)
{
return isset($this->headers[strtolower($offset)]);
}
public function offsetUnset($offset)
{
unset($this->headers[strtolower($offset)]);
}
/**
* Parse HTTP headers.
*
* @static
*
* @param array $lines List of headers
*
* @return array
*/
public static function parse(array $lines)
{
$status = 0;
$headers = array();
foreach ($lines as $line) {
if (strpos($line, 'HTTP/1') === 0) {
$headers = array();
$status = (int) substr($line, 9, 3);
} elseif (strpos($line, ': ') !== false) {
list($name, $value) = explode(': ', $line);
if ($value) {
$headers[trim($name)] = trim($value);
}
}
}
Logger::setMessage(get_called_class().' HTTP status code: '.$status);
foreach ($headers as $name => $value) {
Logger::setMessage(get_called_class().' HTTP header: '.$name.' => '.$value);
}
return array($status, new self($headers));
}
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Client;
/**
* InvalidCertificateException Exception.
*
* @author Frederic Guillot
*/
class InvalidCertificateException extends ClientException
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Client;
/**
* InvalidUrlException Exception.
*
* @author Frederic Guillot
*/
class InvalidUrlException extends ClientException
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Client;
/**
* MaxRedirectException Exception.
*
* @author Frederic Guillot
*/
class MaxRedirectException extends ClientException
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Client;
/**
* MaxSizeException Exception.
*
* @author Frederic Guillot
*/
class MaxSizeException extends ClientException
{
}

View file

@ -0,0 +1,201 @@
<?php
namespace PicoFeed\Client;
use PicoFeed\Logging\Logger;
/**
* Stream context HTTP client.
*
* @author Frederic Guillot
*/
class Stream extends Client
{
/**
* Prepare HTTP headers.
*
* @return string[]
*/
private function prepareHeaders()
{
$headers = array(
'Connection: close',
'User-Agent: '.$this->user_agent,
);
// disable compression in passthrough mode. It could result in double
// compressed content which isn't decodeable by browsers
if (function_exists('gzdecode') && !$this->isPassthroughEnabled()) {
$headers[] = 'Accept-Encoding: gzip';
}
if ($this->etag) {
$headers[] = 'If-None-Match: '.$this->etag;
}
if ($this->last_modified) {
$headers[] = 'If-Modified-Since: '.$this->last_modified;
}
if ($this->proxy_username) {
$headers[] = 'Proxy-Authorization: Basic '.base64_encode($this->proxy_username.':'.$this->proxy_password);
}
if ($this->username && $this->password) {
$headers[] = 'Authorization: Basic '.base64_encode($this->username.':'.$this->password);
}
$headers = array_merge($headers, $this->request_headers);
return $headers;
}
/**
* Construct the final URL from location headers.
*
* @param array $headers List of HTTP response header
*/
private function setEffectiveUrl($headers)
{
foreach ($headers as $header) {
if (stripos($header, 'Location') === 0) {
list(, $value) = explode(': ', $header);
$this->url = Url::resolve($value, $this->url);
}
}
}
/**
* Prepare stream context.
*
* @return array
*/
private function prepareContext()
{
$context = array(
'http' => array(
'method' => 'GET',
'protocol_version' => 1.1,
'timeout' => $this->timeout,
'max_redirects' => $this->max_redirects,
),
);
if ($this->proxy_hostname) {
Logger::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port);
$context['http']['proxy'] = 'tcp://'.$this->proxy_hostname.':'.$this->proxy_port;
$context['http']['request_fulluri'] = true;
if ($this->proxy_username) {
Logger::setMessage(get_called_class().' Proxy credentials: Yes');
} else {
Logger::setMessage(get_called_class().' Proxy credentials: No');
}
}
$context['http']['header'] = implode("\r\n", $this->prepareHeaders());
return $context;
}
/**
* Do the HTTP request.
*
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...]
*/
public function doRequest()
{
$body = '';
// Create context
$context = stream_context_create($this->prepareContext());
// Make HTTP request
$stream = @fopen($this->url, 'r', false, $context);
if (!is_resource($stream)) {
throw new InvalidUrlException('Unable to establish a connection');
}
// Get HTTP headers response
$metadata = stream_get_meta_data($stream);
list($status, $headers) = HttpHeaders::parse($metadata['wrapper_data']);
if ($this->isPassthroughEnabled()) {
header(':', true, $status);
if (isset($headers['Content-Type'])) {
header('Content-Type: '.$headers['Content-Type']);
}
fpassthru($stream);
} else {
// Get the entire body until the max size
$body = stream_get_contents($stream, $this->max_body_size + 1);
// If the body size is too large abort everything
if (strlen($body) > $this->max_body_size) {
throw new MaxSizeException('Content size too large');
}
if ($metadata['timed_out']) {
throw new TimeoutException('Operation timeout');
}
}
fclose($stream);
$this->setEffectiveUrl($metadata['wrapper_data']);
return array(
'status' => $status,
'body' => $this->decodeBody($body, $headers),
'headers' => $headers,
);
}
/**
* Decode body response according to the HTTP headers.
*
* @param string $body Raw body
* @param HttpHeaders $headers HTTP headers
*
* @return string
*/
public function decodeBody($body, HttpHeaders $headers)
{
if (isset($headers['Transfer-Encoding']) && $headers['Transfer-Encoding'] === 'chunked') {
$body = $this->decodeChunked($body);
}
if (isset($headers['Content-Encoding']) && $headers['Content-Encoding'] === 'gzip') {
$body = gzdecode($body);
}
return $body;
}
/**
* Decode a chunked body.
*
* @param string $str Raw body
*
* @return string Decoded body
*/
public function decodeChunked($str)
{
for ($result = ''; !empty($str); $str = trim($str)) {
// Get the chunk length
$pos = strpos($str, "\r\n");
$len = hexdec(substr($str, 0, $pos));
// Append the chunk to the result
$result .= substr($str, $pos + 2, $len);
$str = substr($str, $pos + 2 + $len);
}
return $result;
}
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Client;
/**
* TimeoutException Exception.
*
* @author Frederic Guillot
*/
class TimeoutException extends ClientException
{
}

View file

@ -0,0 +1,10 @@
<?php
namespace PicoFeed\Client;
/**
* @author Bernhard Posselt
*/
class UnauthorizedException extends ClientException
{
}

View file

@ -0,0 +1,290 @@
<?php
namespace PicoFeed\Client;
/**
* URL class.
*
* @author Frederic Guillot
*/
class Url
{
/**
* URL.
*
* @var string
*/
private $url = '';
/**
* URL components.
*
* @var array
*/
private $components = array();
/**
* Constructor.
*
* @param string $url URL
*/
public function __construct($url)
{
$this->url = $url;
$this->components = parse_url($url) ?: array();
// Issue with PHP < 5.4.7 and protocol relative url
if (version_compare(PHP_VERSION, '5.4.7', '<') && $this->isProtocolRelative()) {
$pos = strpos($this->components['path'], '/', 2);
if ($pos === false) {
$pos = strlen($this->components['path']);
}
$this->components['host'] = substr($this->components['path'], 2, $pos - 2);
$this->components['path'] = substr($this->components['path'], $pos);
}
}
/**
* Shortcut method to get an absolute url from relative url.
*
* @static
*
* @param mixed $item_url Unknown url (can be relative or not)
* @param mixed $website_url Website url
*
* @return string
*/
public static function resolve($item_url, $website_url)
{
$link = is_string($item_url) ? new self($item_url) : $item_url;
$website = is_string($website_url) ? new self($website_url) : $website_url;
if ($link->isRelativeUrl()) {
if ($link->isRelativePath()) {
return $link->getAbsoluteUrl($website->getBaseUrl($website->getBasePath()));
}
return $link->getAbsoluteUrl($website->getBaseUrl());
} elseif ($link->isProtocolRelative()) {
$link->setScheme($website->getScheme());
}
return $link->getAbsoluteUrl();
}
/**
* Shortcut method to get a base url.
*
* @static
*
* @param string $url
*
* @return string
*/
public static function base($url)
{
$link = new self($url);
return $link->getBaseUrl();
}
/**
* Get the base URL.
*
* @param string $suffix Add a suffix to the url
*
* @return string
*/
public function getBaseUrl($suffix = '')
{
return $this->hasHost() ? $this->getScheme('://').$this->getHost().$this->getPort(':').$suffix : '';
}
/**
* Get the absolute URL.
*
* @param string $base_url Use this url as base url
*
* @return string
*/
public function getAbsoluteUrl($base_url = '')
{
if ($base_url) {
$base = new self($base_url);
$url = $base->getAbsoluteUrl().substr($this->getFullPath(), 1);
} else {
$url = $this->hasHost() ? $this->getBaseUrl().$this->getFullPath() : '';
}
return $url;
}
/**
* Return true if the url is relative.
*
* @return bool
*/
public function isRelativeUrl()
{
return !$this->hasScheme() && !$this->isProtocolRelative();
}
/**
* Return true if the path is relative.
*
* @return bool
*/
public function isRelativePath()
{
$path = $this->getPath();
return empty($path) || $path{0}
!== '/';
}
/**
* Filters the path of a URI.
*
* Imported from Guzzle library: https://github.com/guzzle/psr7/blob/master/src/Uri.php#L568-L582
*
* @param $path
*
* @return string
*/
public function filterPath($path, $charUnreserved = 'a-zA-Z0-9_\-\.~', $charSubDelims = '!\$&\'\(\)\*\+,;=')
{
return preg_replace_callback(
'/(?:[^'.$charUnreserved.$charSubDelims.':@\/%]+|%(?![A-Fa-f0-9]{2}))/',
function (array $matches) { return rawurlencode($matches[0]); },
$path
);
}
/**
* Get the path.
*
* @return string
*/
public function getPath()
{
return $this->filterPath(empty($this->components['path']) ? '' : $this->components['path']);
}
/**
* Get the base path.
*
* @return string
*/
public function getBasePath()
{
$current_path = $this->getPath();
$path = $this->isRelativePath() ? '/' : '';
$path .= substr($current_path, -1) === '/' ? $current_path : dirname($current_path);
return preg_replace('/\\\\\/|\/\//', '/', $path.'/');
}
/**
* Get the full path (path + querystring + fragment).
*
* @return string
*/
public function getFullPath()
{
$path = $this->isRelativePath() ? '/' : '';
$path .= $this->getPath();
$path .= empty($this->components['query']) ? '' : '?'.$this->components['query'];
$path .= empty($this->components['fragment']) ? '' : '#'.$this->components['fragment'];
return $path;
}
/**
* Get the hostname.
*
* @return string
*/
public function getHost()
{
return empty($this->components['host']) ? '' : $this->components['host'];
}
/**
* Return true if the url has a hostname.
*
* @return bool
*/
public function hasHost()
{
return !empty($this->components['host']);
}
/**
* Get the scheme.
*
* @param string $suffix Suffix to add when there is a scheme
*
* @return string
*/
public function getScheme($suffix = '')
{
return ($this->hasScheme() ? $this->components['scheme'] : 'http').$suffix;
}
/**
* Set the scheme.
*
* @param string $scheme Set a scheme
*
* @return string
*/
public function setScheme($scheme)
{
$this->components['scheme'] = $scheme;
}
/**
* Return true if the url has a scheme.
*
* @return bool
*/
public function hasScheme()
{
return !empty($this->components['scheme']);
}
/**
* Get the port.
*
* @param string $prefix Prefix to add when there is a port
*
* @return string
*/
public function getPort($prefix = '')
{
return $this->hasPort() ? $prefix.$this->components['port'] : '';
}
/**
* Return true if the url has a port.
*
* @return bool
*/
public function hasPort()
{
return !empty($this->components['port']);
}
/**
* Return true if the url is protocol relative (start with //).
*
* @return bool
*/
public function isProtocolRelative()
{
return strpos($this->url, '//') === 0;
}
}

View file

@ -0,0 +1,96 @@
<?php
namespace PicoFeed\Config;
/**
* Config class.
*
* @author Frederic Guillot
*
* @method \PicoFeed\Config\Config setClientTimeout(integer $value)
* @method \PicoFeed\Config\Config setClientUserAgent(string $value)
* @method \PicoFeed\Config\Config setMaxRedirections(integer $value)
* @method \PicoFeed\Config\Config setMaxBodySize(integer $value)
* @method \PicoFeed\Config\Config setProxyHostname(string $value)
* @method \PicoFeed\Config\Config setProxyPort(integer $value)
* @method \PicoFeed\Config\Config setProxyUsername(string $value)
* @method \PicoFeed\Config\Config setProxyPassword(string $value)
* @method \PicoFeed\Config\Config setGrabberRulesFolder(string $value)
* @method \PicoFeed\Config\Config setGrabberTimeout(integer $value)
* @method \PicoFeed\Config\Config setGrabberUserAgent(string $value)
* @method \PicoFeed\Config\Config setParserHashAlgo(string $value)
* @method \PicoFeed\Config\Config setContentFiltering(boolean $value)
* @method \PicoFeed\Config\Config setTimezone(string $value)
* @method \PicoFeed\Config\Config setFilterIframeWhitelist(array $value)
* @method \PicoFeed\Config\Config setFilterIntegerAttributes(array $value)
* @method \PicoFeed\Config\Config setFilterAttributeOverrides(array $value)
* @method \PicoFeed\Config\Config setFilterRequiredAttributes(array $value)
* @method \PicoFeed\Config\Config setFilterMediaBlacklist(array $value)
* @method \PicoFeed\Config\Config setFilterMediaAttributes(array $value)
* @method \PicoFeed\Config\Config setFilterSchemeWhitelist(array $value)
* @method \PicoFeed\Config\Config setFilterWhitelistedTags(array $value)
* @method \PicoFeed\Config\Config setFilterBlacklistedTags(array $value)
* @method \PicoFeed\Config\Config setFilterImageProxyUrl($value)
* @method \PicoFeed\Config\Config setFilterImageProxyCallback($closure)
* @method \PicoFeed\Config\Config setFilterImageProxyProtocol($value)
* @method integer getClientTimeout()
* @method string getClientUserAgent()
* @method integer getMaxRedirections()
* @method integer getMaxBodySize()
* @method string getProxyHostname()
* @method integer getProxyPort()
* @method string getProxyUsername()
* @method string getProxyPassword()
* @method string getGrabberRulesFolder()
* @method integer getGrabberTimeout()
* @method string getGrabberUserAgent()
* @method string getParserHashAlgo()
* @method boolean getContentFiltering(bool $default_value)
* @method string getTimezone()
* @method array getFilterIframeWhitelist(array $default_value)
* @method array getFilterIntegerAttributes(array $default_value)
* @method array getFilterAttributeOverrides(array $default_value)
* @method array getFilterRequiredAttributes(array $default_value)
* @method array getFilterMediaBlacklist(array $default_value)
* @method array getFilterMediaAttributes(array $default_value)
* @method array getFilterSchemeWhitelist(array $default_value)
* @method array getFilterWhitelistedTags(array $default_value)
* @method array getFilterBlacklistedTags(array $default_value)
* @method string getFilterImageProxyUrl()
* @method \Closure getFilterImageProxyCallback()
* @method string getFilterImageProxyProtocol()
*/
class Config
{
/**
* Contains all parameters.
*
* @var array
*/
private $container = array();
/**
* Magic method to have any kind of setters or getters.
*
* @param string $name Getter/Setter name
* @param array $arguments Method arguments
*
* @return mixed
*/
public function __call($name, array $arguments)
{
$name = strtolower($name);
$prefix = substr($name, 0, 3);
$parameter = substr($name, 3);
if ($prefix === 'set' && isset($arguments[0])) {
$this->container[$parameter] = $arguments[0];
return $this;
} elseif ($prefix === 'get') {
$default_value = isset($arguments[0]) ? $arguments[0] : null;
return isset($this->container[$parameter]) ? $this->container[$parameter] : $default_value;
}
}
}

View file

@ -0,0 +1,33 @@
<?php
namespace PicoFeed\Encoding;
/**
* Encoding class.
*/
class Encoding
{
public static function convert($input, $encoding)
{
if ($encoding === 'utf-8' || $encoding === '') {
return $input;
}
// suppress all notices since it isn't possible to silence only the
// notice "Wrong charset, conversion from $in_encoding to $out_encoding is not allowed"
set_error_handler(function () {}, E_NOTICE);
// convert input to utf-8 and strip invalid characters
$value = iconv($encoding, 'UTF-8//IGNORE', $input);
// stop silencing of notices
restore_error_handler();
// return input if something went wrong, maybe it's usable anyway
if ($value === false) {
return $input;
}
return $value;
}
}

View file

@ -0,0 +1,699 @@
<?php
namespace PicoFeed\Filter;
use PicoFeed\Client\Url;
/**
* Attribute Filter class.
*
* @author Frederic Guillot
*/
class Attribute
{
/**
* Image proxy url.
*
* @var string
*/
private $image_proxy_url = '';
/**
* Image proxy callback.
*
* @var \Closure|null
*/
private $image_proxy_callback = null;
/**
* limits the image proxy usage to this protocol.
*
* @var string
*/
private $image_proxy_limit_protocol = '';
/**
* Tags and attribute whitelist.
*
* @var array
*/
private $attribute_whitelist = array(
'audio' => array('controls', 'src'),
'video' => array('poster', 'controls', 'height', 'width', 'src'),
'source' => array('src', 'type'),
'dt' => array(),
'dd' => array(),
'dl' => array(),
'table' => array(),
'caption' => array(),
'tr' => array(),
'th' => array(),
'td' => array(),
'tbody' => array(),
'thead' => array(),
'h2' => array(),
'h3' => array(),
'h4' => array(),
'h5' => array(),
'h6' => array(),
'strong' => array(),
'em' => array(),
'code' => array(),
'pre' => array(),
'blockquote' => array(),
'p' => array(),
'ul' => array(),
'li' => array(),
'ol' => array(),
'br' => array(),
'del' => array(),
'a' => array('href'),
'img' => array('src', 'title', 'alt'),
'figure' => array(),
'figcaption' => array(),
'cite' => array(),
'time' => array('datetime'),
'abbr' => array('title'),
'iframe' => array('width', 'height', 'frameborder', 'src', 'allowfullscreen'),
'q' => array('cite'),
);
/**
* Scheme whitelist.
*
* For a complete list go to http://en.wikipedia.org/wiki/URI_scheme
*
* @var array
*/
private $scheme_whitelist = array(
'bitcoin:',
'callto:',
'ed2k://',
'facetime://',
'feed:',
'ftp://',
'geo:',
'git://',
'http://',
'https://',
'irc://',
'irc6://',
'ircs://',
'jabber:',
'magnet:',
'mailto:',
'nntp://',
'rtmp://',
'sftp://',
'sip:',
'sips:',
'skype:',
'smb://',
'sms:',
'spotify:',
'ssh:',
'steam:',
'svn://',
'tel:',
);
/**
* Iframe source whitelist, everything else is ignored.
*
* @var array
*/
private $iframe_whitelist = array(
'http://www.youtube.com',
'https://www.youtube.com',
'http://player.vimeo.com',
'https://player.vimeo.com',
'http://www.dailymotion.com',
'https://www.dailymotion.com',
'http://vk.com',
'https://vk.com',
);
/**
* Blacklisted resources.
*
* @var array
*/
private $media_blacklist = array(
'api.flattr.com',
'feeds.feedburner.com',
'share.feedsportal.com',
'da.feedsportal.com',
'rc.feedsportal.com',
'rss.feedsportal.com',
'res.feedsportal.com',
'res1.feedsportal.com',
'res2.feedsportal.com',
'res3.feedsportal.com',
'pi.feedsportal.com',
'rss.nytimes.com',
'feeds.wordpress.com',
'stats.wordpress.com',
'rss.cnn.com',
'twitter.com/home?status=',
'twitter.com/share',
'twitter_icon_large.png',
'www.facebook.com/sharer.php',
'facebook_icon_large.png',
'plus.google.com/share',
'www.gstatic.com/images/icons/gplus-16.png',
'www.gstatic.com/images/icons/gplus-32.png',
'www.gstatic.com/images/icons/gplus-64.png',
);
/**
* Attributes used for external resources.
*
* @var array
*/
private $media_attributes = array(
'src',
'href',
'poster',
);
/**
* Attributes that must be integer.
*
* @var array
*/
private $integer_attributes = array(
'width',
'height',
'frameborder',
);
/**
* Mandatory attributes for specified tags.
*
* @var array
*/
private $required_attributes = array(
'a' => array('href'),
'img' => array('src'),
'iframe' => array('src'),
'audio' => array('src'),
'source' => array('src'),
);
/**
* Add attributes to specified tags.
*
* @var array
*/
private $add_attributes = array(
'a' => array('rel' => 'noreferrer', 'target' => '_blank'),
'video' => array('controls' => 'true'),
);
/**
* List of filters to apply.
*
* @var array
*/
private $filters = array(
'filterAllowedAttribute',
'filterIntegerAttribute',
'rewriteAbsoluteUrl',
'filterIframeAttribute',
'filterBlacklistResourceAttribute',
'filterProtocolUrlAttribute',
'rewriteImageProxyUrl',
'secureIframeSrc',
'removeYouTubeAutoplay',
);
/**
* Add attributes to specified tags.
*
* @var \PicoFeed\Client\Url
*/
private $website;
/**
* Constructor.
*
* @param \PicoFeed\Client\Url $website Website url instance
*/
public function __construct(Url $website)
{
$this->website = $website;
}
/**
* Apply filters to the attributes list.
*
* @param string $tag Tag name
* @param array $attributes Attributes dictionary
*
* @return array Filtered attributes
*/
public function filter($tag, array $attributes)
{
foreach ($attributes as $attribute => &$value) {
foreach ($this->filters as $filter) {
if (!$this->$filter($tag, $attribute, $value)) {
unset($attributes[$attribute]);
break;
}
}
}
return $attributes;
}
/**
* Return true if the value is allowed (remove not allowed attributes).
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function filterAllowedAttribute($tag, $attribute, $value)
{
return isset($this->attribute_whitelist[$tag]) && in_array($attribute, $this->attribute_whitelist[$tag]);
}
/**
* Return true if the value is not integer (remove attributes that should have an integer value).
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function filterIntegerAttribute($tag, $attribute, $value)
{
if (in_array($attribute, $this->integer_attributes)) {
return ctype_digit($value);
}
return true;
}
/**
* Return true if the iframe source is allowed (remove not allowed iframe).
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function filterIframeAttribute($tag, $attribute, $value)
{
if ($tag === 'iframe' && $attribute === 'src') {
foreach ($this->iframe_whitelist as $url) {
if (strpos($value, $url) === 0) {
return true;
}
}
return false;
}
return true;
}
/**
* Return true if the resource is not blacklisted (remove blacklisted resource attributes).
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function filterBlacklistResourceAttribute($tag, $attribute, $value)
{
if ($this->isResource($attribute) && $this->isBlacklistedMedia($value)) {
return false;
}
return true;
}
/**
* Convert all relative links to absolute url.
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function rewriteAbsoluteUrl($tag, $attribute, &$value)
{
if ($this->isResource($attribute)) {
$value = Url::resolve($value, $this->website);
}
return true;
}
/**
* Turns iframes' src attribute from http to https to prevent
* mixed active content.
*
* @param string $tag Tag name
* @param array $attribute Atttributes name
* @param string $value Attribute value
*
* @return bool
*/
public function secureIframeSrc($tag, $attribute, &$value)
{
if ($tag === 'iframe' && $attribute === 'src' && strpos($value, 'http://') === 0) {
$value = substr_replace($value, 's', 4, 0);
}
return true;
}
/**
* Removes YouTube autoplay from iframes.
*
* @param string $tag Tag name
* @param array $attribute Atttributes name
* @param string $value Attribute value
*
* @return bool
*/
public function removeYouTubeAutoplay($tag, $attribute, &$value)
{
$regex = '%^(https://(?:www\.)?youtube.com/.*\?.*autoplay=)(1)(.*)%i';
if ($tag === 'iframe' && $attribute === 'src' && preg_match($regex, $value)) {
$value = preg_replace($regex, '${1}0$3', $value);
}
return true;
}
/**
* Rewrite image url to use with a proxy.
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function rewriteImageProxyUrl($tag, $attribute, &$value)
{
if ($tag === 'img' && $attribute === 'src'
&& !($this->image_proxy_limit_protocol !== '' && stripos($value, $this->image_proxy_limit_protocol.':') !== 0)) {
if ($this->image_proxy_url) {
$value = sprintf($this->image_proxy_url, rawurlencode($value));
} elseif (is_callable($this->image_proxy_callback)) {
$value = call_user_func($this->image_proxy_callback, $value);
}
}
return true;
}
/**
* Return true if the scheme is authorized.
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function filterProtocolUrlAttribute($tag, $attribute, $value)
{
if ($this->isResource($attribute) && !$this->isAllowedProtocol($value)) {
return false;
}
return true;
}
/**
* Automatically add/override some attributes for specific tags.
*
* @param string $tag Tag name
* @param array $attributes Attributes list
*
* @return array
*/
public function addAttributes($tag, array $attributes)
{
if (isset($this->add_attributes[$tag])) {
$attributes += $this->add_attributes[$tag];
}
return $attributes;
}
/**
* Return true if all required attributes are present.
*
* @param string $tag Tag name
* @param array $attributes Attributes list
*
* @return bool
*/
public function hasRequiredAttributes($tag, array $attributes)
{
if (isset($this->required_attributes[$tag])) {
foreach ($this->required_attributes[$tag] as $attribute) {
if (!isset($attributes[$attribute])) {
return false;
}
}
}
return true;
}
/**
* Check if an attribute name is an external resource.
*
* @param string $attribute Attribute name
*
* @return bool
*/
public function isResource($attribute)
{
return in_array($attribute, $this->media_attributes);
}
/**
* Detect if the protocol is allowed or not.
*
* @param string $value Attribute value
*
* @return bool
*/
public function isAllowedProtocol($value)
{
foreach ($this->scheme_whitelist as $protocol) {
if (strpos($value, $protocol) === 0) {
return true;
}
}
return false;
}
/**
* Detect if an url is blacklisted.
*
* @param string $resource Attribute value (URL)
*
* @return bool
*/
public function isBlacklistedMedia($resource)
{
foreach ($this->media_blacklist as $name) {
if (strpos($resource, $name) !== false) {
return true;
}
}
return false;
}
/**
* Convert the attribute list to html.
*
* @param array $attributes Attributes
*
* @return string
*/
public function toHtml(array $attributes)
{
$html = array();
foreach ($attributes as $attribute => $value) {
$html[] = sprintf('%s="%s"', $attribute, Filter::escape($value));
}
return implode(' ', $html);
}
/**
* Set whitelisted tags and attributes for each tag.
*
* @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']]
*
* @return Attribute
*/
public function setWhitelistedAttributes(array $values)
{
$this->attribute_whitelist = $values ?: $this->attribute_whitelist;
return $this;
}
/**
* Set scheme whitelist.
*
* @param array $values List of scheme: ['http://', 'ftp://']
*
* @return Attribute
*/
public function setSchemeWhitelist(array $values)
{
$this->scheme_whitelist = $values ?: $this->scheme_whitelist;
return $this;
}
/**
* Set media attributes (used to load external resources).
*
* @param array $values List of values: ['src', 'href']
*
* @return Attribute
*/
public function setMediaAttributes(array $values)
{
$this->media_attributes = $values ?: $this->media_attributes;
return $this;
}
/**
* Set blacklisted external resources.
*
* @param array $values List of tags: ['http://google.com/', '...']
*
* @return Attribute
*/
public function setMediaBlacklist(array $values)
{
$this->media_blacklist = $values ?: $this->media_blacklist;
return $this;
}
/**
* Set mandatory attributes for whitelisted tags.
*
* @param array $values List of tags: ['img' => 'src']
*
* @return Attribute
*/
public function setRequiredAttributes(array $values)
{
$this->required_attributes = $values ?: $this->required_attributes;
return $this;
}
/**
* Set attributes to automatically to specific tags.
*
* @param array $values List of tags: ['a' => 'target="_blank"']
*
* @return Attribute
*/
public function setAttributeOverrides(array $values)
{
$this->add_attributes = $values ?: $this->add_attributes;
return $this;
}
/**
* Set attributes that must be an integer.
*
* @param array $values List of tags: ['width', 'height']
*
* @return Attribute
*/
public function setIntegerAttributes(array $values)
{
$this->integer_attributes = $values ?: $this->integer_attributes;
return $this;
}
/**
* Set allowed iframe resources.
*
* @param array $values List of tags: ['http://www.youtube.com']
*
* @return Attribute
*/
public function setIframeWhitelist(array $values)
{
$this->iframe_whitelist = $values ?: $this->iframe_whitelist;
return $this;
}
/**
* Set image proxy URL.
*
* The original image url will be urlencoded
*
* @param string $url Proxy URL
*
* @return Attribute
*/
public function setImageProxyUrl($url)
{
$this->image_proxy_url = $url ?: $this->image_proxy_url;
return $this;
}
/**
* Set image proxy callback.
*
* @param \Closure $callback
*
* @return Attribute
*/
public function setImageProxyCallback($callback)
{
$this->image_proxy_callback = $callback ?: $this->image_proxy_callback;
return $this;
}
/**
* Set image proxy protocol restriction.
*
* @param string $value
*
* @return Attribute
*/
public function setImageProxyProtocol($value)
{
$this->image_proxy_limit_protocol = $value ?: $this->image_proxy_limit_protocol;
return $this;
}
}

View file

@ -0,0 +1,155 @@
<?php
namespace PicoFeed\Filter;
/**
* Filter class.
*
* @author Frederic Guillot
*/
class Filter
{
/**
* Get the Html filter instance.
*
* @static
*
* @param string $html HTML content
* @param string $website Site URL (used to build absolute URL)
*
* @return Html
*/
public static function html($html, $website)
{
$filter = new Html($html, $website);
return $filter;
}
/**
* Escape HTML content.
*
* @static
*
* @return string
*/
public static function escape($content)
{
return htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
}
/**
* Remove HTML tags.
*
* @param string $data Input data
*
* @return string
*/
public function removeHTMLTags($data)
{
return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
}
/**
* Remove the XML tag from a document.
*
* @static
*
* @param string $data Input data
*
* @return string
*/
public static function stripXmlTag($data)
{
if (strpos($data, '<?xml') !== false) {
$data = ltrim(substr($data, strpos($data, '?>') + 2));
}
do {
$pos = strpos($data, '<?xml-stylesheet ');
if ($pos !== false) {
$data = ltrim(substr($data, strpos($data, '?>') + 2));
}
} while ($pos !== false && $pos < 200);
return $data;
}
/**
* Strip head tag from the HTML content.
*
* @static
*
* @param string $data Input data
*
* @return string
*/
public static function stripHeadTags($data)
{
return preg_replace('@<head[^>]*?>.*?</head>@siu', '', $data);
}
/**
* Trim whitespace from the begining, the end and inside a string and don't break utf-8 string.
*
* @static
*
* @param string $value Raw data
*
* @return string Normalized data
*/
public static function stripWhiteSpace($value)
{
$value = str_replace("\r", ' ', $value);
$value = str_replace("\t", ' ', $value);
$value = str_replace("\n", ' ', $value);
// $value = preg_replace('/\s+/', ' ', $value); <= break utf-8
return trim($value);
}
/**
* Fixes before XML parsing.
*
* @static
*
* @param string $data Raw data
*
* @return string Normalized data
*/
public static function normalizeData($data)
{
$entities = array(
'/(&#)(\d+);/m', // decimal encoded
'/(&#x)([a-f0-9]+);/mi', // hex encoded
);
// strip invalid XML 1.0 characters which are encoded as entities
$data = preg_replace_callback($entities, function ($matches) {
$code_point = $matches[2];
// convert hex entity to decimal
if (strtolower($matches[1]) === '&#x') {
$code_point = hexdec($code_point);
}
$code_point = (int) $code_point;
// replace invalid characters
if ($code_point < 9
|| ($code_point > 10 && $code_point < 13)
|| ($code_point > 13 && $code_point < 32)
|| ($code_point > 55295 && $code_point < 57344)
|| ($code_point > 65533 && $code_point < 65536)
|| $code_point > 1114111
) {
return '';
};
return $matches[0];
}, $data);
// strip every utf-8 character than isn't in the range of valid XML 1.0 characters
return (string) preg_replace('/[^\x{0009}\x{000A}\x{000D}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u', '', $data);
}
}

View file

@ -0,0 +1,243 @@
<?php
namespace PicoFeed\Filter;
use PicoFeed\Config\Config;
use PicoFeed\Client\Url;
use PicoFeed\Scraper\RuleLoader;
use PicoFeed\Parser\XmlParser;
/**
* HTML Filter class.
*
* @author Frederic Guillot
*/
class Html
{
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Unfiltered XML data.
*
* @var string
*/
private $input = '';
/**
* Filtered XML data.
*
* @var string
*/
private $output = '';
/**
* List of empty tags.
*
* @var array
*/
private $empty_tags = array();
/**
* Empty flag.
*
* @var bool
*/
private $empty = true;
/**
* Tag instance.
*
* @var \PicoFeed\Filter\Tag
*/
public $tag = '';
/**
* Attribute instance.
*
* @var \PicoFeed\Filter\Attribute
*/
public $attribute = '';
/**
* The website to filter.
*
* @var string
*/
private $website;
/**
* Initialize the filter, all inputs data must be encoded in UTF-8 before.
*
* @param string $html HTML content
* @param string $website Site URL (used to build absolute URL)
*/
public function __construct($html, $website)
{
$this->config = new Config();
$this->input = XmlParser::htmlToXml($html);
$this->output = '';
$this->tag = new Tag($this->config);
$this->website = $website;
$this->attribute = new Attribute(new Url($website));
}
/**
* Set config object.
*
* @param \PicoFeed\Config\Config $config Config instance
*
* @return \PicoFeed\Filter\Html
*/
public function setConfig($config)
{
$this->config = $config;
if ($this->config !== null) {
$this->attribute->setImageProxyCallback($this->config->getFilterImageProxyCallback());
$this->attribute->setImageProxyUrl($this->config->getFilterImageProxyUrl());
$this->attribute->setImageProxyProtocol($this->config->getFilterImageProxyProtocol());
$this->attribute->setIframeWhitelist($this->config->getFilterIframeWhitelist(array()));
$this->attribute->setIntegerAttributes($this->config->getFilterIntegerAttributes(array()));
$this->attribute->setAttributeOverrides($this->config->getFilterAttributeOverrides(array()));
$this->attribute->setRequiredAttributes($this->config->getFilterRequiredAttributes(array()));
$this->attribute->setMediaBlacklist($this->config->getFilterMediaBlacklist(array()));
$this->attribute->setMediaAttributes($this->config->getFilterMediaAttributes(array()));
$this->attribute->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array()));
$this->attribute->setWhitelistedAttributes($this->config->getFilterWhitelistedTags(array()));
$this->tag->setWhitelistedTags(array_keys($this->config->getFilterWhitelistedTags(array())));
}
return $this;
}
/**
* Run tags/attributes filtering.
*
* @return string
*/
public function execute()
{
$this->preFilter();
$parser = xml_parser_create();
xml_set_object($parser, $this);
xml_set_element_handler($parser, 'startTag', 'endTag');
xml_set_character_data_handler($parser, 'dataTag');
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
xml_parse($parser, $this->input, true);
xml_parser_free($parser);
$this->postFilter();
return $this->output;
}
/**
* Called before XML parsing.
*/
public function preFilter()
{
$this->input = $this->tag->removeBlacklistedTags($this->input);
}
/**
* Called after XML parsing.
*/
public function postFilter()
{
$this->output = $this->tag->removeEmptyTags($this->output);
$this->output = $this->filterRules($this->output);
$this->output = $this->tag->removeMultipleBreakTags($this->output);
$this->output = trim($this->output);
}
/**
* Called after XML parsing.
*
* @param string $content the content that should be filtered
*/
public function filterRules($content)
{
// the constructor should require a config, then this if can be removed
if ($this->config === null) {
$config = new Config();
} else {
$config = $this->config;
}
$loader = new RuleLoader($config);
$rules = $loader->getRules($this->website);
$url = new Url($this->website);
$sub_url = $url->getFullPath();
if (isset($rules['filter'])) {
foreach ($rules['filter'] as $pattern => $rule) {
if (preg_match($pattern, $sub_url)) {
foreach ($rule as $search => $replace) {
$content = preg_replace($search, $replace, $content);
}
}
}
}
return $content;
}
/**
* Parse opening tag.
*
* @param resource $parser XML parser
* @param string $tag Tag name
* @param array $attributes Tag attributes
*/
public function startTag($parser, $tag, array $attributes)
{
$this->empty = true;
if ($this->tag->isAllowed($tag, $attributes)) {
$attributes = $this->attribute->filter($tag, $attributes);
if ($this->attribute->hasRequiredAttributes($tag, $attributes)) {
$attributes = $this->attribute->addAttributes($tag, $attributes);
$this->output .= $this->tag->openHtmlTag($tag, $this->attribute->toHtml($attributes));
$this->empty = false;
}
}
$this->empty_tags[] = $this->empty;
}
/**
* Parse closing tag.
*
* @param resource $parser XML parser
* @param string $tag Tag name
*/
public function endTag($parser, $tag)
{
if (!array_pop($this->empty_tags) && $this->tag->isAllowedTag($tag)) {
$this->output .= $this->tag->closeHtmlTag($tag);
}
}
/**
* Parse tag content.
*
* @param resource $parser XML parser
* @param string $content Tag content
*/
public function dataTag($parser, $content)
{
// Replace &nbsp; with normal space
$content = str_replace("\xc2\xa0", ' ', $content);
$this->output .= Filter::escape($content);
}
}

View file

@ -0,0 +1,215 @@
<?php
namespace PicoFeed\Filter;
use DOMXPath;
use PicoFeed\Base;
use PicoFeed\Parser\XmlParser;
/**
* Tag Filter class.
*
* @author Frederic Guillot
*/
class Tag extends Base
{
/**
* Tags blacklist (Xpath expressions).
*
* @var array
*/
private $tag_blacklist = array(
'//script',
'//style',
);
/**
* Tags whitelist.
*
* @var array
*/
private $tag_whitelist = array(
'audio',
'video',
'source',
'dt',
'dd',
'dl',
'table',
'caption',
'tr',
'th',
'td',
'tbody',
'thead',
'h2',
'h3',
'h4',
'h5',
'h6',
'strong',
'em',
'code',
'pre',
'blockquote',
'p',
'ul',
'li',
'ol',
'br',
'del',
'a',
'img',
'figure',
'figcaption',
'cite',
'time',
'abbr',
'iframe',
'q',
);
/**
* Check if the tag is allowed and is not a pixel tracker.
*
* @param string $tag Tag name
* @param array $attributes Attributes dictionary
*
* @return bool
*/
public function isAllowed($tag, array $attributes)
{
return $this->isAllowedTag($tag) && !$this->isPixelTracker($tag, $attributes);
}
/**
* Return the HTML opening tag.
*
* @param string $tag Tag name
* @param string $attributes Attributes converted in html
*
* @return string
*/
public function openHtmlTag($tag, $attributes = '')
{
return '<'.$tag.(empty($attributes) ? '' : ' '.$attributes).($this->isSelfClosingTag($tag) ? '/>' : '>');
}
/**
* Return the HTML closing tag.
*
* @param string $tag Tag name
*
* @return string
*/
public function closeHtmlTag($tag)
{
return $this->isSelfClosingTag($tag) ? '' : '</'.$tag.'>';
}
/**
* Return true is the tag is self-closing.
*
* @param string $tag Tag name
*
* @return bool
*/
public function isSelfClosingTag($tag)
{
return $tag === 'br' || $tag === 'img';
}
/**
* Check if a tag is on the whitelist.
*
* @param string $tag Tag name
*
* @return bool
*/
public function isAllowedTag($tag)
{
return in_array($tag, array_merge(
$this->tag_whitelist,
array_keys($this->config->getFilterWhitelistedTags(array()))
));
}
/**
* Detect if an image tag is a pixel tracker.
*
* @param string $tag Tag name
* @param array $attributes Tag attributes
*
* @return bool
*/
public function isPixelTracker($tag, array $attributes)
{
return $tag === 'img' &&
isset($attributes['height']) && isset($attributes['width']) &&
$attributes['height'] == 1 && $attributes['width'] == 1;
}
/**
* Remove script tags.
*
* @param string $data Input data
*
* @return string
*/
public function removeBlacklistedTags($data)
{
$dom = XmlParser::getDomDocument($data);
if ($dom === false) {
return '';
}
$xpath = new DOMXpath($dom);
$nodes = $xpath->query(implode(' | ', $this->tag_blacklist));
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
return $dom->saveXML();
}
/**
* Remove empty tags.
*
* @param string $data Input data
*
* @return string
*/
public function removeEmptyTags($data)
{
return preg_replace('/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', '', $data);
}
/**
* Replace <br/><br/> by only one.
*
* @param string $data Input data
*
* @return string
*/
public function removeMultipleBreakTags($data)
{
return preg_replace("/(<br\s*\/?>\s*)+/", '<br/>', $data);
}
/**
* Set whitelisted tags adn attributes for each tag.
*
* @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']]
*
* @return Tag
*/
public function setWhitelistedTags(array $values)
{
$this->tag_whitelist = $values ?: $this->tag_whitelist;
return $this;
}
}

View file

@ -0,0 +1,23 @@
<?php
namespace PicoFeed\Generator;
use PicoFeed\Parser\Item;
/**
* Content Generator Interface
*
* @package PicoFeed\Generator
* @author Frederic Guillot
*/
interface ContentGeneratorInterface
{
/**
* Execute Content Generator
*
* @access public
* @param Item $item
* @return boolean
*/
public function execute(Item $item);
}

View file

@ -0,0 +1,36 @@
<?php
namespace PicoFeed\Generator;
use PicoFeed\Base;
use PicoFeed\Parser\Item;
/**
* File Content Generator
*
* @package PicoFeed\Generator
* @author Frederic Guillot
*/
class FileContentGenerator extends Base implements ContentGeneratorInterface
{
private $extensions = array('pdf');
/**
* Execute Content Generator
*
* @access public
* @param Item $item
* @return boolean
*/
public function execute(Item $item)
{
foreach ($this->extensions as $extension) {
if (substr($item->getUrl(), - strlen($extension)) === $extension) {
$item->setContent('<a href="'.$item->getUrl().'" target="_blank">'.$item->getUrl().'</a>');
return true;
}
}
return false;
}
}

View file

@ -0,0 +1,67 @@
<?php
namespace PicoFeed\Generator;
use PicoFeed\Base;
use PicoFeed\Parser\Item;
/**
* Youtube Content Generator
*
* @package PicoFeed\Generator
* @author Frederic Guillot
*/
class YoutubeContentGenerator extends Base implements ContentGeneratorInterface
{
/**
* Execute Content Generator
*
* @access public
* @param Item $item
* @return boolean
*/
public function execute(Item $item)
{
if ($item->hasNamespace('yt')) {
return $this->generateHtmlFromXml($item);
}
return $this->generateHtmlFromUrl($item);
}
/**
* Generate HTML
*
* @access public
* @param Item $item
* @return boolean
*/
private function generateHtmlFromXml(Item $item)
{
$videoId = $item->getTag('yt:videoId');
if (! empty($videoId)) {
$item->setContent('<iframe width="560" height="315" src="//www.youtube.com/embed/'.$videoId[0].'" frameborder="0"></iframe>');
return true;
}
return false;
}
/**
* Generate HTML from item URL
*
* @access public
* @param Item $item
* @return bool
*/
public function generateHtmlFromUrl(Item $item)
{
if (preg_match('/youtube\.com\/watch\?v=(.*)/', $item->getUrl(), $matches)) {
$item->setContent('<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[1].'" frameborder="0"></iframe>');
return true;
}
return false;
}
}

View file

@ -0,0 +1,114 @@
<?php
namespace PicoFeed\Logging;
use DateTime;
use DateTimeZone;
/**
* Logging class.
*
* @author Frederic Guillot
*/
class Logger
{
/**
* List of messages.
*
* @static
*
* @var array
*/
private static $messages = array();
/**
* Default timezone.
*
* @static
*
* @var string
*/
private static $timezone = 'UTC';
/**
* Enable or disable logging.
*
* @static
*
* @var bool
*/
public static $enable = false;
/**
* Enable logging.
*
* @static
*/
public static function enable()
{
self::$enable = true;
}
/**
* Add a new message.
*
* @static
*
* @param string $message Message
*/
public static function setMessage($message)
{
if (self::$enable) {
$date = new DateTime('now', new DateTimeZone(self::$timezone));
self::$messages[] = '['.$date->format('Y-m-d H:i:s').'] '.$message;
}
}
/**
* Get all logged messages.
*
* @static
*
* @return array
*/
public static function getMessages()
{
return self::$messages;
}
/**
* Remove all logged messages.
*
* @static
*/
public static function deleteMessages()
{
self::$messages = array();
}
/**
* Set a different timezone.
*
* @static
*
* @see http://php.net/manual/en/timezones.php
*
* @param string $timezone Timezone
*/
public static function setTimeZone($timezone)
{
self::$timezone = $timezone ?: self::$timezone;
}
/**
* Get all messages serialized into a string.
*
* @static
*
* @return string
*/
public static function toString()
{
return implode(PHP_EOL, self::$messages).PHP_EOL;
}
}

View file

@ -0,0 +1,364 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
use PicoFeed\Filter\Filter;
use PicoFeed\Client\Url;
/**
* Atom parser.
*
* @author Frederic Guillot
*/
class Atom extends Parser
{
/**
* Supported namespaces.
*/
protected $namespaces = array(
'atom' => 'http://www.w3.org/2005/Atom',
);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml)
{
return XmlParser::getXPathResult($xml, 'atom:entry', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'entry');
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->setFeedUrl($this->getUrl($xml, 'self'));
}
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->setSiteUrl($this->getUrl($xml, 'alternate', true));
}
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
{
$description = XmlParser::getXPathResult($xml, 'atom:subtitle', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'subtitle');
$feed->setDescription(XmlParser::getValue($description));
}
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
{
$logo = XmlParser::getXPathResult($xml, 'atom:logo', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'logo');
$feed->setLogo(XmlParser::getValue($logo));
}
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
{
$icon = XmlParser::getXPathResult($xml, 'atom:icon', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'icon');
$feed->setIcon(XmlParser::getValue($icon));
}
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$title = XmlParser::getXPathResult($xml, 'atom:title', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'title');
$feed->setTitle(Filter::stripWhiteSpace(XmlParser::getValue($title)) ?: $feed->getSiteUrl());
}
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$language = XmlParser::getXPathResult($xml, '*[not(self::atom:entry)]/@xml:lang', $this->namespaces)
?: XmlParser::getXPathResult($xml, '@xml:lang');
$feed->setLanguage(XmlParser::getValue($language));
}
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$id = XmlParser::getXPathResult($xml, 'atom:id', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'id');
$feed->setId(XmlParser::getValue($id));
}
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$updated = XmlParser::getXPathResult($xml, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'updated');
$feed->setDate($this->getDateParser()->getDateTime(XmlParser::getValue($updated)));
}
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$published = XmlParser::getXPathResult($entry, 'atom:published', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'published');
$updated = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'updated');
$published = !empty($published) ? $this->getDateParser()->getDateTime((string) current($published)) : null;
$updated = !empty($updated) ? $this->getDateParser()->getDateTime((string) current($updated)) : null;
if ($published === null && $updated === null) {
$item->setDate($feed->getDate()); // We use the feed date if there is no date for the item
} elseif ($published !== null && $updated !== null) {
$item->setDate(max($published, $updated)); // We use the most recent date between published and updated
} else {
$item->setDate($updated ?: $published);
}
}
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$title = XmlParser::getXPathResult($entry, 'atom:title', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'title');
$item->setTitle(Filter::stripWhiteSpace(XmlParser::getValue($title)) ?: $item->getUrl());
}
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
$author = XmlParser::getXPathResult($entry, 'atom:author/atom:name', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'author/name')
?: XmlParser::getXPathResult($xml, 'atom:author/atom:name', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'author/name');
$item->setAuthor(XmlParser::getValue($author));
}
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$item->setContent($this->getContent($entry));
}
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$item->setUrl($this->getUrl($entry, 'alternate', true));
}
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$id = XmlParser::getXPathResult($entry, 'atom:id', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'id');
if (!empty($id)) {
$item->setId($this->generateId(XmlParser::getValue($id)));
} else {
$item->setId($this->generateId(
$item->getTitle(), $item->getUrl(), $item->getContent()
));
}
}
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$enclosure = $this->findLink($entry, 'enclosure');
if ($enclosure) {
$item->setEnclosureUrl(Url::resolve((string) $enclosure['href'], $feed->getSiteUrl()));
$item->setEnclosureType((string) $enclosure['type']);
}
}
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$language = XmlParser::getXPathResult($entry, './/@xml:lang');
$item->setLanguage(XmlParser::getValue($language) ?: $feed->getLanguage());
}
/**
* Get the URL from a link tag.
*
* @param SimpleXMLElement $xml XML tag
* @param string $rel Link relationship: alternate, enclosure, related, self, via
*
* @return string
*/
private function getUrl(SimpleXMLElement $xml, $rel, $fallback = false)
{
$link = $this->findLink($xml, $rel);
if ($link) {
return (string) $link['href'];
}
if ($fallback) {
$link = $this->findLink($xml, '');
return $link ? (string) $link['href'] : '';
}
return '';
}
/**
* Get a link tag that match a relationship.
*
* @param SimpleXMLElement $xml XML tag
* @param string $rel Link relationship: alternate, enclosure, related, self, via
*
* @return SimpleXMLElement|null
*/
private function findLink(SimpleXMLElement $xml, $rel)
{
$links = XmlParser::getXPathResult($xml, 'atom:link', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'link');
foreach ($links as $link) {
if ($rel === (string) $link['rel']) {
return $link;
}
}
return null;
}
/**
* Get the entry content.
*
* @param SimpleXMLElement $entry XML Entry
*
* @return string
*/
private function getContent(SimpleXMLElement $entry)
{
$content = current(
XmlParser::getXPathResult($entry, 'atom:content', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'content')
);
if (!empty($content) && count($content->children())) {
$xml_string = '';
foreach ($content->children() as $child) {
$xml_string .= $child->asXML();
}
return $xml_string;
} elseif (trim((string) $content) !== '') {
return (string) $content;
}
$summary = XmlParser::getXPathResult($entry, 'atom:summary', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'summary');
return (string) current($summary);
}
}

View file

@ -0,0 +1,126 @@
<?php
namespace PicoFeed\Parser;
use DateTime;
use DateTimeZone;
use PicoFeed\Base;
/**
* Date Parser.
*
* @author Frederic Guillot
*/
class DateParser extends Base
{
/**
* Timezone used to parse feed dates.
*
* @access private
* @var string
*/
private $timezone = 'UTC';
/**
* Supported formats [ 'format' => length ].
*
* @var array
*/
public $formats = array(
DATE_ATOM => null,
DATE_RSS => null,
DATE_COOKIE => null,
DATE_ISO8601 => null,
DATE_RFC822 => null,
DATE_RFC850 => null,
DATE_RFC1036 => null,
DATE_RFC1123 => null,
DATE_RFC2822 => null,
DATE_RFC3339 => null,
'D, d M Y H:i:s' => 25,
'D, d M Y h:i:s' => 25,
'D M d Y H:i:s' => 24,
'j M Y H:i:s' => 20,
'Y-m-d H:i:s' => 19,
'Y-m-d\TH:i:s' => 19,
'd/m/Y H:i:s' => 19,
'D, d M Y' => 16,
'Y-m-d' => 10,
'd-m-Y' => 10,
'm-d-Y' => 10,
'd.m.Y' => 10,
'm.d.Y' => 10,
'd/m/Y' => 10,
'm/d/Y' => 10,
);
/**
* Try to parse all date format for broken feeds.
*
* @param string $value Original date format
*
* @return DateTime
*/
public function getDateTime($value)
{
$value = trim($value);
foreach ($this->formats as $format => $length) {
$truncated_value = $value;
if ($length !== null) {
$truncated_value = substr($truncated_value, 0, $length);
}
$date = $this->getValidDate($format, $truncated_value);
if ($date !== false) {
return $date;
}
}
return $this->getCurrentDateTime();
}
/**
* Get a valid date from a given format.
*
* @param string $format Date format
* @param string $value Original date value
*
* @return DateTime|bool
*/
public function getValidDate($format, $value)
{
$date = DateTime::createFromFormat($format, $value, $this->getTimeZone());
if ($date !== false) {
$errors = DateTime::getLastErrors();
if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) {
return $date;
}
}
return false;
}
/**
* Get the current datetime.
*
* @return DateTime
*/
public function getCurrentDateTime()
{
return new DateTime('now', $this->getTimeZone());
}
/**
* Get DateTimeZone instance
*
* @access public
* @return DateTimeZone
*/
public function getTimeZone()
{
return new DateTimeZone($this->config->getTimezone() ?: $this->timezone);
}
}

View file

@ -0,0 +1,314 @@
<?php
namespace PicoFeed\Parser;
/**
* Feed.
*
* @author Frederic Guillot
*/
class Feed
{
/**
* Feed items.
*
* @var array
*/
public $items = array();
/**
* Feed id.
*
* @var string
*/
public $id = '';
/**
* Feed title.
*
* @var string
*/
public $title = '';
/**
* Feed description.
*
* @var string
*/
public $description = '';
/**
* Feed url.
*
* @var string
*/
public $feedUrl = '';
/**
* Site url.
*
* @var string
*/
public $siteUrl = '';
/**
* Feed date.
*
* @var \DateTime
*/
public $date = null;
/**
* Feed language.
*
* @var string
*/
public $language = '';
/**
* Feed logo URL.
*
* @var string
*/
public $logo = '';
/**
* Feed icon URL.
*
* @var string
*/
public $icon = '';
/**
* Return feed information.
*/
public function __toString()
{
$output = '';
foreach (array('id', 'title', 'feedUrl', 'siteUrl', 'language', 'description', 'logo') as $property) {
$output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL;
}
$output .= 'Feed::date = '.$this->date->format(DATE_RFC822).PHP_EOL;
$output .= 'Feed::isRTL() = '.($this->isRTL() ? 'true' : 'false').PHP_EOL;
$output .= 'Feed::items = '.count($this->items).' items'.PHP_EOL;
foreach ($this->items as $item) {
$output .= '----'.PHP_EOL;
$output .= $item;
}
return $output;
}
/**
* Get title.
*/
public function getTitle()
{
return $this->title;
}
/**
* Get description.
*/
public function getDescription()
{
return $this->description;
}
/**
* Get the logo url.
*/
public function getLogo()
{
return $this->logo;
}
/**
* Get the icon url.
*/
public function getIcon()
{
return $this->icon;
}
/**
* Get feed url.
*/
public function getFeedUrl()
{
return $this->feedUrl;
}
/**
* Get site url.
*/
public function getSiteUrl()
{
return $this->siteUrl;
}
/**
* Get date.
*/
public function getDate()
{
return $this->date;
}
/**
* Get language.
*/
public function getLanguage()
{
return $this->language;
}
/**
* Get id.
*/
public function getId()
{
return $this->id;
}
/**
* Get feed items.
*/
public function getItems()
{
return $this->items;
}
/**
* Return true if the feed is "Right to Left".
*
* @return bool
*/
public function isRTL()
{
return Parser::isLanguageRTL($this->language);
}
/**
* Set feed items.
*
* @param Item[] $items
* @return Feed
*/
public function setItems(array $items)
{
$this->items = $items;
return $this;
}
/**
* Set feed id.
*
* @param string $id
* @return Feed
*/
public function setId($id)
{
$this->id = $id;
return $this;
}
/**
* Set feed title.
*
* @param string $title
* @return Feed
*/
public function setTitle($title)
{
$this->title = $title;
return $this;
}
/**
* Set feed description.
*
* @param string $description
* @return Feed
*/
public function setDescription($description)
{
$this->description = $description;
return $this;
}
/**
* Set feed url.
*
* @param string $feedUrl
* @return Feed
*/
public function setFeedUrl($feedUrl)
{
$this->feedUrl = $feedUrl;
return $this;
}
/**
* Set feed website url.
*
* @param string $siteUrl
* @return Feed
*/
public function setSiteUrl($siteUrl)
{
$this->siteUrl = $siteUrl;
return $this;
}
/**
* Set feed date.
*
* @param \DateTime $date
* @return Feed
*/
public function setDate($date)
{
$this->date = $date;
return $this;
}
/**
* Set feed language.
*
* @param string $language
* @return Feed
*/
public function setLanguage($language)
{
$this->language = $language;
return $this;
}
/**
* Set feed logo.
*
* @param string $logo
* @return Feed
*/
public function setLogo($logo)
{
$this->logo = $logo;
return $this;
}
/**
* Set feed icon.
*
* @param string $icon
* @return Feed
*/
public function setIcon($icon)
{
$this->icon = $icon;
return $this;
}
}

View file

@ -0,0 +1,415 @@
<?php
namespace PicoFeed\Parser;
/**
* Feed Item.
*
* @author Frederic Guillot
*/
class Item
{
/**
* List of known RTL languages.
*
* @var string[]
*/
public $rtl = array(
'ar', // Arabic (ar-**)
'fa', // Farsi (fa-**)
'ur', // Urdu (ur-**)
'ps', // Pashtu (ps-**)
'syr', // Syriac (syr-**)
'dv', // Divehi (dv-**)
'he', // Hebrew (he-**)
'yi', // Yiddish (yi-**)
);
/**
* Item id.
*
* @var string
*/
public $id = '';
/**
* Item title.
*
* @var string
*/
public $title = '';
/**
* Item url.
*
* @var string
*/
public $url = '';
/**
* Item author.
*
* @var string
*/
public $author = '';
/**
* Item date.
*
* @var \DateTime
*/
public $date = null;
/**
* Item content.
*
* @var string
*/
public $content = '';
/**
* Item enclosure url.
*
* @var string
*/
public $enclosureUrl = '';
/**
* Item enclusure type.
*
* @var string
*/
public $enclosureType = '';
/**
* Item language.
*
* @var string
*/
public $language = '';
/**
* Raw XML.
*
* @var \SimpleXMLElement
*/
public $xml;
/**
* List of namespaces.
*
* @var array
*/
public $namespaces = array();
/**
* Check if a XML namespace exists
*
* @access public
* @param string $namespace
* @return bool
*/
public function hasNamespace($namespace)
{
return array_key_exists($namespace, $this->namespaces);
}
/**
* Get specific XML tag or attribute value.
*
* @param string $tag Tag name (examples: guid, media:content)
* @param string $attribute Tag attribute
*
* @return array|false Tag values or error
*/
public function getTag($tag, $attribute = '')
{
if ($attribute !== '') {
$attribute = '/@'.$attribute;
}
$query = './/'.$tag.$attribute;
$elements = XmlParser::getXPathResult($this->xml, $query, $this->namespaces);
if ($elements === false) { // xPath error
return false;
}
return array_map(function ($element) { return (string) $element;}, $elements);
}
/**
* Return item information.
*
* @return string
*/
public function __toString()
{
$output = '';
foreach (array('id', 'title', 'url', 'language', 'author', 'enclosureUrl', 'enclosureType') as $property) {
$output .= 'Item::'.$property.' = '.$this->$property.PHP_EOL;
}
$output .= 'Item::date = '.$this->date->format(DATE_RFC822).PHP_EOL;
$output .= 'Item::isRTL() = '.($this->isRTL() ? 'true' : 'false').PHP_EOL;
$output .= 'Item::content = '.strlen($this->content).' bytes'.PHP_EOL;
return $output;
}
/**
* Get title.
*
* @return string
*/
public function getTitle()
{
return $this->title;
}
/**
* Get URL
*
* @access public
* @return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Set URL
*
* @access public
* @param string $url
* @return Item
*/
public function setUrl($url)
{
$this->url = $url;
return $this;
}
/**
* Get id.
*
* @return string
*/
public function getId()
{
return $this->id;
}
/**
* Get date.
*
* @return \DateTime
*/
public function getDate()
{
return $this->date;
}
/**
* Get content.
*
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Set content
*
* @access public
* @param string $value
* @return Item
*/
public function setContent($value)
{
$this->content = $value;
return $this;
}
/**
* Get enclosure url.
*
* @return string
*/
public function getEnclosureUrl()
{
return $this->enclosureUrl;
}
/**
* Get enclosure type.
*
* @return string
*/
public function getEnclosureType()
{
return $this->enclosureType;
}
/**
* Get language.
*
* @return string
*/
public function getLanguage()
{
return $this->language;
}
/**
* Get author.
*
* @return string
*/
public function getAuthor()
{
return $this->author;
}
/**
* Return true if the item is "Right to Left".
*
* @return bool
*/
public function isRTL()
{
return Parser::isLanguageRTL($this->language);
}
/**
* Set item id.
*
* @param string $id
* @return Item
*/
public function setId($id)
{
$this->id = $id;
return $this;
}
/**
* Set item title.
*
* @param string $title
* @return Item
*/
public function setTitle($title)
{
$this->title = $title;
return $this;
}
/**
* Set author.
*
* @param string $author
* @return Item
*/
public function setAuthor($author)
{
$this->author = $author;
return $this;
}
/**
* Set item date.
*
* @param \DateTime $date
* @return Item
*/
public function setDate($date)
{
$this->date = $date;
return $this;
}
/**
* Set enclosure url.
*
* @param string $enclosureUrl
* @return Item
*/
public function setEnclosureUrl($enclosureUrl)
{
$this->enclosureUrl = $enclosureUrl;
return $this;
}
/**
* Set enclosure type.
*
* @param string $enclosureType
* @return Item
*/
public function setEnclosureType($enclosureType)
{
$this->enclosureType = $enclosureType;
return $this;
}
/**
* Set item language.
*
* @param string $language
* @return Item
*/
public function setLanguage($language)
{
$this->language = $language;
return $this;
}
/**
* Set raw XML.
*
* @param \SimpleXMLElement $xml
* @return Item
*/
public function setXml($xml)
{
$this->xml = $xml;
return $this;
}
/**
* Get raw XML.
*
* @return \SimpleXMLElement
*/
public function getXml()
{
return $this->xml;
}
/**
* Set XML namespaces.
*
* @param array $namespaces
* @return Item
*/
public function setNamespaces($namespaces)
{
$this->namespaces = $namespaces;
return $this;
}
/**
* Get XML namespaces.
*
* @return array
*/
public function getNamespaces()
{
return $this->namespaces;
}
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Parser;
/**
* MalformedXmlException Exception.
*
* @author Frederic Guillot
*/
class MalformedXmlException extends ParserException
{
}

View file

@ -0,0 +1,523 @@
<?php
namespace PicoFeed\Parser;
use PicoFeed\Processor\ContentFilterProcessor;
use PicoFeed\Processor\ContentGeneratorProcessor;
use PicoFeed\Processor\ItemPostProcessor;
use PicoFeed\Processor\ScraperProcessor;
use SimpleXMLElement;
use PicoFeed\Client\Url;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
/**
* Base parser class.
*
* @author Frederic Guillot
*/
abstract class Parser
{
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* DateParser object.
*
* @var \PicoFeed\Parser\DateParser
*/
private $dateParser;
/**
* Hash algorithm used to generate item id, any value supported by PHP, see hash_algos().
*
* @var string
*/
private $hash_algo = 'sha256';
/**
* Feed content (XML data).
*
* @var string
*/
protected $content = '';
/**
* Fallback url.
*
* @var string
*/
protected $fallback_url = '';
/**
* XML namespaces supported by parser.
*
* @var array
*/
protected $namespaces = array();
/**
* XML namespaces used in document.
*
* @var array
*/
protected $used_namespaces = array();
/**
* Item Post Processor instance
*
* @access private
* @var ItemPostProcessor
*/
private $itemPostProcessor;
/**
* Constructor.
*
* @param string $content Feed content
* @param string $http_encoding HTTP encoding (headers)
* @param string $fallback_url Fallback url when the feed provide relative or broken url
*/
public function __construct($content, $http_encoding = '', $fallback_url = '')
{
$this->fallback_url = $fallback_url;
$xml_encoding = XmlParser::getEncodingFromXmlTag($content);
// Strip XML tag to avoid multiple encoding/decoding in the next XML processing
$this->content = Filter::stripXmlTag($content);
// Encode everything in UTF-8
Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
$this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding);
$this->itemPostProcessor = new ItemPostProcessor($this->config);
$this->itemPostProcessor->register(new ContentGeneratorProcessor($this->config));
$this->itemPostProcessor->register(new ContentFilterProcessor($this->config));
}
/**
* Parse the document.
*
* @return \PicoFeed\Parser\Feed
*/
public function execute()
{
Logger::setMessage(get_called_class().': begin parsing');
$xml = XmlParser::getSimpleXml($this->content);
if ($xml === false) {
Logger::setMessage(get_called_class().': Applying XML workarounds');
$this->content = Filter::normalizeData($this->content);
$xml = XmlParser::getSimpleXml($this->content);
if ($xml === false) {
Logger::setMessage(get_called_class().': XML parsing error');
Logger::setMessage(XmlParser::getErrors());
throw new MalformedXmlException('XML parsing error');
}
}
$this->used_namespaces = $xml->getNamespaces(true);
$xml = $this->registerSupportedNamespaces($xml);
$feed = new Feed();
$this->findFeedUrl($xml, $feed);
$this->checkFeedUrl($feed);
$this->findSiteUrl($xml, $feed);
$this->checkSiteUrl($feed);
$this->findFeedTitle($xml, $feed);
$this->findFeedDescription($xml, $feed);
$this->findFeedLanguage($xml, $feed);
$this->findFeedId($xml, $feed);
$this->findFeedDate($xml, $feed);
$this->findFeedLogo($xml, $feed);
$this->findFeedIcon($xml, $feed);
foreach ($this->getItemsTree($xml) as $entry) {
$entry = $this->registerSupportedNamespaces($entry);
$item = new Item();
$item->xml = $entry;
$item->namespaces = $this->used_namespaces;
$this->findItemAuthor($xml, $entry, $item);
$this->findItemUrl($entry, $item);
$this->checkItemUrl($feed, $item);
$this->findItemTitle($entry, $item);
$this->findItemContent($entry, $item);
// Id generation can use the item url/title/content (order is important)
$this->findItemId($entry, $item, $feed);
$this->findItemDate($entry, $item, $feed);
$this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed);
$this->itemPostProcessor->execute($feed, $item);
$feed->items[] = $item;
}
Logger::setMessage(get_called_class().PHP_EOL.$feed);
return $feed;
}
/**
* Check if the feed url is correct.
*
* @param Feed $feed Feed object
*/
public function checkFeedUrl(Feed $feed)
{
if ($feed->getFeedUrl() === '') {
$feed->feedUrl = $this->fallback_url;
} else {
$feed->feedUrl = Url::resolve($feed->getFeedUrl(), $this->fallback_url);
}
}
/**
* Check if the site url is correct.
*
* @param Feed $feed Feed object
*/
public function checkSiteUrl(Feed $feed)
{
if ($feed->getSiteUrl() === '') {
$feed->siteUrl = Url::base($feed->getFeedUrl());
} else {
$feed->siteUrl = Url::resolve($feed->getSiteUrl(), $this->fallback_url);
}
}
/**
* Check if the item url is correct.
*
* @param Feed $feed Feed object
* @param Item $item Item object
*/
public function checkItemUrl(Feed $feed, Item $item)
{
$item->url = Url::resolve($item->getUrl(), $feed->getSiteUrl());
}
/**
* Get Item Post Processor instance
*
* @access public
* @return ItemPostProcessor
*/
public function getItemPostProcessor()
{
return $this->itemPostProcessor;
}
/**
* Get DateParser instance
*
* @access public
* @return DateParser
*/
public function getDateParser()
{
if ($this->dateParser === null) {
return new DateParser($this->config);
}
return $this->dateParser;
}
/**
* Generate a unique id for an entry (hash all arguments).
*
* @return string
*/
public function generateId()
{
return hash($this->hash_algo, implode(func_get_args()));
}
/**
* Return true if the given language is "Right to Left".
*
* @static
*
* @param string $language Language: fr-FR, en-US
*
* @return bool
*/
public static function isLanguageRTL($language)
{
$language = strtolower($language);
$rtl_languages = array(
'ar', // Arabic (ar-**)
'fa', // Farsi (fa-**)
'ur', // Urdu (ur-**)
'ps', // Pashtu (ps-**)
'syr', // Syriac (syr-**)
'dv', // Divehi (dv-**)
'he', // Hebrew (he-**)
'yi', // Yiddish (yi-**)
);
foreach ($rtl_languages as $prefix) {
if (strpos($language, $prefix) === 0) {
return true;
}
}
return false;
}
/**
* Set Hash algorithm used for id generation.
*
* @param string $algo Algorithm name
* @return \PicoFeed\Parser\Parser
*/
public function setHashAlgo($algo)
{
$this->hash_algo = $algo ?: $this->hash_algo;
return $this;
}
/**
* Set config object.
*
* @param \PicoFeed\Config\Config $config Config instance
*
* @return \PicoFeed\Parser\Parser
*/
public function setConfig($config)
{
$this->config = $config;
return $this;
}
/**
* Enable the content grabber.
*
* @return \PicoFeed\Parser\Parser
*/
public function disableContentFiltering()
{
$this->itemPostProcessor->unregister('PicoFeed\Processor\ContentFilterProcessor');
return $this;
}
/**
* Enable the content grabber.
*
* @param bool $needsRuleFile true if only pages with rule files should be
* scraped
* @param null|\Closure $scraperCallback Callback function that gets called for each
* scraper execution
*
* @return \PicoFeed\Parser\Parser
*/
public function enableContentGrabber($needsRuleFile = false, $scraperCallback = null)
{
$processor = new ScraperProcessor($this->config);
if ($needsRuleFile) {
$processor->getScraper()->disableCandidateParser();
}
if ($scraperCallback !== null) {
$processor->setExecutionCallback($scraperCallback);
}
$this->itemPostProcessor->register($processor);
return $this;
}
/**
* Set ignored URLs for the content grabber.
*
* @param array $urls URLs
*
* @return \PicoFeed\Parser\Parser
*/
public function setGrabberIgnoreUrls(array $urls)
{
$this->itemPostProcessor->getProcessor('PicoFeed\Processor\ScraperProcessor')->ignoreUrls($urls);
return $this;
}
/**
* Register all supported namespaces to be used within an xpath query.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function registerSupportedNamespaces(SimpleXMLElement $xml)
{
foreach ($this->namespaces as $prefix => $ns) {
$xml->registerXPathNamespace($prefix, $ns);
}
return $xml;
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedUrl(SimpleXMLElement $xml, Feed $feed);
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findSiteUrl(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedTitle(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedDescription(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedId(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedDate(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedLogo(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedIcon(SimpleXMLElement $xml, Feed $feed);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
abstract public function getItemsTree(SimpleXMLElement $xml);
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item);
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemUrl(SimpleXMLElement $entry, Item $item);
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemTitle(SimpleXMLElement $entry, Item $item);
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemContent(SimpleXMLElement $entry, Item $item);
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed);
}

View file

@ -0,0 +1,14 @@
<?php
namespace PicoFeed\Parser;
use PicoFeed\PicoFeedException;
/**
* ParserException Exception.
*
* @author Frederic Guillot
*/
abstract class ParserException extends PicoFeedException
{
}

View file

@ -0,0 +1,277 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
use PicoFeed\Filter\Filter;
/**
* RSS 1.0 parser.
*
* @author Frederic Guillot
*/
class Rss10 extends Parser
{
/**
* Supported namespaces.
*/
protected $namespaces = array(
'rss' => 'http://purl.org/rss/1.0/',
'dc' => 'http://purl.org/dc/elements/1.1/',
'content' => 'http://purl.org/rss/1.0/modules/content/',
'feedburner' => 'http://rssnamespace.org/feedburner/ext/1.0',
);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml)
{
return XmlParser::getXPathResult($xml, 'rss:item', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'item')
?: $xml->item;
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->setFeedUrl('');
}
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
{
$value = XmlParser::getXPathResult($xml, 'rss:channel/rss:link', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/link')
?: $xml->channel->link;
$feed->setSiteUrl(XmlParser::getValue($value));
}
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
{
$description = XmlParser::getXPathResult($xml, 'rss:channel/rss:description', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/description')
?: $xml->channel->description;
$feed->setDescription(XmlParser::getValue($description));
}
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
{
$logo = XmlParser::getXPathResult($xml, 'rss:image/rss:url', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'image/url');
$feed->setLogo(XmlParser::getValue($logo));
}
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
{
$feed->setIcon('');
}
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$title = XmlParser::getXPathResult($xml, 'rss:channel/rss:title', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/title')
?: $xml->channel->title;
$feed->setTitle(Filter::stripWhiteSpace(XmlParser::getValue($title)) ?: $feed->getSiteUrl());
}
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$language = XmlParser::getXPathResult($xml, 'rss:channel/dc:language', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:language', $this->namespaces);
$feed->setLanguage(XmlParser::getValue($language));
}
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$feed->setId($feed->getFeedUrl() ?: $feed->getSiteUrl());
}
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$date = XmlParser::getXPathResult($xml, 'rss:channel/dc:date', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:date', $this->namespaces);
$feed->setDate($this->getDateParser()->getDateTime(XmlParser::getValue($date)));
}
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$date = XmlParser::getXPathResult($entry, 'dc:date', $this->namespaces);
$item->setDate(empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime(XmlParser::getValue($date)));
}
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$title = XmlParser::getXPathResult($entry, 'rss:title', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'title')
?: $entry->title;
$item->setTitle(Filter::stripWhiteSpace(XmlParser::getValue($title)) ?: $item->getUrl());
}
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
$author = XmlParser::getXPathResult($entry, 'dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'rss:channel/dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:creator', $this->namespaces);
$item->setAuthor(XmlParser::getValue($author));
}
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$content = XmlParser::getXPathResult($entry, 'content:encoded', $this->namespaces);
if (XmlParser::getValue($content) === '') {
$content = XmlParser::getXPathResult($entry, 'rss:description', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'description')
?: $entry->description;
}
$item->setContent(XmlParser::getValue($content));
}
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$link = XmlParser::getXPathResult($entry, 'feedburner:origLink', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'rss:link', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'link')
?: $entry->link;
$item->setUrl(XmlParser::getValue($link));
}
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$item->setId($this->generateId(
$item->getTitle(), $item->getUrl(), $item->getContent()
));
}
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
}
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces);
$item->setLanguage(XmlParser::getValue($language) ?: $feed->getLanguage());
}
}

View file

@ -0,0 +1,289 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
use PicoFeed\Filter\Filter;
use PicoFeed\Client\Url;
/**
* RSS 2.0 Parser.
*
* @author Frederic Guillot
*/
class Rss20 extends Parser
{
/**
* Supported namespaces.
*/
protected $namespaces = array(
'dc' => 'http://purl.org/dc/elements/1.1/',
'content' => 'http://purl.org/rss/1.0/modules/content/',
'feedburner' => 'http://rssnamespace.org/feedburner/ext/1.0',
'atom' => 'http://www.w3.org/2005/Atom',
);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml)
{
return XmlParser::getXPathResult($xml, 'channel/item');
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->setFeedUrl('');
}
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
{
$value = XmlParser::getXPathResult($xml, 'channel/link');
$feed->setSiteUrl(XmlParser::getValue($value));
}
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
{
$value = XmlParser::getXPathResult($xml, 'channel/description');
$feed->setDescription(XmlParser::getValue($value));
}
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
{
$value = XmlParser::getXPathResult($xml, 'channel/image/url');
$feed->setLogo(XmlParser::getValue($value));
}
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
{
$feed->setIcon('');
}
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$title = XmlParser::getXPathResult($xml, 'channel/title');
$feed->setTitle(Filter::stripWhiteSpace(XmlParser::getValue($title)) ?: $feed->getSiteUrl());
}
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$value = XmlParser::getXPathResult($xml, 'channel/language');
$feed->setLanguage(XmlParser::getValue($value));
}
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$feed->setId($feed->getFeedUrl() ?: $feed->getSiteUrl());
}
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$publish_date = XmlParser::getXPathResult($xml, 'channel/pubDate');
$update_date = XmlParser::getXPathResult($xml, 'channel/lastBuildDate');
$published = !empty($publish_date) ? $this->getDateParser()->getDateTime(XmlParser::getValue($publish_date)) : null;
$updated = !empty($update_date) ? $this->getDateParser()->getDateTime(XmlParser::getValue($update_date)) : null;
if ($published === null && $updated === null) {
$feed->setDate($this->getDateParser()->getCurrentDateTime()); // We use the current date if there is no date for the feed
} elseif ($published !== null && $updated !== null) {
$feed->setDate(max($published, $updated)); // We use the most recent date between published and updated
} else {
$feed->setDate($updated ?: $published);
}
}
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$date = XmlParser::getXPathResult($entry, 'pubDate');
$item->setDate(empty($date) ? $feed->getDate() : $this->getDateParser()->getDateTime(XmlParser::getValue($date)));
}
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$value = XmlParser::getXPathResult($entry, 'title');
$item->setTitle(Filter::stripWhiteSpace(XmlParser::getValue($value)) ?: $item->getUrl());
}
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
$value = XmlParser::getXPathResult($entry, 'dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'author')
?: XmlParser::getXPathResult($xml, 'channel/dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/managingEditor');
$item->setAuthor(XmlParser::getValue($value));
}
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$content = XmlParser::getXPathResult($entry, 'content:encoded', $this->namespaces);
if (XmlParser::getValue($content) === '') {
$content = XmlParser::getXPathResult($entry, 'description');
}
$item->setContent(XmlParser::getValue($content));
}
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$link = XmlParser::getXPathResult($entry, 'feedburner:origLink', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'link')
?: XmlParser::getXPathResult($entry, 'atom:link/@href', $this->namespaces);
if (!empty($link)) {
$item->setUrl(XmlParser::getValue($link));
} else {
$link = XmlParser::getXPathResult($entry, 'guid');
$link = XmlParser::getValue($link);
if (filter_var($link, FILTER_VALIDATE_URL) !== false) {
$item->setUrl($link);
}
}
}
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$id = XmlParser::getValue(XmlParser::getXPathResult($entry, 'guid'));
if ($id) {
$item->setId($this->generateId($id));
} else {
$item->setId($this->generateId(
$item->getTitle(), $item->getUrl(), $item->getContent()
));
}
}
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
if (isset($entry->enclosure)) {
$type = XmlParser::getXPathResult($entry, 'enclosure/@type');
$url = XmlParser::getXPathResult($entry, 'feedburner:origEnclosureLink', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'enclosure/@url');
$item->setEnclosureUrl(Url::resolve(XmlParser::getValue($url), $feed->getSiteUrl()));
$item->setEnclosureType(XmlParser::getValue($type));
}
}
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces);
$item->setLanguage(XmlParser::getValue($language) ?: $feed->getLanguage());
}
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Parser;
/**
* RSS 0.91 Parser.
*
* @author Frederic Guillot
*/
class Rss91 extends Rss20
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Parser;
/**
* RSS 0.92 Parser.
*
* @author Frederic Guillot
*/
class Rss92 extends Rss20
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Parser;
/**
* XmlEntityException Exception.
*
* @author Bernhard Posselt
*/
class XmlEntityException extends MalformedXmlException
{
}

View file

@ -0,0 +1,236 @@
<?php
namespace PicoFeed\Parser;
use DomDocument;
use SimpleXmlElement;
use ZendXml\Security;
/**
* XML parser class.
*
* Checks for XML eXternal Entity (XXE) and XML Entity Expansion (XEE) attacks on XML documents
*
* @author Frederic Guillot
*/
class XmlParser
{
/**
* Get a SimpleXmlElement instance or return false.
*
* @static
* @param string $input XML content
* @return mixed
*/
public static function getSimpleXml($input)
{
return self::scan($input);
}
/**
* Get a DomDocument instance or return false.
*
* @static
* @param string $input XML content
* @return \DOMDocument
*/
public static function getDomDocument($input)
{
if (empty($input)) {
return false;
}
$dom = self::scan($input, new DOMDocument());
// The document is empty, there is probably some parsing errors
if ($dom && $dom->childNodes->length === 0) {
return false;
}
return $dom;
}
/**
* Small wrapper around ZendXml to turn their exceptions into picoFeed
* exceptions
*
* @param $input the xml to load
* @param $dom pass in a dom document or use null/omit if simpleXml should
* be used
*/
private static function scan($input, $dom = null)
{
try {
return Security::scan($input, $dom);
} catch(\ZendXml\Exception\RuntimeException $e) {
throw new XmlEntityException($e->getMessage());
}
}
/**
* Load HTML document by using a DomDocument instance or return false on failure.
*
* @static
* @param string $input XML content
* @return \DOMDocument
*/
public static function getHtmlDocument($input)
{
$dom = new DomDocument();
if (empty($input)) {
return $dom;
}
libxml_use_internal_errors(true);
if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
$dom->loadHTML($input, LIBXML_NONET);
} else {
$dom->loadHTML($input);
}
return $dom;
}
/**
* Convert a HTML document to XML.
*
* @static
*
* @param string $html HTML document
*
* @return string
*/
public static function htmlToXml($html)
{
$dom = self::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
return $dom->saveXML($dom->getElementsByTagName('body')->item(0));
}
/**
* Get XML parser errors.
*
* @static
* @return string
*/
public static function getErrors()
{
$errors = array();
foreach (libxml_get_errors() as $error) {
$errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)',
$error->message,
$error->line,
$error->column,
$error->code
);
}
return implode(', ', $errors);
}
/**
* Get the encoding from a xml tag.
*
* @static
* @param string $data Input data
* @return string
*/
public static function getEncodingFromXmlTag($data)
{
$encoding = '';
if (strpos($data, '<?xml') !== false) {
$data = substr($data, 0, strrpos($data, '?>'));
$data = str_replace("'", '"', $data);
$p1 = strpos($data, 'encoding=');
$p2 = strpos($data, '"', $p1 + 10);
if ($p1 !== false && $p2 !== false) {
$encoding = substr($data, $p1 + 10, $p2 - $p1 - 10);
$encoding = strtolower($encoding);
}
}
return $encoding;
}
/**
* Get the charset from a meta tag.
*
* @static
* @param string $data Input data
* @return string
*/
public static function getEncodingFromMetaTag($data)
{
$encoding = '';
if (preg_match('/<meta.*?charset\s*=\s*["\']?\s*([^"\'\s\/>;]+)/i', $data, $match) === 1) {
$encoding = strtolower($match[1]);
}
return $encoding;
}
/**
* Rewrite XPath query to use namespace-uri and local-name derived from prefix.
*
* @param string $query XPath query
* @param array $ns Prefix to namespace URI mapping
* @return string
*/
public static function replaceXPathPrefixWithNamespaceURI($query, array $ns)
{
return preg_replace_callback('/([A-Z0-9]+):([A-Z0-9]+)/iu', function ($matches) use ($ns) {
// don't try to map the special prefix XML
if (strtolower($matches[1]) === 'xml') {
return $matches[0];
}
return '*[namespace-uri()="'.$ns[$matches[1]].'" and local-name()="'.$matches[2].'"]';
},
$query);
}
/**
* Get the result elements of a XPath query.
*
* @param \SimpleXMLElement $xml XML element
* @param string $query XPath query
* @param array $ns Prefix to namespace URI mapping
* @return \SimpleXMLElement[]
*/
public static function getXPathResult(SimpleXMLElement $xml, $query, array $ns = array())
{
if (!empty($ns)) {
$query = static::replaceXPathPrefixWithNamespaceURI($query, $ns);
}
return $xml->xpath($query);
}
/**
* Get the first Xpath result or SimpleXMLElement value
*
* @static
* @access public
* @param mixed $value
* @return string
*/
public static function getValue($value)
{
$result = '';
if (is_array($value) && count($value) > 0) {
$result = (string) $value[0];
} elseif (is_a($value, 'SimpleXMLElement')) {
return $result = (string) $value;
}
return trim($result);
}
}

View file

@ -0,0 +1,14 @@
<?php
namespace PicoFeed;
use Exception;
/**
* PicoFeedException Exception.
*
* @author Frederic Guillot
*/
abstract class PicoFeedException extends Exception
{
}

View file

@ -0,0 +1,37 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Base;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Content Filter
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ContentFilterProcessor extends Base implements ItemProcessorInterface
{
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
if ($this->config->getContentFiltering(true)) {
$filter = Filter::html($item->getContent(), $feed->getSiteUrl());
$filter->setConfig($this->config);
$item->setContent($filter->execute());
} else {
Logger::setMessage(get_called_class().': Content filtering disabled');
}
}
}

View file

@ -0,0 +1,49 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Base;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Content Generator
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ContentGeneratorProcessor extends Base implements ItemProcessorInterface
{
/**
* List of generators
*
* @access protected
* @var array
*/
protected $generators = array(
'youtube',
'file',
);
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
foreach ($this->generators as $generator) {
$className = '\PicoFeed\Generator\\'.ucfirst($generator).'ContentGenerator';
$object = new $className($this->config);
if ($object->execute($item)) {
return true;
}
}
return false;
}
}

View file

@ -0,0 +1,96 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Base;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Post Processor
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ItemPostProcessor extends Base
{
/**
* List of processors
*
* @access private
* @var array
*/
private $processors = array();
/**
* Execute all processors
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
foreach ($this->processors as $processor) {
if ($processor->execute($feed, $item)) {
return true;
}
}
return false;
}
/**
* Register a new Item post-processor
*
* @access public
* @param ItemProcessorInterface $processor
* @return ItemPostProcessor
*/
public function register(ItemProcessorInterface $processor)
{
$this->processors[get_class($processor)] = $processor;
return $this;
}
/**
* Remove Processor instance
*
* @access public
* @param string $class
* @return ItemPostProcessor
*/
public function unregister($class)
{
if (isset($this->processors[$class])) {
unset($this->processors[$class]);
}
return $this;
}
/**
* Checks wheather a specific processor is registered or not
*
* @access public
* @param string $class
* @return bool
*/
public function hasProcessor($class)
{
return isset($this->processors[$class]);
}
/**
* Get Processor instance
*
* @access public
* @param string $class
* @return ItemProcessorInterface|null
*/
public function getProcessor($class)
{
return isset($this->processors[$class]) ? $this->processors[$class] : null;
}
}

View file

@ -0,0 +1,25 @@
<?php
namespace PicoFeed\Processor;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
/**
* Item Processor Interface
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
interface ItemProcessorInterface
{
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item);
}

View file

@ -0,0 +1,96 @@
<?php
namespace PicoFeed\Processor;
use Closure;
use PicoFeed\Base;
use PicoFeed\Parser\Feed;
use PicoFeed\Parser\Item;
use PicoFeed\Scraper\Scraper;
/**
* Scraper Processor
*
* @package PicoFeed\Processor
* @author Frederic Guillot
*/
class ScraperProcessor extends Base implements ItemProcessorInterface
{
private $ignoredUrls = array();
private $scraper;
/**
* Callback function for each scraper execution
*
* @var Closure
*/
private $executionCallback;
/**
* Add a new execution callback
*
* @access public
* @param Closure $executionCallback
* @return $this
*/
public function setExecutionCallback(Closure $executionCallback)
{
$this->executionCallback = $executionCallback;
return $this;
}
/**
* Execute Item Processor
*
* @access public
* @param Feed $feed
* @param Item $item
* @return bool
*/
public function execute(Feed $feed, Item $item)
{
if (!in_array($item->getUrl(), $this->ignoredUrls)) {
$scraper = $this->getScraper();
$scraper->setUrl($item->getUrl());
$scraper->execute();
if ($this->executionCallback && is_callable($this->executionCallback)) {
call_user_func($this->executionCallback, $feed, $item, $scraper);
}
if ($scraper->hasRelevantContent()) {
$item->setContent($scraper->getFilteredContent());
}
}
return false;
}
/**
* Ignore list of URLs
*
* @access public
* @param array $urls
* @return $this
*/
public function ignoreUrls(array $urls)
{
$this->ignoredUrls = $urls;
return $this;
}
/**
* Returns Scraper instance
*
* @access public
* @return Scraper
*/
public function getScraper()
{
if ($this->scraper === null) {
$this->scraper = new Scraper($this->config);
}
return $this->scraper;
}
}

View file

@ -0,0 +1,190 @@
<?php
namespace PicoFeed\Reader;
use DOMXPath;
use PicoFeed\Base;
use PicoFeed\Client\Client;
use PicoFeed\Client\ClientException;
use PicoFeed\Client\Url;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\XmlParser;
/**
* Favicon class.
*
* https://en.wikipedia.org/wiki/Favicon
*
* @author Frederic Guillot
*/
class Favicon extends Base
{
/**
* Valid types for favicon (supported by browsers).
*
* @var array
*/
private $types = array(
'image/png',
'image/gif',
'image/x-icon',
'image/jpeg',
'image/jpg',
'image/svg+xml'
);
/**
* Icon binary content.
*
* @var string
*/
private $content = '';
/**
* Icon content type.
*
* @var string
*/
private $content_type = '';
/**
* Get the icon file content (available only after the download).
*
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get the icon file type (available only after the download).
*
* @return string
*/
public function getType()
{
foreach ($this->types as $type) {
if (strpos($this->content_type, $type) === 0) {
return $type;
}
}
return 'image/x-icon';
}
/**
* Get data URI (http://en.wikipedia.org/wiki/Data_URI_scheme).
*
* @return string
*/
public function getDataUri()
{
if (empty($this->content)) {
return '';
}
return sprintf(
'data:%s;base64,%s',
$this->getType(),
base64_encode($this->content)
);
}
/**
* Download and check if a resource exists.
*
* @param string $url URL
*
* @return \PicoFeed\Client Client instance
*/
public function download($url)
{
$client = Client::getInstance();
$client->setConfig($this->config);
Logger::setMessage(get_called_class().' Download => '.$url);
try {
$client->execute($url);
} catch (ClientException $e) {
Logger::setMessage(get_called_class().' Download Failed => '.$e->getMessage());
}
return $client;
}
/**
* Check if a remote file exists.
*
* @param string $url URL
*
* @return bool
*/
public function exists($url)
{
return $this->download($url)->getContent() !== '';
}
/**
* Get the icon link for a website.
*
* @param string $website_link URL
* @param string $favicon_link optional URL
*
* @return string
*/
public function find($website_link, $favicon_link = '')
{
$website = new Url($website_link);
if ($favicon_link !== '') {
$icons = array($favicon_link);
} else {
$icons = $this->extract($this->download($website->getBaseUrl('/'))->getContent());
$icons[] = $website->getBaseUrl('/favicon.ico');
}
foreach ($icons as $icon_link) {
$icon_link = Url::resolve($icon_link, $website);
$resource = $this->download($icon_link);
$this->content = $resource->getContent();
$this->content_type = $resource->getContentType();
if ($this->content !== '') {
return $icon_link;
} elseif ($favicon_link !== '') {
return $this->find($website_link);
}
}
return '';
}
/**
* Extract the icon links from the HTML.
*
* @param string $html HTML
*
* @return array
*/
public function extract($html)
{
$icons = array();
if (empty($html)) {
return $icons;
}
$dom = XmlParser::getHtmlDocument($html);
$xpath = new DOMXpath($dom);
$elements = $xpath->query('//link[@rel="icon" or @rel="shortcut icon" or @rel="icon shortcut"]');
for ($i = 0; $i < $elements->length; ++$i) {
$icons[] = $elements->item($i)->getAttribute('href');
}
return $icons;
}
}

View file

@ -0,0 +1,190 @@
<?php
namespace PicoFeed\Reader;
use DOMXPath;
use PicoFeed\Base;
use PicoFeed\Client\Client;
use PicoFeed\Client\Url;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\XmlParser;
/**
* Reader class.
*
* @author Frederic Guillot
*/
class Reader extends Base
{
/**
* Feed formats for detection.
*
* @var array
*/
private $formats = array(
'Atom' => '//feed',
'Rss20' => '//rss[@version="2.0"]',
'Rss92' => '//rss[@version="0.92"]',
'Rss91' => '//rss[@version="0.91"]',
'Rss10' => '//rdf',
);
/**
* Download a feed (no discovery).
*
* @param string $url Feed url
* @param string $last_modified Last modified HTTP header
* @param string $etag Etag HTTP header
* @param string $username HTTP basic auth username
* @param string $password HTTP basic auth password
*
* @return \PicoFeed\Client\Client
*/
public function download($url, $last_modified = '', $etag = '', $username = '', $password = '')
{
$url = $this->prependScheme($url);
return Client::getInstance()
->setConfig($this->config)
->setLastModified($last_modified)
->setEtag($etag)
->setUsername($username)
->setPassword($password)
->execute($url);
}
/**
* Discover and download a feed.
*
* @param string $url Feed or website url
* @param string $last_modified Last modified HTTP header
* @param string $etag Etag HTTP header
* @param string $username HTTP basic auth username
* @param string $password HTTP basic auth password
*
* @return \PicoFeed\Client\Client
*/
public function discover($url, $last_modified = '', $etag = '', $username = '', $password = '')
{
$client = $this->download($url, $last_modified, $etag, $username, $password);
// It's already a feed or the feed was not modified
if (!$client->isModified() || $this->detectFormat($client->getContent())) {
return $client;
}
// Try to find a subscription
$links = $this->find($client->getUrl(), $client->getContent());
if (empty($links)) {
throw new SubscriptionNotFoundException('Unable to find a subscription');
}
return $this->download($links[0], $last_modified, $etag, $username, $password);
}
/**
* Find feed urls inside a HTML document.
*
* @param string $url Website url
* @param string $html HTML content
*
* @return array List of feed links
*/
public function find($url, $html)
{
Logger::setMessage(get_called_class().': Try to discover subscriptions');
$dom = XmlParser::getHtmlDocument($html);
$xpath = new DOMXPath($dom);
$links = array();
$queries = array(
'//link[@type="application/rss+xml"]',
'//link[@type="application/atom+xml"]',
);
foreach ($queries as $query) {
$nodes = $xpath->query($query);
foreach ($nodes as $node) {
$link = $node->getAttribute('href');
if (!empty($link)) {
$feedUrl = new Url($link);
$siteUrl = new Url($url);
$links[] = $feedUrl->getAbsoluteUrl($feedUrl->isRelativeUrl() ? $siteUrl->getBaseUrl() : '');
}
}
}
Logger::setMessage(get_called_class().': '.implode(', ', $links));
return $links;
}
/**
* Get a parser instance.
*
* @param string $url Site url
* @param string $content Feed content
* @param string $encoding HTTP encoding
*
* @return \PicoFeed\Parser\Parser
*/
public function getParser($url, $content, $encoding)
{
$format = $this->detectFormat($content);
if (empty($format)) {
throw new UnsupportedFeedFormatException('Unable to detect feed format');
}
$className = '\PicoFeed\Parser\\'.$format;
$parser = new $className($content, $encoding, $url);
$parser->setHashAlgo($this->config->getParserHashAlgo());
$parser->setConfig($this->config);
return $parser;
}
/**
* Detect the feed format.
*
* @param string $content Feed content
*
* @return string
*/
public function detectFormat($content)
{
$dom = XmlParser::getHtmlDocument($content);
$xpath = new DOMXPath($dom);
foreach ($this->formats as $parser_name => $query) {
$nodes = $xpath->query($query);
if ($nodes->length === 1) {
return $parser_name;
}
}
return '';
}
/**
* Add the prefix "http://" if the end-user just enter a domain name.
*
* @param string $url Url
* @retunr string
*/
public function prependScheme($url)
{
if (!preg_match('%^https?://%', $url)) {
$url = 'http://'.$url;
}
return $url;
}
}

View file

@ -0,0 +1,14 @@
<?php
namespace PicoFeed\Reader;
use PicoFeed\PicoFeedException;
/**
* ReaderException Exception.
*
* @author Frederic Guillot
*/
abstract class ReaderException extends PicoFeedException
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Reader;
/**
* SubscriptionNotFoundException Exception.
*
* @author Frederic Guillot
*/
class SubscriptionNotFoundException extends ReaderException
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Reader;
/**
* UnsupportedFeedFormatException Exception.
*
* @author Frederic Guillot
*/
class UnsupportedFeedFormatException extends ReaderException
{
}

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://combat.blog.lemonde.fr/2013/08/31/teddy-riner-le-rookie-devenu-rambo/#xtor=RSS-3208',
'body' => array(
'//div[@class="entry-content"]',
),
'strip' => array(
'//*[contains(@class, "fb-like") or contains(@class, "social")]'
),
)
)
);

View file

@ -0,0 +1,15 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'title' => '//header/h1',
'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
'body' => array(
'//div[@class="postContent"]',
),
'strip' => array(
'//*[@class="shareToolsBox"]',
),
)
)
);

View file

@ -0,0 +1,13 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.igen.fr/ailleurs/2014/05/nvidia-va-delaisser-les-smartphones-grand-public-86031',
'body' => array(
'//div[contains(@class, "field-name-body")]'
),
'strip' => array(
),
)
)
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.nytimes.com/2011/05/15/world/middleeast/15prince.html',
'body' => array(
'//div[@class="articleBody"]',
),
)
)
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://eliascarpe.over-blog.com/2015/12/re-upload-projets-d-avenir.html',
'body' => array(
'//div[contains(concat(" ", normalize-space(@class), " "), " ob-section ")]',
),
)
)
);

View file

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1',
'body' => array(
'//div[@class="content"]',
),
'strip' => array()
)
)
);

View file

@ -0,0 +1,20 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.slate.com/articles/business/moneybox/2013/08/microsoft_ceo_steve_ballmer_retires_a_firsthand_account_of_the_company_s.html',
'body' => array(
'//div[@class="sl-art-body"]',
),
'strip' => array(
'//*[contains(@class, "social") or contains(@class, "comments") or contains(@class, "sl-article-floatin-tools") or contains(@class, "sl-art-pag")]',
'//*[@id="mys_slate_logged_in"]',
'//*[@id="sl_article_tools_myslate_bottom"]',
'//*[@id="mys_myslate"]',
'//*[@class="sl-viral-container"]',
'//*[@class="sl-art-creds-cntr"]',
'//*[@class="sl-art-ad-midflex"]',
)
)
)
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.theguardian.com/sustainable-business/2015/feb/02/2015-hyper-transparency-global-business',
'body' => array(
'//div[contains(@class, "content__main-column--article")]',
),
'strip' => array(
'//div[contains(@class, "meta-container")]',
),
)
)
);

View file

@ -0,0 +1,29 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'https://en.wikipedia.org/wiki/Grace_Hopper',
'body' => array(
'//div[@id="bodyContent"]',
),
'strip' => array(
"//div[@id='toc']",
"//div[@id='catlinks']",
"//div[@id='jump-to-nav']",
"//div[@class='thumbcaption']//div[@class='magnify']",
"//table[@class='navbox']",
"//table[contains(@class, 'infobox')]",
"//div[@class='dablink']",
"//div[@id='contentSub']",
"//div[@id='siteSub']",
"//table[@id='persondata']",
"//table[contains(@class, 'metadata')]",
"//*[contains(@class, 'noprint')]",
"//*[contains(@class, 'printfooter')]",
"//*[contains(@class, 'editsection')]",
"//*[contains(@class, 'error')]",
"//span[@title='pronunciation:']",
),
)
)
);

View file

@ -0,0 +1,31 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.wired.com/gamelife/2013/09/ouya-free-the-games/',
'body' => array(
'//div[@data-js="gallerySlides"]',
'//article',
),
'strip' => array(
'//*[@id="linker_widget"]',
'//*[@class="credit"]',
'//div[@data-js="slideCount"]',
'//*[contains(@class="visually-hidden")]',
'//*[@data-slide-number="_endslate"]',
'//*[@id="related"]',
'//*[contains(@class, "bio")]',
'//*[contains(@class, "entry-footer")]',
'//*[contains(@class, "mobify_backtotop_link")]',
'//*[contains(@class, "gallery-navigation")]',
'//*[contains(@class, "gallery-thumbnail")]',
'//img[contains(@src, "1x1")]',
'//a[contains(@href, "creativecommons")]',
'//a[@href="#start-of-content"]',
'//ul[@id="article-tags"]',
),
)
)
);

View file

@ -0,0 +1,15 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://online.wsj.com/article/SB10001424127887324108204579023143974408428.html',
'body' => array(
'//div[@class="articlePage"]',
),
'strip' => array(
'//*[@id="articleThumbnail_2"]',
'//*[@class="socialByline"]',
)
)
)
);

View file

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.01net.com/editorial/624550/twitter-rachete-madbits-un-specialiste-francais-de-lanalyse-dimages/',
'body' => array(
'//div[@class="article_ventre_box"]',
),
'strip' => array(
'//link',
'//*[contains(@class, "article_navigation")]',
'//h1',
'//*[contains(@class, "article_toolbarMain")]',
'//*[contains(@class, "article_imagehaute_box")]',
),
),
),
);

View file

@ -0,0 +1,9 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%alt="(.+)" title="(.+)" */>%' => '/><br/>$1<br/>$2',
),
),
);

View file

@ -0,0 +1,15 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.alainonline.net/news_details.php?lang=arabic&sid=18907',
'body' => array(
'//div[@class="news_details"]',
),
'strip' => array(
'//div[@class="news_details"]/div/div[last()]',
),
),
),
);

View file

@ -0,0 +1,22 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.aljazeera.com/news/2015/09/xi-jinping-seattle-china-150922230118373.html',
'body' => array(
'//figure[@class="article-content"]',
'//div[@class="article-body"]',
),
'strip' => array(
'//h1',
'//h3',
'//ul',
'//table[contains(@class, "in-article-item")]',
'//a[@target="_self"]',
'//div[@data-embed-type="Brightcove"]',
'//div[@class="QuoteContainer"]',
),
),
),
);

View file

@ -0,0 +1,20 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.aljazeera.com/news/2015/09/xi-jinping-seattle-china-150922230118373.html',
'body' => array(
'//div[@class="story-body"]',
),
'strip' => array(
'//p[@class="kindofstory"]',
'//cite[@class="byline"]',
'//div[contains(@class,"related-topics")]',
'//links',
'//sharebar',
'//related-topics',
),
),
),
);

View file

@ -0,0 +1,24 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.allgemeine-zeitung.de/lokales/polizei/mainz-gonsenheim-unbekannte-rauben-esso-tankstelle-in-kurt-schumacher-strasse-aus_14913147.htm',
'body' => array(
'//div[contains(@class, "article")][1]',
),
'strip' => array(
'//read/h1',
'//*[@id="t-map"]',
'//*[contains(@class, "modules")]',
'//*[contains(@class, "adsense")]',
'//*[contains(@class, "linkbox")]',
'//*[contains(@class, "info")]',
'//*[@class="skip"]',
'//*[@class="funcs"]',
'//span[@class="nd address"]',
'//a[contains(@href, "abo-und-services")]',
),
),
),
);

View file

@ -0,0 +1,9 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%title="(.+)" */>%' => '/><br/>$1',
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'body' => array(
'//img[@id="comic_image"]',
'//div[@class="comment-wrapper"][position()=1]',
),
'strip' => array(),
'test_url' => 'http://www.anythingcomic.com/comics/2108929/stress-free/',
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://hosted.ap.org/dynamic/stories/A/AS_CHINA_GAO_ZHISHENG?SITE=AP&SECTION=HOME&TEMPLATE=DEFAULT',
'body' => array(
'//img[@class="ap-smallphoto-img"]',
'//span[@class="entry-content"]',
),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.areadvd.de/news/daily-deals-angebote-bei-lautsprecher-teufel-3/',
'body' => array('//div[contains(@class,"entry")]'),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,23 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://arstechnica.com/tech-policy/2015/09/judge-warners-2m-happy-birthday-copyright-is-bogus/',
'body' => array(
'//header/h2',
'//section[@id="article-guts"]',
'//div[@class="superscroll-content show"]',
'//div[@class="gallery"]',
),
'next_page' => '//span[@class="numbers"]/a',
'strip' => array(
'//figcaption',
'//div[@class="post-meta"]',
'//div[@class="gallery-image-credit"]',
'//aside',
'//div[@class="article-expander"]',
),
),
),
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%/index.php.*comic=.*%' => array(
'test_url' => 'http://www.awkwardzombie.com/index.php?comic=041315',
'body' => array('//*[@id="comic"]/img'),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,21 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.bangkokpost.com/news/politics/704204/new-us-ambassador-arrives-in-bangkok',
'body' => array(
'//div[@class="articleContents"]',
),
'strip' => array(
'//h2',
'//h4',
'//div[@class="text-size"]',
'//div[@class="relate-story"]',
'//div[@class="text-ads"]',
'//script',
'//ul',
),
),
),
);

View file

@ -0,0 +1,16 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://bgr.com/2015/09/27/iphone-6s-waterproof-testing/',
'body' => array(
'//img[contains(@class,"img")]',
'//div[@class="text-column"]',
),
'strip' => array(
'//strong',
),
),
),
);

View file

@ -0,0 +1,9 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%-150x150%' => '',
),
),
);

View file

@ -0,0 +1,13 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.bizjournals.com/milwaukee/news/2015/09/30/bucks-will-hike-prices-on-best-seats-at-new-arena.html',
'body' => array(
'//figure/div/a/img',
'//p[@class="content__segment"]',
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://blog.fefe.de/?ts=ad706a73',
'body' => array(
'/html/body/ul',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://blog.mapillary.com/update/2015/08/26/traffic-sign-updates.html',
'body' => array(
'//div[contains(@class, "blog-post__content")]',
),
),
),
);

View file

@ -0,0 +1,18 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.buenosairesherald.com/article/199344/manzur-named-next-governor-of-tucum%C3%A1n',
'body' => array(
'//div[@style="float:none"]',
),
'strip' => array(
'//div[contains(@class, "bz_alias_short_desc_container"]',
'//td[@id="bz_show_bug_column_1"]',
'//table[@id="attachment_table"]',
'//table[@class="bz_comment_table"]',
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.bunicomic.com/comic/buni-623/',
'body' => array(
'//div[@class="comic-table"]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://buttersafe.com/2015/04/21/the-incredible-flexible-man/',
'body' => array(
'//div[@id="comic"]',
'//div[@class="post-comic"]',
),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,13 @@
<?php
return array(
'grabber' => array(
'%/cad/.+%' => array(
'test_url' => 'http://www.cad-comic.com/cad/20150417',
'body' => array(
'//*[@id="content"]/img',
),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://chaoslife.findchaos.com/pets-in-the-wild',
'body' => array('//div[@id="comic"]'),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%/comic.*%' => array(
'test_url' => 'http://cliquerefresh.com/comic/078-stating-the-obvious/',
'body' => array('//div[@class="comicImg"]/img | //div[@class="comicImg"]/a/img'),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,38 @@
<?php
return array(
'grabber' => array(
'%^/products.*%' => array(
'test_url' => 'http://www.cnet.com/products/fibaro-flood-sensor/#ftag=CADf328eec',
'body' => array(
'//li[contains(@class,"slide first"] || //figure[contains(@class,(promoFigure))]',
'//div[@class="quickInfo"]',
'//div[@class="col-6 ratings"]',
'//div[@id="editorReview"]',
),
'strip' => array(
'//script',
'//a[@class="clickToEnlarge"]',
'//div[@section="topSharebar"]',
'//div[contains(@class,"related")]',
'//div[contains(@class,"ad-")]',
'//div[@section="shortcodeGallery"]',
),
),
'%.*%' => array(
'test_url' => 'http://cnet.com.feedsportal.com/c/34938/f/645093/s/4a340866/sc/28/l/0L0Scnet0N0Cnews0Cman0Eclaims0Eonline0Epsychic0Emade0Ehim0Ebuy0E10Emillion0Epowerball0Ewinning0Eticket0C0Tftag0FCAD590Aa51e/story01.htm',
'body' => array(
'//p[@itemprop="description"]',
'//div[@itemprop="articleBody"]',
),
'strip' => array(
'//script',
'//a[@class="clickToEnlarge"]',
'//div[@section="topSharebar"]',
'//div[contains(@class,"related")]',
'//div[contains(@class,"ad-")]',
'//div[@section="shortcodeGallery"]',
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://consomac.fr/news-2430-l-iphone-6-toujours-un-secret-bien-garde.html',
'body' => array(
'//div[contains(@id, "newscontent")]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,9 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%title="(.+)" */>%' => '/><br/>$1',
),
),
);

View file

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.csmonitor.com/USA/Politics/2015/0925/John-Boehner-steps-down-Self-sacrificing-but-will-it-lead-to-better-government',
'body' => array(
'//figure[@id="image-top-1"]',
'//div[@id="story-body"]',
),
'strip' => array(
'//script',
'//img[@title="hide caption"]',
'//*[contains(@class,"promo_link")]',
'//div[@id="story-embed-column"]',
),
),
),
);

View file

@ -0,0 +1,20 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://dailyjs.com/2014/08/07/p5js/',
'body' => array(
'//div[@id="post"]',
),
'strip' => array(
'//h2[@class="post"]',
'//div[@class="meta"]',
'//*[contains(@class, "addthis_toolbox")]',
'//*[contains(@class, "addthis_default_style")]',
'//*[@class="navigation small"]',
'//*[@id="related"]',
),
),
),
);

View file

@ -0,0 +1,16 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://dailyreporter.com/2016/01/09/us-supreme-court-case-could-weaken-government-workers-unions/',
'body' => array(
'//div[contains(@class, "entry-content")]',
),
'strip' => array(
'//div[@class="dmcss_login_form"]',
'//*[contains(@class, "sharedaddy")]',
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.dailytech.com/Apples+First+Fixes+to+iOS+9+Land+w+iOS++901+Release/article37495.htm',
'body' => array(
'//div[@class="NewsBodyImage"]',
'//span[@id="lblSummary"]',
'//span[@id="lblBody"]',
),
),
),
);

View file

@ -0,0 +1,15 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.degroupnews.com/medias/vodsvod/amazon-concurrence-la-chromecast-de-google-avec-fire-tv-stick',
'body' => array(
'//div[@class="contenu"]',
),
'strip' => array(
'//div[contains(@class, "a2a")]',
),
),
),
);

View file

@ -0,0 +1,15 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://derstandard.at/2000010267354/The-Witcher-3-Hohe-Hardware-Anforderungen-fuer-PC-Spieler?ref=rss',
'body' => array(
'//div[@class="copytext"]',
'//ul[@id="media-list"]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'body' => array(
'//img[@class="img-responsive img-comic"]',
),
'test_url' => 'http://dilbert.com/strip/2016-01-28',
),
),
);

View file

@ -0,0 +1,18 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://blogs.discovermagazine.com/the-extremo-files/2015/09/11/have-scientists-found-the-worlds-deepest-fish/',
'body' => array(
'//div[@class="entry"]',
),
'strip' => array(
'//h1',
'//div[@class="meta"]',
'//div[@class="shareIcons"]',
'//div[@class="navigation"]',
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://distrowatch.com/?newsid=08355',
'body' => array(
'//td[@class="NewsText"][1]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,16 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://dozodomo.com/bento/2014/03/04/lart-des-maki-de-takayo-kiyota/',
'body' => array(
'//div[@class="joke"]',
'//div[@class="story-cover"]',
'//div[@class="story-content"]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,16 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'body' => array('//img[@id="comicimage"]'),
'strip' => array(),
'test_url' => 'http://drawingboardcomic.com/index.php?comic=208',
),
),
'filter' => array(
'%.*%' => array(
'%title="(.+)" */>%' => '/><br/>$1',
),
),
);

View file

@ -0,0 +1,13 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://encyclopedie.naheulbeuk.com/article.php3?id_article=352',
'body' => array(
'//td//h1[@class="titre-texte"]',
'//td//div[@class="surtitre"]',
'//td//div[@class="texte"]',
),
)
),
);

View file

@ -0,0 +1,9 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%-150x150%' => '',
),
),
);

Some files were not shown because too many files have changed in this diff Show more