Switch to Composer

This commit is contained in:
mmk2410 2016-02-16 14:19:57 +01:00
parent ce709fec80
commit fda6e3d811
393 changed files with 14048 additions and 17 deletions

View file

@ -0,0 +1,668 @@
<?php
namespace PicoFeed\Client;
use LogicException;
use PicoFeed\Logging\Logger;
use PicoFeed\Config\Config;
/**
* Client class.
*
* @author Frederic Guillot
*/
abstract class Client
{
/**
* Flag that say if the resource have been modified.
*
* @var bool
*/
private $is_modified = true;
/**
* HTTP Content-Type.
*
* @var string
*/
private $content_type = '';
/**
* HTTP encoding.
*
* @var string
*/
private $encoding = '';
/**
* HTTP request headers.
*
* @var array
*/
protected $request_headers = array();
/**
* HTTP Etag header.
*
* @var string
*/
protected $etag = '';
/**
* HTTP Last-Modified header.
*
* @var string
*/
protected $last_modified = '';
/**
* Proxy hostname.
*
* @var string
*/
protected $proxy_hostname = '';
/**
* Proxy port.
*
* @var int
*/
protected $proxy_port = 3128;
/**
* Proxy username.
*
* @var string
*/
protected $proxy_username = '';
/**
* Proxy password.
*
* @var string
*/
protected $proxy_password = '';
/**
* Basic auth username.
*
* @var string
*/
protected $username = '';
/**
* Basic auth password.
*
* @var string
*/
protected $password = '';
/**
* Client connection timeout.
*
* @var int
*/
protected $timeout = 10;
/**
* User-agent.
*
* @var string
*/
protected $user_agent = 'PicoFeed (https://github.com/fguillot/picoFeed)';
/**
* Real URL used (can be changed after a HTTP redirect).
*
* @var string
*/
protected $url = '';
/**
* Page/Feed content.
*
* @var string
*/
protected $content = '';
/**
* Number maximum of HTTP redirections to avoid infinite loops.
*
* @var int
*/
protected $max_redirects = 5;
/**
* Maximum size of the HTTP body response.
*
* @var int
*/
protected $max_body_size = 2097152; // 2MB
/**
* HTTP response status code.
*
* @var int
*/
protected $status_code = 0;
/**
* Enables direct passthrough to requesting client.
*
* @var bool
*/
protected $passthrough = false;
/**
* Do the HTTP request.
*
* @abstract
*
* @return array
*/
abstract public function doRequest();
/**
* Get client instance: curl or stream driver.
*
* @static
*
* @return \PicoFeed\Client\Client
*/
public static function getInstance()
{
if (function_exists('curl_init')) {
return new Curl();
} elseif (ini_get('allow_url_fopen')) {
return new Stream();
}
throw new LogicException('You must have "allow_url_fopen=1" or curl extension installed');
}
/**
* Add HTTP Header to the request.
*
* @param array $headers
*/
public function setHeaders($headers)
{
$this->request_headers = $headers;
}
/**
* Perform the HTTP request.
*
* @param string $url URL
*
* @return Client
*/
public function execute($url = '')
{
if ($url !== '') {
$this->url = $url;
}
Logger::setMessage(get_called_class().' Fetch URL: '.$this->url);
Logger::setMessage(get_called_class().' Etag provided: '.$this->etag);
Logger::setMessage(get_called_class().' Last-Modified provided: '.$this->last_modified);
$response = $this->doRequest();
$this->status_code = $response['status'];
$this->handleNotModifiedResponse($response);
$this->handleNotFoundResponse($response);
$this->handleNormalResponse($response);
return $this;
}
/**
* Handle not modified response.
*
* @param array $response Client response
*/
public function handleNotModifiedResponse(array $response)
{
if ($response['status'] == 304) {
$this->is_modified = false;
} elseif ($response['status'] == 200) {
$this->is_modified = $this->hasBeenModified($response, $this->etag, $this->last_modified);
$this->etag = $this->getHeader($response, 'ETag');
$this->last_modified = $this->getHeader($response, 'Last-Modified');
}
if ($this->is_modified === false) {
Logger::setMessage(get_called_class().' Resource not modified');
}
}
/**
* Handle not found response.
*
* @param array $response Client response
*/
public function handleNotFoundResponse(array $response)
{
if ($response['status'] == 404) {
throw new InvalidUrlException('Resource not found');
}
}
/**
* Handle normal response.
*
* @param array $response Client response
*/
public function handleNormalResponse(array $response)
{
if ($response['status'] == 200) {
$this->content = $response['body'];
$this->content_type = $this->findContentType($response);
$this->encoding = $this->findCharset();
}
}
/**
* Check if a request has been modified according to the parameters.
*
* @param array $response
* @param string $etag
* @param string $lastModified
*
* @return bool
*/
private function hasBeenModified($response, $etag, $lastModified)
{
$headers = array(
'Etag' => $etag,
'Last-Modified' => $lastModified,
);
// Compare the values for each header that is present
$presentCacheHeaderCount = 0;
foreach ($headers as $key => $value) {
if (isset($response['headers'][$key])) {
if ($response['headers'][$key] !== $value) {
return true;
}
++$presentCacheHeaderCount;
}
}
// If at least one header is present and the values match, the response
// was not modified
if ($presentCacheHeaderCount > 0) {
return false;
}
return true;
}
/**
* Find content type from response headers.
*
* @param array $response Client response
*
* @return string
*/
public function findContentType(array $response)
{
return strtolower($this->getHeader($response, 'Content-Type'));
}
/**
* Find charset from response headers.
*
* @return string
*/
public function findCharset()
{
$result = explode('charset=', $this->content_type);
return isset($result[1]) ? $result[1] : '';
}
/**
* Get header value from a client response.
*
* @param array $response Client response
* @param string $header Header name
*
* @return string
*/
public function getHeader(array $response, $header)
{
return isset($response['headers'][$header]) ? $response['headers'][$header] : '';
}
/**
* Set the Last-Modified HTTP header.
*
* @param string $last_modified Header value
*
* @return \PicoFeed\Client\Client
*/
public function setLastModified($last_modified)
{
$this->last_modified = $last_modified;
return $this;
}
/**
* Get the value of the Last-Modified HTTP header.
*
* @return string
*/
public function getLastModified()
{
return $this->last_modified;
}
/**
* Set the value of the Etag HTTP header.
*
* @param string $etag Etag HTTP header value
*
* @return \PicoFeed\Client\Client
*/
public function setEtag($etag)
{
$this->etag = $etag;
return $this;
}
/**
* Get the Etag HTTP header value.
*
* @return string
*/
public function getEtag()
{
return $this->etag;
}
/**
* Get the final url value.
*
* @return string
*/
public function getUrl()
{
return $this->url;
}
/**
* Set the url.
*
* @return string
* @return \PicoFeed\Client\Client
*/
public function setUrl($url)
{
$this->url = $url;
return $this;
}
/**
* Get the HTTP response status code.
*
* @return int
*/
public function getStatusCode()
{
return $this->status_code;
}
/**
* Get the body of the HTTP response.
*
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get the content type value from HTTP headers.
*
* @return string
*/
public function getContentType()
{
return $this->content_type;
}
/**
* Get the encoding value from HTTP headers.
*
* @return string
*/
public function getEncoding()
{
return $this->encoding;
}
/**
* Return true if the remote resource has changed.
*
* @return bool
*/
public function isModified()
{
return $this->is_modified;
}
/**
* return true if passthrough mode is enabled.
*
* @return bool
*/
public function isPassthroughEnabled()
{
return $this->passthrough;
}
/**
* Set connection timeout.
*
* @param int $timeout Connection timeout
*
* @return \PicoFeed\Client\Client
*/
public function setTimeout($timeout)
{
$this->timeout = $timeout ?: $this->timeout;
return $this;
}
/**
* Set a custom user agent.
*
* @param string $user_agent User Agent
*
* @return \PicoFeed\Client\Client
*/
public function setUserAgent($user_agent)
{
$this->user_agent = $user_agent ?: $this->user_agent;
return $this;
}
/**
* Set the maximum number of HTTP redirections.
*
* @param int $max Maximum
*
* @return \PicoFeed\Client\Client
*/
public function setMaxRedirections($max)
{
$this->max_redirects = $max ?: $this->max_redirects;
return $this;
}
/**
* Set the maximum size of the HTTP body.
*
* @param int $max Maximum
*
* @return \PicoFeed\Client\Client
*/
public function setMaxBodySize($max)
{
$this->max_body_size = $max ?: $this->max_body_size;
return $this;
}
/**
* Set the proxy hostname.
*
* @param string $hostname Proxy hostname
*
* @return \PicoFeed\Client\Client
*/
public function setProxyHostname($hostname)
{
$this->proxy_hostname = $hostname ?: $this->proxy_hostname;
return $this;
}
/**
* Set the proxy port.
*
* @param int $port Proxy port
*
* @return \PicoFeed\Client\Client
*/
public function setProxyPort($port)
{
$this->proxy_port = $port ?: $this->proxy_port;
return $this;
}
/**
* Set the proxy username.
*
* @param string $username Proxy username
*
* @return \PicoFeed\Client\Client
*/
public function setProxyUsername($username)
{
$this->proxy_username = $username ?: $this->proxy_username;
return $this;
}
/**
* Set the proxy password.
*
* @param string $password Password
*
* @return \PicoFeed\Client\Client
*/
public function setProxyPassword($password)
{
$this->proxy_password = $password ?: $this->proxy_password;
return $this;
}
/**
* Set the username.
*
* @param string $username Basic Auth username
*
* @return \PicoFeed\Client\Client
*/
public function setUsername($username)
{
$this->username = $username ?: $this->username;
return $this;
}
/**
* Set the password.
*
* @param string $password Basic Auth Password
*
* @return \PicoFeed\Client\Client
*/
public function setPassword($password)
{
$this->password = $password ?: $this->password;
return $this;
}
/**
* Enable the passthrough mode.
*
* @return \PicoFeed\Client\Client
*/
public function enablePassthroughMode()
{
$this->passthrough = true;
return $this;
}
/**
* Disable the passthrough mode.
*
* @return \PicoFeed\Client\Client
*/
public function disablePassthroughMode()
{
$this->passthrough = false;
return $this;
}
/**
* Set config object.
*
* @param \PicoFeed\Config\Config $config Config instance
*
* @return \PicoFeed\Client\Client
*/
public function setConfig(Config $config)
{
if ($config !== null) {
$this->setTimeout($config->getClientTimeout());
$this->setUserAgent($config->getClientUserAgent());
$this->setMaxRedirections($config->getMaxRedirections());
$this->setMaxBodySize($config->getMaxBodySize());
$this->setProxyHostname($config->getProxyHostname());
$this->setProxyPort($config->getProxyPort());
$this->setProxyUsername($config->getProxyUsername());
$this->setProxyPassword($config->getProxyPassword());
}
return $this;
}
/**
* Return true if the HTTP status code is a redirection
*
* @access protected
* @param integer $code
* @return boolean
*/
public function isRedirection($code)
{
return $code == 301 || $code == 302 || $code == 303 || $code == 307;
}
}

View file

@ -0,0 +1,14 @@
<?php
namespace PicoFeed\Client;
use PicoFeed\PicoFeedException;
/**
* ClientException Exception.
*
* @author Frederic Guillot
*/
abstract class ClientException extends PicoFeedException
{
}

View file

@ -0,0 +1,384 @@
<?php
namespace PicoFeed\Client;
use PicoFeed\Logging\Logger;
/**
* cURL HTTP client.
*
* @author Frederic Guillot
*/
class Curl extends Client
{
/**
* HTTP response body.
*
* @var string
*/
private $body = '';
/**
* Body size.
*
* @var int
*/
private $body_length = 0;
/**
* HTTP response headers.
*
* @var array
*/
private $response_headers = array();
/**
* Counter on the number of header received.
*
* @var int
*/
private $response_headers_count = 0;
/**
* cURL callback to read the HTTP body.
*
* If the function return -1, curl stop to read the HTTP response
*
* @param resource $ch cURL handler
* @param string $buffer Chunk of data
*
* @return int Length of the buffer
*/
public function readBody($ch, $buffer)
{
$length = strlen($buffer);
$this->body_length += $length;
if ($this->body_length > $this->max_body_size) {
return -1;
}
$this->body .= $buffer;
return $length;
}
/**
* cURL callback to read HTTP headers.
*
* @param resource $ch cURL handler
* @param string $buffer Header line
*
* @return int Length of the buffer
*/
public function readHeaders($ch, $buffer)
{
$length = strlen($buffer);
if ($buffer === "\r\n" || $buffer === "\n") {
++$this->response_headers_count;
} else {
if (!isset($this->response_headers[$this->response_headers_count])) {
$this->response_headers[$this->response_headers_count] = '';
}
$this->response_headers[$this->response_headers_count] .= $buffer;
}
return $length;
}
/**
* cURL callback to passthrough the HTTP body to the client.
*
* If the function return -1, curl stop to read the HTTP response
*
* @param resource $ch cURL handler
* @param string $buffer Chunk of data
*
* @return int Length of the buffer
*/
public function passthroughBody($ch, $buffer)
{
// do it only at the beginning of a transmission
if ($this->body_length === 0) {
list($status, $headers) = HttpHeaders::parse(explode("\n", $this->response_headers[$this->response_headers_count - 1]));
if ($this->isRedirection($status)) {
return $this->handleRedirection($headers['Location']);
}
header($status);
if (isset($headers['Content-Type'])) {
header('Content-Type:' .$headers['Content-Type']);
}
}
$length = strlen($buffer);
$this->body_length += $length;
echo $buffer;
return $length;
}
/**
* Prepare HTTP headers.
*
* @return string[]
*/
private function prepareHeaders()
{
$headers = array(
'Connection: close',
);
if ($this->etag) {
$headers[] = 'If-None-Match: '.$this->etag;
}
if ($this->last_modified) {
$headers[] = 'If-Modified-Since: '.$this->last_modified;
}
$headers = array_merge($headers, $this->request_headers);
return $headers;
}
/**
* Prepare curl proxy context.
*
* @param resource $ch
*
* @return resource $ch
*/
private function prepareProxyContext($ch)
{
if ($this->proxy_hostname) {
Logger::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port);
curl_setopt($ch, CURLOPT_PROXYPORT, $this->proxy_port);
curl_setopt($ch, CURLOPT_PROXYTYPE, 'HTTP');
curl_setopt($ch, CURLOPT_PROXY, $this->proxy_hostname);
if ($this->proxy_username) {
Logger::setMessage(get_called_class().' Proxy credentials: Yes');
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $this->proxy_username.':'.$this->proxy_password);
} else {
Logger::setMessage(get_called_class().' Proxy credentials: No');
}
}
return $ch;
}
/**
* Prepare curl auth context.
*
* @param resource $ch
*
* @return resource $ch
*/
private function prepareAuthContext($ch)
{
if ($this->username && $this->password) {
curl_setopt($ch, CURLOPT_USERPWD, $this->username.':'.$this->password);
}
return $ch;
}
/**
* Set write/header functions.
*
* @param resource $ch
*
* @return resource $ch
*/
private function prepareDownloadMode($ch)
{
$write_function = 'readBody';
$header_function = 'readHeaders';
if ($this->isPassthroughEnabled()) {
$write_function = 'passthroughBody';
}
curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, $write_function));
curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, $header_function));
return $ch;
}
/**
* Prepare curl context.
*
* @return resource
*/
private function prepareContext()
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->url);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent);
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->prepareHeaders());
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_ENCODING, '');
curl_setopt($ch, CURLOPT_COOKIEJAR, 'php://memory');
curl_setopt($ch, CURLOPT_COOKIEFILE, 'php://memory');
// Disable SSLv3 by enforcing TLSv1.x for curl >= 7.34.0 and < 7.39.0.
// Versions prior to 7.34 and at least when compiled against openssl
// interpret this parameter as "limit to TLSv1.0" which fails for sites
// which enforce TLS 1.1+.
// Starting with curl 7.39.0 SSLv3 is disabled by default.
$version = curl_version();
if ($version['version_number'] >= 467456 && $version['version_number'] < 468736) {
curl_setopt($ch, CURLOPT_SSLVERSION, 1);
}
$ch = $this->prepareDownloadMode($ch);
$ch = $this->prepareProxyContext($ch);
$ch = $this->prepareAuthContext($ch);
return $ch;
}
/**
* Execute curl context.
*/
private function executeContext()
{
$ch = $this->prepareContext();
curl_exec($ch);
Logger::setMessage(get_called_class().' cURL total time: '.curl_getinfo($ch, CURLINFO_TOTAL_TIME));
Logger::setMessage(get_called_class().' cURL dns lookup time: '.curl_getinfo($ch, CURLINFO_NAMELOOKUP_TIME));
Logger::setMessage(get_called_class().' cURL connect time: '.curl_getinfo($ch, CURLINFO_CONNECT_TIME));
Logger::setMessage(get_called_class().' cURL speed download: '.curl_getinfo($ch, CURLINFO_SPEED_DOWNLOAD));
Logger::setMessage(get_called_class().' cURL effective url: '.curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
$curl_errno = curl_errno($ch);
if ($curl_errno) {
Logger::setMessage(get_called_class().' cURL error: '.curl_error($ch));
curl_close($ch);
$this->handleError($curl_errno);
}
// Update the url if there where redirects
$this->url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
}
/**
* Do the HTTP request.
*
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...]
*/
public function doRequest()
{
$this->executeContext();
list($status, $headers) = HttpHeaders::parse(explode("\n", $this->response_headers[$this->response_headers_count - 1]));
if ($this->isRedirection($status)) {
return $this->handleRedirection($headers['Location']);
}
return array(
'status' => $status,
'body' => $this->body,
'headers' => $headers,
);
}
/**
* Handle HTTP redirects
*
* @param string $location Redirected URL
*
* @return array
*/
private function handleRedirection($location)
{
$nb_redirects = 0;
$result = array();
$this->url = Url::resolve($location, $this->url);
$this->body = '';
$this->body_length = 0;
$this->response_headers = array();
$this->response_headers_count = 0;
while (true) {
++$nb_redirects;
if ($nb_redirects >= $this->max_redirects) {
throw new MaxRedirectException('Maximum number of redirections reached');
}
$result = $this->doRequest();
if ($this->isRedirection($result['status'])) {
$this->url = Url::resolve($result['headers']['Location'], $this->url);
$this->body = '';
$this->body_length = 0;
$this->response_headers = array();
$this->response_headers_count = 0;
} else {
break;
}
}
return $result;
}
/**
* Handle cURL errors (throw individual exceptions).
*
* We don't use constants because they are not necessary always available
* (depends of the version of libcurl linked to php)
*
* @see http://curl.haxx.se/libcurl/c/libcurl-errors.html
*
* @param int $errno cURL error code
*/
private function handleError($errno)
{
switch ($errno) {
case 78: // CURLE_REMOTE_FILE_NOT_FOUND
throw new InvalidUrlException('Resource not found');
case 6: // CURLE_COULDNT_RESOLVE_HOST
throw new InvalidUrlException('Unable to resolve hostname');
case 7: // CURLE_COULDNT_CONNECT
throw new InvalidUrlException('Unable to connect to the remote host');
case 23: // CURLE_WRITE_ERROR
throw new MaxSizeException('Maximum response size exceeded');
case 28: // CURLE_OPERATION_TIMEDOUT
throw new TimeoutException('Operation timeout');
case 35: // CURLE_SSL_CONNECT_ERROR
case 51: // CURLE_PEER_FAILED_VERIFICATION
case 58: // CURLE_SSL_CERTPROBLEM
case 60: // CURLE_SSL_CACERT
case 59: // CURLE_SSL_CIPHER
case 64: // CURLE_USE_SSL_FAILED
case 66: // CURLE_SSL_ENGINE_INITFAILED
case 77: // CURLE_SSL_CACERT_BADFILE
case 83: // CURLE_SSL_ISSUER_ERROR
throw new InvalidCertificateException('Invalid SSL certificate');
case 47: // CURLE_TOO_MANY_REDIRECTS
throw new MaxRedirectException('Maximum number of redirections reached');
case 63: // CURLE_FILESIZE_EXCEEDED
throw new MaxSizeException('Maximum response size exceeded');
default:
throw new InvalidUrlException('Unable to fetch the URL');
}
}
}

View file

@ -0,0 +1,79 @@
<?php
namespace PicoFeed\Client;
use ArrayAccess;
use PicoFeed\Logging\Logger;
/**
* Class to handle HTTP headers case insensitivity.
*
* @author Bernhard Posselt
* @author Frederic Guillot
*/
class HttpHeaders implements ArrayAccess
{
private $headers = array();
public function __construct(array $headers)
{
foreach ($headers as $key => $value) {
$this->headers[strtolower($key)] = $value;
}
}
public function offsetGet($offset)
{
return $this->headers[strtolower($offset)];
}
public function offsetSet($offset, $value)
{
$this->headers[strtolower($offset)] = $value;
}
public function offsetExists($offset)
{
return isset($this->headers[strtolower($offset)]);
}
public function offsetUnset($offset)
{
unset($this->headers[strtolower($offset)]);
}
/**
* Parse HTTP headers.
*
* @static
*
* @param array $lines List of headers
*
* @return array
*/
public static function parse(array $lines)
{
$status = 0;
$headers = array();
foreach ($lines as $line) {
if (strpos($line, 'HTTP/1') === 0) {
$headers = array();
$status = (int) substr($line, 9, 3);
} elseif (strpos($line, ': ') !== false) {
list($name, $value) = explode(': ', $line);
if ($value) {
$headers[trim($name)] = trim($value);
}
}
}
Logger::setMessage(get_called_class().' HTTP status code: '.$status);
foreach ($headers as $name => $value) {
Logger::setMessage(get_called_class().' HTTP header: '.$name.' => '.$value);
}
return array($status, new self($headers));
}
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Client;
/**
* InvalidCertificateException Exception.
*
* @author Frederic Guillot
*/
class InvalidCertificateException extends ClientException
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Client;
/**
* InvalidUrlException Exception.
*
* @author Frederic Guillot
*/
class InvalidUrlException extends ClientException
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Client;
/**
* MaxRedirectException Exception.
*
* @author Frederic Guillot
*/
class MaxRedirectException extends ClientException
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Client;
/**
* MaxSizeException Exception.
*
* @author Frederic Guillot
*/
class MaxSizeException extends ClientException
{
}

View file

@ -0,0 +1,201 @@
<?php
namespace PicoFeed\Client;
use PicoFeed\Logging\Logger;
/**
* Stream context HTTP client.
*
* @author Frederic Guillot
*/
class Stream extends Client
{
/**
* Prepare HTTP headers.
*
* @return string[]
*/
private function prepareHeaders()
{
$headers = array(
'Connection: close',
'User-Agent: '.$this->user_agent,
);
// disable compression in passthrough mode. It could result in double
// compressed content which isn't decodeable by browsers
if (function_exists('gzdecode') && !$this->isPassthroughEnabled()) {
$headers[] = 'Accept-Encoding: gzip';
}
if ($this->etag) {
$headers[] = 'If-None-Match: '.$this->etag;
}
if ($this->last_modified) {
$headers[] = 'If-Modified-Since: '.$this->last_modified;
}
if ($this->proxy_username) {
$headers[] = 'Proxy-Authorization: Basic '.base64_encode($this->proxy_username.':'.$this->proxy_password);
}
if ($this->username && $this->password) {
$headers[] = 'Authorization: Basic '.base64_encode($this->username.':'.$this->password);
}
$headers = array_merge($headers, $this->request_headers);
return $headers;
}
/**
* Construct the final URL from location headers.
*
* @param array $headers List of HTTP response header
*/
private function setEffectiveUrl($headers)
{
foreach ($headers as $header) {
if (stripos($header, 'Location') === 0) {
list(, $value) = explode(': ', $header);
$this->url = Url::resolve($value, $this->url);
}
}
}
/**
* Prepare stream context.
*
* @return array
*/
private function prepareContext()
{
$context = array(
'http' => array(
'method' => 'GET',
'protocol_version' => 1.1,
'timeout' => $this->timeout,
'max_redirects' => $this->max_redirects,
),
);
if ($this->proxy_hostname) {
Logger::setMessage(get_called_class().' Proxy: '.$this->proxy_hostname.':'.$this->proxy_port);
$context['http']['proxy'] = 'tcp://'.$this->proxy_hostname.':'.$this->proxy_port;
$context['http']['request_fulluri'] = true;
if ($this->proxy_username) {
Logger::setMessage(get_called_class().' Proxy credentials: Yes');
} else {
Logger::setMessage(get_called_class().' Proxy credentials: No');
}
}
$context['http']['header'] = implode("\r\n", $this->prepareHeaders());
return $context;
}
/**
* Do the HTTP request.
*
* @return array HTTP response ['body' => ..., 'status' => ..., 'headers' => ...]
*/
public function doRequest()
{
$body = '';
// Create context
$context = stream_context_create($this->prepareContext());
// Make HTTP request
$stream = @fopen($this->url, 'r', false, $context);
if (!is_resource($stream)) {
throw new InvalidUrlException('Unable to establish a connection');
}
// Get HTTP headers response
$metadata = stream_get_meta_data($stream);
list($status, $headers) = HttpHeaders::parse($metadata['wrapper_data']);
if ($this->isPassthroughEnabled()) {
header(':', true, $status);
if (isset($headers['Content-Type'])) {
header('Content-Type: '.$headers['Content-Type']);
}
fpassthru($stream);
} else {
// Get the entire body until the max size
$body = stream_get_contents($stream, $this->max_body_size + 1);
// If the body size is too large abort everything
if (strlen($body) > $this->max_body_size) {
throw new MaxSizeException('Content size too large');
}
if ($metadata['timed_out']) {
throw new TimeoutException('Operation timeout');
}
}
fclose($stream);
$this->setEffectiveUrl($metadata['wrapper_data']);
return array(
'status' => $status,
'body' => $this->decodeBody($body, $headers),
'headers' => $headers,
);
}
/**
* Decode body response according to the HTTP headers.
*
* @param string $body Raw body
* @param HttpHeaders $headers HTTP headers
*
* @return string
*/
public function decodeBody($body, HttpHeaders $headers)
{
if (isset($headers['Transfer-Encoding']) && $headers['Transfer-Encoding'] === 'chunked') {
$body = $this->decodeChunked($body);
}
if (isset($headers['Content-Encoding']) && $headers['Content-Encoding'] === 'gzip') {
$body = gzdecode($body);
}
return $body;
}
/**
* Decode a chunked body.
*
* @param string $str Raw body
*
* @return string Decoded body
*/
public function decodeChunked($str)
{
for ($result = ''; !empty($str); $str = trim($str)) {
// Get the chunk length
$pos = strpos($str, "\r\n");
$len = hexdec(substr($str, 0, $pos));
// Append the chunk to the result
$result .= substr($str, $pos + 2, $len);
$str = substr($str, $pos + 2 + $len);
}
return $result;
}
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Client;
/**
* TimeoutException Exception.
*
* @author Frederic Guillot
*/
class TimeoutException extends ClientException
{
}

View file

@ -0,0 +1,290 @@
<?php
namespace PicoFeed\Client;
/**
* URL class.
*
* @author Frederic Guillot
*/
class Url
{
/**
* URL.
*
* @var string
*/
private $url = '';
/**
* URL components.
*
* @var array
*/
private $components = array();
/**
* Constructor.
*
* @param string $url URL
*/
public function __construct($url)
{
$this->url = $url;
$this->components = parse_url($url) ?: array();
// Issue with PHP < 5.4.7 and protocol relative url
if (version_compare(PHP_VERSION, '5.4.7', '<') && $this->isProtocolRelative()) {
$pos = strpos($this->components['path'], '/', 2);
if ($pos === false) {
$pos = strlen($this->components['path']);
}
$this->components['host'] = substr($this->components['path'], 2, $pos - 2);
$this->components['path'] = substr($this->components['path'], $pos);
}
}
/**
* Shortcut method to get an absolute url from relative url.
*
* @static
*
* @param mixed $item_url Unknown url (can be relative or not)
* @param mixed $website_url Website url
*
* @return string
*/
public static function resolve($item_url, $website_url)
{
$link = is_string($item_url) ? new self($item_url) : $item_url;
$website = is_string($website_url) ? new self($website_url) : $website_url;
if ($link->isRelativeUrl()) {
if ($link->isRelativePath()) {
return $link->getAbsoluteUrl($website->getBaseUrl($website->getBasePath()));
}
return $link->getAbsoluteUrl($website->getBaseUrl());
} elseif ($link->isProtocolRelative()) {
$link->setScheme($website->getScheme());
}
return $link->getAbsoluteUrl();
}
/**
* Shortcut method to get a base url.
*
* @static
*
* @param string $url
*
* @return string
*/
public static function base($url)
{
$link = new self($url);
return $link->getBaseUrl();
}
/**
* Get the base URL.
*
* @param string $suffix Add a suffix to the url
*
* @return string
*/
public function getBaseUrl($suffix = '')
{
return $this->hasHost() ? $this->getScheme('://').$this->getHost().$this->getPort(':').$suffix : '';
}
/**
* Get the absolute URL.
*
* @param string $base_url Use this url as base url
*
* @return string
*/
public function getAbsoluteUrl($base_url = '')
{
if ($base_url) {
$base = new self($base_url);
$url = $base->getAbsoluteUrl().substr($this->getFullPath(), 1);
} else {
$url = $this->hasHost() ? $this->getBaseUrl().$this->getFullPath() : '';
}
return $url;
}
/**
* Return true if the url is relative.
*
* @return bool
*/
public function isRelativeUrl()
{
return !$this->hasScheme() && !$this->isProtocolRelative();
}
/**
* Return true if the path is relative.
*
* @return bool
*/
public function isRelativePath()
{
$path = $this->getPath();
return empty($path) || $path{0}
!== '/';
}
/**
* Filters the path of a URI.
*
* Imported from Guzzle library: https://github.com/guzzle/psr7/blob/master/src/Uri.php#L568-L582
*
* @param $path
*
* @return string
*/
public function filterPath($path, $charUnreserved = 'a-zA-Z0-9_\-\.~', $charSubDelims = '!\$&\'\(\)\*\+,;=')
{
return preg_replace_callback(
'/(?:[^'.$charUnreserved.$charSubDelims.':@\/%]+|%(?![A-Fa-f0-9]{2}))/',
function (array $matches) { return rawurlencode($matches[0]); },
$path
);
}
/**
* Get the path.
*
* @return string
*/
public function getPath()
{
return $this->filterPath(empty($this->components['path']) ? '' : $this->components['path']);
}
/**
* Get the base path.
*
* @return string
*/
public function getBasePath()
{
$current_path = $this->getPath();
$path = $this->isRelativePath() ? '/' : '';
$path .= substr($current_path, -1) === '/' ? $current_path : dirname($current_path);
return preg_replace('/\\\\\/|\/\//', '/', $path.'/');
}
/**
* Get the full path (path + querystring + fragment).
*
* @return string
*/
public function getFullPath()
{
$path = $this->isRelativePath() ? '/' : '';
$path .= $this->getPath();
$path .= empty($this->components['query']) ? '' : '?'.$this->components['query'];
$path .= empty($this->components['fragment']) ? '' : '#'.$this->components['fragment'];
return $path;
}
/**
* Get the hostname.
*
* @return string
*/
public function getHost()
{
return empty($this->components['host']) ? '' : $this->components['host'];
}
/**
* Return true if the url has a hostname.
*
* @return bool
*/
public function hasHost()
{
return !empty($this->components['host']);
}
/**
* Get the scheme.
*
* @param string $suffix Suffix to add when there is a scheme
*
* @return string
*/
public function getScheme($suffix = '')
{
return ($this->hasScheme() ? $this->components['scheme'] : 'http').$suffix;
}
/**
* Set the scheme.
*
* @param string $scheme Set a scheme
*
* @return string
*/
public function setScheme($scheme)
{
$this->components['scheme'] = $scheme;
}
/**
* Return true if the url has a scheme.
*
* @return bool
*/
public function hasScheme()
{
return !empty($this->components['scheme']);
}
/**
* Get the port.
*
* @param string $prefix Prefix to add when there is a port
*
* @return string
*/
public function getPort($prefix = '')
{
return $this->hasPort() ? $prefix.$this->components['port'] : '';
}
/**
* Return true if the url has a port.
*
* @return bool
*/
public function hasPort()
{
return !empty($this->components['port']);
}
/**
* Return true if the url is protocol relative (start with //).
*
* @return bool
*/
public function isProtocolRelative()
{
return strpos($this->url, '//') === 0;
}
}

View file

@ -0,0 +1,96 @@
<?php
namespace PicoFeed\Config;
/**
* Config class.
*
* @author Frederic Guillot
*
* @method \PicoFeed\Config\Config setClientTimeout(integer $value)
* @method \PicoFeed\Config\Config setClientUserAgent(string $value)
* @method \PicoFeed\Config\Config setMaxRedirections(integer $value)
* @method \PicoFeed\Config\Config setMaxBodySize(integer $value)
* @method \PicoFeed\Config\Config setProxyHostname(string $value)
* @method \PicoFeed\Config\Config setProxyPort(integer $value)
* @method \PicoFeed\Config\Config setProxyUsername(string $value)
* @method \PicoFeed\Config\Config setProxyPassword(string $value)
* @method \PicoFeed\Config\Config setGrabberRulesFolder(string $value)
* @method \PicoFeed\Config\Config setGrabberTimeout(integer $value)
* @method \PicoFeed\Config\Config setGrabberUserAgent(string $value)
* @method \PicoFeed\Config\Config setParserHashAlgo(string $value)
* @method \PicoFeed\Config\Config setContentFiltering(boolean $value)
* @method \PicoFeed\Config\Config setTimezone(string $value)
* @method \PicoFeed\Config\Config setFilterIframeWhitelist(array $value)
* @method \PicoFeed\Config\Config setFilterIntegerAttributes(array $value)
* @method \PicoFeed\Config\Config setFilterAttributeOverrides(array $value)
* @method \PicoFeed\Config\Config setFilterRequiredAttributes(array $value)
* @method \PicoFeed\Config\Config setFilterMediaBlacklist(array $value)
* @method \PicoFeed\Config\Config setFilterMediaAttributes(array $value)
* @method \PicoFeed\Config\Config setFilterSchemeWhitelist(array $value)
* @method \PicoFeed\Config\Config setFilterWhitelistedTags(array $value)
* @method \PicoFeed\Config\Config setFilterBlacklistedTags(array $value)
* @method \PicoFeed\Config\Config setFilterImageProxyUrl($value)
* @method \PicoFeed\Config\Config setFilterImageProxyCallback($closure)
* @method \PicoFeed\Config\Config setFilterImageProxyProtocol($value)
* @method integer getClientTimeout()
* @method string getClientUserAgent()
* @method integer getMaxRedirections()
* @method integer getMaxBodySize()
* @method string getProxyHostname()
* @method integer getProxyPort()
* @method string getProxyUsername()
* @method string getProxyPassword()
* @method string getGrabberRulesFolder()
* @method integer getGrabberTimeout()
* @method string getGrabberUserAgent()
* @method string getParserHashAlgo()
* @method boolean getContentFiltering(bool $default_value)
* @method string getTimezone()
* @method array getFilterIframeWhitelist(array $default_value)
* @method array getFilterIntegerAttributes(array $default_value)
* @method array getFilterAttributeOverrides(array $default_value)
* @method array getFilterRequiredAttributes(array $default_value)
* @method array getFilterMediaBlacklist(array $default_value)
* @method array getFilterMediaAttributes(array $default_value)
* @method array getFilterSchemeWhitelist(array $default_value)
* @method array getFilterWhitelistedTags(array $default_value)
* @method array getFilterBlacklistedTags(array $default_value)
* @method string getFilterImageProxyUrl()
* @method \Closure getFilterImageProxyCallback()
* @method string getFilterImageProxyProtocol()
*/
class Config
{
/**
* Contains all parameters.
*
* @var array
*/
private $container = array();
/**
* Magic method to have any kind of setters or getters.
*
* @param string $name Getter/Setter name
* @param array $arguments Method arguments
*
* @return mixed
*/
public function __call($name, array $arguments)
{
$name = strtolower($name);
$prefix = substr($name, 0, 3);
$parameter = substr($name, 3);
if ($prefix === 'set' && isset($arguments[0])) {
$this->container[$parameter] = $arguments[0];
return $this;
} elseif ($prefix === 'get') {
$default_value = isset($arguments[0]) ? $arguments[0] : null;
return isset($this->container[$parameter]) ? $this->container[$parameter] : $default_value;
}
}
}

View file

@ -0,0 +1,33 @@
<?php
namespace PicoFeed\Encoding;
/**
* Encoding class.
*/
class Encoding
{
public static function convert($input, $encoding)
{
if ($encoding === 'utf-8' || $encoding === '') {
return $input;
}
// suppress all notices since it isn't possible to silence only the
// notice "Wrong charset, conversion from $in_encoding to $out_encoding is not allowed"
set_error_handler(function () {}, E_NOTICE);
// convert input to utf-8 and strip invalid characters
$value = iconv($encoding, 'UTF-8//IGNORE', $input);
// stop silencing of notices
restore_error_handler();
// return input if something went wrong, maybe it's usable anyway
if ($value === false) {
return $input;
}
return $value;
}
}

View file

@ -0,0 +1,699 @@
<?php
namespace PicoFeed\Filter;
use PicoFeed\Client\Url;
/**
* Attribute Filter class.
*
* @author Frederic Guillot
*/
class Attribute
{
/**
* Image proxy url.
*
* @var string
*/
private $image_proxy_url = '';
/**
* Image proxy callback.
*
* @var \Closure|null
*/
private $image_proxy_callback = null;
/**
* limits the image proxy usage to this protocol.
*
* @var string
*/
private $image_proxy_limit_protocol = '';
/**
* Tags and attribute whitelist.
*
* @var array
*/
private $attribute_whitelist = array(
'audio' => array('controls', 'src'),
'video' => array('poster', 'controls', 'height', 'width', 'src'),
'source' => array('src', 'type'),
'dt' => array(),
'dd' => array(),
'dl' => array(),
'table' => array(),
'caption' => array(),
'tr' => array(),
'th' => array(),
'td' => array(),
'tbody' => array(),
'thead' => array(),
'h2' => array(),
'h3' => array(),
'h4' => array(),
'h5' => array(),
'h6' => array(),
'strong' => array(),
'em' => array(),
'code' => array(),
'pre' => array(),
'blockquote' => array(),
'p' => array(),
'ul' => array(),
'li' => array(),
'ol' => array(),
'br' => array(),
'del' => array(),
'a' => array('href'),
'img' => array('src', 'title', 'alt'),
'figure' => array(),
'figcaption' => array(),
'cite' => array(),
'time' => array('datetime'),
'abbr' => array('title'),
'iframe' => array('width', 'height', 'frameborder', 'src', 'allowfullscreen'),
'q' => array('cite'),
);
/**
* Scheme whitelist.
*
* For a complete list go to http://en.wikipedia.org/wiki/URI_scheme
*
* @var array
*/
private $scheme_whitelist = array(
'bitcoin:',
'callto:',
'ed2k://',
'facetime://',
'feed:',
'ftp://',
'geo:',
'git://',
'http://',
'https://',
'irc://',
'irc6://',
'ircs://',
'jabber:',
'magnet:',
'mailto:',
'nntp://',
'rtmp://',
'sftp://',
'sip:',
'sips:',
'skype:',
'smb://',
'sms:',
'spotify:',
'ssh:',
'steam:',
'svn://',
'tel:',
);
/**
* Iframe source whitelist, everything else is ignored.
*
* @var array
*/
private $iframe_whitelist = array(
'http://www.youtube.com',
'https://www.youtube.com',
'http://player.vimeo.com',
'https://player.vimeo.com',
'http://www.dailymotion.com',
'https://www.dailymotion.com',
'http://vk.com',
'https://vk.com',
);
/**
* Blacklisted resources.
*
* @var array
*/
private $media_blacklist = array(
'api.flattr.com',
'feeds.feedburner.com',
'share.feedsportal.com',
'da.feedsportal.com',
'rc.feedsportal.com',
'rss.feedsportal.com',
'res.feedsportal.com',
'res1.feedsportal.com',
'res2.feedsportal.com',
'res3.feedsportal.com',
'pi.feedsportal.com',
'rss.nytimes.com',
'feeds.wordpress.com',
'stats.wordpress.com',
'rss.cnn.com',
'twitter.com/home?status=',
'twitter.com/share',
'twitter_icon_large.png',
'www.facebook.com/sharer.php',
'facebook_icon_large.png',
'plus.google.com/share',
'www.gstatic.com/images/icons/gplus-16.png',
'www.gstatic.com/images/icons/gplus-32.png',
'www.gstatic.com/images/icons/gplus-64.png',
);
/**
* Attributes used for external resources.
*
* @var array
*/
private $media_attributes = array(
'src',
'href',
'poster',
);
/**
* Attributes that must be integer.
*
* @var array
*/
private $integer_attributes = array(
'width',
'height',
'frameborder',
);
/**
* Mandatory attributes for specified tags.
*
* @var array
*/
private $required_attributes = array(
'a' => array('href'),
'img' => array('src'),
'iframe' => array('src'),
'audio' => array('src'),
'source' => array('src'),
);
/**
* Add attributes to specified tags.
*
* @var array
*/
private $add_attributes = array(
'a' => array('rel' => 'noreferrer', 'target' => '_blank'),
'video' => array('controls' => 'true'),
);
/**
* List of filters to apply.
*
* @var array
*/
private $filters = array(
'filterAllowedAttribute',
'filterIntegerAttribute',
'rewriteAbsoluteUrl',
'filterIframeAttribute',
'filterBlacklistResourceAttribute',
'filterProtocolUrlAttribute',
'rewriteImageProxyUrl',
'secureIframeSrc',
'removeYouTubeAutoplay',
);
/**
* Add attributes to specified tags.
*
* @var \PicoFeed\Client\Url
*/
private $website;
/**
* Constructor.
*
* @param \PicoFeed\Client\Url $website Website url instance
*/
public function __construct(Url $website)
{
$this->website = $website;
}
/**
* Apply filters to the attributes list.
*
* @param string $tag Tag name
* @param array $attributes Attributes dictionary
*
* @return array Filtered attributes
*/
public function filter($tag, array $attributes)
{
foreach ($attributes as $attribute => &$value) {
foreach ($this->filters as $filter) {
if (!$this->$filter($tag, $attribute, $value)) {
unset($attributes[$attribute]);
break;
}
}
}
return $attributes;
}
/**
* Return true if the value is allowed (remove not allowed attributes).
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function filterAllowedAttribute($tag, $attribute, $value)
{
return isset($this->attribute_whitelist[$tag]) && in_array($attribute, $this->attribute_whitelist[$tag]);
}
/**
* Return true if the value is not integer (remove attributes that should have an integer value).
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function filterIntegerAttribute($tag, $attribute, $value)
{
if (in_array($attribute, $this->integer_attributes)) {
return ctype_digit($value);
}
return true;
}
/**
* Return true if the iframe source is allowed (remove not allowed iframe).
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function filterIframeAttribute($tag, $attribute, $value)
{
if ($tag === 'iframe' && $attribute === 'src') {
foreach ($this->iframe_whitelist as $url) {
if (strpos($value, $url) === 0) {
return true;
}
}
return false;
}
return true;
}
/**
* Return true if the resource is not blacklisted (remove blacklisted resource attributes).
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function filterBlacklistResourceAttribute($tag, $attribute, $value)
{
if ($this->isResource($attribute) && $this->isBlacklistedMedia($value)) {
return false;
}
return true;
}
/**
* Convert all relative links to absolute url.
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function rewriteAbsoluteUrl($tag, $attribute, &$value)
{
if ($this->isResource($attribute)) {
$value = Url::resolve($value, $this->website);
}
return true;
}
/**
* Turns iframes' src attribute from http to https to prevent
* mixed active content.
*
* @param string $tag Tag name
* @param array $attribute Atttributes name
* @param string $value Attribute value
*
* @return bool
*/
public function secureIframeSrc($tag, $attribute, &$value)
{
if ($tag === 'iframe' && $attribute === 'src' && strpos($value, 'http://') === 0) {
$value = substr_replace($value, 's', 4, 0);
}
return true;
}
/**
* Removes YouTube autoplay from iframes.
*
* @param string $tag Tag name
* @param array $attribute Atttributes name
* @param string $value Attribute value
*
* @return bool
*/
public function removeYouTubeAutoplay($tag, $attribute, &$value)
{
$regex = '%^(https://(?:www\.)?youtube.com/.*\?.*autoplay=)(1)(.*)%i';
if ($tag === 'iframe' && $attribute === 'src' && preg_match($regex, $value)) {
$value = preg_replace($regex, '${1}0$3', $value);
}
return true;
}
/**
* Rewrite image url to use with a proxy.
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function rewriteImageProxyUrl($tag, $attribute, &$value)
{
if ($tag === 'img' && $attribute === 'src'
&& !($this->image_proxy_limit_protocol !== '' && stripos($value, $this->image_proxy_limit_protocol.':') !== 0)) {
if ($this->image_proxy_url) {
$value = sprintf($this->image_proxy_url, rawurlencode($value));
} elseif (is_callable($this->image_proxy_callback)) {
$value = call_user_func($this->image_proxy_callback, $value);
}
}
return true;
}
/**
* Return true if the scheme is authorized.
*
* @param string $tag Tag name
* @param string $attribute Attribute name
* @param string $value Attribute value
*
* @return bool
*/
public function filterProtocolUrlAttribute($tag, $attribute, $value)
{
if ($this->isResource($attribute) && !$this->isAllowedProtocol($value)) {
return false;
}
return true;
}
/**
* Automatically add/override some attributes for specific tags.
*
* @param string $tag Tag name
* @param array $attributes Attributes list
*
* @return array
*/
public function addAttributes($tag, array $attributes)
{
if (isset($this->add_attributes[$tag])) {
$attributes += $this->add_attributes[$tag];
}
return $attributes;
}
/**
* Return true if all required attributes are present.
*
* @param string $tag Tag name
* @param array $attributes Attributes list
*
* @return bool
*/
public function hasRequiredAttributes($tag, array $attributes)
{
if (isset($this->required_attributes[$tag])) {
foreach ($this->required_attributes[$tag] as $attribute) {
if (!isset($attributes[$attribute])) {
return false;
}
}
}
return true;
}
/**
* Check if an attribute name is an external resource.
*
* @param string $attribute Attribute name
*
* @return bool
*/
public function isResource($attribute)
{
return in_array($attribute, $this->media_attributes);
}
/**
* Detect if the protocol is allowed or not.
*
* @param string $value Attribute value
*
* @return bool
*/
public function isAllowedProtocol($value)
{
foreach ($this->scheme_whitelist as $protocol) {
if (strpos($value, $protocol) === 0) {
return true;
}
}
return false;
}
/**
* Detect if an url is blacklisted.
*
* @param string $resource Attribute value (URL)
*
* @return bool
*/
public function isBlacklistedMedia($resource)
{
foreach ($this->media_blacklist as $name) {
if (strpos($resource, $name) !== false) {
return true;
}
}
return false;
}
/**
* Convert the attribute list to html.
*
* @param array $attributes Attributes
*
* @return string
*/
public function toHtml(array $attributes)
{
$html = array();
foreach ($attributes as $attribute => $value) {
$html[] = sprintf('%s="%s"', $attribute, Filter::escape($value));
}
return implode(' ', $html);
}
/**
* Set whitelisted tags and attributes for each tag.
*
* @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']]
*
* @return Attribute
*/
public function setWhitelistedAttributes(array $values)
{
$this->attribute_whitelist = $values ?: $this->attribute_whitelist;
return $this;
}
/**
* Set scheme whitelist.
*
* @param array $values List of scheme: ['http://', 'ftp://']
*
* @return Attribute
*/
public function setSchemeWhitelist(array $values)
{
$this->scheme_whitelist = $values ?: $this->scheme_whitelist;
return $this;
}
/**
* Set media attributes (used to load external resources).
*
* @param array $values List of values: ['src', 'href']
*
* @return Attribute
*/
public function setMediaAttributes(array $values)
{
$this->media_attributes = $values ?: $this->media_attributes;
return $this;
}
/**
* Set blacklisted external resources.
*
* @param array $values List of tags: ['http://google.com/', '...']
*
* @return Attribute
*/
public function setMediaBlacklist(array $values)
{
$this->media_blacklist = $values ?: $this->media_blacklist;
return $this;
}
/**
* Set mandatory attributes for whitelisted tags.
*
* @param array $values List of tags: ['img' => 'src']
*
* @return Attribute
*/
public function setRequiredAttributes(array $values)
{
$this->required_attributes = $values ?: $this->required_attributes;
return $this;
}
/**
* Set attributes to automatically to specific tags.
*
* @param array $values List of tags: ['a' => 'target="_blank"']
*
* @return Attribute
*/
public function setAttributeOverrides(array $values)
{
$this->add_attributes = $values ?: $this->add_attributes;
return $this;
}
/**
* Set attributes that must be an integer.
*
* @param array $values List of tags: ['width', 'height']
*
* @return Attribute
*/
public function setIntegerAttributes(array $values)
{
$this->integer_attributes = $values ?: $this->integer_attributes;
return $this;
}
/**
* Set allowed iframe resources.
*
* @param array $values List of tags: ['http://www.youtube.com']
*
* @return Attribute
*/
public function setIframeWhitelist(array $values)
{
$this->iframe_whitelist = $values ?: $this->iframe_whitelist;
return $this;
}
/**
* Set image proxy URL.
*
* The original image url will be urlencoded
*
* @param string $url Proxy URL
*
* @return Attribute
*/
public function setImageProxyUrl($url)
{
$this->image_proxy_url = $url ?: $this->image_proxy_url;
return $this;
}
/**
* Set image proxy callback.
*
* @param \Closure $callback
*
* @return Attribute
*/
public function setImageProxyCallback($callback)
{
$this->image_proxy_callback = $callback ?: $this->image_proxy_callback;
return $this;
}
/**
* Set image proxy protocol restriction.
*
* @param string $value
*
* @return Attribute
*/
public function setImageProxyProtocol($value)
{
$this->image_proxy_limit_protocol = $value ?: $this->image_proxy_limit_protocol;
return $this;
}
}

View file

@ -0,0 +1,155 @@
<?php
namespace PicoFeed\Filter;
/**
* Filter class.
*
* @author Frederic Guillot
*/
class Filter
{
/**
* Get the Html filter instance.
*
* @static
*
* @param string $html HTML content
* @param string $website Site URL (used to build absolute URL)
*
* @return Html
*/
public static function html($html, $website)
{
$filter = new Html($html, $website);
return $filter;
}
/**
* Escape HTML content.
*
* @static
*
* @return string
*/
public static function escape($content)
{
return htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
}
/**
* Remove HTML tags.
*
* @param string $data Input data
*
* @return string
*/
public function removeHTMLTags($data)
{
return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
}
/**
* Remove the XML tag from a document.
*
* @static
*
* @param string $data Input data
*
* @return string
*/
public static function stripXmlTag($data)
{
if (strpos($data, '<?xml') !== false) {
$data = ltrim(substr($data, strpos($data, '?>') + 2));
}
do {
$pos = strpos($data, '<?xml-stylesheet ');
if ($pos !== false) {
$data = ltrim(substr($data, strpos($data, '?>') + 2));
}
} while ($pos !== false && $pos < 200);
return $data;
}
/**
* Strip head tag from the HTML content.
*
* @static
*
* @param string $data Input data
*
* @return string
*/
public static function stripHeadTags($data)
{
return preg_replace('@<head[^>]*?>.*?</head>@siu', '', $data);
}
/**
* Trim whitespace from the begining, the end and inside a string and don't break utf-8 string.
*
* @static
*
* @param string $value Raw data
*
* @return string Normalized data
*/
public static function stripWhiteSpace($value)
{
$value = str_replace("\r", ' ', $value);
$value = str_replace("\t", ' ', $value);
$value = str_replace("\n", ' ', $value);
// $value = preg_replace('/\s+/', ' ', $value); <= break utf-8
return trim($value);
}
/**
* Fixes before XML parsing.
*
* @static
*
* @param string $data Raw data
*
* @return string Normalized data
*/
public static function normalizeData($data)
{
$entities = array(
'/(&#)(\d+);/m', // decimal encoded
'/(&#x)([a-f0-9]+);/mi', // hex encoded
);
// strip invalid XML 1.0 characters which are encoded as entities
$data = preg_replace_callback($entities, function ($matches) {
$code_point = $matches[2];
// convert hex entity to decimal
if (strtolower($matches[1]) === '&#x') {
$code_point = hexdec($code_point);
}
$code_point = (int) $code_point;
// replace invalid characters
if ($code_point < 9
|| ($code_point > 10 && $code_point < 13)
|| ($code_point > 13 && $code_point < 32)
|| ($code_point > 55295 && $code_point < 57344)
|| ($code_point > 65533 && $code_point < 65536)
|| $code_point > 1114111
) {
return '';
};
return $matches[0];
}, $data);
// strip every utf-8 character than isn't in the range of valid XML 1.0 characters
return (string) preg_replace('/[^\x{0009}\x{000A}\x{000D}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u', '', $data);
}
}

View file

@ -0,0 +1,243 @@
<?php
namespace PicoFeed\Filter;
use PicoFeed\Config\Config;
use PicoFeed\Client\Url;
use PicoFeed\Scraper\RuleLoader;
use PicoFeed\Parser\XmlParser;
/**
* HTML Filter class.
*
* @author Frederic Guillot
*/
class Html
{
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Unfiltered XML data.
*
* @var string
*/
private $input = '';
/**
* Filtered XML data.
*
* @var string
*/
private $output = '';
/**
* List of empty tags.
*
* @var array
*/
private $empty_tags = array();
/**
* Empty flag.
*
* @var bool
*/
private $empty = true;
/**
* Tag instance.
*
* @var \PicoFeed\Filter\Tag
*/
public $tag = '';
/**
* Attribute instance.
*
* @var \PicoFeed\Filter\Attribute
*/
public $attribute = '';
/**
* The website to filter.
*
* @var string
*/
private $website;
/**
* Initialize the filter, all inputs data must be encoded in UTF-8 before.
*
* @param string $html HTML content
* @param string $website Site URL (used to build absolute URL)
*/
public function __construct($html, $website)
{
$this->config = new Config();
$this->input = XmlParser::htmlToXml($html);
$this->output = '';
$this->tag = new Tag($this->config);
$this->website = $website;
$this->attribute = new Attribute(new Url($website));
}
/**
* Set config object.
*
* @param \PicoFeed\Config\Config $config Config instance
*
* @return \PicoFeed\Filter\Html
*/
public function setConfig($config)
{
$this->config = $config;
if ($this->config !== null) {
$this->attribute->setImageProxyCallback($this->config->getFilterImageProxyCallback());
$this->attribute->setImageProxyUrl($this->config->getFilterImageProxyUrl());
$this->attribute->setImageProxyProtocol($this->config->getFilterImageProxyProtocol());
$this->attribute->setIframeWhitelist($this->config->getFilterIframeWhitelist(array()));
$this->attribute->setIntegerAttributes($this->config->getFilterIntegerAttributes(array()));
$this->attribute->setAttributeOverrides($this->config->getFilterAttributeOverrides(array()));
$this->attribute->setRequiredAttributes($this->config->getFilterRequiredAttributes(array()));
$this->attribute->setMediaBlacklist($this->config->getFilterMediaBlacklist(array()));
$this->attribute->setMediaAttributes($this->config->getFilterMediaAttributes(array()));
$this->attribute->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array()));
$this->attribute->setWhitelistedAttributes($this->config->getFilterWhitelistedTags(array()));
$this->tag->setWhitelistedTags(array_keys($this->config->getFilterWhitelistedTags(array())));
}
return $this;
}
/**
* Run tags/attributes filtering.
*
* @return string
*/
public function execute()
{
$this->preFilter();
$parser = xml_parser_create();
xml_set_object($parser, $this);
xml_set_element_handler($parser, 'startTag', 'endTag');
xml_set_character_data_handler($parser, 'dataTag');
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
xml_parse($parser, $this->input, true);
xml_parser_free($parser);
$this->postFilter();
return $this->output;
}
/**
* Called before XML parsing.
*/
public function preFilter()
{
$this->input = $this->tag->removeBlacklistedTags($this->input);
}
/**
* Called after XML parsing.
*/
public function postFilter()
{
$this->output = $this->tag->removeEmptyTags($this->output);
$this->output = $this->filterRules($this->output);
$this->output = $this->tag->removeMultipleBreakTags($this->output);
$this->output = trim($this->output);
}
/**
* Called after XML parsing.
*
* @param string $content the content that should be filtered
*/
public function filterRules($content)
{
// the constructor should require a config, then this if can be removed
if ($this->config === null) {
$config = new Config();
} else {
$config = $this->config;
}
$loader = new RuleLoader($config);
$rules = $loader->getRules($this->website);
$url = new Url($this->website);
$sub_url = $url->getFullPath();
if (isset($rules['filter'])) {
foreach ($rules['filter'] as $pattern => $rule) {
if (preg_match($pattern, $sub_url)) {
foreach ($rule as $search => $replace) {
$content = preg_replace($search, $replace, $content);
}
}
}
}
return $content;
}
/**
* Parse opening tag.
*
* @param resource $parser XML parser
* @param string $tag Tag name
* @param array $attributes Tag attributes
*/
public function startTag($parser, $tag, array $attributes)
{
$this->empty = true;
if ($this->tag->isAllowed($tag, $attributes)) {
$attributes = $this->attribute->filter($tag, $attributes);
if ($this->attribute->hasRequiredAttributes($tag, $attributes)) {
$attributes = $this->attribute->addAttributes($tag, $attributes);
$this->output .= $this->tag->openHtmlTag($tag, $this->attribute->toHtml($attributes));
$this->empty = false;
}
}
$this->empty_tags[] = $this->empty;
}
/**
* Parse closing tag.
*
* @param resource $parser XML parser
* @param string $tag Tag name
*/
public function endTag($parser, $tag)
{
if (!array_pop($this->empty_tags) && $this->tag->isAllowedTag($tag)) {
$this->output .= $this->tag->closeHtmlTag($tag);
}
}
/**
* Parse tag content.
*
* @param resource $parser XML parser
* @param string $content Tag content
*/
public function dataTag($parser, $content)
{
// Replace &nbsp; with normal space
$content = str_replace("\xc2\xa0", ' ', $content);
$this->output .= Filter::escape($content);
}
}

View file

@ -0,0 +1,227 @@
<?php
namespace PicoFeed\Filter;
use DOMXpath;
use PicoFeed\Parser\XmlParser;
use PicoFeed\Config\Config;
/**
* Tag Filter class.
*
* @author Frederic Guillot
*/
class Tag
{
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Tags blacklist (Xpath expressions).
*
* @var array
*/
private $tag_blacklist = array(
'//script',
'//style',
);
/**
* Tags whitelist.
*
* @var array
*/
private $tag_whitelist = array(
'audio',
'video',
'source',
'dt',
'dd',
'dl',
'table',
'caption',
'tr',
'th',
'td',
'tbody',
'thead',
'h2',
'h3',
'h4',
'h5',
'h6',
'strong',
'em',
'code',
'pre',
'blockquote',
'p',
'ul',
'li',
'ol',
'br',
'del',
'a',
'img',
'figure',
'figcaption',
'cite',
'time',
'abbr',
'iframe',
'q',
);
public function __construct(Config $config)
{
$this->config = $config;
}
/**
* Check if the tag is allowed and is not a pixel tracker.
*
* @param string $tag Tag name
* @param array $attributes Attributes dictionary
*
* @return bool
*/
public function isAllowed($tag, array $attributes)
{
return $this->isAllowedTag($tag) && !$this->isPixelTracker($tag, $attributes);
}
/**
* Return the HTML opening tag.
*
* @param string $tag Tag name
* @param string $attributes Attributes converted in html
*
* @return string
*/
public function openHtmlTag($tag, $attributes = '')
{
return '<'.$tag.(empty($attributes) ? '' : ' '.$attributes).($this->isSelfClosingTag($tag) ? '/>' : '>');
}
/**
* Return the HTML closing tag.
*
* @param string $tag Tag name
*
* @return string
*/
public function closeHtmlTag($tag)
{
return $this->isSelfClosingTag($tag) ? '' : '</'.$tag.'>';
}
/**
* Return true is the tag is self-closing.
*
* @param string $tag Tag name
*
* @return bool
*/
public function isSelfClosingTag($tag)
{
return $tag === 'br' || $tag === 'img';
}
/**
* Check if a tag is on the whitelist.
*
* @param string $tag Tag name
*
* @return bool
*/
public function isAllowedTag($tag)
{
return in_array($tag, array_merge(
$this->tag_whitelist,
array_keys($this->config->getFilterWhitelistedTags(array()))
));
}
/**
* Detect if an image tag is a pixel tracker.
*
* @param string $tag Tag name
* @param array $attributes Tag attributes
*
* @return bool
*/
public function isPixelTracker($tag, array $attributes)
{
return $tag === 'img' &&
isset($attributes['height']) && isset($attributes['width']) &&
$attributes['height'] == 1 && $attributes['width'] == 1;
}
/**
* Remove script tags.
*
* @param string $data Input data
*
* @return string
*/
public function removeBlacklistedTags($data)
{
$dom = XmlParser::getDomDocument($data);
if ($dom === false) {
return '';
}
$xpath = new DOMXpath($dom);
$nodes = $xpath->query(implode(' | ', $this->tag_blacklist));
foreach ($nodes as $node) {
$node->parentNode->removeChild($node);
}
return $dom->saveXML();
}
/**
* Remove empty tags.
*
* @param string $data Input data
*
* @return string
*/
public function removeEmptyTags($data)
{
return preg_replace('/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', '', $data);
}
/**
* Replace <br/><br/> by only one.
*
* @param string $data Input data
*
* @return string
*/
public function removeMultipleBreakTags($data)
{
return preg_replace("/(<br\s*\/?>\s*)+/", '<br/>', $data);
}
/**
* Set whitelisted tags adn attributes for each tag.
*
* @param array $values List of tags: ['video' => ['src', 'cover'], 'img' => ['src']]
*
* @return Tag
*/
public function setWhitelistedTags(array $values)
{
$this->tag_whitelist = $values ?: $this->tag_whitelist;
return $this;
}
}

View file

@ -0,0 +1,114 @@
<?php
namespace PicoFeed\Logging;
use DateTime;
use DateTimeZone;
/**
* Logging class.
*
* @author Frederic Guillot
*/
class Logger
{
/**
* List of messages.
*
* @static
*
* @var array
*/
private static $messages = array();
/**
* Default timezone.
*
* @static
*
* @var string
*/
private static $timezone = 'UTC';
/**
* Enable or disable logging.
*
* @static
*
* @var bool
*/
public static $enable = false;
/**
* Enable logging.
*
* @static
*/
public static function enable()
{
self::$enable = true;
}
/**
* Add a new message.
*
* @static
*
* @param string $message Message
*/
public static function setMessage($message)
{
if (self::$enable) {
$date = new DateTime('now', new DateTimeZone(self::$timezone));
self::$messages[] = '['.$date->format('Y-m-d H:i:s').'] '.$message;
}
}
/**
* Get all logged messages.
*
* @static
*
* @return array
*/
public static function getMessages()
{
return self::$messages;
}
/**
* Remove all logged messages.
*
* @static
*/
public static function deleteMessages()
{
self::$messages = array();
}
/**
* Set a different timezone.
*
* @static
*
* @see http://php.net/manual/en/timezones.php
*
* @param string $timezone Timezone
*/
public static function setTimeZone($timezone)
{
self::$timezone = $timezone ?: self::$timezone;
}
/**
* Get all messages serialized into a string.
*
* @static
*
* @return string
*/
public static function toString()
{
return implode(PHP_EOL, self::$messages).PHP_EOL;
}
}

View file

@ -0,0 +1,366 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
use PicoFeed\Filter\Filter;
use PicoFeed\Client\Url;
/**
* Atom parser.
*
* @author Frederic Guillot
*/
class Atom extends Parser
{
/**
* Supported namespaces.
*/
protected $namespaces = array(
'atom' => 'http://www.w3.org/2005/Atom',
);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml)
{
return XmlParser::getXPathResult($xml, 'atom:entry', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'entry');
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->feed_url = $this->getUrl($xml, 'self');
}
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->site_url = $this->getUrl($xml, 'alternate', true);
}
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
{
$description = XmlParser::getXPathResult($xml, 'atom:subtitle', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'subtitle');
$feed->description = (string) current($description);
}
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
{
$logo = XmlParser::getXPathResult($xml, 'atom:logo', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'logo');
$feed->logo = (string) current($logo);
}
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
{
$icon = XmlParser::getXPathResult($xml, 'atom:icon', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'icon');
$feed->icon = (string) current($icon);
}
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$title = XmlParser::getXPathResult($xml, 'atom:title', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'title');
$feed->title = Filter::stripWhiteSpace((string) current($title)) ?: $feed->getSiteUrl();
}
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$language = XmlParser::getXPathResult($xml, '*[not(self::atom:entry)]/@xml:lang', $this->namespaces)
?: XmlParser::getXPathResult($xml, '@xml:lang');
$feed->language = (string) current($language);
}
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$id = XmlParser::getXPathResult($xml, 'atom:id', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'id');
$feed->id = (string) current($id);
}
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$updated = XmlParser::getXPathResult($xml, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'updated');
$feed->date = $this->date->getDateTime((string) current($updated));
}
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$published = XmlParser::getXPathResult($entry, 'atom:published', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'published');
$updated = XmlParser::getXPathResult($entry, 'atom:updated', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'updated');
$published = !empty($published) ? $this->date->getDateTime((string) current($published)) : null;
$updated = !empty($updated) ? $this->date->getDateTime((string) current($updated)) : null;
if ($published === null && $updated === null) {
$item->date = $feed->getDate(); // We use the feed date if there is no date for the item
} elseif ($published !== null && $updated !== null) {
$item->date = max($published, $updated); // We use the most recent date between published and updated
} else {
$item->date = $updated ?: $published;
}
}
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$title = XmlParser::getXPathResult($entry, 'atom:title', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'title');
$item->title = Filter::stripWhiteSpace((string) current($title)) ?: $item->url;
}
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
$author = XmlParser::getXPathResult($entry, 'atom:author/atom:name', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'author/name')
?: XmlParser::getXPathResult($xml, 'atom:author/atom:name', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'author/name');
$item->author = (string) current($author);
}
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$item->content = $this->getContent($entry);
}
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$item->url = $this->getUrl($entry, 'alternate', true);
}
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$id = XmlParser::getXPathResult($entry, 'atom:id', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'id');
if (!empty($id)) {
$item->id = $this->generateId((string) current($id));
} else {
$item->id = $this->generateId(
$item->getTitle(), $item->getUrl(), $item->getContent()
);
}
}
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$enclosure = $this->findLink($entry, 'enclosure');
if ($enclosure) {
$item->enclosure_url = Url::resolve((string) $enclosure['href'], $feed->getSiteUrl());
$item->enclosure_type = (string) $enclosure['type'];
}
}
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$language = XmlParser::getXPathResult($entry, './/@xml:lang');
$item->language = (string) current($language) ?: $feed->language;
}
/**
* Get the URL from a link tag.
*
* @param SimpleXMLElement $xml XML tag
* @param string $rel Link relationship: alternate, enclosure, related, self, via
*
* @return string
*/
private function getUrl(SimpleXMLElement $xml, $rel, $fallback = false)
{
$link = $this->findLink($xml, $rel);
if ($link) {
return (string) $link['href'];
}
if ($fallback) {
$link = $this->findLink($xml, '');
return $link ? (string) $link['href'] : '';
}
return '';
}
/**
* Get a link tag that match a relationship.
*
* @param SimpleXMLElement $xml XML tag
* @param string $rel Link relationship: alternate, enclosure, related, self, via
*
* @return SimpleXMLElement|null
*/
private function findLink(SimpleXMLElement $xml, $rel)
{
$links = XmlParser::getXPathResult($xml, 'atom:link', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'link');
foreach ($links as $link) {
if ($rel === (string) $link['rel']) {
return $link;
}
}
return;
}
/**
* Get the entry content.
*
* @param SimpleXMLElement $entry XML Entry
*
* @return string
*/
private function getContent(SimpleXMLElement $entry)
{
$content = current(
XmlParser::getXPathResult($entry, 'atom:content', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'content')
);
if (!empty($content) && count($content->children())) {
$xml_string = '';
foreach ($content->children() as $child) {
$xml_string .= $child->asXML();
}
return $xml_string;
} elseif (trim((string) $content) !== '') {
return (string) $content;
}
$summary = XmlParser::getXPathResult($entry, 'atom:summary', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'summary');
return (string) current($summary);
}
}

View file

@ -0,0 +1,113 @@
<?php
namespace PicoFeed\Parser;
use DateTime;
use DateTimeZone;
/**
* Date Parser.
*
* @author Frederic Guillot
*/
class DateParser
{
/**
* Timezone used to parse feed dates.
*
* @var string
*/
public $timezone = 'UTC';
/**
* Supported formats [ 'format' => length ].
*
* @var array
*/
public $formats = array(
DATE_ATOM => null,
DATE_RSS => null,
DATE_COOKIE => null,
DATE_ISO8601 => null,
DATE_RFC822 => null,
DATE_RFC850 => null,
DATE_RFC1036 => null,
DATE_RFC1123 => null,
DATE_RFC2822 => null,
DATE_RFC3339 => null,
'D, d M Y H:i:s' => 25,
'D, d M Y h:i:s' => 25,
'D M d Y H:i:s' => 24,
'j M Y H:i:s' => 20,
'Y-m-d H:i:s' => 19,
'Y-m-d\TH:i:s' => 19,
'd/m/Y H:i:s' => 19,
'D, d M Y' => 16,
'Y-m-d' => 10,
'd-m-Y' => 10,
'm-d-Y' => 10,
'd.m.Y' => 10,
'm.d.Y' => 10,
'd/m/Y' => 10,
'm/d/Y' => 10,
);
/**
* Try to parse all date format for broken feeds.
*
* @param string $value Original date format
*
* @return DateTime
*/
public function getDateTime($value)
{
$value = trim($value);
foreach ($this->formats as $format => $length) {
$truncated_value = $value;
if ($length !== null) {
$truncated_value = substr($truncated_value, 0, $length);
}
$date = $this->getValidDate($format, $truncated_value);
if ($date !== false) {
return $date;
}
}
return $this->getCurrentDateTime();
}
/**
* Get a valid date from a given format.
*
* @param string $format Date format
* @param string $value Original date value
*
* @return DateTime|bool
*/
public function getValidDate($format, $value)
{
$date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone));
if ($date !== false) {
$errors = DateTime::getLastErrors();
if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) {
return $date;
}
}
return false;
}
/**
* Get the current datetime.
*
* @return DateTime
*/
public function getCurrentDateTime()
{
return new DateTime('now', new DateTimeZone($this->timezone));
}
}

View file

@ -0,0 +1,194 @@
<?php
namespace PicoFeed\Parser;
/**
* Feed.
*
* @author Frederic Guillot
*/
class Feed
{
/**
* Feed items.
*
* @var array
*/
public $items = array();
/**
* Feed id.
*
* @var string
*/
public $id = '';
/**
* Feed title.
*
* @var string
*/
public $title = '';
/**
* Feed description.
*
* @var string
*/
public $description = '';
/**
* Feed url.
*
* @var string
*/
public $feed_url = '';
/**
* Site url.
*
* @var string
*/
public $site_url = '';
/**
* Feed date.
*
* @var \DateTime
*/
public $date = null;
/**
* Feed language.
*
* @var string
*/
public $language = '';
/**
* Feed logo URL.
*
* @var string
*/
public $logo = '';
/**
* Feed icon URL.
*
* @var string
*/
public $icon = '';
/**
* Return feed information.
*/
public function __toString()
{
$output = '';
foreach (array('id', 'title', 'feed_url', 'site_url', 'language', 'description', 'logo') as $property) {
$output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL;
}
$output .= 'Feed::date = '.$this->date->format(DATE_RFC822).PHP_EOL;
$output .= 'Feed::isRTL() = '.($this->isRTL() ? 'true' : 'false').PHP_EOL;
$output .= 'Feed::items = '.count($this->items).' items'.PHP_EOL;
foreach ($this->items as $item) {
$output .= '----'.PHP_EOL;
$output .= $item;
}
return $output;
}
/**
* Get title.
*/
public function getTitle()
{
return $this->title;
}
/**
* Get description.
*/
public function getDescription()
{
return $this->description;
}
/**
* Get the logo url.
*/
public function getLogo()
{
return $this->logo;
}
/**
* Get the icon url.
*/
public function getIcon()
{
return $this->icon;
}
/**
* Get feed url.
*/
public function getFeedUrl()
{
return $this->feed_url;
}
/**
* Get site url.
*/
public function getSiteUrl()
{
return $this->site_url;
}
/**
* Get date.
*/
public function getDate()
{
return $this->date;
}
/**
* Get language.
*/
public function getLanguage()
{
return $this->language;
}
/**
* Get id.
*/
public function getId()
{
return $this->id;
}
/**
* Get feed items.
*/
public function getItems()
{
return $this->items;
}
/**
* Return true if the feed is "Right to Left".
*
* @return bool
*/
public function isRTL()
{
return Parser::isLanguageRTL($this->language);
}
}

View file

@ -0,0 +1,230 @@
<?php
namespace PicoFeed\Parser;
/**
* Feed Item.
*
* @author Frederic Guillot
*/
class Item
{
/**
* List of known RTL languages.
*
* @var public
*/
public $rtl = array(
'ar', // Arabic (ar-**)
'fa', // Farsi (fa-**)
'ur', // Urdu (ur-**)
'ps', // Pashtu (ps-**)
'syr', // Syriac (syr-**)
'dv', // Divehi (dv-**)
'he', // Hebrew (he-**)
'yi', // Yiddish (yi-**)
);
/**
* Item id.
*
* @var string
*/
public $id = '';
/**
* Item title.
*
* @var string
*/
public $title = '';
/**
* Item url.
*
* @var string
*/
public $url = '';
/**
* Item author.
*
* @var string
*/
public $author = '';
/**
* Item date.
*
* @var \DateTime
*/
public $date = null;
/**
* Item content.
*
* @var string
*/
public $content = '';
/**
* Item enclosure url.
*
* @var string
*/
public $enclosure_url = '';
/**
* Item enclusure type.
*
* @var string
*/
public $enclosure_type = '';
/**
* Item language.
*
* @var string
*/
public $language = '';
/**
* Raw XML.
*
* @var \SimpleXMLElement
*/
public $xml;
/**
* List of namespaces.
*
* @var array
*/
public $namespaces = array();
/**
* Get specific XML tag or attribute value.
*
* @param string $tag Tag name (examples: guid, media:content)
* @param string $attribute Tag attribute
*
* @return array|false Tag values or error
*/
public function getTag($tag, $attribute = '')
{
// convert to xPath attribute query
if ($attribute !== '') {
$attribute = '/@'.$attribute;
}
// construct query
$query = './/'.$tag.$attribute;
$elements = XmlParser::getXPathResult($this->xml, $query, $this->namespaces);
if ($elements === false) { // xPath error
return false;
}
return array_map(function ($element) { return (string) $element;}, $elements);
}
/**
* Return item information.
*/
public function __toString()
{
$output = '';
foreach (array('id', 'title', 'url', 'language', 'author', 'enclosure_url', 'enclosure_type') as $property) {
$output .= 'Item::'.$property.' = '.$this->$property.PHP_EOL;
}
$output .= 'Item::date = '.$this->date->format(DATE_RFC822).PHP_EOL;
$output .= 'Item::isRTL() = '.($this->isRTL() ? 'true' : 'false').PHP_EOL;
$output .= 'Item::content = '.strlen($this->content).' bytes'.PHP_EOL;
return $output;
}
/**
* Get title.
*/
public function getTitle()
{
return $this->title;
}
/**
* Get url.
*/
public function getUrl()
{
return $this->url;
}
/**
* Get id.
*/
public function getId()
{
return $this->id;
}
/**
* Get date.
*/
public function getDate()
{
return $this->date;
}
/**
* Get content.
*/
public function getContent()
{
return $this->content;
}
/**
* Get enclosure url.
*/
public function getEnclosureUrl()
{
return $this->enclosure_url;
}
/**
* Get enclosure type.
*/
public function getEnclosureType()
{
return $this->enclosure_type;
}
/**
* Get language.
*/
public function getLanguage()
{
return $this->language;
}
/**
* Get author.
*/
public function getAuthor()
{
return $this->author;
}
/**
* Return true if the item is "Right to Left".
*
* @return bool
*/
public function isRTL()
{
return Parser::isLanguageRTL($this->language);
}
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Parser;
/**
* MalformedXmlException Exception.
*
* @author Frederic Guillot
*/
class MalformedXmlException extends ParserException
{
}

View file

@ -0,0 +1,576 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
use PicoFeed\Client\Url;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
use PicoFeed\Scraper\Scraper;
/**
* Base parser class.
*
* @author Frederic Guillot
*/
abstract class Parser
{
/**
* Config object.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* DateParser object.
*
* @var \PicoFeed\Parser\DateParser
*/
protected $date;
/**
* Hash algorithm used to generate item id, any value supported by PHP, see hash_algos().
*
* @var string
*/
private $hash_algo = 'sha256';
/**
* Feed content (XML data).
*
* @var string
*/
protected $content = '';
/**
* Fallback url.
*
* @var string
*/
protected $fallback_url = '';
/**
* XML namespaces supported by parser.
*
* @var array
*/
protected $namespaces = array();
/**
* XML namespaces used in document.
*
* @var array
*/
protected $used_namespaces = array();
/**
* Enable the content filtering.
*
* @var bool
*/
private $enable_filter = true;
/**
* Enable the content grabber.
*
* @var bool
*/
private $enable_grabber = false;
/**
* Enable the content grabber on all pages.
*
* @var bool
*/
private $grabber_needs_rule_file = false;
/**
* Ignore those urls for the content scraper.
*
* @var array
*/
private $grabber_ignore_urls = array();
/**
* Constructor.
*
* @param string $content Feed content
* @param string $http_encoding HTTP encoding (headers)
* @param string $fallback_url Fallback url when the feed provide relative or broken url
*/
public function __construct($content, $http_encoding = '', $fallback_url = '')
{
$this->date = new DateParser();
$this->fallback_url = $fallback_url;
$xml_encoding = XmlParser::getEncodingFromXmlTag($content);
// Strip XML tag to avoid multiple encoding/decoding in the next XML processing
$this->content = Filter::stripXmlTag($content);
// Encode everything in UTF-8
Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
$this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding);
}
/**
* Parse the document.
*
* @return \PicoFeed\Parser\Feed
*/
public function execute()
{
Logger::setMessage(get_called_class().': begin parsing');
$xml = XmlParser::getSimpleXml($this->content);
if ($xml === false) {
Logger::setMessage(get_called_class().': Applying XML workarounds');
$this->content = Filter::normalizeData($this->content);
$xml = XmlParser::getSimpleXml($this->content);
if ($xml === false) {
Logger::setMessage(get_called_class().': XML parsing error');
Logger::setMessage(XmlParser::getErrors());
throw new MalformedXmlException('XML parsing error');
}
}
$this->used_namespaces = $xml->getNamespaces(true);
$xml = $this->registerSupportedNamespaces($xml);
$feed = new Feed();
$this->findFeedUrl($xml, $feed);
$this->checkFeedUrl($feed);
$this->findSiteUrl($xml, $feed);
$this->checkSiteUrl($feed);
$this->findFeedTitle($xml, $feed);
$this->findFeedDescription($xml, $feed);
$this->findFeedLanguage($xml, $feed);
$this->findFeedId($xml, $feed);
$this->findFeedDate($xml, $feed);
$this->findFeedLogo($xml, $feed);
$this->findFeedIcon($xml, $feed);
foreach ($this->getItemsTree($xml) as $entry) {
$entry = $this->registerSupportedNamespaces($entry);
$item = new Item();
$item->xml = $entry;
$item->namespaces = $this->used_namespaces;
$this->findItemAuthor($xml, $entry, $item);
$this->findItemUrl($entry, $item);
$this->checkItemUrl($feed, $item);
$this->findItemTitle($entry, $item);
$this->findItemContent($entry, $item);
// Id generation can use the item url/title/content (order is important)
$this->findItemId($entry, $item, $feed);
$this->findItemDate($entry, $item, $feed);
$this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed);
// Order is important (avoid double filtering)
$this->filterItemContent($feed, $item);
$this->scrapWebsite($item);
$feed->items[] = $item;
}
Logger::setMessage(get_called_class().PHP_EOL.$feed);
return $feed;
}
/**
* Check if the feed url is correct.
*
* @param Feed $feed Feed object
*/
public function checkFeedUrl(Feed $feed)
{
if ($feed->getFeedUrl() === '') {
$feed->feed_url = $this->fallback_url;
} else {
$feed->feed_url = Url::resolve($feed->getFeedUrl(), $this->fallback_url);
}
}
/**
* Check if the site url is correct.
*
* @param Feed $feed Feed object
*/
public function checkSiteUrl(Feed $feed)
{
if ($feed->getSiteUrl() === '') {
$feed->site_url = Url::base($feed->getFeedUrl());
} else {
$feed->site_url = Url::resolve($feed->getSiteUrl(), $this->fallback_url);
}
}
/**
* Check if the item url is correct.
*
* @param Feed $feed Feed object
* @param Item $item Item object
*/
public function checkItemUrl(Feed $feed, Item $item)
{
$item->url = Url::resolve($item->getUrl(), $feed->getSiteUrl());
}
/**
* Fetch item content with the content grabber.
*
* @param Item $item Item object
*/
public function scrapWebsite(Item $item)
{
if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) {
$grabber = new Scraper($this->config);
$grabber->setUrl($item->getUrl());
if ($this->grabber_needs_rule_file) {
$grabber->disableCandidateParser();
}
$grabber->execute();
if ($grabber->hasRelevantContent()) {
$item->content = $grabber->getFilteredContent();
}
}
}
/**
* Filter HTML for entry content.
*
* @param Feed $feed Feed object
* @param Item $item Item object
*/
public function filterItemContent(Feed $feed, Item $item)
{
if ($this->isFilteringEnabled()) {
$filter = Filter::html($item->getContent(), $feed->getSiteUrl());
$filter->setConfig($this->config);
$item->content = $filter->execute();
} else {
Logger::setMessage(get_called_class().': Content filtering disabled');
}
}
/**
* Generate a unique id for an entry (hash all arguments).
*
* @return string
*/
public function generateId()
{
return hash($this->hash_algo, implode(func_get_args()));
}
/**
* Return true if the given language is "Right to Left".
*
* @static
*
* @param string $language Language: fr-FR, en-US
*
* @return bool
*/
public static function isLanguageRTL($language)
{
$language = strtolower($language);
$rtl_languages = array(
'ar', // Arabic (ar-**)
'fa', // Farsi (fa-**)
'ur', // Urdu (ur-**)
'ps', // Pashtu (ps-**)
'syr', // Syriac (syr-**)
'dv', // Divehi (dv-**)
'he', // Hebrew (he-**)
'yi', // Yiddish (yi-**)
);
foreach ($rtl_languages as $prefix) {
if (strpos($language, $prefix) === 0) {
return true;
}
}
return false;
}
/**
* Set Hash algorithm used for id generation.
*
* @param string $algo Algorithm name
*
* @return \PicoFeed\Parser\Parser
*/
public function setHashAlgo($algo)
{
$this->hash_algo = $algo ?: $this->hash_algo;
return $this;
}
/**
* Set a different timezone.
*
* @see http://php.net/manual/en/timezones.php
*
* @param string $timezone Timezone
*
* @return \PicoFeed\Parser\Parser
*/
public function setTimezone($timezone)
{
if ($timezone) {
$this->date->timezone = $timezone;
}
return $this;
}
/**
* Set config object.
*
* @param \PicoFeed\Config\Config $config Config instance
*
* @return \PicoFeed\Parser\Parser
*/
public function setConfig($config)
{
$this->config = $config;
return $this;
}
/**
* Enable the content grabber.
*
* @return \PicoFeed\Parser\Parser
*/
public function disableContentFiltering()
{
$this->enable_filter = false;
}
/**
* Return true if the content filtering is enabled.
*
* @return bool
*/
public function isFilteringEnabled()
{
if ($this->config === null) {
return $this->enable_filter;
}
return $this->config->getContentFiltering($this->enable_filter);
}
/**
* Enable the content grabber.
*
* @param bool $needs_rule_file true if only pages with rule files should be
* scraped
*
* @return \PicoFeed\Parser\Parser
*/
public function enableContentGrabber($needs_rule_file = false)
{
$this->enable_grabber = true;
$this->grabber_needs_rule_file = $needs_rule_file;
}
/**
* Set ignored URLs for the content grabber.
*
* @param array $urls URLs
*
* @return \PicoFeed\Parser\Parser
*/
public function setGrabberIgnoreUrls(array $urls)
{
$this->grabber_ignore_urls = $urls;
}
/**
* Register all supported namespaces to be used within an xpath query.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function registerSupportedNamespaces(SimpleXMLElement $xml)
{
foreach ($this->namespaces as $prefix => $ns) {
$xml->registerXPathNamespace($prefix, $ns);
}
return $xml;
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedUrl(SimpleXMLElement $xml, Feed $feed);
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findSiteUrl(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedTitle(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedDescription(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedId(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedDate(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedLogo(SimpleXMLElement $xml, Feed $feed);
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findFeedIcon(SimpleXMLElement $xml, Feed $feed);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
abstract public function getItemsTree(SimpleXMLElement $xml);
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item);
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemUrl(SimpleXMLElement $entry, Item $item);
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemTitle(SimpleXMLElement $entry, Item $item);
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
abstract public function findItemContent(SimpleXMLElement $entry, Item $item);
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed);
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
abstract public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed);
}

View file

@ -0,0 +1,14 @@
<?php
namespace PicoFeed\Parser;
use PicoFeed\PicoFeedException;
/**
* ParserException Exception.
*
* @author Frederic Guillot
*/
abstract class ParserException extends PicoFeedException
{
}

View file

@ -0,0 +1,270 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
use PicoFeed\Filter\Filter;
/**
* RSS 1.0 parser.
*
* @author Frederic Guillot
*/
class Rss10 extends Parser
{
/**
* Supported namespaces.
*/
protected $namespaces = array(
'rss' => 'http://purl.org/rss/1.0/',
'dc' => 'http://purl.org/dc/elements/1.1/',
'content' => 'http://purl.org/rss/1.0/modules/content/',
'feedburner' => 'http://rssnamespace.org/feedburner/ext/1.0',
);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml)
{
return XmlParser::getXPathResult($xml, 'rss:item', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'item');
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->feed_url = '';
}
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
{
$site_url = XmlParser::getXPathResult($xml, 'rss:channel/rss:link', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/link');
$feed->site_url = (string) current($site_url);
}
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
{
$description = XmlParser::getXPathResult($xml, 'rss:channel/rss:description', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/description');
$feed->description = (string) current($description);
}
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
{
$logo = XmlParser::getXPathResult($xml, 'rss:image/rss:url', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'image/url');
$feed->logo = (string) current($logo);
}
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
{
$feed->icon = '';
}
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$title = XmlParser::getXPathResult($xml, 'rss:channel/rss:title', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/title');
$feed->title = Filter::stripWhiteSpace((string) current($title)) ?: $feed->getSiteUrl();
}
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$language = XmlParser::getXPathResult($xml, 'rss:channel/dc:language', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:language', $this->namespaces);
$feed->language = (string) current($language);
}
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$feed->id = $feed->getFeedUrl() ?: $feed->getSiteUrl();
}
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$date = XmlParser::getXPathResult($xml, 'rss:channel/dc:date', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:date', $this->namespaces);
$feed->date = $this->date->getDateTime((string) current($date));
}
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$date = XmlParser::getXPathResult($entry, 'dc:date', $this->namespaces);
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
}
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$title = XmlParser::getXPathResult($entry, 'rss:title', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'title');
$item->title = Filter::stripWhiteSpace((string) current($title)) ?: $item->url;
}
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
$author = XmlParser::getXPathResult($entry, 'dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'rss:channel/dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/dc:creator', $this->namespaces);
$item->author = (string) current($author);
}
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$content = XmlParser::getXPathResult($entry, 'content:encoded', $this->namespaces);
if (trim((string) current($content)) === '') {
$content = XmlParser::getXPathResult($entry, 'rss:description', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'description');
}
$item->content = (string) current($content);
}
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$link = XmlParser::getXPathResult($entry, 'feedburner:origLink', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'rss:link', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'link');
$item->url = trim((string) current($link));
}
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$item->id = $this->generateId(
$item->getTitle(), $item->getUrl(), $item->getContent()
);
}
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
}
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces);
$item->language = (string) current($language) ?: $feed->language;
}
}

View file

@ -0,0 +1,291 @@
<?php
namespace PicoFeed\Parser;
use SimpleXMLElement;
use PicoFeed\Filter\Filter;
use PicoFeed\Client\Url;
/**
* RSS 2.0 Parser.
*
* @author Frederic Guillot
*/
class Rss20 extends Parser
{
/**
* Supported namespaces.
*/
protected $namespaces = array(
'dc' => 'http://purl.org/dc/elements/1.1/',
'content' => 'http://purl.org/rss/1.0/modules/content/',
'feedburner' => 'http://rssnamespace.org/feedburner/ext/1.0',
'atom' => 'http://www.w3.org/2005/Atom',
);
/**
* Get the path to the items XML tree.
*
* @param SimpleXMLElement $xml Feed xml
*
* @return SimpleXMLElement
*/
public function getItemsTree(SimpleXMLElement $xml)
{
return XmlParser::getXPathResult($xml, 'channel/item');
}
/**
* Find the feed url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedUrl(SimpleXMLElement $xml, Feed $feed)
{
$feed->feed_url = '';
}
/**
* Find the site url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findSiteUrl(SimpleXMLElement $xml, Feed $feed)
{
$site_url = XmlParser::getXPathResult($xml, 'channel/link');
$feed->site_url = (string) current($site_url);
}
/**
* Find the feed description.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDescription(SimpleXMLElement $xml, Feed $feed)
{
$description = XmlParser::getXPathResult($xml, 'channel/description');
$feed->description = (string) current($description);
}
/**
* Find the feed logo url.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLogo(SimpleXMLElement $xml, Feed $feed)
{
$logo = XmlParser::getXPathResult($xml, 'channel/image/url');
$feed->logo = (string) current($logo);
}
/**
* Find the feed icon.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedIcon(SimpleXMLElement $xml, Feed $feed)
{
$feed->icon = '';
}
/**
* Find the feed title.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedTitle(SimpleXMLElement $xml, Feed $feed)
{
$title = XmlParser::getXPathResult($xml, 'channel/title');
$feed->title = Filter::stripWhiteSpace((string) current($title)) ?: $feed->getSiteUrl();
}
/**
* Find the feed language.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed)
{
$language = XmlParser::getXPathResult($xml, 'channel/language');
$feed->language = (string) current($language);
}
/**
* Find the feed id.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedId(SimpleXMLElement $xml, Feed $feed)
{
$feed->id = $feed->getFeedUrl() ?: $feed->getSiteUrl();
}
/**
* Find the feed date.
*
* @param SimpleXMLElement $xml Feed xml
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findFeedDate(SimpleXMLElement $xml, Feed $feed)
{
$publish_date = XmlParser::getXPathResult($xml, 'channel/pubDate');
$update_date = XmlParser::getXPathResult($xml, 'channel/lastBuildDate');
$published = !empty($publish_date) ? $this->date->getDateTime((string) current($publish_date)) : null;
$updated = !empty($update_date) ? $this->date->getDateTime((string) current($update_date)) : null;
if ($published === null && $updated === null) {
$feed->date = $this->date->getCurrentDateTime(); // We use the current date if there is no date for the feed
} elseif ($published !== null && $updated !== null) {
$feed->date = max($published, $updated); // We use the most recent date between published and updated
} else {
$feed->date = $updated ?: $published;
}
}
/**
* Find the item date.
*
* @param SimpleXMLElement $entry Feed item
* @param Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemDate(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$date = XmlParser::getXPathResult($entry, 'pubDate');
$item->date = empty($date) ? $feed->getDate() : $this->date->getDateTime((string) current($date));
}
/**
* Find the item title.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemTitle(SimpleXMLElement $entry, Item $item)
{
$title = XmlParser::getXPathResult($entry, 'title');
$item->title = Filter::stripWhiteSpace((string) current($title)) ?: $item->url;
}
/**
* Find the item author.
*
* @param SimpleXMLElement $xml Feed
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item)
{
$author = XmlParser::getXPathResult($entry, 'dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'author')
?: XmlParser::getXPathResult($xml, 'channel/dc:creator', $this->namespaces)
?: XmlParser::getXPathResult($xml, 'channel/managingEditor');
$item->author = (string) current($author);
}
/**
* Find the item content.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemContent(SimpleXMLElement $entry, Item $item)
{
$content = XmlParser::getXPathResult($entry, 'content:encoded', $this->namespaces);
if (trim((string) current($content)) === '') {
$content = XmlParser::getXPathResult($entry, 'description');
}
$item->content = (string) current($content);
}
/**
* Find the item URL.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
*/
public function findItemUrl(SimpleXMLElement $entry, Item $item)
{
$link = XmlParser::getXPathResult($entry, 'feedburner:origLink', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'link')
?: XmlParser::getXPathResult($entry, 'atom:link/@href', $this->namespaces);
if (!empty($link)) {
$item->url = trim((string) current($link));
} else {
$link = XmlParser::getXPathResult($entry, 'guid');
$link = trim((string) current($link));
if (filter_var($link, FILTER_VALIDATE_URL) !== false) {
$item->url = $link;
}
}
}
/**
* Genereate the item id.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$id = (string) current(XmlParser::getXPathResult($entry, 'guid'));
if ($id) {
$item->id = $this->generateId($id);
} else {
$item->id = $this->generateId(
$item->getTitle(), $item->getUrl(), $item->getContent()
);
}
}
/**
* Find the item enclosure.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed)
{
if (isset($entry->enclosure)) {
$enclosure_url = XmlParser::getXPathResult($entry, 'feedburner:origEnclosureLink', $this->namespaces)
?: XmlParser::getXPathResult($entry, 'enclosure/@url');
$enclosure_type = XmlParser::getXPathResult($entry, 'enclosure/@type');
$item->enclosure_url = Url::resolve((string) current($enclosure_url), $feed->getSiteUrl());
$item->enclosure_type = (string) current($enclosure_type);
}
}
/**
* Find the item language.
*
* @param SimpleXMLElement $entry Feed item
* @param \PicoFeed\Parser\Item $item Item object
* @param \PicoFeed\Parser\Feed $feed Feed object
*/
public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed)
{
$language = XmlParser::getXPathResult($entry, 'dc:language', $this->namespaces);
$item->language = (string) current($language) ?: $feed->language;
}
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Parser;
/**
* RSS 0.91 Parser.
*
* @author Frederic Guillot
*/
class Rss91 extends Rss20
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Parser;
/**
* RSS 0.92 Parser.
*
* @author Frederic Guillot
*/
class Rss92 extends Rss20
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Parser;
/**
* XmlEntityException Exception.
*
* @author Bernhard Posselt
*/
class XmlEntityException extends MalformedXmlException
{
}

View file

@ -0,0 +1,229 @@
<?php
namespace PicoFeed\Parser;
use DomDocument;
use SimpleXmlElement;
use Exception;
use ZendXml\Security;
/**
* XML parser class.
*
* Checks for XML eXternal Entity (XXE) and XML Entity Expansion (XEE) attacks on XML documents
*
* @author Frederic Guillot
*/
class XmlParser
{
/**
* Get a SimpleXmlElement instance or return false.
*
* @static
*
* @param string $input XML content
*
* @return mixed
*/
public static function getSimpleXml($input)
{
return self::scan($input);
}
/**
* Get a DomDocument instance or return false.
*
* @static
*
* @param string $input XML content
*
* @return \DOMNDocument
*/
public static function getDomDocument($input)
{
if (empty($input)) {
return false;
}
$dom = self::scan($input, new DOMDocument());
// The document is empty, there is probably some parsing errors
if ($dom && $dom->childNodes->length === 0) {
return false;
}
return $dom;
}
/**
* Small wrapper around ZendXml to turn their exceptions into picoFeed
* exceptions
* @param $input the xml to load
* @param $dom pass in a dom document or use null/omit if simpleXml should
* be used
*/
private static function scan($input, $dom=null)
{
try {
return Security::scan($input, $dom);
} catch(\ZendXml\Exception\RuntimeException $e) {
throw new XmlEntityException($e->getMessage());
}
}
/**
* Load HTML document by using a DomDocument instance or return false on failure.
*
* @static
*
* @param string $input XML content
*
* @return \DOMDocument
*/
public static function getHtmlDocument($input)
{
$dom = new DomDocument();
if (empty($input)) {
return $dom;
}
libxml_use_internal_errors(true);
if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
$dom->loadHTML($input, LIBXML_NONET);
} else {
$dom->loadHTML($input);
}
return $dom;
}
/**
* Convert a HTML document to XML.
*
* @static
*
* @param string $html HTML document
*
* @return string
*/
public static function htmlToXml($html)
{
$dom = self::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
return $dom->saveXML($dom->getElementsByTagName('body')->item(0));
}
/**
* Get XML parser errors.
*
* @static
*
* @return string
*/
public static function getErrors()
{
$errors = array();
foreach (libxml_get_errors() as $error) {
$errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)',
$error->message,
$error->line,
$error->column,
$error->code
);
}
return implode(', ', $errors);
}
/**
* Get the encoding from a xml tag.
*
* @static
*
* @param string $data Input data
*
* @return string
*/
public static function getEncodingFromXmlTag($data)
{
$encoding = '';
if (strpos($data, '<?xml') !== false) {
$data = substr($data, 0, strrpos($data, '?>'));
$data = str_replace("'", '"', $data);
$p1 = strpos($data, 'encoding=');
$p2 = strpos($data, '"', $p1 + 10);
if ($p1 !== false && $p2 !== false) {
$encoding = substr($data, $p1 + 10, $p2 - $p1 - 10);
$encoding = strtolower($encoding);
}
}
return $encoding;
}
/**
* Get the charset from a meta tag.
*
* @static
*
* @param string $data Input data
*
* @return string
*/
public static function getEncodingFromMetaTag($data)
{
$encoding = '';
if (preg_match('/<meta.*?charset\s*=\s*["\']?\s*([^"\'\s\/>;]+)/i', $data, $match) === 1) {
$encoding = strtolower($match[1]);
}
return $encoding;
}
/**
* Rewrite XPath query to use namespace-uri and local-name derived from prefix.
*
* @param string $query XPath query
* @param array $ns Prefix to namespace URI mapping
*
* @return string
*/
public static function replaceXPathPrefixWithNamespaceURI($query, array $ns)
{
return preg_replace_callback('/([A-Z0-9]+):([A-Z0-9]+)/iu', function ($matches) use ($ns) {
// don't try to map the special prefix XML
if (strtolower($matches[1]) === 'xml') {
return $matches[0];
}
return '*[namespace-uri()="'.$ns[$matches[1]].'" and local-name()="'.$matches[2].'"]';
},
$query);
}
/**
* Get the result elements of a XPath query.
*
* @param \SimpleXMLElement $xml XML element
* @param string $query XPath query
* @param array $ns Prefix to namespace URI mapping
*
* @return \SimpleXMLElement
*/
public static function getXPathResult(SimpleXMLElement $xml, $query, array $ns = array())
{
if (!empty($ns)) {
$query = static::replaceXPathPrefixWithNamespaceURI($query, $ns);
}
return $xml->xpath($query);
}
}

View file

@ -0,0 +1,14 @@
<?php
namespace PicoFeed;
use Exception;
/**
* PicoFeedException Exception.
*
* @author Frederic Guillot
*/
abstract class PicoFeedException extends Exception
{
}

View file

@ -0,0 +1,207 @@
<?php
namespace PicoFeed\Reader;
use DOMXpath;
use PicoFeed\Client\Client;
use PicoFeed\Client\ClientException;
use PicoFeed\Client\Url;
use PicoFeed\Config\Config;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\XmlParser;
/**
* Favicon class.
*
* https://en.wikipedia.org/wiki/Favicon
*
* @author Frederic Guillot
*/
class Favicon
{
/**
* Valid types for favicon (supported by browsers).
*
* @var array
*/
private $types = array(
'image/png',
'image/gif',
'image/x-icon',
'image/jpeg',
'image/jpg',
'image/svg+xml'
);
/**
* Config class instance.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Icon binary content.
*
* @var string
*/
private $content = '';
/**
* Icon content type.
*
* @var string
*/
private $content_type = '';
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config = null)
{
$this->config = $config ?: new Config();
}
/**
* Get the icon file content (available only after the download).
*
* @return string
*/
public function getContent()
{
return $this->content;
}
/**
* Get the icon file type (available only after the download).
*
* @return string
*/
public function getType()
{
foreach ($this->types as $type) {
if (strpos($this->content_type, $type) === 0) {
return $type;
}
}
return 'image/x-icon';
}
/**
* Get data URI (http://en.wikipedia.org/wiki/Data_URI_scheme).
*
* @return string
*/
public function getDataUri()
{
if (empty($this->content)) {
return '';
}
return sprintf(
'data:%s;base64,%s',
$this->getType(),
base64_encode($this->content)
);
}
/**
* Download and check if a resource exists.
*
* @param string $url URL
*
* @return \PicoFeed\Client Client instance
*/
public function download($url)
{
$client = Client::getInstance();
$client->setConfig($this->config);
Logger::setMessage(get_called_class().' Download => '.$url);
try {
$client->execute($url);
} catch (ClientException $e) {
Logger::setMessage(get_called_class().' Download Failed => '.$e->getMessage());
}
return $client;
}
/**
* Check if a remote file exists.
*
* @param string $url URL
*
* @return bool
*/
public function exists($url)
{
return $this->download($url)->getContent() !== '';
}
/**
* Get the icon link for a website.
*
* @param string $website_link URL
* @param string $favicon_link optional URL
*
* @return string
*/
public function find($website_link, $favicon_link = '')
{
$website = new Url($website_link);
if ($favicon_link !== '') {
$icons = array($favicon_link);
} else {
$icons = $this->extract($this->download($website->getBaseUrl('/'))->getContent());
$icons[] = $website->getBaseUrl('/favicon.ico');
}
foreach ($icons as $icon_link) {
$icon_link = Url::resolve($icon_link, $website);
$resource = $this->download($icon_link);
$this->content = $resource->getContent();
$this->content_type = $resource->getContentType();
if ($this->content !== '') {
return $icon_link;
} elseif ($favicon_link !== '') {
return $this->find($website_link);
}
}
return '';
}
/**
* Extract the icon links from the HTML.
*
* @param string $html HTML
*
* @return array
*/
public function extract($html)
{
$icons = array();
if (empty($html)) {
return $icons;
}
$dom = XmlParser::getHtmlDocument($html);
$xpath = new DOMXpath($dom);
$elements = $xpath->query('//link[@rel="icon" or @rel="shortcut icon" or @rel="icon shortcut"]');
for ($i = 0; $i < $elements->length; ++$i) {
$icons[] = $elements->item($i)->getAttribute('href');
}
return $icons;
}
}

View file

@ -0,0 +1,209 @@
<?php
namespace PicoFeed\Reader;
use DOMXPath;
use PicoFeed\Config\Config;
use PicoFeed\Client\Client;
use PicoFeed\Client\Url;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\XmlParser;
/**
* Reader class.
*
* @author Frederic Guillot
*/
class Reader
{
/**
* Feed formats for detection.
*
* @var array
*/
private $formats = array(
'Atom' => '//feed',
'Rss20' => '//rss[@version="2.0"]',
'Rss92' => '//rss[@version="0.92"]',
'Rss91' => '//rss[@version="0.91"]',
'Rss10' => '//rdf',
);
/**
* Config class instance.
*
* @var \PicoFeed\Config\Config
*/
private $config;
/**
* Constructor.
*
* @param \PicoFeed\Config\Config $config Config class instance
*/
public function __construct(Config $config = null)
{
$this->config = $config ?: new Config();
Logger::setTimezone($this->config->getTimezone());
}
/**
* Download a feed (no discovery).
*
* @param string $url Feed url
* @param string $last_modified Last modified HTTP header
* @param string $etag Etag HTTP header
* @param string $username HTTP basic auth username
* @param string $password HTTP basic auth password
*
* @return \PicoFeed\Client\Client
*/
public function download($url, $last_modified = '', $etag = '', $username = '', $password = '')
{
$url = $this->prependScheme($url);
return Client::getInstance()
->setConfig($this->config)
->setLastModified($last_modified)
->setEtag($etag)
->setUsername($username)
->setPassword($password)
->execute($url);
}
/**
* Discover and download a feed.
*
* @param string $url Feed or website url
* @param string $last_modified Last modified HTTP header
* @param string $etag Etag HTTP header
* @param string $username HTTP basic auth username
* @param string $password HTTP basic auth password
*
* @return \PicoFeed\Client\Client
*/
public function discover($url, $last_modified = '', $etag = '', $username = '', $password = '')
{
$client = $this->download($url, $last_modified, $etag, $username, $password);
// It's already a feed or the feed was not modified
if (!$client->isModified() || $this->detectFormat($client->getContent())) {
return $client;
}
// Try to find a subscription
$links = $this->find($client->getUrl(), $client->getContent());
if (empty($links)) {
throw new SubscriptionNotFoundException('Unable to find a subscription');
}
return $this->download($links[0], $last_modified, $etag, $username, $password);
}
/**
* Find feed urls inside a HTML document.
*
* @param string $url Website url
* @param string $html HTML content
*
* @return array List of feed links
*/
public function find($url, $html)
{
Logger::setMessage(get_called_class().': Try to discover subscriptions');
$dom = XmlParser::getHtmlDocument($html);
$xpath = new DOMXPath($dom);
$links = array();
$queries = array(
'//link[@type="application/rss+xml"]',
'//link[@type="application/atom+xml"]',
);
foreach ($queries as $query) {
$nodes = $xpath->query($query);
foreach ($nodes as $node) {
$link = $node->getAttribute('href');
if (!empty($link)) {
$feedUrl = new Url($link);
$siteUrl = new Url($url);
$links[] = $feedUrl->getAbsoluteUrl($feedUrl->isRelativeUrl() ? $siteUrl->getBaseUrl() : '');
}
}
}
Logger::setMessage(get_called_class().': '.implode(', ', $links));
return $links;
}
/**
* Get a parser instance.
*
* @param string $url Site url
* @param string $content Feed content
* @param string $encoding HTTP encoding
*
* @return \PicoFeed\Parser\Parser
*/
public function getParser($url, $content, $encoding)
{
$format = $this->detectFormat($content);
if (empty($format)) {
throw new UnsupportedFeedFormatException('Unable to detect feed format');
}
$className = '\PicoFeed\Parser\\'.$format;
$parser = new $className($content, $encoding, $url);
$parser->setHashAlgo($this->config->getParserHashAlgo());
$parser->setTimezone($this->config->getTimezone());
$parser->setConfig($this->config);
return $parser;
}
/**
* Detect the feed format.
*
* @param string $content Feed content
*
* @return string
*/
public function detectFormat($content)
{
$dom = XmlParser::getHtmlDocument($content);
$xpath = new DOMXPath($dom);
foreach ($this->formats as $parser_name => $query) {
$nodes = $xpath->query($query);
if ($nodes->length === 1) {
return $parser_name;
}
}
return '';
}
/**
* Add the prefix "http://" if the end-user just enter a domain name.
*
* @param string $url Url
* @retunr string
*/
public function prependScheme($url)
{
if (!preg_match('%^https?://%', $url)) {
$url = 'http://'.$url;
}
return $url;
}
}

View file

@ -0,0 +1,14 @@
<?php
namespace PicoFeed\Reader;
use PicoFeed\PicoFeedException;
/**
* ReaderException Exception.
*
* @author Frederic Guillot
*/
abstract class ReaderException extends PicoFeedException
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Reader;
/**
* SubscriptionNotFoundException Exception.
*
* @author Frederic Guillot
*/
class SubscriptionNotFoundException extends ReaderException
{
}

View file

@ -0,0 +1,12 @@
<?php
namespace PicoFeed\Reader;
/**
* UnsupportedFeedFormatException Exception.
*
* @author Frederic Guillot
*/
class UnsupportedFeedFormatException extends ReaderException
{
}

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://combat.blog.lemonde.fr/2013/08/31/teddy-riner-le-rookie-devenu-rambo/#xtor=RSS-3208',
'body' => array(
'//div[@class="entry-content"]',
),
'strip' => array(
'//*[contains(@class, "fb-like") or contains(@class, "social")]'
),
)
)
);

View file

@ -0,0 +1,15 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'title' => '//header/h1',
'test_url' => 'http://bits.blogs.nytimes.com/2012/01/16/wikipedia-plans-to-go-dark-on-wednesday-to-protest-sopa/',
'body' => array(
'//div[@class="postContent"]',
),
'strip' => array(
'//*[@class="shareToolsBox"]',
),
)
)
);

View file

@ -0,0 +1,13 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.igen.fr/ailleurs/2014/05/nvidia-va-delaisser-les-smartphones-grand-public-86031',
'body' => array(
'//div[contains(@class, "field-name-body")]'
),
'strip' => array(
),
)
)
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.nytimes.com/2011/05/15/world/middleeast/15prince.html',
'body' => array(
'//div[@class="articleBody"]',
),
)
)
);

View file

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.phoronix.com/scan.php?page=article&item=amazon_ec2_bare&num=1',
'body' => array(
'//div[@class="content"]',
),
'strip' => array()
)
)
);

View file

@ -0,0 +1,20 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.slate.com/articles/business/moneybox/2013/08/microsoft_ceo_steve_ballmer_retires_a_firsthand_account_of_the_company_s.html',
'body' => array(
'//div[@class="sl-art-body"]',
),
'strip' => array(
'//*[contains(@class, "social") or contains(@class, "comments") or contains(@class, "sl-article-floatin-tools") or contains(@class, "sl-art-pag")]',
'//*[@id="mys_slate_logged_in"]',
'//*[@id="sl_article_tools_myslate_bottom"]',
'//*[@id="mys_myslate"]',
'//*[@class="sl-viral-container"]',
'//*[@class="sl-art-creds-cntr"]',
'//*[@class="sl-art-ad-midflex"]',
)
)
)
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.theguardian.com/sustainable-business/2015/feb/02/2015-hyper-transparency-global-business',
'body' => array(
'//div[contains(@class, "content__main-column--article")]',
),
'strip' => array(
'//div[contains(@class, "meta-container")]',
),
)
)
);

View file

@ -0,0 +1,29 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'https://en.wikipedia.org/wiki/Grace_Hopper',
'body' => array(
'//div[@id="bodyContent"]',
),
'strip' => array(
"//div[@id='toc']",
"//div[@id='catlinks']",
"//div[@id='jump-to-nav']",
"//div[@class='thumbcaption']//div[@class='magnify']",
"//table[@class='navbox']",
"//table[contains(@class, 'infobox')]",
"//div[@class='dablink']",
"//div[@id='contentSub']",
"//div[@id='siteSub']",
"//table[@id='persondata']",
"//table[contains(@class, 'metadata')]",
"//*[contains(@class, 'noprint')]",
"//*[contains(@class, 'printfooter')]",
"//*[contains(@class, 'editsection')]",
"//*[contains(@class, 'error')]",
"//span[@title='pronunciation:']",
),
)
)
);

View file

@ -0,0 +1,31 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.wired.com/gamelife/2013/09/ouya-free-the-games/',
'body' => array(
'//div[@data-js="gallerySlides"]',
'//article',
),
'strip' => array(
'//*[@id="linker_widget"]',
'//*[@class="credit"]',
'//div[@data-js="slideCount"]',
'//*[contains(@class="visually-hidden")]',
'//*[@data-slide-number="_endslate"]',
'//*[@id="related"]',
'//*[contains(@class, "bio")]',
'//*[contains(@class, "entry-footer")]',
'//*[contains(@class, "mobify_backtotop_link")]',
'//*[contains(@class, "gallery-navigation")]',
'//*[contains(@class, "gallery-thumbnail")]',
'//img[contains(@src, "1x1")]',
'//a[contains(@href, "creativecommons")]',
'//a[@href="#start-of-content"]',
'//ul[@id="article-tags"]',
),
)
)
);

View file

@ -0,0 +1,15 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://online.wsj.com/article/SB10001424127887324108204579023143974408428.html',
'body' => array(
'//div[@class="articlePage"]',
),
'strip' => array(
'//*[@id="articleThumbnail_2"]',
'//*[@class="socialByline"]',
)
)
)
);

View file

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.01net.com/editorial/624550/twitter-rachete-madbits-un-specialiste-francais-de-lanalyse-dimages/',
'body' => array(
'//div[@class="article_ventre_box"]',
),
'strip' => array(
'//link',
'//*[contains(@class, "article_navigation")]',
'//h1',
'//*[contains(@class, "article_toolbarMain")]',
'//*[contains(@class, "article_imagehaute_box")]',
),
),
),
);

View file

@ -0,0 +1,9 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%alt="(.+)" title="(.+)" */>%' => '/><br/>$1<br/>$2',
),
),
);

View file

@ -0,0 +1,15 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.alainonline.net/news_details.php?lang=arabic&sid=18907',
'body' => array(
'//div[@class="news_details"]',
),
'strip' => array(
'//div[@class="news_details"]/div/div[last()]',
),
),
),
);

View file

@ -0,0 +1,22 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.aljazeera.com/news/2015/09/xi-jinping-seattle-china-150922230118373.html',
'body' => array(
'//figure[@class="article-content"]',
'//div[@class="article-body"]',
),
'strip' => array(
'//h1',
'//h3',
'//ul',
'//table[contains(@class, "in-article-item")]',
'//a[@target="_self"]',
'//div[@data-embed-type="Brightcove"]',
'//div[@class="QuoteContainer"]',
),
),
),
);

View file

@ -0,0 +1,20 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.aljazeera.com/news/2015/09/xi-jinping-seattle-china-150922230118373.html',
'body' => array(
'//div[@class="story-body"]',
),
'strip' => array(
'//p[@class="kindofstory"]',
'//cite[@class="byline"]',
'//div[contains(@class,"related-topics")]',
'//links',
'//sharebar',
'//related-topics',
),
),
),
);

View file

@ -0,0 +1,24 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.allgemeine-zeitung.de/lokales/polizei/mainz-gonsenheim-unbekannte-rauben-esso-tankstelle-in-kurt-schumacher-strasse-aus_14913147.htm',
'body' => array(
'//div[contains(@class, "article")][1]',
),
'strip' => array(
'//read/h1',
'//*[@id="t-map"]',
'//*[contains(@class, "modules")]',
'//*[contains(@class, "adsense")]',
'//*[contains(@class, "linkbox")]',
'//*[contains(@class, "info")]',
'//*[@class="skip"]',
'//*[@class="funcs"]',
'//span[@class="nd address"]',
'//a[contains(@href, "abo-und-services")]',
),
),
),
);

View file

@ -0,0 +1,9 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%title="(.+)" */>%' => '/><br/>$1',
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'body' => array(
'//img[@id="comic_image"]',
'//div[@class="comment-wrapper"][position()=1]',
),
'strip' => array(),
'test_url' => 'http://www.anythingcomic.com/comics/2108929/stress-free/',
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://hosted.ap.org/dynamic/stories/A/AS_CHINA_GAO_ZHISHENG?SITE=AP&SECTION=HOME&TEMPLATE=DEFAULT',
'body' => array(
'//img[@class="ap-smallphoto-img"]',
'//span[@class="entry-content"]',
),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.areadvd.de/news/daily-deals-angebote-bei-lautsprecher-teufel-3/',
'body' => array('//div[contains(@class,"entry")]'),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,23 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://arstechnica.com/tech-policy/2015/09/judge-warners-2m-happy-birthday-copyright-is-bogus/',
'body' => array(
'//header/h2',
'//section[@id="article-guts"]',
'//div[@class="superscroll-content show"]',
'//div[@class="gallery"]',
),
'next_page' => '//span[@class="numbers"]/a',
'strip' => array(
'//figcaption',
'//div[@class="post-meta"]',
'//div[@class="gallery-image-credit"]',
'//aside',
'//div[@class="article-expander"]',
),
),
),
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%/index.php.*comic=.*%' => array(
'test_url' => 'http://www.awkwardzombie.com/index.php?comic=041315',
'body' => array('//*[@id="comic"]/img'),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,21 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.bangkokpost.com/news/politics/704204/new-us-ambassador-arrives-in-bangkok',
'body' => array(
'//div[@class="articleContents"]',
),
'strip' => array(
'//h2',
'//h4',
'//div[@class="text-size"]',
'//div[@class="relate-story"]',
'//div[@class="text-ads"]',
'//script',
'//ul',
),
),
),
);

View file

@ -0,0 +1,16 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://bgr.com/2015/09/27/iphone-6s-waterproof-testing/',
'body' => array(
'//img[contains(@class,"img")]',
'//div[@class="text-column"]',
),
'strip' => array(
'//strong',
),
),
),
);

View file

@ -0,0 +1,9 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%-150x150%' => '',
),
),
);

View file

@ -0,0 +1,13 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.bizjournals.com/milwaukee/news/2015/09/30/bucks-will-hike-prices-on-best-seats-at-new-arena.html',
'body' => array(
'//figure/div/a/img',
'//p[@class="content__segment"]',
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://blog.fefe.de/?ts=ad706a73',
'body' => array(
'/html/body/ul',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://blog.mapillary.com/update/2015/08/26/traffic-sign-updates.html',
'body' => array(
'//div[contains(@class, "blog-post__content")]',
),
),
),
);

View file

@ -0,0 +1,18 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.buenosairesherald.com/article/199344/manzur-named-next-governor-of-tucum%C3%A1n',
'body' => array(
'//div[@style="float:none"]',
),
'strip' => array(
'//div[contains(@class, "bz_alias_short_desc_container"]',
'//td[@id="bz_show_bug_column_1"]',
'//table[@id="attachment_table"]',
'//table[@class="bz_comment_table"]',
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.bunicomic.com/comic/buni-623/',
'body' => array(
'//div[@class="comic-table"]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://buttersafe.com/2015/04/21/the-incredible-flexible-man/',
'body' => array(
'//div[@id="comic"]',
'//div[@class="post-comic"]',
),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,13 @@
<?php
return array(
'grabber' => array(
'%/cad/.+%' => array(
'test_url' => 'http://www.cad-comic.com/cad/20150417',
'body' => array(
'//*[@id="content"]/img',
),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://chaoslife.findchaos.com/pets-in-the-wild',
'body' => array('//div[@id="comic"]'),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%/comic.*%' => array(
'test_url' => 'http://cliquerefresh.com/comic/078-stating-the-obvious/',
'body' => array('//div[@class="comicImg"]/img | //div[@class="comicImg"]/a/img'),
'strip' => array(),
),
),
);

View file

@ -0,0 +1,38 @@
<?php
return array(
'grabber' => array(
'%^/products.*%' => array(
'test_url' => 'http://www.cnet.com/products/fibaro-flood-sensor/#ftag=CADf328eec',
'body' => array(
'//li[contains(@class,"slide first"] || //figure[contains(@class,(promoFigure))]',
'//div[@class="quickInfo"]',
'//div[@class="col-6 ratings"]',
'//div[@id="editorReview"]',
),
'strip' => array(
'//script',
'//a[@class="clickToEnlarge"]',
'//div[@section="topSharebar"]',
'//div[contains(@class,"related")]',
'//div[contains(@class,"ad-")]',
'//div[@section="shortcodeGallery"]',
),
),
'%.*%' => array(
'test_url' => 'http://cnet.com.feedsportal.com/c/34938/f/645093/s/4a340866/sc/28/l/0L0Scnet0N0Cnews0Cman0Eclaims0Eonline0Epsychic0Emade0Ehim0Ebuy0E10Emillion0Epowerball0Ewinning0Eticket0C0Tftag0FCAD590Aa51e/story01.htm',
'body' => array(
'//p[@itemprop="description"]',
'//div[@itemprop="articleBody"]',
),
'strip' => array(
'//script',
'//a[@class="clickToEnlarge"]',
'//div[@section="topSharebar"]',
'//div[contains(@class,"related")]',
'//div[contains(@class,"ad-")]',
'//div[@section="shortcodeGallery"]',
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://consomac.fr/news-2430-l-iphone-6-toujours-un-secret-bien-garde.html',
'body' => array(
'//div[contains(@id, "newscontent")]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,9 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%title="(.+)" */>%' => '/><br/>$1',
),
),
);

View file

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.csmonitor.com/USA/Politics/2015/0925/John-Boehner-steps-down-Self-sacrificing-but-will-it-lead-to-better-government',
'body' => array(
'//figure[@id="image-top-1"]',
'//div[@id="story-body"]',
),
'strip' => array(
'//script',
'//img[@title="hide caption"]',
'//*[contains(@class,"promo_link")]',
'//div[@id="story-embed-column"]',
),
),
),
);

View file

@ -0,0 +1,20 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://dailyjs.com/2014/08/07/p5js/',
'body' => array(
'//div[@id="post"]',
),
'strip' => array(
'//h2[@class="post"]',
'//div[@class="meta"]',
'//*[contains(@class, "addthis_toolbox")]',
'//*[contains(@class, "addthis_default_style")]',
'//*[@class="navigation small"]',
'//*[@id="related"]',
),
),
),
);

View file

@ -0,0 +1,16 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://dailyreporter.com/2016/01/09/us-supreme-court-case-could-weaken-government-workers-unions/',
'body' => array(
'//div[contains(@class, "entry-content")]',
),
'strip' => array(
'//div[@class="dmcss_login_form"]',
'//*[contains(@class, "sharedaddy")]',
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.dailytech.com/Apples+First+Fixes+to+iOS+9+Land+w+iOS++901+Release/article37495.htm',
'body' => array(
'//div[@class="NewsBodyImage"]',
'//span[@id="lblSummary"]',
'//span[@id="lblBody"]',
),
),
),
);

View file

@ -0,0 +1,15 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.degroupnews.com/medias/vodsvod/amazon-concurrence-la-chromecast-de-google-avec-fire-tv-stick',
'body' => array(
'//div[@class="contenu"]',
),
'strip' => array(
'//div[contains(@class, "a2a")]',
),
),
),
);

View file

@ -0,0 +1,15 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://derstandard.at/2000010267354/The-Witcher-3-Hohe-Hardware-Anforderungen-fuer-PC-Spieler?ref=rss',
'body' => array(
'//div[@class="copytext"]',
'//ul[@id="media-list"]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'body' => array(
'//img[@class="img-responsive img-comic"]',
),
'test_url' => 'http://dilbert.com/strip/2016-01-28',
),
),
);

View file

@ -0,0 +1,18 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://blogs.discovermagazine.com/the-extremo-files/2015/09/11/have-scientists-found-the-worlds-deepest-fish/',
'body' => array(
'//div[@class="entry"]',
),
'strip' => array(
'//h1',
'//div[@class="meta"]',
'//div[@class="shareIcons"]',
'//div[@class="navigation"]',
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://distrowatch.com/?newsid=08355',
'body' => array(
'//td[@class="NewsText"][1]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,16 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://dozodomo.com/bento/2014/03/04/lart-des-maki-de-takayo-kiyota/',
'body' => array(
'//div[@class="joke"]',
'//div[@class="story-cover"]',
'//div[@class="story-content"]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,16 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'body' => array('//img[@id="comicimage"]'),
'strip' => array(),
'test_url' => 'http://drawingboardcomic.com/index.php?comic=208',
),
),
'filter' => array(
'%.*%' => array(
'%title="(.+)" */>%' => '/><br/>$1',
),
),
);

View file

@ -0,0 +1,9 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%-150x150%' => '',
),
),
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.engadget.com/2015/04/20/dark-matter-discovery/?ncid=rss_truncated',
'body' => array('//div[@id="page_body"]/div[@class="container@m-"]'),
'strip' => array('//aside[@role="banner"]'),
),
),
);

View file

@ -0,0 +1,46 @@
<?php
return array(
'grabber' => array(
'%/articles/view/comicsandcosplay/comics/critical-miss.*%' => array(
'body' => array('//*[@class="body"]/span/img | //div[@class="folder_nav_links"]/following::p'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/critical-miss/13776-Critical-Miss-on-Framerates?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array(),
),
'%/articles/view/comicsandcosplay/comics/namegame.*%' => array(
'body' => array('//*[@class="body"]/span/p/img[@height != "120"]'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/namegame/9759-Leaving-the-Nest?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array(),
),
'%/articles/view/comicsandcosplay/comics/stolen-pixels.*%' => array(
'body' => array('//*[@class="body"]/span/p[2]/img'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/stolen-pixels/8866-Stolen-Pixels-258-Where-the-Boys-Are?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array(),
),
'%/articles/view/comicsandcosplay/comics/bumhugparade.*%' => array(
'body' => array('//*[@class="body"]/span/p[2]/img'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/bumhugparade/8262-Bumhug-Parade-13?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array(),
),
'%/articles/view/comicsandcosplay.*/comics/escapistradiotheater%' => array(
'body' => array('//*[@class="body"]/span/p[2]/img'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/escapistradiotheater/8265-The-Escapist-Radio-Theater-13?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array(),
),
'%/articles/view/comicsandcosplay/comics/paused.*%' => array(
'body' => array('//*[@class="body"]/span/p[2]/img | //*[@class="body"]/span/div/img'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/paused/8263-Paused-16?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array(),
),
'%/articles/view/comicsandcosplay/comics/fraughtwithperil.*%' => array(
'body' => array('//*[@class="body"]'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/comicsandcosplay/comics/fraughtwithperil/12166-The-Escapist-Presents-Escapist-Comics-Critical-Miss-B-lyeh-Fhlop?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=articles',
'strip' => array(),
),
'%/articles/view/video-games/columns/.*%' => array(
'body' => array('//*[@id="article_content"]'),
'test_url' => 'http://www.escapistmagazine.com/articles/view/video-games/columns/experienced-points/13971-What-50-Shades-and-Batman-Have-in-Common.2',
'strip' => array(),
),
),
);

View file

@ -0,0 +1,12 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://espn.go.com/nfl/story/_/id/13388208/jason-whitlock-chip-kelly-controversy',
'body' => array(
'//p',
),
),
),
);

View file

@ -0,0 +1,16 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'body' => array('//a[@class="comic"]/img'),
'strip' => array(),
'test_url' => 'http://www.exocomics.com/379',
),
),
'filter' => array(
'%.*%' => array(
'%title="(.+)" */>%' => '/><br/>$1',
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://explosm.net/comics/3803/',
'body' => array(
'//div[@id="comic-container"]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,9 @@
<?php
return array(
'filter' => array(
'%.*%' => array(
'%-150x150%' => '',
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.fastcodesign.com/3026548/exposure/peek-inside-the-worlds-forbidden-subway-tunnels',
'body' => array(
'//article[contains(@class, "body prose")]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.fastcoexist.com/3026114/take-a-seat-on-this-gates-funded-future-toilet-that-will-change-how-we-think-about-poop',
'body' => array(
'//article[contains(@class, "body prose")]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.fastcompany.com/3026712/fast-feed/elon-musk-an-apple-tesla-merger-is-very-unlikely',
'body' => array(
'//article[contains(@class, "body prose")]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,14 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.ffworld.com/?rub=news&page=voir&id=2709',
'body' => array(
'//div[@class="news_body"]',
),
'strip' => array(
),
),
),
);

View file

@ -0,0 +1,22 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://foreignpolicy.com/2016/01/09/networking-giant-pulls-nsa-linked-code-exploited-by-hackers/',
'body' => array(
'//article',
),
'strip' => array(
'//div[@id="post-category"]',
'//div[@id="desktop-right"]',
'//h1',
'//section[@class="article-meta"]',
'//div[@class="side-panel-wrapper"]',
'//*[contains(@class, "share-")]',
'//*[contains(@id, "taboola-")]',
'//div[@class="comments"]',
),
),
),
);

View file

@ -0,0 +1,19 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://fossbytes.com/fbi-hacked-1000-computers-to-shut-down-largest-child-pornography-site-on-the-dark-web/',
'body' => array(
'//div[@class="entry-inner"]',
),
'strip' => array(
'//*[@class="at-above-post addthis_default_style addthis_toolbox at-wordpress-hide"]',
'//*[@class="at-below-post addthis_default_style addthis_toolbox at-wordpress-hide"]',
'//*[@class="at-below-post-recommended addthis_default_style addthis_toolbox at-wordpress-hide"]',
'//*[@class="code-block code-block-12 ai-desktop"]',
'//*[@class="code-block code-block-13 ai-tablet-phone"]',
),
),
),
);

View file

@ -0,0 +1,11 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'body' => array('//*[@id="comic"] | //*[@class="post-image"]'),
'strip' => array(),
'test_url' => 'http://www.fowllanguagecomics.com/comic/working-out/',
),
),
);

View file

@ -0,0 +1,17 @@
<?php
return array(
'grabber' => array(
'%.*%' => array(
'test_url' => 'http://www.geek.com/news/the-11-best-ways-to-eat-eggs-1634076/',
'body' => array(
'//div[@class="articleinfo"]/figure',
'//div[@class="articleinfo"]/article',
'//span[@class="by"]',
),
'strip' => array(
'//span[@class="red"]',
),
),
),
);

Some files were not shown because too many files have changed in this diff Show more