Cleanup

2016-04-18 09:56:52 +02:00 · 2016-04-18 09:56:52 +02:00 · 9947a5f033
commit 9947a5f033
parent cb491341df
391 changed files with 0 additions and 15712 deletions
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/CandidateParser.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/CandidateParser.php
@ -1,273 +0,0 @@
-<?php
-
-namespace PicoFeed\Scraper;
-
-use DomDocument;
-use DOMXPath;
-use PicoFeed\Logging\Logger;
-use PicoFeed\Parser\XmlParser;
-
-/**
- * Candidate Parser.
- *
- * @author  Frederic Guillot
- */
-class CandidateParser implements ParserInterface
-{
-    private $dom;
-    private $xpath;
-
-    /**
-     * List of attributes to try to get the content, order is important, generic terms at the end.
-     *
-     * @var array
-     */
-    private $candidatesAttributes = array(
-        'articleBody',
-        'articlebody',
-        'article-body',
-        'articleContent',
-        'articlecontent',
-        'article-content',
-        'articlePage',
-        'post-content',
-        'post_content',
-        'entry-content',
-        'entry-body',
-        'main-content',
-        'story_content',
-        'storycontent',
-        'entryBox',
-        'entrytext',
-        'comic',
-        'post',
-        'article',
-        'content',
-        'main',
-    );
-
-    /**
-     * List of attributes to strip.
-     *
-     * @var array
-     */
-    private $stripAttributes = array(
-        'comment',
-        'share',
-        'links',
-        'toolbar',
-        'fb',
-        'footer',
-        'credit',
-        'bottom',
-        'nav',
-        'header',
-        'social',
-        'tag',
-        'metadata',
-        'entry-utility',
-        'related-posts',
-        'tweet',
-        'categories',
-        'post_title',
-        'by_line',
-        'byline',
-        'sponsors',
-    );
-
-    /**
-     * Tags to remove.
-     *
-     * @var array
-     */
-    private $stripTags = array(
-        'nav',
-        'header',
-        'footer',
-        'aside',
-        'form',
-    );
-
-    /**
-     * Constructor.
-     *
-     * @param string $html
-     */
-    public function __construct($html)
-    {
-        $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
-        $this->xpath = new DOMXPath($this->dom);
-    }
-
-    /**
-     * Get the relevant content with the list of potential attributes.
-     *
-     * @return string
-     */
-    public function execute()
-    {
-        $content = $this->findContentWithCandidates();
-
-        if (strlen($content) < 200) {
-            $content = $this->findContentWithArticle();
-        }
-
-        if (strlen($content) < 50) {
-            $content = $this->findContentWithBody();
-        }
-
-        return $this->stripGarbage($content);
-    }
-
-    /**
-     * Find content based on the list of tag candidates.
-     *
-     * @return string
-     */
-    public function findContentWithCandidates()
-    {
-        foreach ($this->candidatesAttributes as $candidate) {
-            Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
-
-            $nodes = $this->xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
-
-            if ($nodes !== false && $nodes->length > 0) {
-                Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'"');
-
-                return $this->dom->saveXML($nodes->item(0));
-            }
-        }
-
-        return '';
-    }
-
-    /**
-     * Find <article/> tag.
-     *
-     * @return string
-     */
-    public function findContentWithArticle()
-    {
-        $nodes = $this->xpath->query('//article');
-
-        if ($nodes !== false && $nodes->length > 0) {
-            Logger::setMessage(get_called_class().': Find <article/> tag');
-
-            return $this->dom->saveXML($nodes->item(0));
-        }
-
-        return '';
-    }
-
-    /**
-     * Find <body/> tag.
-     *
-     * @return string
-     */
-    public function findContentWithBody()
-    {
-        $nodes = $this->xpath->query('//body');
-
-        if ($nodes !== false && $nodes->length > 0) {
-            Logger::setMessage(get_called_class().' Find <body/>');
-
-            return $this->dom->saveXML($nodes->item(0));
-        }
-
-        return '';
-    }
-
-    /**
-     * Strip useless tags.
-     *
-     * @param string $content
-     *
-     * @return string
-     */
-    public function stripGarbage($content)
-    {
-        $dom = XmlParser::getDomDocument($content);
-
-        if ($dom !== false) {
-            $xpath = new DOMXPath($dom);
-
-            $this->stripTags($xpath);
-            $this->stripAttributes($dom, $xpath);
-
-            $content = $dom->saveXML($dom->documentElement);
-        }
-
-        return $content;
-    }
-
-    /**
-     * Remove blacklisted tags.
-     *
-     * @param DOMXPath $xpath
-     */
-    public function stripTags(DOMXPath $xpath)
-    {
-        foreach ($this->stripTags as $tag) {
-            $nodes = $xpath->query('//'.$tag);
-
-            if ($nodes !== false && $nodes->length > 0) {
-                Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
-
-                foreach ($nodes as $node) {
-                    $node->parentNode->removeChild($node);
-                }
-            }
-        }
-    }
-
-    /**
-     * Remove blacklisted attributes.
-     *
-     * @param DomDocument $dom
-     * @param DOMXPath    $xpath
-     */
-    public function stripAttributes(DomDocument $dom, DOMXPath $xpath)
-    {
-        foreach ($this->stripAttributes as $attribute) {
-            $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
-
-            if ($nodes !== false && $nodes->length > 0) {
-                Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
-
-                foreach ($nodes as $node) {
-                    if ($this->shouldRemove($dom, $node)) {
-                        $node->parentNode->removeChild($node);
-                    }
-                }
-            }
-        }
-    }
-
-    /**
-     * Return false if the node should not be removed.
-     *
-     * @param DomDocument $dom
-     * @param DomNode     $node
-     *
-     * @return bool
-     */
-    public function shouldRemove(DomDocument $dom, $node)
-    {
-        $document_length = strlen($dom->textContent);
-        $node_length = strlen($node->textContent);
-
-        if ($document_length === 0) {
-            return true;
-        }
-
-        $ratio = $node_length * 100 / $document_length;
-
-        if ($ratio >= 90) {
-            Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
-
-            return false;
-        }
-
-        return true;
-    }
-}
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/ParserInterface.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/ParserInterface.php
@ -1,13 +0,0 @@
-<?php
-
-namespace PicoFeed\Scraper;
-
-interface ParserInterface
-{
-    /**
-     * Execute the parser and return the contents.
-     *
-     * @return string
-     */
-    public function execute();
-}
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleLoader.php
@ -1,122 +0,0 @@
-<?php
-
-namespace PicoFeed\Scraper;
-
-use PicoFeed\Logging\Logger;
-use PicoFeed\Config\Config;
-
-/**
- * RuleLoader class.
- *
- * @author  Frederic Guillot
- * @author  Bernhard Posselt
- */
-class RuleLoader
-{
-    /**
-     * Config object.
-     *
-     * @var \PicoFeed\Config\Config
-     */
-    private $config;
-
-    /**
-     * Constructor.
-     *
-     * @param \PicoFeed\Config\Config $config Config class instance
-     */
-    public function __construct(Config $config)
-    {
-        $this->config = $config;
-    }
-
-    /**
-     * Get the rules for an URL.
-     *
-     * @param string $url the URL that should be looked up
-     *
-     * @return array the array containing the rules
-     */
-    public function getRules($url)
-    {
-        $hostname = parse_url($url, PHP_URL_HOST);
-
-        if ($hostname !== false) {
-            $files = $this->getRulesFileList($hostname);
-
-            foreach ($this->getRulesFolders() as $folder) {
-                $rule = $this->loadRuleFile($folder, $files);
-
-                if (!empty($rule)) {
-                    return $rule;
-                }
-            }
-        }
-
-        return array();
-    }
-
-    /**
-     * Get the list of possible rules file names for a given hostname.
-     *
-     * @param string $hostname Hostname
-     *
-     * @return array
-     */
-    public function getRulesFileList($hostname)
-    {
-        $files = array($hostname);                 // subdomain.domain.tld
-        $parts = explode('.', $hostname);
-        $len = count($parts);
-
-        if ($len > 2) {
-            $subdomain = array_shift($parts);
-            $files[] = implode('.', $parts);       // domain.tld
-            $files[] = '.'.implode('.', $parts);   // .domain.tld
-            $files[] = $subdomain;                 // subdomain
-        } elseif ($len === 2) {
-            $files[] = '.'.implode('.', $parts);    // .domain.tld
-            $files[] = $parts[0];                   // domain
-        }
-
-        return $files;
-    }
-
-    /**
-     * Load a rule file from the defined folder.
-     *
-     * @param string $folder Rule directory
-     * @param array  $files  List of possible file names
-     *
-     * @return array
-     */
-    public function loadRuleFile($folder, array $files)
-    {
-        foreach ($files as $file) {
-            $filename = $folder.'/'.$file.'.php';
-            if (file_exists($filename)) {
-                Logger::setMessage(get_called_class().' Load rule: '.$file);
-
-                return include $filename;
-            }
-        }
-
-        return array();
-    }
-
-    /**
-     * Get the list of folders that contains rules.
-     *
-     * @return array
-     */
-    public function getRulesFolders()
-    {
-        $folders = array(__DIR__.'/../Rules');
-
-        if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
-            $folders[] = $this->config->getGrabberRulesFolder();
-        }
-
-        return $folders;
-    }
-}
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleParser.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/RuleParser.php
@ -1,83 +0,0 @@
-<?php
-
-namespace PicoFeed\Scraper;
-
-use DOMXPath;
-use PicoFeed\Parser\XmlParser;
-
-/**
- * Rule Parser.
- *
- * @author  Frederic Guillot
- */
-class RuleParser implements ParserInterface
-{
-    private $dom;
-    private $xpath;
-    private $rules = array();
-
-    /**
-     * Constructor.
-     *
-     * @param string $html
-     * @param array  $rules
-     */
-    public function __construct($html, array $rules)
-    {
-        $this->rules = $rules;
-        $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
-        $this->xpath = new DOMXPath($this->dom);
-    }
-
-    /**
-     * Get the relevant content with predefined rules.
-     *
-     * @return string
-     */
-    public function execute()
-    {
-        $this->stripTags();
-
-        return $this->findContent();
-    }
-
-    /**
-     * Remove HTML tags.
-     */
-    public function stripTags()
-    {
-        if (isset($this->rules['strip']) && is_array($this->rules['strip'])) {
-            foreach ($this->rules['strip'] as $pattern) {
-                $nodes = $this->xpath->query($pattern);
-
-                if ($nodes !== false && $nodes->length > 0) {
-                    foreach ($nodes as $node) {
-                        $node->parentNode->removeChild($node);
-                    }
-                }
-            }
-        }
-    }
-
-    /**
-     * Fetch content based on Xpath rules.
-     */
-    public function findContent()
-    {
-        $content = '';
-
-        if (isset($this->rules['body']) && is_array($this->rules['body'])) {
-            foreach ($this->rules['body'] as $pattern) {
-                $nodes = $this->xpath->query($pattern);
-
-                if ($nodes !== false && $nodes->length > 0) {
-                    foreach ($nodes as $node) {
-                        $content .= $this->dom->saveXML($node);
-                    }
-                }
-            }
-        }
-
-        return $content;
-    }
-}
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Scraper/Scraper.php
@ -1,341 +0,0 @@
-<?php
-
-namespace PicoFeed\Scraper;
-
-use PicoFeed\Client\Client;
-use PicoFeed\Client\ClientException;
-use PicoFeed\Client\Url;
-use PicoFeed\Config\Config;
-use PicoFeed\Encoding\Encoding;
-use PicoFeed\Filter\Filter;
-use PicoFeed\Logging\Logger;
-use PicoFeed\Parser\XmlParser;
-
-/**
- * Scraper class.
- *
- * @author  Frederic Guillot
- */
-class Scraper
-{
-    /**
-     * URL.
-     *
-     * @var string
-     */
-    private $url = '';
-
-    /**
-     * Relevant content.
-     *
-     * @var string
-     */
-    private $content = '';
-
-    /**
-     * HTML content.
-     *
-     * @var string
-     */
-    private $html = '';
-
-    /**
-     * HTML content encoding.
-     *
-     * @var string
-     */
-    private $encoding = '';
-
-    /**
-     * Flag to enable candidates parsing.
-     *
-     * @var bool
-     */
-    private $enableCandidateParser = true;
-
-    /**
-     * Config object.
-     *
-     * @var \PicoFeed\Config\Config
-     */
-    private $config;
-
-    /**
-     * Constructor.
-     *
-     * @param \PicoFeed\Config\Config $config Config class instance
-     */
-    public function __construct(Config $config)
-    {
-        $this->config = $config;
-        Logger::setTimezone($this->config->getTimezone());
-    }
-
-    /**
-     * Disable candidates parsing.
-     *
-     * @return Scraper
-     */
-    public function disableCandidateParser()
-    {
-        $this->enableCandidateParser = false;
-
-        return $this;
-    }
-
-    /**
-     * Get encoding.
-     *
-     * @return string
-     */
-    public function getEncoding()
-    {
-        return $this->encoding;
-    }
-
-    /**
-     * Set encoding.
-     *
-     * @param string $encoding
-     *
-     * @return Scraper
-     */
-    public function setEncoding($encoding)
-    {
-        $this->encoding = $encoding;
-
-        return $this;
-    }
-
-    /**
-     * Get URL to download.
-     *
-     * @return string
-     */
-    public function getUrl()
-    {
-        return $this->url;
-    }
-
-    /**
-     * Set URL to download.
-     *
-     * @param string $url URL
-     *
-     * @return Scraper
-     */
-    public function setUrl($url)
-    {
-        $this->url = $url;
-
-        return $this;
-    }
-
-    /**
-     * Return true if the scraper found relevant content.
-     *
-     * @return bool
-     */
-    public function hasRelevantContent()
-    {
-        return !empty($this->content);
-    }
-
-    /**
-     * Get relevant content.
-     *
-     * @return string
-     */
-    public function getRelevantContent()
-    {
-        return $this->content;
-    }
-
-    /**
-     * Get raw content (unfiltered).
-     *
-     * @return string
-     */
-    public function getRawContent()
-    {
-        return $this->html;
-    }
-
-    /**
-     * Set raw content (unfiltered).
-     *
-     * @param string $html
-     *
-     * @return Scraper
-     */
-    public function setRawContent($html)
-    {
-        $this->html = $html;
-
-        return $this;
-    }
-
-    /**
-     * Get filtered relevant content.
-     *
-     * @return string
-     */
-    public function getFilteredContent()
-    {
-        $filter = Filter::html($this->content, $this->url);
-        $filter->setConfig($this->config);
-
-        return $filter->execute();
-    }
-
-    /**
-     * Download the HTML content.
-     *
-     * @return bool
-     */
-    public function download()
-    {
-        if (!empty($this->url)) {
-
-            // Clear everything
-            $this->html = '';
-            $this->content = '';
-            $this->encoding = '';
-
-            try {
-                $client = Client::getInstance();
-                $client->setConfig($this->config);
-                $client->setTimeout($this->config->getGrabberTimeout());
-                $client->setUserAgent($this->config->getGrabberUserAgent());
-                $client->execute($this->url);
-
-                $this->url = $client->getUrl();
-                $this->html = $client->getContent();
-                $this->encoding = $client->getEncoding();
-
-                return true;
-            } catch (ClientException $e) {
-                Logger::setMessage(get_called_class().': '.$e->getMessage());
-            }
-        }
-
-        return false;
-    }
-
-    /**
-     * Execute the scraper.
-     */
-    public function execute()
-    {
-        $this->download();
-
-        if (!$this->skipProcessing()) {
-            $this->prepareHtml();
-
-            $parser = $this->getParser();
-
-            if ($parser !== null) {
-                $this->content = $parser->execute();
-                Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
-            }
-        }
-    }
-
-    /**
-     * Returns true if the parsing must be skipped.
-     *
-     * @return bool
-     */
-    public function skipProcessing()
-    {
-        $handlers = array(
-            'detectStreamingVideos',
-            'detectPdfFiles',
-        );
-
-        foreach ($handlers as $handler) {
-            if ($this->$handler()) {
-                return true;
-            }
-        }
-
-        if (empty($this->html)) {
-            Logger::setMessage(get_called_class().': Raw HTML is empty');
-
-            return true;
-        }
-
-        return false;
-    }
-
-    /**
-     * Get the parser.
-     *
-     * @return ParserInterface
-     */
-    public function getParser()
-    {
-        $ruleLoader = new RuleLoader($this->config);
-        $rules = $ruleLoader->getRules($this->url);
-
-        if (!empty($rules['grabber'])) {
-            Logger::setMessage(get_called_class().': Parse content with rules');
-
-            foreach ($rules['grabber'] as $pattern => $rule) {
-                $url = new Url($this->url);
-                $sub_url = $url->getFullPath();
-
-                if (preg_match($pattern, $sub_url)) {
-                    Logger::setMessage(get_called_class().': Matched url '.$sub_url);
-
-                    return new RuleParser($this->html, $rule);
-                }
-            }
-        } elseif ($this->enableCandidateParser) {
-            Logger::setMessage(get_called_class().': Parse content with candidates');
-
-            return new CandidateParser($this->html);
-        }
-
-        return;
-    }
-
-    /**
-     * Normalize encoding and strip head tag.
-     */
-    public function prepareHtml()
-    {
-        $html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
-
-        $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
-        $this->html = Filter::stripHeadTags($this->html);
-
-        Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
-    }
-
-    /**
-     * Return the Youtube embed player and skip processing.
-     *
-     * @return bool
-     */
-    public function detectStreamingVideos()
-    {
-        if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
-            $this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
-
-            return true;
-        }
-
-        return false;
-    }
-
-    /**
-     * Skip processing for PDF documents.
-     *
-     * @return bool
-     */
-    public function detectPdfFiles()
-    {
-        return substr($this->url, -3) === 'pdf';
-    }
-}