composer update

This commit is contained in:
Marcel Kapfer (mmk2410) 2016-12-30 00:04:12 +01:00
parent 9ac51e0523
commit 623395064f
279 changed files with 4458 additions and 16328 deletions

View file

@ -243,6 +243,16 @@ class CandidateParser implements ParserInterface
}
}
/**
* Find link for next page of the article.
*
* @return string
*/
public function findNextLink()
{
return null;
}
/**
* Return false if the node should not be removed.
*

View file

@ -10,4 +10,11 @@ interface ParserInterface
* @return string
*/
public function execute();
/**
* Find link for next page of the article.
*
* @return string
*/
public function findNextLink();
}

View file

@ -65,7 +65,6 @@ class RuleParser implements ParserInterface
public function findContent()
{
$content = '';
if (isset($this->rules['body']) && is_array($this->rules['body'])) {
foreach ($this->rules['body'] as $pattern) {
$nodes = $this->xpath->query($pattern);
@ -80,4 +79,24 @@ class RuleParser implements ParserInterface
return $content;
}
/**
* Fetch next link based on Xpath rules.
*
* @return string
*/
public function findNextLink()
{
if (isset($this->rules['next_page']) && is_array($this->rules['next_page'])) {
foreach ($this->rules['next_page'] as $pattern) {
$nodes = $this->xpath->query($pattern);
if ($nodes !== false && $nodes->length > 0) {
foreach ($nodes as $node) {
return $node->getAttribute('href');
}
}
}
}
return null;
}
}

View file

@ -206,19 +206,31 @@ class Scraper extends Base
/**
* Execute the scraper.
*/
public function execute()
public function execute($pageContent = '', $recursionDepth = 0)
{
$this->content = '';
$this->html = '';
$this->encoding = '';
$this->content = '';
$this->download();
$this->prepareHtml();
$parser = $this->getParser();
if ($parser !== null) {
$this->content = $parser->execute();
$maxRecursions = $this->config->getMaxRecursions();
if(!isset($maxRecursions)){
$maxRecursions = 25;
}
$pageContent .= $parser->execute();
// check if there is a link to next page and recursively get content (max 25 pages)
if((($nextLink = $parser->findNextLink()) !== null) && $recursionDepth < $maxRecursions){
$nextLink = Url::resolve($nextLink,$this->url);
$this->setUrl($nextLink);
$this->execute($pageContent,$recursionDepth+1);
}
else{
$this->content = $pageContent;
}
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
}
}