composer update
This commit is contained in:
parent
9ac51e0523
commit
623395064f
279 changed files with 4458 additions and 16328 deletions
|
@ -243,6 +243,16 @@ class CandidateParser implements ParserInterface
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find link for next page of the article.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function findNextLink()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return false if the node should not be removed.
|
||||
*
|
||||
|
|
|
@ -10,4 +10,11 @@ interface ParserInterface
|
|||
* @return string
|
||||
*/
|
||||
public function execute();
|
||||
|
||||
/**
|
||||
* Find link for next page of the article.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function findNextLink();
|
||||
}
|
||||
|
|
|
@ -65,7 +65,6 @@ class RuleParser implements ParserInterface
|
|||
public function findContent()
|
||||
{
|
||||
$content = '';
|
||||
|
||||
if (isset($this->rules['body']) && is_array($this->rules['body'])) {
|
||||
foreach ($this->rules['body'] as $pattern) {
|
||||
$nodes = $this->xpath->query($pattern);
|
||||
|
@ -80,4 +79,24 @@ class RuleParser implements ParserInterface
|
|||
|
||||
return $content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch next link based on Xpath rules.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function findNextLink()
|
||||
{
|
||||
if (isset($this->rules['next_page']) && is_array($this->rules['next_page'])) {
|
||||
foreach ($this->rules['next_page'] as $pattern) {
|
||||
$nodes = $this->xpath->query($pattern);
|
||||
if ($nodes !== false && $nodes->length > 0) {
|
||||
foreach ($nodes as $node) {
|
||||
return $node->getAttribute('href');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -206,19 +206,31 @@ class Scraper extends Base
|
|||
/**
|
||||
* Execute the scraper.
|
||||
*/
|
||||
public function execute()
|
||||
public function execute($pageContent = '', $recursionDepth = 0)
|
||||
{
|
||||
$this->content = '';
|
||||
$this->html = '';
|
||||
$this->encoding = '';
|
||||
|
||||
$this->content = '';
|
||||
$this->download();
|
||||
$this->prepareHtml();
|
||||
|
||||
$parser = $this->getParser();
|
||||
|
||||
if ($parser !== null) {
|
||||
$this->content = $parser->execute();
|
||||
$maxRecursions = $this->config->getMaxRecursions();
|
||||
if(!isset($maxRecursions)){
|
||||
$maxRecursions = 25;
|
||||
}
|
||||
$pageContent .= $parser->execute();
|
||||
// check if there is a link to next page and recursively get content (max 25 pages)
|
||||
if((($nextLink = $parser->findNextLink()) !== null) && $recursionDepth < $maxRecursions){
|
||||
$nextLink = Url::resolve($nextLink,$this->url);
|
||||
$this->setUrl($nextLink);
|
||||
$this->execute($pageContent,$recursionDepth+1);
|
||||
}
|
||||
else{
|
||||
$this->content = $pageContent;
|
||||
}
|
||||
Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
|
||||
}
|
||||
}
|
||||
|
|
Reference in a new issue