Skip to content
Snippets Groups Projects
Commit b58a4f0b authored by Marcus Johansson's avatar Marcus Johansson
Browse files

Issue #3447299 by Marcus_Johansson: Add a depth scraper to AI Interpolator

parent 151e2890
Branches
Tags
No related merge requests found
Pipeline #174187 passed with warnings
......@@ -27,13 +27,12 @@ class DepthCrawler {
* The context.
*/
public static function startCrawl($entity, $link, array $config, $fieldDefinition, $mode, &$context) {
if (!empty($config['cool_down'])) {
// Milliseconds.
usleep($config['cool_down'] * 1000);
}
if (!isset($context['results']['links_left'])) {
$context['results']['links_left'] = $config['links_left'] ?? 1;
}
if (!isset($context['results']['found_links'])) {
$context['results']['found_links'] = $config['found_links'] ?? [];
}
$context['message'] = 'Crawling ' . $link;
$context['results']['links_left']--;
......@@ -41,6 +40,11 @@ class DepthCrawler {
if (in_array($link, $context['results']['found_links'] ?? [])) {
return;
}
if (!empty($config['cool_down'])) {
// Milliseconds.
usleep($config['cool_down'] * 1000);
}
// Scrape the link.
$options['useChrome'] = $config['use_chrome'];
$options['waitForNetworkRequests'] = $config['wait_for_network'];
......@@ -70,25 +74,8 @@ class DepthCrawler {
if ($value) {
$context['results']['found_texts'][] = $value;
}
$context['results']['found_links'][] = $link;
// If we are at the end, save and return.
if ($config['depth'] == 0) {
if ($context['results']['links_left'] == 0) {
$saveTexts = [];
foreach ($context['results']['found_texts'] as $foundText) {
if ($mode == 'string') {
$saveTexts[] = $foundText;
}
else {
$saveTexts[] = ['value' => $foundText, 'format' => \Drupal::service('ai_interpolator_scraping_bot.crawler_helper')->getTextFormat($fieldDefinition)];
}
}
$entity->set($fieldDefinition->getName(), $saveTexts);
$entity->save();
}
return;
}
// If its wanted to just do inside the body, we get the body only using regex.
if ($config['body_only']) {
preg_match('/<body[^>]*>(.*?)<\/body>/is', $rawHtml, $body);
......@@ -97,32 +84,62 @@ class DepthCrawler {
}
}
// Parse the html, collecting links starting with http* or / using regex.
$newOperations = [];
$batch = \batch_get();
preg_match_all('/href=["\']?([^"\'>]+)["\']?/', $rawHtml, $matches);
if (!empty($matches[1])) {
$links = $matches[1];
$links = \Drupal::service('ai_interpolator_scraping_bot.crawler_helper')->cleanLinks($links, $link, $config);
$config['depth']--;
$batch = \batch_get();
// If we have links, scrape them.
$config['links_left'] = $context['results']['links_left'] + count($links);
$config['found_links'] = $context['results']['found_links'];
foreach ($links as $link) {
// Get the extension if it has one.
$extension = pathinfo($link, PATHINFO_EXTENSION);
// If its in found links, we don't scrape it.
if (in_array($link, $config['found_links'])) {
continue;
}
// If it has no extension or if it is a web page, we scrape it.
if (in_array($extension, ['html', 'htm', 'asp', 'php']) || empty($extension)) {
// Add to the batch job.
$context['results']['links_left']++;
$batch['operations'][] = [
$newOperations[] = [
'Drupal\ai_interpolator_scraping_bot\Batch\DepthCrawler::startCrawl',
[$entity, $link, $config, $fieldDefinition, $mode],
];
}
}
if (!empty($batch['operations'])) {
if (!empty($newOperations)) {
$batch['operations'] = !empty($batch['operations']) ? array_merge_recursive($batch['operations'], $newOperations) : $newOperations;
\batch_set($batch);
}
}
$jobsLeft = FALSE;
if (!empty($batch['operations'])) {
foreach ($batch['operations'] as $operation) {
if ($operation[0] == 'Drupal\ai_interpolator_simple_crawler\Batch\DepthCrawler::startCrawl') {
$jobsLeft = TRUE;
break;
}
}
}
// If there are no jobs left, save it.
if (!$jobsLeft) {
$saveTexts = [];
foreach ($context['results']['found_texts'] as $foundText) {
if ($mode == 'string') {
$saveTexts[] = $foundText;
} else {
$saveTexts[] = ['value' => $foundText, 'format' => \Drupal::service('ai_interpolator_simple_crawler.crawler_helper')->getTextFormat($fieldDefinition)];
}
}
$entity->set($fieldDefinition->getName(), $saveTexts);
$entity->save();
}
}
}
......@@ -104,7 +104,11 @@ class CrawlerHelper {
foreach ($links as $link) {
// If it is a hash link, remove it.
if (str_contains($link, '#')) {
continue;
// We get the base link.
$link = explode('#', trim($link))[0];
if (!$link) {
continue;
}
}
// If it is a mailto link, remove it.
if (strpos($link, 'mailto:') === 0) {
......@@ -160,6 +164,12 @@ class CrawlerHelper {
}
// If its relative, but without /, make it absolute.
if (strpos($link, '/') !== 0 && strpos($link, 'http') !== 0) {
// If the parent link has a file we have to remove it.
$parts = explode('/', $parentLink);
array_pop($parts);
$parentLink = implode('/', $parts);
$parentLink = substr($parentLink, -1) == '/' ? $parentLink : $parentLink . '/';
// Then add parents
$link = $parentLink . $link;
}
// If its wanted to just keep the host, then remove the rest.
......
......@@ -194,10 +194,10 @@ class ScrapingBot {
$apiEndPoint .= count($query_string) ? '?' . http_build_query($query_string) : '';
// We can wait some.
$options['connect_timeout'] = 30;
$options['read_timeout'] = 30;
// Don't let Guzzle die, just forward body and status.
$options['http_errors'] = FALSE;
$options['connect_timeout'] = 60;
$options['read_timeout'] = 60;
$options['timeout'] = 60;
// Headers.
$options['auth'] = [
$this->userName,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment