Issue #3447299 by Marcus_Johansson: Add a depth scraper to AI Interpolator

b58a4f0b · Marcus Johansson · 151e2890 · b58a4f0b · b58a4f0b · b58a4f0b
Commit b58a4f0b authored 1 year ago by Marcus Johansson
--- a/src/Batch/DepthCrawler.php
+++ b/src/Batch/DepthCrawler.php
@@ -27,13 +27,12 @@ class DepthCrawler {
   *   The context.
   */
  public static function startCrawl($entity, $link, array $config, $fieldDefinition, $mode, &$context) {
-    if (!empty($config['cool_down'])) {
-      // Milliseconds.
-      usleep($config['cool_down'] * 1000);
-    }
    if (!isset($context['results']['links_left'])) {
      $context['results']['links_left'] = $config['links_left'] ?? 1;
    }
+    if (!isset($context['results']['found_links'])) {
+      $context['results']['found_links'] = $config['found_links'] ?? [];
+    }
    $context['message'] = 'Crawling ' . $link;

    $context['results']['links_left']--;
@@ -41,6 +40,11 @@ class DepthCrawler {
    if (in_array($link, $context['results']['found_links'] ?? [])) {
      return;
    }
+    if (!empty($config['cool_down'])) {
+      // Milliseconds.
+      usleep($config['cool_down'] * 1000);
+    }
+
    // Scrape the link.
    $options['useChrome'] = $config['use_chrome'];
    $options['waitForNetworkRequests'] = $config['wait_for_network'];
@@ -70,25 +74,8 @@ class DepthCrawler {
    if ($value) {
      $context['results']['found_texts'][] = $value;
    }
+    $context['results']['found_links'][] = $link;

-
-    // If we are at the end, save and return.
-    if ($config['depth'] == 0) {
-      if ($context['results']['links_left'] == 0) {
-        $saveTexts = [];
-        foreach ($context['results']['found_texts'] as $foundText) {
-          if ($mode == 'string') {
-            $saveTexts[] = $foundText;
-          }
-          else {
-            $saveTexts[] = ['value' => $foundText, 'format' => \Drupal::service('ai_interpolator_scraping_bot.crawler_helper')->getTextFormat($fieldDefinition)];
-          }
-        }
-        $entity->set($fieldDefinition->getName(), $saveTexts);
-        $entity->save();
-      }
-      return;
-    }
    // If its wanted to just do inside the body, we get the body only using regex.
    if ($config['body_only']) {
      preg_match('/<body[^>]*>(.*?)<\/body>/is', $rawHtml, $body);
@@ -97,32 +84,62 @@ class DepthCrawler {
      }
    }
    // Parse the html, collecting links starting with http* or / using regex.
+    $newOperations = [];
+    $batch = \batch_get();
    preg_match_all('/href=["\']?([^"\'>]+)["\']?/', $rawHtml, $matches);
    if (!empty($matches[1])) {
      $links = $matches[1];
      $links = \Drupal::service('ai_interpolator_scraping_bot.crawler_helper')->cleanLinks($links, $link, $config);

      $config['depth']--;
-      $batch = \batch_get();
      // If we have links, scrape them.
      $config['links_left'] = $context['results']['links_left'] + count($links);
+      $config['found_links'] = $context['results']['found_links'];
      foreach ($links as $link) {
        // Get the extension if it has one.
        $extension = pathinfo($link, PATHINFO_EXTENSION);
+        // If its in found links, we don't scrape it.
+        if (in_array($link, $config['found_links'])) {
+          continue;
+        }
        // If it has no extension or if it is a web page, we scrape it.
        if (in_array($extension, ['html', 'htm', 'asp', 'php']) || empty($extension)) {
          // Add to the batch job.
          $context['results']['links_left']++;
-          $batch['operations'][] = [
+          $newOperations[] = [
            'Drupal\ai_interpolator_scraping_bot\Batch\DepthCrawler::startCrawl',
            [$entity, $link, $config, $fieldDefinition, $mode],
          ];
        }
      }
-      if (!empty($batch['operations'])) {
+      if (!empty($newOperations)) {
+        $batch['operations'] = !empty($batch['operations']) ? array_merge_recursive($batch['operations'], $newOperations) : $newOperations;
        \batch_set($batch);
      }
    }
+
+    $jobsLeft = FALSE;
+    if (!empty($batch['operations'])) {
+      foreach ($batch['operations'] as $operation) {
+        if ($operation[0] == 'Drupal\ai_interpolator_simple_crawler\Batch\DepthCrawler::startCrawl') {
+          $jobsLeft = TRUE;
+          break;
+        }
+      }
+    }
+    // If there are no jobs left, save it.
+    if (!$jobsLeft) {
+      $saveTexts = [];
+      foreach ($context['results']['found_texts'] as $foundText) {
+        if ($mode == 'string') {
+          $saveTexts[] = $foundText;
+        } else {
+          $saveTexts[] = ['value' => $foundText, 'format' => \Drupal::service('ai_interpolator_simple_crawler.crawler_helper')->getTextFormat($fieldDefinition)];
+        }
+      }
+      $entity->set($fieldDefinition->getName(), $saveTexts);
+      $entity->save();
+    }
  }

 }
--- a/src/CrawlerHelper.php
+++ b/src/CrawlerHelper.php
@@ -104,7 +104,11 @@ class CrawlerHelper {
    foreach ($links as $link) {
      // If it is a hash link, remove it.
      if (str_contains($link, '#')) {
-        continue;
+        // We get the base link.
+        $link = explode('#', trim($link))[0];
+        if (!$link) {
+          continue;
+        }
      }
      // If it is a mailto link, remove it.
      if (strpos($link, 'mailto:') === 0) {
@@ -160,6 +164,12 @@ class CrawlerHelper {
      }
      // If its relative, but without /, make it absolute.
      if (strpos($link, '/') !== 0 && strpos($link, 'http') !== 0) {
+        // If the parent link has a file we have to remove it.
+        $parts = explode('/', $parentLink);
+        array_pop($parts);
+        $parentLink = implode('/', $parts);
+        $parentLink = substr($parentLink, -1) == '/' ? $parentLink : $parentLink . '/';
+        // Then add parents
        $link = $parentLink . $link;
      }
      // If its wanted to just keep the host, then remove the rest.

--- a/src/ScrapingBot.php
+++ b/src/ScrapingBot.php
@@ -194,10 +194,10 @@ class ScrapingBot {
    $apiEndPoint .= count($query_string) ? '?' . http_build_query($query_string) : '';

    // We can wait some.
-    $options['connect_timeout'] = 30;
-    $options['read_timeout'] = 30;
-    // Don't let Guzzle die, just forward body and status.
-    $options['http_errors'] = FALSE;
+    $options['connect_timeout'] = 60;
+    $options['read_timeout'] = 60;
+    $options['timeout'] = 60;
+
    // Headers.
    $options['auth'] = [
      $this->userName,