Skip to content
Snippets Groups Projects
Commit 3659d4ee authored by Marcus Johansson's avatar Marcus Johansson
Browse files

Issue #3447132 by Marcus_Johansson: Add Link to Link scraper

parent dc93e691
No related branches found
No related tags found
No related merge requests found
Pipeline #173200 passed with warnings
<?php
namespace Drupal\ai_interpolator_scraping_bot\Batch;
/**
* The Link Crawler for batch processing.
*/
class LinkCrawler {
/**
* Start the crawl.
*
* @param object $entity
* The entity.
* @param string $link
* The link to crawl.
* @param array $config
* The config.
* @param object $fieldDefinition
* The field definition.
* @param array $context
* The context.
*/
public static function startCrawl($entity, $link, array $config, $fieldDefinition, &$context) {
if (!empty($config['cool_down'])) {
// Milliseconds.
usleep($config['cool_down'] * 1000);
}
if (!isset($context['results']['links_left'])) {
$context['results']['links_left'] = $config['links_left'] ?? 1;
}
$context['message'] = 'Crawling ' . $link;
$context['results']['links_left']--;
// If we have already scraped this link, return.
if (in_array($link, $context['results']['found_links'] ?? [])) {
return;
}
// Scrape the link.
$options['useChrome'] = $config['use_chrome'];
$options['waitForNetworkRequests'] = $config['wait_for_network'];
$options['proxyCountry'] = $config['proxy_country'];
$options['premiumProxy'] = $config['use_premium_proxy'];
$rawHtml = \Drupal::service('ai_interpolator_scraping_bot.api')->scrapeRaw($link, $options);
// If we are at the end, save and return.
if ($config['depth'] == 0) {
if ($context['results']['links_left'] == 0) {
$saveLinks = [];
foreach ($context['results']['found_links'] as $foundLink) {
$saveLinks[] = ['uri' => $foundLink];
}
$entity->set($fieldDefinition->getName(), $saveLinks);
$entity->save();
}
return;
}
// If its wanted to just do inside the body, we get the body only using regex.
if ($config['body_only']) {
preg_match('/<body[^>]*>(.*?)<\/body>/is', $rawHtml, $body);
if (!empty($body[1])) {
$rawHtml = $body[1];
}
}
// Parse the html, collecting links starting with http* or / using regex.
preg_match_all('/href=["\']?([^"\'>]+)["\']?/', $rawHtml, $matches);
if (!empty($matches[1])) {
$links = $matches[1];
$links = \Drupal::service('ai_interpolator_scraping_bot.crawler_helper')->cleanLinks($links, $link, $config);
$config['depth']--;
$batch = \batch_get();
// If we have links, scrape them.
$config['links_left'] = $context['results']['links_left'] + count($links);
// If we have links and they fit the html part, scrape them.
$formats = \Drupal::service('ai_interpolator_scraping_bot.crawler_helper')->getFormats();
foreach ($links as $link) {
// Get the extension if it has one.
$extension = pathinfo($link, PATHINFO_EXTENSION);
// Check if we should save the link.
if (in_array($extension, $formats) && !in_array($link, $context['results']['found_links'])) {
$context['results']['found_links'][] = $link;
}
// If it has no extension or if it is a web page, we scrape it.
if (in_array($extension, ['html', 'htm', 'asp', 'php']) || empty($extension)) {
// Add to the batch job.
$context['results']['links_left']++;
$batch['operations'][] = [
'Drupal\ai_interpolator_scraping_bot\Batch\LinkCrawler::startCrawl',
[$entity, $link, $config, $fieldDefinition],
];
}
}
if (!empty($batch['operations'])) {
\batch_set($batch);
}
}
}
}
<?php
namespace Drupal\ai_interpolator_scraping_bot;
use Drupal\Core\Field\FieldDefinitionInterface;
/**
* The crawler helper.
*/
class CrawlerHelper {
/**
* The entity type manager.
*
* @var \Drupal\Core\Entity\EntityTypeManagerInterface
*/
protected $entityTypeManager;
/**
* Construct the crawler helper.
*
* @param \Drupal\Core\Entity\EntityTypeManagerInterface $entityTypeManager
* The entity type manager.
*/
public function __construct($entityTypeManager) {
$this->entityTypeManager = $entityTypeManager;
}
/**
* Crude simple DOM traverser.
*
* @todo Use an actual traversal library.
*
* @var string $html
* The html.
* @var string $tag
* The tag to get.
* @var string $remove
* The tags to remove.
*
* @return string
* The cut out html.
*/
public function getPartial($html, $tag = 'body', $remove = "") {
// Get the whole HTML.
$dom = new \DOMDocument();
$dom->loadHTML($html);
// Get the new one.
$mock = new \DOMDocument();
// Figure out if classes or id is involved.
$parts = explode('.', $tag);
$tag = isset($parts[1]) ? $parts[0] : $tag;
$class = $parts[1] ?? '';
$parts = explode('#', $tag);
$tag = isset($parts[1]) ? $parts[0] : $tag;
$id = $parts[1] ?? '';
// Remove.
foreach (explode("\n", $remove) as $tagRemove) {
$removals = $dom->getElementsByTagName($tagRemove);
for ($t = 0; $t < $removals->count(); $t++) {
$dom->removeChild($removals->item($t));
}
}
// Get the rest.
$tags = $dom->getElementsByTagName($tag);
for ($t = 0; $t < $tags->count(); $t++) {
/** @var DOMNode */
$tag = $tags->item($t);
if ($class && $tag->getAttribute('class') != $class) {
continue;
}
if ($id && $tag->getAttribute('id') != $id) {
continue;
}
foreach ($tag->childNodes as $child) {
$mock->appendChild($mock->importNode($child, TRUE));
}
}
return $mock->saveHTML();
}
/**
* Cleanup links.
*
* @param array $links
* The links to clean.
* @param string $parentLink
* The parent link.
* @param array $config
* The config.
*
* @return array
* The cleaned links.
*/
public function cleanLinks($links, $parentLink, $config) {
$cleaned = [];
// Get the parent host and protocol as one string.
$parentHost = parse_url($parentLink, PHP_URL_SCHEME) . '://' . parse_url($parentLink, PHP_URL_HOST);
foreach ($links as $link) {
// If it is a hash link, remove it.
if (strpos($link, '#') === 0) {
continue;
}
// If it is a mailto link, remove it.
if (strpos($link, 'mailto:') === 0) {
continue;
}
// If it is a tel link, remove it.
if (strpos($link, 'tel:') === 0) {
continue;
}
// If it is a javascript link, remove it.
if (strpos($link, 'javascript:') === 0) {
continue;
}
// If it is a data link, remove it.
if (strpos($link, 'data:') === 0) {
continue;
}
// If it is a ftp link, remove it.
if (strpos($link, 'ftp:') === 0) {
continue;
}
// If it is a skype link, remove it.
if (strpos($link, 'skype:') === 0) {
continue;
}
// If it is a callto link, remove it.
if (strpos($link, 'callto:') === 0) {
continue;
}
// If it is a sms link, remove it.
if (strpos($link, 'sms:') === 0) {
continue;
}
// If it is a whatsapp link, remove it.
if (strpos($link, 'whatsapp:') === 0) {
continue;
}
// If it is a viber link, remove it.
if (strpos($link, 'viber:') === 0) {
continue;
}
// If it is a facetime link, remove it.
if (strpos($link, 'facetime:') === 0) {
continue;
}
// If it is a facetime-audio link, remove it.
if (strpos($link, 'facetime-audio:') === 0) {
continue;
}
// If its relative starting with /, make it absolute.
if (strpos($link, '/') === 0) {
$link = $parentHost . $link;
}
// If its relative, but without /, make it absolute.
if (strpos($link, '/') !== 0 && strpos($link, 'http') !== 0) {
$link = $parentLink . $link;
}
// If its wanted to just keep the host, then remove the rest.
if ($config['host_only']) {
if (strpos($link, $parentHost) !== 0) {
continue;
}
}
// If the link is within the list of excluded pages, skip it.
foreach (explode(',', $config['exclude_pages']) as $exclude) {
$abs = $parentHost . '/' . trim($exclude);
if ($abs == $link || $abs . '/' == $link) {
continue 2;
}
}
$cleaned[$link] = $link;
}
return array_values($cleaned);
}
/**
* Get all the possible formats to scrape.
*
* @param array $config
* The config.
*
* @return array
* The formats.
*/
public function getFormats($config) {
$formats = [];
$scrapeFormats = $config['types_to_scrape'];
foreach ($scrapeFormats as $scrapeFormat => $set) {
if (!$set) {
continue;
}
switch ($scrapeFormat) {
case 'webpages':
$formats = array_merge($formats, ['html', 'htm', 'asp', 'php', '']);
break;
case 'images':
$formats = array_merge($formats, ['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp']);
break;
case 'pdfs':
$formats = array_merge($formats, ['pdf']);
break;
case 'docs':
$formats = array_merge($formats, ['doc', 'docx']);
break;
case 'videos':
$formats = array_merge($formats, ['mp4', 'avi', 'mov', 'webm', 'mkv', 'flv']);
break;
case 'audios':
$formats = array_merge($formats, ['mp3', 'wav', 'flac', 'ogg', 'm4a', 'wma', 'aac']);
break;
case 'archives':
$formats = array_merge($formats, ['zip', 'rar', '7z']);
break;
case 'scripts':
$formats = array_merge($formats, ['js', 'css']);
break;
case 'others':
// Explode the other formats.
$otherFormats = explode(',', $config['other_formats']);
foreach ($otherFormats as $format) {
$formats[] = trim($format);
}
break;
}
}
return $formats;
}
/**
* Get text format.
*
* @param \Drupal\Core\Field\FieldDefinitionInterface $fieldDefinition
* The field definition.
*
* @return string|null
* The format.
*/
public function getTextFormat(FieldDefinitionInterface $fieldDefinition) {
$allFormats = $this->entityTypeManager->getStorage('filter_format')->loadMultiple();
// Maybe no formats are set.
if (empty($allFormats)) {
return NULL;
}
$format = $fieldDefinition->getSetting('allowed_formats');
return $format[0] ?? key($allFormats);
}
}
<?php
namespace Drupal\ai_interpolator_scraping_bot\Plugin\AiInterPolatorFieldRules;
use Drupal\ai_interpolator\Annotation\AiInterpolatorFieldRule;
use Drupal\ai_interpolator\PluginInterfaces\AiInterpolatorFieldRuleInterface;
use Drupal\ai_interpolator_scraping_bot\CrawlerHelper;
use Drupal\ai_interpolator_scraping_bot\ScrapingBot;
use Drupal\Core\Entity\ContentEntityInterface;
use Drupal\Core\Field\FieldDefinitionInterface;
use Drupal\Core\Plugin\ContainerFactoryPluginInterface;
use Symfony\Component\DependencyInjection\ContainerInterface;
/**
* The rule to get links from a link.
*
* @AiInterpolatorFieldRule(
* id = "ai_interpolator_scraping_bot_link",
* title = @Translation("Author Crawler"),
* field_rule = "link"
* )
*/
class LinkCrawler extends AiInterpolatorFieldRule implements AiInterpolatorFieldRuleInterface, ContainerFactoryPluginInterface {
/**
* ScrapingBot API Caller.
*/
private ScrapingBot $scrapingBot;
/**
* The Crawler Helper.
*/
private CrawlerHelper $crawlerHelper;
/**
* The links found so far, so it doesn't rerun links.
*/
private array $foundLinks = [];
/**
* We need the Interpolator config globally.
*/
private array $interpolatorConfig;
/**
* We need crawler options globally.
*/
private array $options;
/**
* Construct a boolean field.
*
* @param array $configuration
* Inherited configuration.
* @param string $plugin_id
* Inherited plugin id.
* @param mixed $plugin_definition
* Inherited plugin definition.
* @param \Drupal\ai_interpolator_scraping_bot\ScrapingBot $scrapingBot
* The ScrapingBot requester.
* @param \Drupal\ai_interpolator_scraping_bot\CrawlerHelper $crawlerHelper
* The Crawler Helper.
*/
public function __construct(array $configuration, $plugin_id, $plugin_definition, ScrapingBot $scrapingBot, CrawlerHelper $crawlerHelper) {
parent::__construct($configuration, $plugin_id, $plugin_definition);
$this->scrapingBot = $scrapingBot;
$this->crawlerHelper = $crawlerHelper;
}
/**
* {@inheritDoc}
*/
public static function create(ContainerInterface $container, array $configuration, $plugin_id, $plugin_definition) {
return new static(
$configuration,
$plugin_id,
$plugin_definition,
$container->get('ai_interpolator_scraping_bot.api'),
$container->get('ai_interpolator_scraping_bot.crawler_helper')
);
}
/**
* {@inheritDoc}
*/
public $title = 'Scrapingbot Link Crawler';
/**
* {@inheritDoc}
*/
public function needsPrompt() {
return FALSE;
}
/**
* {@inheritDoc}
*/
public function advancedMode() {
return FALSE;
}
/**
* {@inheritDoc}
*/
public function placeholderText() {
return "";
}
/**
* {@inheritDoc}
*/
public function allowedInputs() {
return ['link'];
}
/**
* {@inheritDoc}
*/
public function extraAdvancedFormFields(ContentEntityInterface $entity, FieldDefinitionInterface $fieldDefinition) {
$form['interpolator_depth'] = [
'#type' => 'number',
'#title' => $this->t('Depth'),
'#description' => $this->t('How many levels deep should the crawler go.'),
'#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_depth', 1),
'#weight' => -20,
];
$form['interpolator_include_source_url'] = [
'#type' => 'checkbox',
'#title' => $this->t('Include Source URL'),
'#description' => $this->t('Include the source URL as one of the links.'),
'#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_include_source_url', TRUE),
'#weight' => -19,
];
$form['interpolator_host_only'] = [
'#type' => 'checkbox',
'#title' => $this->t('Host Only'),
'#description' => $this->t('Only crawl the host of the base link. DO NOT UNCHECK THIS UNLESS YOU KNOW WHAT YOU ARE DOING.'),
'#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_host_only', TRUE),
'#weight' => -19,
];
$form['interpolator_body_only'] = [
'#type' => 'checkbox',
'#title' => $this->t('Body Only'),
'#description' => $this->t('Only crawl the body of the base link.'),
'#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_body_only', TRUE),
'#weight' => -19,
];
$defaultPages = [
'privacy',
'privacy-policy',
'privacy_policy',
'terms',
'terms-of-service',
'terms_of_service',
'terms-and-conditions',
'terms_and_conditions',
'disclaimers',
'disclaimer',
'cookies',
'cookie-policy',
'cookie_policy',
'login',
'register',
];
$form['interpolator_exclude_pages'] = [
'#type' => 'textarea',
'#title' => $this->t('Exclude Pages'),
'#description' => $this->t('Comma separated list of pages to exclude.'),
'#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_exclude_pages', implode(', ', $defaultPages)),
'#weight' => -19,
];
$form['interpolator_types_to_scrape'] = [
'#type' => 'checkboxes',
'#title' => $this->t('Types to Scrape'),
'#description' => $this->t('What types of links should be scraped.'),
'#options' => [
'webpages' => 'Webpages (/, html, htm, asp, php)',
'images' => 'LINKED Images (jpg, jpeg, png, gif)',
'pdfs' => 'PDFs',
'docs' => 'Docs (doc, docx)',
'videos' => 'Videos (mp4, avi, mov)',
'audios' => 'Audios (mp3, wav)',
'archives' => 'Archives (zip, rar, 7z)',
'scripts' => 'Scripts (js, css)',
'others' => 'Others',
],
'#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_types_to_scrape', ['webpages']),
'#weight' => -18,
];
$form['interpolator_use_chrome'] = [
'#type' => 'checkbox',
'#title' => $this->t('Use Chrome'),
'#description' => $this->t("Use Chrome when scraping"),
'#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_use_chrome', TRUE),
'#weight' => -15,
];
$form['interpolator_wait_for_network'] = [
'#type' => 'checkbox',
'#title' => $this->t('Wait for network'),
'#description' => $this->t("Check if you want to wait for most ajax requests to finish until returning the Html content. This can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example."),
'#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_wait_for_network', FALSE),
'#weight' => -14,
];
$form['interpolator_proxy_country'] = [
'#type' => 'select',
'#options' => ScrapingBot::$proxyCountries,
'#title' => $this->t('Proxy Country'),
'#description' => $this->t("Where should the scraping take place."),
'#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_proxy_country', 'US'),
'#weight' => -13,
];
$form['interpolator_use_premium_proxy'] = [
'#type' => 'checkbox',
'#title' => $this->t('Use Premium Proxy'),
'#description' => $this->t("Use Premium Proxy when scraping. This is VERY expensive."),
'#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_use_premium_proxy', FALSE),
'#weight' => -12,
];
$form['interpolator_cool_down'] = [
'#type' => 'number',
'#title' => $this->t('Cool Down'),
'#description' => $this->t('How many milliseconds to wait between each request. Don\'t take down websites by spamming them.'),
'#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_cool_down', 500),
'#weight' => -11,
];
return $form;
}
/**
* {@inheritDoc}
*/
public function generate(ContentEntityInterface $entity, FieldDefinitionInterface $fieldDefinition, array $interpolatorConfig) {
// Set the config.
$this->interpolatorConfig = $interpolatorConfig;
// Set options.
$options['useChrome'] = $interpolatorConfig['use_chrome'];
$options['waitForNetworkRequests'] = $interpolatorConfig['wait_for_network'];
$options['proxyCountry'] = $interpolatorConfig['proxy_country'];
$options['premiumProxy'] = $interpolatorConfig['use_premium_proxy'];
// Set the options globally.
$this->options = $options;
// Take all input links.
foreach ($entity->{$interpolatorConfig['base_field']} as $link) {
// A link is found.
if (!empty($link->uri)) {
// If its batch mode.
if ($interpolatorConfig['worker_type'] == 'batch') {
$batch = \batch_get();
$batch['operations'][] = [
'Drupal\ai_interpolator_scraping_bot\Batch\LinkCrawler::startCrawl',
[$entity, $link->uri, $interpolatorConfig, $fieldDefinition],
];
} else {
$this->scrapeLink($link->uri, $interpolatorConfig['depth']);
}
}
}
if ($interpolatorConfig['worker_type'] == 'batch' && !empty($batch)) {
\batch_set($batch);
return [];
} else {
return $this->foundLinks;
}
}
/**
* {@inheritDoc}
*/
public function verifyValue(ContentEntityInterface $entity, $value, FieldDefinitionInterface $fieldDefinition) {
// Has to have a link an be valid.
if (empty($value) || !filter_var($value, FILTER_VALIDATE_URL)) {
return FALSE;
}
// Otherwise it is ok.
return TRUE;
}
/**
* {@inheritDoc}
*/
public function storeValues(ContentEntityInterface $entity, array $values, FieldDefinitionInterface $fieldDefinition) {
$config = $fieldDefinition->getConfig($entity->bundle())->getSettings();
foreach ($values as $key => $value) {
$new['uri'] = $value;
if ($config['title'] == 0) {
$new['title'] = '';
}
$values[$key] = $new;
}
$entity->set($fieldDefinition->getName(), $values);
}
/**
* Recursive function to scrape links.
*
* @param string $link
* The link to scrape.
* @param int $depth
* The current depth.
*/
private function scrapeLink($link, $depth) {
if (!empty($this->interpolatorConfig['cool_down'])) {
// Milliseconds.
usleep($this->interpolatorConfig['cool_down'] * 1000);
}
// If we have already scraped this link, return.
if (in_array($link, $this->foundLinks)) {
return;
}
// Scrape the link.
$rawHtml = $this->scrapingBot->scrapeRaw($link, $this->options);
// If we are at the end, return.
if ($depth == 0) {
return;
}
// If its wanted to just do inside the body, we get the body only using regex.
if ($this->interpolatorConfig['body_only']) {
preg_match('/<body[^>]*>(.*?)<\/body>/is', $rawHtml, $body);
if (!empty($body[1])) {
$rawHtml = $body[1];
}
}
// Parse the html, collecting links starting with http* or / using regex.
preg_match_all('/href=["\']?([^"\'>]+)["\']?/', $rawHtml, $matches);
if (!empty($matches[1])) {
$links = $matches[1];
$links = $this->crawlerHelper->cleanLinks($links, $link, $this->interpolatorConfig);
// If we have links and they fit the html part, scrape them.
$formats = $this->crawlerHelper->getFormats($this->interpolatorConfig);
foreach ($links as $link) {
// Get the extension if it has one.
$extension = pathinfo($link, PATHINFO_EXTENSION);
// Check if we should save the link.
if (in_array($extension, $formats) && !in_array($link, $this->foundLinks)) {
$this->foundLinks[] = $link;
}
// If it has no extension or if it is a web page, we scrape it.
if (in_array($extension, ['html', 'htm', 'asp', 'php']) || empty($extension)) {
$this->scrapeLink($link, $depth - 1);
}
}
}
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment