Skip to content
Snippets Groups Projects
Commit 52d44fe8 authored by Alexander Hass's avatar Alexander Hass
Browse files

#380052: Add support with non-blocking parallel link checking.

parent fbb42a55
No related branches found
No related tags found
No related merge requests found
linkchecker 6.x-dev, nightly
----------------------------
* #380052: Add support with non-blocking parallel link checking.
* linkchecker_update_6211 was missing an update message.
* #1869924: Strict warning: Creating default object from empty value.
* #1867460: Prevent save on automatic updates, if content has not changed.
......
......@@ -123,11 +123,31 @@ function linkchecker_admin_settings_form(&$form_state) {
'#description' => t('If a filter has been enabled for an input format it runs first and afterwards the link extraction. This helps the link checker module to find all links normally created by custom filters (e.g. Markdown filter, Bbcode). All filters used as an inline references (e.g. Weblink filter <code>[link: id]</code>) to other content and filters only wasting processing time (e.g. Line break converter) should be disabled. This setting does not have any effect on how content is shown on a page. This feature optimizes the internal link extraction process for link checker and prevents false alarms about broken links in content not having the real data of a link.'),
);
$count_lids_enabled = db_result(db_query("SELECT count(lid) FROM {linkchecker_links} WHERE status = %d", 1));
$count_lids_disabled = db_result(db_query("SELECT count(lid) FROM {linkchecker_links} WHERE status = %d", 0));
$form['check'] = array(
'#type' => 'fieldset',
'#title' => t('Check settings'),
'#description' => t('For simultaneous link checks it is recommended to install the <a href="@httprl">HTTP Parallel Request & Threading Library</a>. This may be <strong>necessary</strong> on larger sites with very many links (30.000+), but will also improve overall link check duration on smaller sites. Currently the site has @count links (@count_enabled enabled / @count_disabled disabled).', array('@httprl' => 'http://drupal.org/project/httprl', '@count' => $count_lids_enabled+$count_lids_disabled, '@count_enabled' => $count_lids_enabled, '@count_disabled' => $count_lids_disabled)),
'#collapsible' => FALSE,
);
$form['check']['linkchecker_check_library'] = array(
'#type' => 'select',
'#title' => t('Check library'),
'#description' => t('Defines the library that is used for checking links.'),
'#default_value' => variable_get('linkchecker_check_library', 'core'),
'#options' => array(
'core' => t('Drupal core'),
'httprl' => t('HTTP Parallel Request Library'),
),
);
$form['check']['linkchecker_check_connections_max'] = array(
'#type' => 'select',
'#title' => t('Number of simultaneous connections'),
'#description' => t('Defines the maximum number of simultaneous connections that can be opened by the server. <em>HTTP Parallel Request & Threading Library</em> make sure that a single domain is not overloaded beyond RFC limits. For small hosting plans with very limited CPU and RAM it may be required to reduce the default limit.'),
'#default_value' => variable_get('linkchecker_check_connections_max', 8),
'#options' => drupal_map_assoc(array(2, 4, 8, 16, 24, 32, 48, 64, 96, 128)),
);
$form['check']['linkchecker_check_useragent'] = array(
'#type' => 'select',
'#title' => t('User-Agent'),
......
......@@ -32,6 +32,8 @@ function linkchecker_uninstall() {
variable_del('linkchecker_action_status_code_301');
variable_del('linkchecker_action_status_code_404');
variable_get('linkchecker_check_connections_max');
variable_del('linkchecker_check_library');
variable_del('linkchecker_check_links_interval');
variable_del('linkchecker_check_useragent');
variable_del('linkchecker_cleanup_links_last');
......@@ -647,14 +649,14 @@ function linkchecker_update_6217() {
return $ret;
}
/**
* Removed obsolete linkchecker_check_links_max variable.
*/
function linkchecker_update_6218() {
$ret = array();
variable_del('linkchecker_check_links_max');
$ret[] = array('success' => TRUE, 'query' => 'Removed obsolete linkchecker_check_links_max variable.');
return $ret;
}
/**
* Removed obsolete linkchecker_check_links_max variable.
*/
function linkchecker_update_6218() {
$ret = array();
variable_del('linkchecker_check_links_max');
$ret[] = array('success' => TRUE, 'query' => 'Removed obsolete linkchecker_check_links_max variable.');
return $ret;
}
......@@ -171,12 +171,12 @@ function _linkchecker_link_access($link) {
function _linkchecker_link_node_ids($link, $node_author_account = NULL) {
static $fields_with_node_links = array();
// Exit if all node types are disabled or if the user cannot access content,
// there is no need to check further.
$linkchecker_scan_nodetypes = array_filter(variable_get('linkchecker_scan_nodetypes', array()));
if (empty($linkchecker_scan_nodetypes) || !user_access('access content')) {
return array();
}
// Exit if all node types are disabled or if the user cannot access content,
// there is no need to check further.
$linkchecker_scan_nodetypes = array_filter(variable_get('linkchecker_scan_nodetypes', array()));
if (empty($linkchecker_scan_nodetypes) || !user_access('access content')) {
return array();
}
// Disable language negotiation temporarily, re-enable it later.
if (module_exists('i18n')) {
......@@ -201,8 +201,8 @@ function _linkchecker_link_node_ids($link, $node_author_account = NULL) {
}
// Re-enable language negotiation.
if (module_exists('i18n')) {
i18n_selection_mode('reset');
if (module_exists('i18n')) {
i18n_selection_mode('reset');
}
// Check if the current user has access to view the link in each node.
......@@ -343,29 +343,68 @@ function _linkchecker_link_block_ids($link) {
* Implementation of hook_cron().
*/
function linkchecker_cron() {
// Get max_execution_time from configuration, override 0 with 240 seconds.
$max_execution_time = ini_get('max_execution_time') == 0 ? 240 : ini_get('max_execution_time');
// Remove outdated links no longer in use once per day.
if (time() - variable_get('linkchecker_cleanup_links_last', 0) >= 86400) {
_linkchecker_cleanup_links();
variable_set('linkchecker_cleanup_links_last', time());
}
// Run link checker in a new process, independent of cron.
if (module_exists('httprl') && variable_get('linkchecker_check_library', 'core') == 'httprl') {
// Setup callback options array; call _linkchecker_check_links() in the
// background.
$callback_options = array(array('function' => '_linkchecker_check_links'));
// Queue up the request.
httprl_queue_background_callback($callback_options);
// Execute request.
httprl_send_request();
// Exit here so we don't call _linkchecker_check_links() in this process.
return;
}
// Run the link checks the normal way.
_linkchecker_check_links();
}
/**
* Run link checks.
*/
function _linkchecker_check_links() {
// Get max_execution_time from configuration, override 0 with 240 seconds.
$max_execution_time = ini_get('max_execution_time') == 0 ? 240 : ini_get('max_execution_time');
// Make sure we have enough time to validate all of the links.
linkchecker_set_time_limit($max_execution_time);
// Make sure this is the only process trying to run this function.
if (!lock_acquire(__FUNCTION__, $max_execution_time)) {
watchdog('linkchecker', 'Attempted to re-run link checks while they are already running.', array(), WATCHDOG_WARNING);
return FALSE;
}
$has_httprl = (module_exists('httprl') && variable_get('linkchecker_check_library', 'core') == 'httprl');
// Do not confuse admins with a setting of maximum checkable links per cron
// run and guess that 2 links can be checked per second with 1 thread, what is
// nevertheless uncommon. The max_execution_time can be used to calculate
// a useful value that is higher, but not totally out of scope and limits the
// query resultset to a resonable size.
$check_links_max_per_cron_run = $max_execution_time;
$linkchecker_check_connections_max = variable_get('linkchecker_check_connections_max', 8);
$check_links_max_per_cron_run = ($has_httprl) ? ($linkchecker_check_connections_max * $max_execution_time) : $max_execution_time;
$linkchecker_check_links_interval = variable_get('linkchecker_check_links_interval', 2419200);
$linkchecker_check_useragent = variable_get('linkchecker_check_useragent', 'Drupal (+http://drupal.org/)');
$linkchecker_check_links_interval = variable_get('linkchecker_check_links_interval', 2419200);
$linkchecker_check_useragent = variable_get('linkchecker_check_useragent', 'Drupal (+http://drupal.org/)');
// Connection limit can be overriden via settings.php. Two connections is the
// limit defined in RFC http://www.ietf.org/rfc/rfc2616.txt. Modern browsers
// are typically using 6-8 connections and no more. Never use more and keep
// in mind that you can overload other people servers.
$linkchecker_check_domain_connections = variable_get('linkchecker_check_domain_connections', 2);
// Get URLs for checking.
$result = db_query_range("SELECT * FROM {linkchecker_links} WHERE last_checked < %d AND status = %d ORDER BY last_checked, lid ASC", time() - $linkchecker_check_links_interval, 1, 0, $check_links_max_per_cron_run);
while ($link = db_fetch_object($result)) {
$links = db_query_range("SELECT * FROM {linkchecker_links} WHERE last_checked < %d AND status = %d ORDER BY last_checked, lid ASC", time() - $linkchecker_check_links_interval, 1, 0, $check_links_max_per_cron_run);
$links_remaining = $links->num_rows;
while ($link = db_fetch_object($links)) {
$headers = array();
$headers['User-Agent'] = $linkchecker_check_useragent;
......@@ -373,26 +412,71 @@ function linkchecker_cron() {
// required to prevent timeouts on URLs that are large downloads.
if ($link->method == 'GET') { $headers['Range'] = 'bytes=0-1024'; }
// Fetch URL.
$response = drupal_http_request($link->url, $headers, $link->method, NULL, 0);
_linkchecker_status_handling($link, $response);
// Add in the headers.
$options = array(
'headers' => $headers,
'method' => $link->method,
'max_redirects' => 0,
);
if ($has_httprl) {
// Define the callback and add the $link object to it.
// Notes:
// - 'global_timeout' does not require a timer_read('page'), as this job
// runs in a new process, independent of cron.
$options += array(
'global_connections' => $linkchecker_check_connections_max,
'global_timeout' => $max_execution_time - 30,
'domain_connections' => $linkchecker_check_domain_connections,
'callback' => array(
array(
'function' => '_linkchecker_status_handling',
),
$link, // This need to be passed or it's not send back to _linkchecker_status_handling()
)
);
// Queue up the requests.
httprl_request($link->url, $options);
$links_remaining--;
// After all links are queued, run the url checks.
if ($links_remaining == 0) {
httprl_send_request();
}
}
else {
$response = drupal_http_request($link->url, $options['headers'], $options['method'], NULL, $options['max_redirects']);
_linkchecker_status_handling($response, $link);
if ((timer_read('page') / 1000) > ($max_execution_time / 2)) {
break; // Stop once we have used over half of the maximum execution time.
if ((timer_read('page') / 1000) > ($max_execution_time / 2)) {
break; // Stop once we have used over half of the maximum execution time.
}
}
}
// Release the lock.
lock_release(__FUNCTION__);
watchdog('linkchecker', 'Link checks completed.', array(), WATCHDOG_INFO);
// Peak memory usage is only available in PHP >= 5.2.
if (version_compare(phpversion(), '5.2.0', '>=')) {
watchdog('linkchecker', 'Memory usage: @memory_get_usage, Peak memory usage: @memory_get_peak_usage.', array('@memory_get_peak_usage' => format_size(memory_get_peak_usage()), '@memory_get_usage' => format_size(memory_get_usage())), WATCHDOG_DEBUG);
}
else {
watchdog('linkchecker', 'Memory usage: @memory_get_usage.', array('@memory_get_usage' => format_size(memory_get_usage())), WATCHDOG_DEBUG);
}
return TRUE;
}
/**
* Status code handling.
*
* @param string $link
* An object containing the url, lid and fail_count.
* @param object $response
* An object containing the HTTP request headers, response code, headers,
* data and redirect status.
* @param string $link
* An object containing the url, lid and fail_count.
*/
function _linkchecker_status_handling($link, $response) {
function _linkchecker_status_handling(&$response, $link) {
$ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403"));
// - Prevent E_ALL warnings in DB updates for non-existing $response->error.
......@@ -411,6 +495,11 @@ function _linkchecker_status_handling($link, $response) {
}
switch ($response->code) {
case -4: // HTTPRL: httprl_send_request timed out.
// Skip these and try them again next cron run.
break;
case -2: // HTTPRL: maximum allowed redirects exhausted.
case 301:
db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->redirect_code, $response->status_message, time(), $link->lid);
......@@ -481,9 +570,9 @@ function _linkchecker_status_handling($link, $response) {
node_save($node);
watchdog('linkchecker', 'Changed permanently moved link in %node from %src to %dst.', array('%node' => url('node/' . $row->nid), '%src' => $link->url, '%dst' => $response->redirect_url), WATCHDOG_INFO);
}
else {
watchdog('linkchecker', 'Link update in node failed. Permanently moved link %src not found in node %node. Manual fix required.', array('%node' => url('node/' . $row->nid), '%src' => $link->url), WATCHDOG_WARNING);
}
else {
watchdog('linkchecker', 'Link update in node failed. Permanently moved link %src not found in node %node. Manual fix required.', array('%node' => url('node/' . $row->nid), '%src' => $link->url), WATCHDOG_WARNING);
}
}
// COMMENTS: Autorepair all comments having this outdated link.
......@@ -531,7 +620,7 @@ function _linkchecker_status_handling($link, $response) {
_linkchecker_link_replace($box[$text_item], $link->url, $response->redirect_url);
}
if ($box_original != $box) {
if ($box_original != $box) {
// Save changed box and update the box link list.
block_box_save($box, $row->bid);
// There is no hook that fires on block_box_save(), therefore do link
......@@ -539,9 +628,9 @@ function _linkchecker_status_handling($link, $response) {
_linkchecker_add_box_links($box, $row->bid);
watchdog('linkchecker', 'Changed permanently moved link in box %bid from %src to %dst.', array('%bid' => $row->bid, '%src' => $link->url, '%dst' => $response->redirect_url), WATCHDOG_INFO);
}
else {
watchdog('linkchecker', 'Link update in block failed. Permanently moved link %src not found in block %bid. Manual fix required.', array('%bid' => $row->bid, '%src' => $link->url), WATCHDOG_WARNING);
}
else {
watchdog('linkchecker', 'Link update in block failed. Permanently moved link %src not found in block %bid. Manual fix required.', array('%bid' => $row->bid, '%src' => $link->url), WATCHDOG_WARNING);
}
}
// Revert user back to anonymous.
......@@ -597,6 +686,8 @@ function _linkchecker_status_handling($link, $response) {
//watchdog('linkchecker', 'Unhandled link error %link has been found.', array('%link' => $link->url), WATCHDOG_ERROR, l(t('Broken links'), 'admin/reports/linkchecker'));
}
}
// Free Memory.
$response = new StdClass();
}
function linkchecker_nodeapi(&$node, $op, $a3 = NULL, $a4 = NULL) {
......@@ -1788,3 +1879,12 @@ function linkchecker_impersonate_user($new_user = NULL) {
function linkchecker_revert_user() {
return linkchecker_impersonate_user();
}
/**
* Backport of drupal_set_time_limit from Drupal 7.
*/
function linkchecker_set_time_limit($time_limit) {
if (function_exists('set_time_limit')) {
@set_time_limit($time_limit);
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment