xmlsitemap.generate.inc 19.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
<?php

/**
 * @file
 * Sitemap generation and rebuilding functions for the xmlsitemap module.
 *
 * @ingroup xmlsitemap
 */

/**
 * Given an internal Drupal path, return the alias for the path.
 *
 * This is similar to drupal_get_path_alias(), but designed to fetch all alises
 * at once so that only one database query is executed instead of several or
 * possibly thousands during sitemap generation.
 *
17
 * @param string $path
18
 *   An internal Drupal path.
19
 * @param string $language
20
 *   A language code to use when looking up the paths.
21 22 23 24 25 26 27 28 29 30 31 32 33 34
 */
function xmlsitemap_get_path_alias($path, $language) {
  static $aliases;
  static $last_language;

  if (!isset($aliases)) {
    $aliases[LANGUAGE_NONE] = db_query("SELECT source, alias FROM {url_alias} WHERE language = :language ORDER BY pid", array(':language' => LANGUAGE_NONE))->fetchAllKeyed();
  }
  if ($language != LANGUAGE_NONE && $last_language != $language) {
    unset($aliases[$last_language]);
    $aliases[$language] = db_query("SELECT source, alias FROM {url_alias} WHERE language = :language ORDER BY pid", array(':language' => $language))->fetchAllKeyed();
    $last_language = $language;
  }

35 36 37 38 39 40 41 42 43 44 45 46
  // We need to pass our path through hook_url_outbound_alter(). This fixes
  // clean URLs not working when they don't exist in the {url_alias} table and
  // are created with something like subpathauto.
  $normalized_path = $path;

  // hook_url_outbound_alter() expects defaults in url() options.
  $options = array(
    'fragment' => '',
    'query' => array(),
    'absolute' => FALSE,
    'alias' => FALSE,
    'prefix' => '',
47
    'external' => FALSE,
48 49
  );

50
  if ($language != LANGUAGE_NONE && isset($aliases[$language][$path])) {
51 52
    $normalized_path = $aliases[$language][$path];
    $options['alias'] = TRUE;
53 54
  }
  elseif (isset($aliases[LANGUAGE_NONE][$path])) {
55 56
    $normalized_path = $aliases[LANGUAGE_NONE][$path];
    $options['alias'] = TRUE;
57
  }
58 59 60 61

  $original_path = $normalized_path;
  drupal_alter('url_outbound', $normalized_path, $options, $original_path);
  return $normalized_path;
62 63 64 65 66 67
}

/**
 * Perform operations before rebuilding the sitemap.
 */
function _xmlsitemap_regenerate_before() {
68
  // Attempt to increase the memory limit.
69 70
  _xmlsitemap_set_memory_limit();

71 72
  if (variable_get('xmlsitemap_developer_mode', 0)) {
    watchdog('xmlsitemap', 'Starting XML sitemap generation. Memory usage: @memory-peak.', array(
73 74
      '@memory-peak' => format_size(memory_get_peak_usage(TRUE)),
    ),
75 76 77
      WATCHDOG_DEBUG
    );
  }
78 79
}

80
/**
81
 * Get Memory Usage.
82
 */
83 84 85 86 87 88 89 90 91
function _xmlsitemap_get_memory_usage($start = FALSE) {
  static $memory_start;
  $current = memory_get_peak_usage(TRUE);
  if (!isset($memory_start) || $start) {
    $memory_start = $current;
  }
  return $current - $memory_start;
}

92 93 94 95 96 97
/**
 * Calculate the optimal PHP memory limit for sitemap generation.
 *
 * This function just makes a guess. It does not take into account
 * the currently loaded modules.
 */
98
function _xmlsitemap_get_optimal_memory_limit() {
99
  $optimal_limit = &drupal_static(__FUNCTION__);
100 101 102 103 104 105 106 107
  if (!isset($optimal_limit)) {
    // Set the base memory amount from the provided core constant.
    $optimal_limit = parse_size(DRUPAL_MINIMUM_PHP_MEMORY_LIMIT);

    // Add memory based on the chunk size.
    $optimal_limit += xmlsitemap_get_chunk_size() * 500;

    // Add memory for storing the url aliases.
108 109 110 111
    if (variable_get('xmlsitemap_prefetch_aliases', 1)) {
      $aliases = db_query("SELECT COUNT(pid) FROM {url_alias}")->fetchField();
      $optimal_limit += $aliases * 250;
    }
112 113 114 115 116 117
  }
  return $optimal_limit;
}

/**
 * Calculate the optimal memory level for sitemap generation.
118
 *
119
 * @param string $new_limit
120 121
 *   An optional PHP memory limit in bytes. If not provided, the value of
 *   _xmlsitemap_get_optimal_memory_limit() will be used.
122
 */
123 124 125 126 127 128 129 130
function _xmlsitemap_set_memory_limit($new_limit = NULL) {
  $current_limit = @ini_get('memory_limit');
  if ($current_limit && $current_limit != -1) {
    if (!is_null($new_limit)) {
      $new_limit = _xmlsitemap_get_optimal_memory_limit();
    }
    if (parse_size($current_limit) < $new_limit) {
      return @ini_set('memory_limit', $new_limit);
131 132 133 134 135
    }
  }
}

/**
136
 * Generate one page (chunk) of the sitemap.
137
 *
138
 * @param object $sitemap
139
 *   An unserialized data array for an XML sitemap.
140
 * @param string $page
141
 *   An integer of the specific page of the sitemap to generate.
142
 */
143
function xmlsitemap_generate_page(stdClass $sitemap, $page) {
144 145 146 147 148
  try {
    $writer = new XMLSitemapWriter($sitemap, $page);
    $writer->startDocument();
    $writer->generateXML();
    $writer->endDocument();
149
  }
150 151 152
  catch (Exception $e) {
    watchdog_exception('xmlsitemap', $e);
    throw $e;
153 154
  }

155
  return $writer->getSitemapElementCount();
156 157
}

158
/**
159
 * Generate chunk.
160
 */
161
function xmlsitemap_generate_chunk(stdClass $sitemap, XMLSitemapWriter $writer, $chunk) {
162
  global $base_url;
163 164 165 166 167
  $output_elements = drupal_map_assoc(variable_get('xmlsitemap_output_elements', array(
    'lastmod',
    'changefreq',
    'priority',
  )));
168 169
  $lastmod_format = variable_get('xmlsitemap_lastmod_format', XMLSITEMAP_LASTMOD_MEDIUM);

170
  $url_options = $sitemap->uri['options'];
171 172
  $url_options += array(
    'absolute' => TRUE,
173
    'base_url' => variable_get('xmlsitemap_base_url', $base_url),
174
    'language' => language_default(),
175
    'alias' => variable_get('xmlsitemap_prefetch_aliases', TRUE),
176 177
  );

178
  $last_url = '';
179
  $link_count = 0;
180 181

  $query = db_select('xmlsitemap', 'x');
182 183 184 185 186 187 188 189 190 191 192 193 194
  $query->fields('x', array(
    'id',
    'type',
    'subtype',
    'loc',
    'lastmod',
    'changefreq',
    'changecount',
    'priority',
    'language',
    'access',
    'status',
  ));
195 196 197 198
  $query->condition('x.access', 1);
  $query->condition('x.status', 1);
  $query->orderBy('x.language', 'DESC');
  $query->orderBy('x.loc');
199 200
  $query->addTag('xmlsitemap_generate');
  $query->addMetaData('sitemap', $sitemap);
201 202 203 204 205 206 207

  $offset = max($chunk - 1, 0) * xmlsitemap_get_chunk_size();
  $limit = xmlsitemap_get_chunk_size();
  $query->range($offset, $limit);
  $links = $query->execute();

  while ($link = $links->fetchAssoc()) {
208
    $link['language'] = $link['language'] != LANGUAGE_NONE ? xmlsitemap_language_load($link['language']) : $url_options['language'];
209
    $parsed_url = drupal_parse_url($link['loc']);
210 211 212 213 214 215 216 217 218
    // Skip nodes which are 301 redirected.
    if (variable_get('xmlsitemap_redirect')) {
      $relative_redirect = redirect_fetch_rids_by_path($link['loc'], $link['language']->language, TRUE);
      $alias_redirect = redirect_fetch_rids_by_path(ltrim(url($link['loc']), '/'), $link['language']->language, TRUE);
      // If node contains a 301 redirect we skip it.
      if (!empty($relative_redirect) || !empty($alias_redirect)) {
        continue;
      }
    }
219 220
    // Remove query or fragment.
    $link['loc'] = $parsed_url['path'];
221
    if ($url_options['alias']) {
222
      $link['loc'] = xmlsitemap_get_path_alias($link['loc'], $link['language']->language);
223
    }
224 225 226 227
    $link_options = array(
      'language' => $link['language'],
      'xmlsitemap_link' => $link,
      'xmlsitemap_sitemap' => $sitemap,
228 229
      'query' => $parsed_url['query'],
      'fragment' => $parsed_url['fragment'],
230
    );
231
    // @todo Add a separate hook_xmlsitemap_link_url_alter() here?
232
    $link_url = url($link['loc'], $link_options + $url_options);
233 234 235 236 237 238 239 240 241

    // Skip this link if it was a duplicate of the last one.
    // @todo Figure out a way to do this before generation so we can report
    // back to the user about this.
    if ($link_url == $last_url) {
      continue;
    }
    else {
      $last_url = $link_url;
242 243
      // Keep track of the total number of links written.
      $link_count++;
244 245
    }

246
    $element = array();
247
    $element['loc'] = urldecode($link_url);
248
    if ($link['lastmod']) {
249 250 251
      if (!empty($output_elements['lastmod'])) {
        $element['lastmod'] = gmdate($lastmod_format, $link['lastmod']);
      }
252 253 254 255 256 257
      // If the link has a lastmod value, update the changefreq so that links
      // with a short changefreq but updated two years ago show decay.
      // We use abs() here just incase items were created on this same cron run
      // because lastmod would be greater than REQUEST_TIME.
      $link['changefreq'] = (abs(REQUEST_TIME - $link['lastmod']) + $link['changefreq']) / 2;
    }
258
    if (!empty($output_elements['changefreq']) && $link['changefreq']) {
259
      $element['changefreq'] = xmlsitemap_get_changefreq($link['changefreq']);
260
    }
261
    if (!empty($output_elements['priority']) && isset($link['priority']) && $link['priority'] != 0.5) {
262 263 264
      // Don't output the priority value for links that have 0.5 priority. This
      // is the default 'assumed' value if priority is not included as per the
      // sitemaps.org specification.
265
      $element['priority'] = number_format($link['priority'], 1);
266
    }
267 268 269 270

    // @todo Should this be moved to XMLSitemapWritier::writeSitemapElement()?
    drupal_alter('xmlsitemap_element', $element, $link, $sitemap);

271 272 273
    if (!empty($element)) {
      $writer->writeSitemapElement('url', $element);
    }
274 275
  }

276
  return $link_count;
277 278 279 280 281
}

/**
 * Generate the index sitemap.
 *
282
 * @param object $sitemap
283
 *   An unserialized data array for an XML sitemap.
284
 */
285
function xmlsitemap_generate_index(stdClass $sitemap) {
286 287 288 289 290 291 292 293 294
  try {
    $writer = new XMLSitemapIndexWriter($sitemap);
    $writer->startDocument();
    $writer->generateXML();
    $writer->endDocument();
  }
  catch (Exception $e) {
    watchdog_exception('xmlsitemap', $e);
    throw $e;
295
  }
296 297

  return $writer->getSitemapElementCount();
298 299 300
}

/**
301 302
 * BATCH OPERATIONS -----------------------------------------------------------.
 *
303
 * Batch information callback for regenerating the sitemap files.
304
 *
305
 * @param array $smids
306 307
 *   An optional array of XML sitemap IDs. If not provided, it will load all
 *   existing XML sitemaps.
308 309 310 311 312 313 314 315 316 317 318 319
 */
function xmlsitemap_regenerate_batch(array $smids = array()) {
  if (empty($smids)) {
    $smids = db_query("SELECT smid FROM {xmlsitemap_sitemap}")->fetchCol();
  }
  $batch = array(
    'operations' => array(),
    'finished' => 'xmlsitemap_regenerate_batch_finished',
    'title' => t('Regenerating Sitemap'),
    'file' => drupal_get_path('module', 'xmlsitemap') . '/xmlsitemap.generate.inc',
  );

320 321 322 323
  // Set the regenerate flag in case something fails during file generation.
  $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_regenerate_needed' => TRUE)));

  // @todo Get rid of this batch operation.
324
  $batch['operations'][] = array('_xmlsitemap_regenerate_before', array());
325 326

  // Generate all the sitemap pages for each context.
327 328 329 330
  foreach ($smids as $smid) {
    $batch['operations'][] = array('xmlsitemap_regenerate_batch_generate', array($smid));
    $batch['operations'][] = array('xmlsitemap_regenerate_batch_generate_index', array($smid));
  }
331 332 333

  // Clear the regeneration flag.
  $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_regenerate_needed' => FALSE)));
334 335 336 337 338 339 340 341 342 343

  return $batch;
}

/**
 * Batch callback; generate all pages of a sitemap.
 */
function xmlsitemap_regenerate_batch_generate($smid, array &$context) {
  if (!isset($context['sandbox']['sitemap'])) {
    $context['sandbox']['sitemap'] = xmlsitemap_sitemap_load($smid);
344 345
    $context['sandbox']['sitemap']->chunks = 1;
    $context['sandbox']['sitemap']->links = 0;
346
    $context['sandbox']['max'] = XMLSITEMAP_MAX_SITEMAP_LINKS;
347 348 349 350

    // Clear the cache directory for this sitemap before generating any files.
    xmlsitemap_check_directory($context['sandbox']['sitemap']);
    xmlsitemap_clear_directory($context['sandbox']['sitemap']);
351 352 353
  }

  $sitemap = &$context['sandbox']['sitemap'];
354 355
  $links = xmlsitemap_generate_page($sitemap, $sitemap->chunks);
  $context['message'] = t('Now generating %sitemap-url.', array('%sitemap-url' => url('sitemap.xml', $sitemap->uri['options'] + array('query' => array('page' => $sitemap->chunks)))));
356 357

  if ($links) {
358 359
    $sitemap->links += $links;
    $sitemap->chunks++;
360 361
  }
  else {
362
    // Cleanup the 'extra' empty file.
363 364
    $file = xmlsitemap_sitemap_get_file($sitemap, $sitemap->chunks);
    if (file_exists($file) && $sitemap->chunks > 1) {
365 366
      file_unmanaged_delete($file);
    }
367
    $sitemap->chunks--;
368

369
    // Save the updated chunks and links values.
370 371
    $context['sandbox']['max'] = $sitemap->chunks;
    $sitemap->updated = REQUEST_TIME;
372
    xmlsitemap_sitemap_get_max_filesize($sitemap);
373 374 375
    xmlsitemap_sitemap_save($sitemap);
  }

376 377
  if ($sitemap->chunks != $context['sandbox']['max']) {
    $context['finished'] = $sitemap->chunks / $context['sandbox']['max'];
378 379 380 381 382 383 384 385
  }
}

/**
 * Batch callback; generate the index page of a sitemap.
 */
function xmlsitemap_regenerate_batch_generate_index($smid, array &$context) {
  $sitemap = xmlsitemap_sitemap_load($smid);
386
  if ($sitemap->chunks > 1) {
387
    xmlsitemap_generate_index($sitemap);
388
    $context['message'] = t('Now generating sitemap index %sitemap-url.', array('%sitemap-url' => url('sitemap.xml', $sitemap->uri['options'])));
389
  }
390 391 392
}

/**
393
 * Batch callback; sitemap regeneration finished.
394
 */
395 396 397
function xmlsitemap_regenerate_batch_finished($success, $results, $operations, $elapsed) {
  if ($success && !variable_get('xmlsitemap_regenerate_needed', FALSE)) {
    variable_set('xmlsitemap_generated_last', REQUEST_TIME);
398
    // drupal_set_message(t('The sitemaps were regenerated.'));
399 400 401 402 403 404 405 406 407
    // Show a watchdog message that the sitemap was regenerated.
    watchdog('xmlsitemap',
      'Finished XML sitemap generation in @elapsed. Memory usage: @memory-peak.',
      array(
        '@elapsed' => $elapsed,
        '@memory-peak' => format_size(memory_get_peak_usage(TRUE)),
      ),
      WATCHDOG_NOTICE
    );
408
    module_invoke_all('xmlsitemap_regenerate_finished');
409 410
  }
  else {
411
    drupal_set_message(t('The sitemaps were not successfully regenerated.'), 'error');
412 413 414 415 416 417 418
  }
}

/**
 * Batch information callback for rebuilding the sitemap data.
 */
function xmlsitemap_rebuild_batch(array $entities, $save_custom = FALSE) {
419 420 421 422 423 424 425
  $batch = array(
    'operations' => array(),
    'finished' => 'xmlsitemap_rebuild_batch_finished',
    'title' => t('Rebuilding Sitemap'),
    'file' => drupal_get_path('module', 'xmlsitemap') . '/xmlsitemap.generate.inc',
  );

426 427
  // Set the rebuild flag in case something fails during the rebuild.
  $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_rebuild_needed' => TRUE)));
428

429
  // Purge any links first.
430 431 432
  $batch['operations'][] = array('xmlsitemap_rebuild_batch_clear',
    array($entities, (bool) $save_custom),
  );
433 434 435

  // Fetch all the sitemap links and save them to the {xmlsitemap} table.
  foreach ($entities as $entity) {
436 437
    $info = xmlsitemap_get_link_info($entity);
    $batch['operations'][] = array($info['xmlsitemap']['rebuild callback'], array($entity));
438 439
  }

440 441 442
  // Clear the rebuild flag.
  $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_rebuild_needed' => FALSE)));

443 444 445
  // Add the regeneration batch.
  $regenerate_batch = xmlsitemap_regenerate_batch();
  $batch['operations'] = array_merge($batch['operations'], $regenerate_batch['operations']);
446 447 448 449

  return $batch;
}

450
/**
451
 * Batch callback; set an array of variables and their values.
452
 */
453 454 455 456
function xmlsitemap_batch_variable_set(array $variables) {
  foreach ($variables as $variable => $value) {
    variable_set($variable, $value);
  }
457 458
}

459 460 461 462 463
/**
 * Batch callback; clear sitemap links for entites.
 */
function xmlsitemap_rebuild_batch_clear(array $entities, $save_custom, &$context) {
  if (!empty($entities)) {
464
    xmlsitemap_rebuild_clear($entities, $save_custom);
465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
  }
  $context['message'] = t('Purging links.');
}

/**
 * Batch callback; fetch and add the sitemap links for a specific entity.
 */
function xmlsitemap_rebuild_batch_fetch($entity, &$context) {
  if (!isset($context['sandbox']['info'])) {
    $context['sandbox']['info'] = xmlsitemap_get_link_info($entity);
    $context['sandbox']['progress'] = 0;
    $context['sandbox']['last_id'] = 0;
  }
  $info = $context['sandbox']['info'];

480 481 482
  $query = new EntityFieldQuery();
  $query->entityCondition('entity_type', $entity);
  $query->entityCondition('entity_id', $context['sandbox']['last_id'], '>');
483
  $query->addTag('xmlsitemap_link_bundle_access');
484 485 486
  $query->addTag('xmlsitemap_rebuild');
  $query->addMetaData('entity', $entity);
  $query->addMetaData('entity_info', $info);
487 488 489 490 491 492 493
  if ($types = xmlsitemap_get_link_type_enabled_bundles($entity)) {
    $query->entityCondition('bundle', $types, 'IN');
  }
  else {
    // If no enabled bundle types, skip everything else.
    return;
  }
494 495

  if (!isset($context['sandbox']['max'])) {
496 497 498
    $count_query = clone $query;
    $count_query->count();
    $context['sandbox']['max'] = $count_query->execute();
499 500 501 502
    if (!$context['sandbox']['max']) {
      // If there are no items to process, skip everything else.
      return;
    }
503 504
  }

505
  // PostgreSQL cannot have the ORDERED BY in the count query.
506
  $query->entityOrderBy('entity_id');
507
  $limit = 20;
508
  $query->range(0, $limit);
509

510 511 512
  $result = $query->execute();
  $ids = array_keys($result[$entity]);

513 514 515
  $info['xmlsitemap']['process callback']($ids);
  $context['sandbox']['last_id'] = end($ids);
  $context['sandbox']['progress'] += count($ids);
516 517 518 519 520 521
  $context['message'] = t('Now processing %entity @last_id (@progress of @count).', array(
    '%entity' => $entity,
    '@last_id' => $context['sandbox']['last_id'],
    '@progress' => $context['sandbox']['progress'],
    '@count' => $context['sandbox']['max'],
  ));
522

523 524 525 526
  if ($context['sandbox']['progress'] >= $context['sandbox']['max']) {
    $context['finished'] = 1;
  }
  else {
527 528 529 530 531 532 533
    $context['finished'] = $context['sandbox']['progress'] / $context['sandbox']['max'];
  }
}

/**
 * Batch callback; sitemap rebuild finished.
 */
534 535 536
function xmlsitemap_rebuild_batch_finished($success, $results, $operations, $elapsed) {
  if ($success && !variable_get('xmlsitemap_rebuild_needed', FALSE)) {
    drupal_set_message(t('The sitemap links were rebuilt.'));
537 538
  }
  else {
539
    drupal_set_message(t('The sitemap links were not successfully rebuilt.'), 'error');
540 541
  }
}
542

543
/**
544
 * Get Rebuildable link types.
545
 */
546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566
function xmlsitemap_get_rebuildable_link_types() {
  $rebuild_types = array();
  $entities = xmlsitemap_get_link_info();

  foreach ($entities as $entity => $info) {
    if (empty($info['xmlsitemap']['rebuild callback'])) {
      // If the entity is missing a rebuild callback, skip.
      continue;
    }
    if (!empty($info['entity keys']['bundle']) && !xmlsitemap_get_link_type_enabled_bundles($entity)) {
      // If the entity has bundles, but no enabled bundles, skip since
      // rebuilding wouldn't get any links.
      continue;
    }
    else {
      $rebuild_types[] = $entity;
    }
  }

  return $rebuild_types;
}
567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595

/**
 * Clear all sitemap links for given entity types.
 *
 * @param array $types
 *   An array of link types.
 * @param bool $save_custom
 *   A boolean if links with status or priority overridden should not be
 *   removed (and hence overridden values not lost).
 *
 * @return int
 *   The number of deleted links.
 */
function xmlsitemap_rebuild_clear(array $types, $save_custom) {
  // Let other modules respond to the rebuild clearing.
  module_invoke_all('xmlsitemap_rebuild_clear', $types, $save_custom);

  $query = db_delete('xmlsitemap');
  $query->condition('type', $types);

  // If we want to save the custom data, make sure to exclude any links
  // that are not using default inclusion or priority.
  if ($save_custom) {
    $query->condition('status_override', 0);
    $query->condition('priority_override', 0);
  }

  return $query->execute();
}