Commit 6303ecb8 authored by Dries's avatar Dries

- Patch #303930 by Aron Novak, alex_b, et al: introduced pluggable architecture for aggregator

parent 0c63d9e2
......@@ -42,6 +42,9 @@ Drupal 7.0, xxxx-xx-xx (development version)
* Optionally, RSS feeds may be configured to not automatically generate feed blocks.
- Search:
* Added support for language-aware searches.
- Aggregator:
* Introduced architecture that allows pluggable parsers and processors for
syndicating RSS and Atom feeds.
- Testing:
* Added test framework and tests.
- Improved time zone support:
......
This diff is collapsed.
<?php
// $Id$
/**
* @file
* Documentation for aggregator API.
*/
/**
* @addtogroup hooks
* @{
*/
/**
* Implement this hook to create an alternative fetcher for aggregator module.
*
* A fetcher downloads feed data to a Drupal site. The fetcher is called
* at the first of the three aggregation stages: data is downloaded by the
* active fetcher, it is converted to a common format by the active parser and
* finally, it is passed to all active processors which manipulate or store the
* data.
*
* Modules that define this hook can be set as active fetcher on
* admin/content/aggregator/settings. Only one fetcher can be active at a time.
*
* @param $feed
* The $feed object that describes the resource to be downloaded.
* $feed->url contains the link to the feed. Download the data at the URL
* and expose it to other modules by attaching it to $feed->source_string.
*
* @see hook_aggregator_fetch_info()
* @see hook_aggregator_parse()
* @see hook_aggregator_process()
*
* @ingroup aggregator
*/
function hook_aggregator_fetch($feed) {
$feed->source_string = mymodule_fetch($feed->url);
}
/**
* Implement this hook to expose the title and a short description of your
* fetcher.
*
* The title and the description provided are shown on
* admin/content/aggregator/settings among other places. Use as title the human
* readable name of the fetcher and as description a brief (40 to 80 characters)
* explanation of the fetcher's functionality.
*
* This hook is only called if your module implements hook_aggregator_fetch().
* If this hook is not implemented aggregator will use your module's file name
* as title and there will be no description.
*
* @return
* An associative array defining a title and a description string.
*
* @see hook_aggregator_fetch()
*
* @ingroup aggregator
*/
function hook_aggregator_fetch_info() {
return array(
'title' => t('Default fetcher'),
'description' => t('Default fetcher for resources available by URL.'),
);
}
/**
* Implement this hook to create an alternative parser for aggregator module.
*
* A parser converts feed item data to a common format. The parser is called
* at the second of the three aggregation stages: data is downloaded by the
* active fetcher, it is converted to a common format by the active parser and
* finally, it is passed to all active processors which manipulate or store the
* data.
*
* Modules that define this hook can be set as active parser on
* admin/content/aggregator/settings. Only one parser can be active at a time.
*
* @param $feed
* The $feed object that describes the resource to be parsed.
* $feed->source_string contains the raw feed data as a string. Parse data
* from $feed->source_string and expose it to other modules as an array of
* data items on $feed->items.
*
* By convention, the common format for a single feed item is:
* $item[key-name] = value;
*
* Recognized keys:
* TITLE (string) - the title of a feed item
* DESCRIPTION (string) - the description (body text) of a feed item
* TIMESTAMP (UNIX timestamp) - the feed item's published time as UNIX timestamp
* AUTHOR (string) - the feed item's author
* GUID (string) - RSS/Atom global unique identifier
* LINK (string) - the feed item's URL
*
* @see hook_aggregator_parse_info()
* @see hook_aggregator_fetch()
* @see hook_aggregator_process()
*
* @ingroup aggregator
*/
function hook_aggregator_parse($feed) {
$feed->items = mymodule_parse($feed->source_string);
}
/**
* Implement this hook to expose the title and a short description of your
* parser.
*
* The title and the description provided are shown on
* admin/content/aggregator/settings among other places. Use as title the human
* readable name of the parser and as description a brief (40 to 80 characters)
* explanation of the parser's functionality.
*
* This hook is only called if your module implements hook_aggregator_parse().
* If this hook is not implemented aggregator will use your module's file name
* as title and there will be no description.
*
* @return
* An associative array defining a title and a description string.
*
* @see hook_aggregator_parse()
*
* @ingroup aggregator
*/
function hook_aggregator_parse_info() {
return array(
'title' => t('Default parser'),
'description' => t('Default parser for RSS, Atom and RDF feeds.'),
);
}
/**
* Implement this hook to create a processor for aggregator module.
*
* A processor acts on parsed feed data. Active processors are called at the
* third and last of the aggregation stages: data is downloaded by the active
* fetcher, it is converted to a common format by the active parser and
* finally, it is passed to all active processors which manipulate or store the
* data.
*
* Modules that define this hook can be activated as processor on
* admin/content/aggregator/settings.
*
* @param $feed
* The $feed object that describes the resource to be processed. $feed->items
* contains an array of feed items downloaded and parsed at the parsing
* stage. See hook_aggregator_parse() for the basic format of a single item
* in the $feed->items array. For the exact format refer to the particular
* parser in use.
*
* @see hook_aggregator_process_info()
* @see hook_aggregator_fetch()
* @see hook_aggregator_parse()
*
* @ingroup aggregator
*/
function hook_aggregator_process($feed) {
foreach ($feed->items as $item) {
mymodule_save($item);
}
}
/**
* Implement this hook to expose the title and a short description of your
* processor.
*
* The title and the description provided are shown most importantly on
* admin/content/aggregator/settings . Use as title the natural name of the
* processor and as description a brief (40 to 80 characters) explanation of
* the functionality.
*
* This hook is only called if your module implements
* hook_aggregator_process(). If this hook is not implemented aggregator
* will use your module's file name as title and there will be no description.
*
* @return
* An associative array defining a title and a description string.
*
* @see hook_aggregator_process()
*
* @ingroup aggregator
*/
function hook_aggregator_process_info($feed) {
return array(
'title' => t('Default processor'),
'description' => t('Creates lightweight records of feed items.'),
);
}
/**
* Implement this hook to remove stored data if a feed is being deleted or a
* feed's items are being removed.
*
* Aggregator calls this hook if either a feed is deleted or a user clicks on
* "remove items".
*
* If your module stores feed items for example on hook_aggregator_process() it
* is recommended to implement this hook and to remove data related to $feed
* when called.
*
* @param $feed
* The $feed object whose items are being removed.
*
* @ingroup aggregator
*/
function hook_aggregator_remove($feed) {
mymodule_remove_items($feed->fid);
}
/**
* @} End of "addtogroup hooks".
*/
<?php
// $Id$
/**
* @file
* Fetcher functions for the aggregator module.
*/
/**
* Implementation of hook_aggregator_fetch_info().
*/
function aggregator_aggregator_fetch_info() {
return array(
'title' => t('Default fetcher'),
'description' => t('Downloads data from a URL using Drupal\'s HTTP request handler.'),
);
}
/**
* Implementation of hook_aggregator_fetch().
*/
function aggregator_aggregator_fetch($feed) {
$feed->source_string = FALSE;
// Generate conditional GET headers.
$headers = array();
if ($feed->etag) {
$headers['If-None-Match'] = $feed->etag;
}
if ($feed->modified) {
$headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $feed->modified) . ' GMT';
}
// Request feed.
$result = drupal_http_request($feed->url, $headers);
// Process HTTP response code.
switch ($result->code) {
case 304:
db_update('aggregator_feed')
->fields(array('checked' => REQUEST_TIME))
->condition('fid', $feed->fid)
->execute();
drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed->title)));
break;
case 301:
$feed->url = $result->redirect_url;
$feed->redirected = TRUE;
// Do not break here.
case 200:
case 302:
case 307:
// We store the md5 hash of feed data in the database. When refreshing a
// feed we compare stored hash and new hash calculated from downloaded
// data. If both are equal we say that feed is not updated.
if (!isset($result->data)) {
$result->data = '';
}
if (!isset($result->headers)) {
$result->headers = array();
}
$md5 = md5($result->data);
if ($feed->hash == $md5) {
db_update('aggregator_feed')
->condition('fid', $feed->fid)
->fields(array('checked' => REQUEST_TIME))
->execute();
drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed->title)));
break;
}
$feed->source_string = $result->data;
$feed->http_headers = $result->headers;
break;
default:
watchdog('aggregator', 'The feed from %site seems to be broken, due to "%error".', array('%site' => $feed->title, '%error' => $result->code . ' ' . $result->error), WATCHDOG_WARNING);
drupal_set_message(t('The feed from %site seems to be broken, because of error "%error".', array('%site' => $feed->title, '%error' => $result->code . ' ' . $result->error)));
module_invoke('system', 'check_http_request');
}
}
......@@ -8,4 +8,7 @@ core = 7.x
files[] = aggregator.module
files[] = aggregator.admin.inc
files[] = aggregator.pages.inc
files[] = aggregator.fetcher.inc
files[] = aggregator.parser.inc
files[] = aggregator.processor.inc
files[] = aggregator.install
......@@ -20,6 +20,9 @@ function aggregator_uninstall() {
variable_del('aggregator_summary_items');
variable_del('aggregator_clear');
variable_del('aggregator_category_selector');
variable_del('aggregator_fetcher');
variable_del('aggregator_parser');
variable_del('aggregator_processors');
}
/**
......
This diff is collapsed.
......@@ -35,8 +35,7 @@ function aggregator_page_last() {
function aggregator_page_source($arg1, $arg2 = NULL) {
// If there are two arguments then this function is the categorize form, and
// $arg1 is $form_state and $arg2 is $feed. Otherwise, $arg1 is $feed.
$feed = is_array($arg2) ? $arg2 : $arg1;
$feed = (object)$feed;
$feed = is_object($arg2) ? $arg2 : $arg1;
drupal_set_title($feed->title);
$feed_source = theme('aggregator_feed_source', $feed);
......
<?php
// $Id$
/**
* @file
* Parser functions for the aggregator module.
*/
/**
* Implementation of hook_aggregator_parse_info().
*/
function aggregator_aggregator_parse_info() {
return array(
'title' => t('Default parser'),
'description' => t('Parses RSS, Atom and RDF feeds.'),
);
}
/**
* Implementation of hook_aggregator_parse().
*/
function aggregator_aggregator_parse($feed) {
global $channel, $image;
// Filter the input data.
if (aggregator_parse_feed($feed->source_string, $feed)) {
$modified = empty($feed->http_headers['Last-Modified']) ? 0 : strtotime($feed->http_headers['Last-Modified']);
// Prepare the channel data.
foreach ($channel as $key => $value) {
$channel[$key] = trim($value);
}
// Prepare the image data (if any).
foreach ($image as $key => $value) {
$image[$key] = trim($value);
}
if (!empty($image['LINK']) && !empty($image['URL']) && !empty($image['TITLE'])) {
$image = l(theme('image', $image['URL'], $image['TITLE']), $image['LINK'], array('html' => TRUE));
}
else {
$image = '';
}
$etag = empty($feed->http_headers['ETag']) ? '' : $feed->http_headers['ETag'];
// Update the feed data.
db_merge('aggregator_feed')
->key(array('fid' => $feed->fid))
->fields(array(
'url' => $feed->url,
'checked' => REQUEST_TIME,
'link' => !empty($channel['LINK']) ? $channel['LINK'] : '',
'description' => !empty($channel['DESCRIPTION']) ? $channel['DESCRIPTION'] : '',
'image' => $image,
'hash' => md5($feed->source_string),
'etag' => $etag,
'modified' => $modified,
))
->execute();
// Clear the cache.
cache_clear_all();
if (isset($feed->redirected)) {
watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed->title, '%url' => $feed->url));
}
watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed->title));
drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed->title)));
}
}
/**
* Parse a feed and store its items.
*
* @param $data
* The feed data.
* @param $feed
* An object describing the feed to be parsed.
* @return
* FALSE on error, TRUE otherwise.
*/
function aggregator_parse_feed(&$data, $feed) {
global $items, $image, $channel;
// Unset the global variables before we use them.
unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']);
$items = array();
$image = array();
$channel = array();
// Parse the data.
$xml_parser = drupal_xml_parser_create($data);
xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end');
xml_set_character_data_handler($xml_parser, 'aggregator_element_data');
if (!xml_parse($xml_parser, $data, 1)) {
watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING);
drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error');
return FALSE;
}
xml_parser_free($xml_parser);
// We reverse the array such that we store the first item last, and the last
// item first. In the database, the newest item should be at the top.
$items = array_reverse($items);
// Initialize items array.
$feed->items = array();
foreach ($items as $item) {
// Prepare the item:
foreach ($item as $key => $value) {
$item[$key] = trim($value);
}
// Resolve the item's title. If no title is found, we use up to 40
// characters of the description ending at a word boundary, but not
// splitting potential entities.
if (!empty($item['TITLE'])) {
$item['TITLE'] = $item['TITLE'];
}
elseif (!empty($item['DESCRIPTION'])) {
$item['TITLE'] = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['DESCRIPTION'], 40));
}
else {
$item['TITLE'] = '';
}
// Resolve the items link.
if (!empty($item['LINK'])) {
$item['LINK'] = $item['LINK'];
}
else {
$item['LINK'] = $feed->link;
}
$item['GUID'] = isset($item['GUID']) ? $item['GUID'] : '';
// Atom feeds have a CONTENT and/or SUMMARY tag instead of a DESCRIPTION tag.
if (!empty($item['CONTENT:ENCODED'])) {
$item['DESCRIPTION'] = $item['CONTENT:ENCODED'];
}
elseif (!empty($item['SUMMARY'])) {
$item['DESCRIPTION'] = $item['SUMMARY'];
}
elseif (!empty($item['CONTENT'])) {
$item['DESCRIPTION'] = $item['CONTENT'];
}
// Try to resolve and parse the item's publication date.
$date = '';
foreach (array('PUBDATE', 'DC:DATE', 'DCTERMS:ISSUED', 'DCTERMS:CREATED', 'DCTERMS:MODIFIED', 'ISSUED', 'CREATED', 'MODIFIED', 'PUBLISHED', 'UPDATED') as $key) {
if (!empty($item[$key])) {
$date = $item[$key];
break;
}
}
$item['TIMESTAMP'] = strtotime($date);
if ($item['TIMESTAMP'] === FALSE) {
$item['TIMESTAMP'] = aggregator_parse_w3cdtf($date); // Aggregator_parse_w3cdtf() returns FALSE on failure.
}
$item += array('AUTHOR' => '', 'DESCRIPTION' => '');
// Store on $feed object. This is where processors will look for parsed items.
$feed->items[] = $item;
}
return TRUE;
}
/**
* Callback function used by the XML parser.
*/
function aggregator_element_start($parser, $name, $attributes) {
global $item, $element, $tag, $items, $channel;
switch ($name) {
case 'IMAGE':
case 'TEXTINPUT':
case 'CONTENT':
case 'SUMMARY':
case 'TAGLINE':
case 'SUBTITLE':
case 'LOGO':
case 'INFO':
$element = $name;
break;
case 'ID':
if ($element != 'ITEM') {
$element = $name;
}
case 'LINK':
if (!empty($attributes['REL']) && $attributes['REL'] == 'alternate') {
if ($element == 'ITEM') {
$items[$item]['LINK'] = $attributes['HREF'];
}
else {
$channel['LINK'] = $attributes['HREF'];
}
}
break;
case 'ITEM':
$element = $name;
$item += 1;
break;
case 'ENTRY':
$element = 'ITEM';
$item += 1;
break;
}
$tag = $name;
}
/**
* Call-back function used by the XML parser.
*/
function aggregator_element_end($parser, $name) {
global $element;
switch ($name) {
case 'IMAGE':
case 'TEXTINPUT':
case 'ITEM':
case 'ENTRY':
case 'CONTENT':
case 'INFO':
$element = '';
break;
case 'ID':
if ($element == 'ID') {
$element = '';
}
}
}
/**
* Callback function used by the XML parser.
*/
function aggregator_element_data($parser, $data) {
global $channel, $element, $items, $item, $image, $tag;
$items += array($item => array());
switch ($element) {
case 'ITEM':
$items[$item] += array($tag => '');
$items[$item][$tag] .= $data;
break;
case 'IMAGE':
case 'LOGO':
$image += array($tag => '');
$image[$tag] .= $data;
break;
case 'LINK':
if ($data) {
$items[$item] += array($tag => '');
$items[$item][$tag] .= $data;
}
break;
case 'CONTENT':
$items[$item] += array('CONTENT' => '');
$items[$item]['CONTENT'] .= $data;
break;
case 'SUMMARY':
$items[$item] += array('SUMMARY' => '');
$items[$item]['SUMMARY'] .= $data;
break;
case 'TAGLINE':
case 'SUBTITLE':
$channel += array('DESCRIPTION' => '');
$channel['DESCRIPTION'] .= $data;
break;
case 'INFO':
case 'ID':
case 'TEXTINPUT':
// The sub-element is not supported. However, we must recognize
// it or its contents will end up in the item array.
break;
default:
$channel += array($tag => '');
$channel[$tag] .= $data;
}
}
/**
* Parse the W3C date/time format, a subset of ISO 8601.
*
* PHP date parsing functions do not handle this format.
* See http://www.w3.org/TR/NOTE-datetime for more information.
* Originally from MagpieRSS (http://magpierss.sourceforge.net/).
*
* @param $date_str
* A string with a potentially W3C DTF date.
* @return
* A timestamp if parsed successfully or FALSE if not.