Newer
Older
Dries Buytaert
committed
<?php
// $Id$
/**
* @file
* Parser functions for the aggregator module.
*/
/**
Dries Buytaert
committed
* Implement hook_aggregator_parse_info().
Dries Buytaert
committed
*/
function aggregator_aggregator_parse_info() {
return array(
'title' => t('Default parser'),
'description' => t('Parses RSS, Atom and RDF feeds.'),
);
}
/**
Dries Buytaert
committed
* Implement hook_aggregator_parse().
Dries Buytaert
committed
*/
function aggregator_aggregator_parse($feed) {
global $channel, $image;
// Filter the input data.
if (aggregator_parse_feed($feed->source_string, $feed)) {
$modified = empty($feed->http_headers['Last-Modified']) ? 0 : strtotime($feed->http_headers['Last-Modified']);
// Prepare the channel data.
foreach ($channel as $key => $value) {
$channel[$key] = trim($value);
}
// Prepare the image data (if any).
foreach ($image as $key => $value) {
$image[$key] = trim($value);
}
Dries Buytaert
committed
if (!empty($image['link']) && !empty($image['url']) && !empty($image['title'])) {
$image = l(theme('image', $image['url'], $image['title']), $image['link'], array('html' => TRUE));
Dries Buytaert
committed
}
else {
$image = '';
}
$etag = empty($feed->http_headers['ETag']) ? '' : $feed->http_headers['ETag'];
// Update the feed data.
db_merge('aggregator_feed')
->key(array('fid' => $feed->fid))
->fields(array(
'url' => $feed->url,
'checked' => REQUEST_TIME,
Dries Buytaert
committed
'link' => !empty($channel['link']) ? $channel['link'] : '',
'description' => !empty($channel['description']) ? $channel['description'] : '',
Dries Buytaert
committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
'image' => $image,
'hash' => md5($feed->source_string),
'etag' => $etag,
'modified' => $modified,
))
->execute();
// Clear the cache.
cache_clear_all();
if (isset($feed->redirected)) {
watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed->title, '%url' => $feed->url));
}
watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed->title));
drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed->title)));
}
}
/**
* Parse a feed and store its items.
*
* @param $data
* The feed data.
* @param $feed
* An object describing the feed to be parsed.
* @return
* FALSE on error, TRUE otherwise.
*/
function aggregator_parse_feed(&$data, $feed) {
global $items, $image, $channel;
// Unset the global variables before we use them.
unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']);
$items = array();
$image = array();
$channel = array();
// Parse the data.
$xml_parser = drupal_xml_parser_create($data);
xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end');
xml_set_character_data_handler($xml_parser, 'aggregator_element_data');
if (!xml_parse($xml_parser, $data, 1)) {
watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING);
drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error');
return FALSE;
}
xml_parser_free($xml_parser);
// We reverse the array such that we store the first item last, and the last
// item first. In the database, the newest item should be at the top.
$items = array_reverse($items);
// Initialize items array.
$feed->items = array();
foreach ($items as $item) {
// Prepare the item:
foreach ($item as $key => $value) {
$item[$key] = trim($value);
}
// Resolve the item's title. If no title is found, we use up to 40
// characters of the description ending at a word boundary, but not
// splitting potential entities.
Dries Buytaert
committed
if (!empty($item['title'])) {
$item['title'] = $item['title'];
Dries Buytaert
committed
}
Dries Buytaert
committed
elseif (!empty($item['description'])) {
$item['title'] = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['description'], 40));
Dries Buytaert
committed
}
else {
Dries Buytaert
committed
$item['title'] = '';
Dries Buytaert
committed
}
// Resolve the items link.
Dries Buytaert
committed
if (!empty($item['link'])) {
$item['link'] = $item['link'];
Dries Buytaert
committed
}
else {
Dries Buytaert
committed
$item['link'] = $feed->link;
Dries Buytaert
committed
}
Dries Buytaert
committed
$item['guid'] = isset($item['guid']) ? $item['guid'] : '';
Dries Buytaert
committed
Dries Buytaert
committed
// Atom feeds have a content and/or summary tag instead of a description tag.
if (!empty($item['content:encoded'])) {
$item['description'] = $item['content:encoded'];
Dries Buytaert
committed
}
Dries Buytaert
committed
elseif (!empty($item['summary'])) {
$item['description'] = $item['summary'];
Dries Buytaert
committed
}
Dries Buytaert
committed
elseif (!empty($item['content'])) {
$item['description'] = $item['content'];
Dries Buytaert
committed
}
// Try to resolve and parse the item's publication date.
$date = '';
Dries Buytaert
committed
foreach (array('pubdate', 'dc:date', 'dcterms:issued', 'dcterms:created', 'dcterms:modified', 'issued', 'created', 'modified', 'published', 'updated') as $key) {
Dries Buytaert
committed
if (!empty($item[$key])) {
$date = $item[$key];
break;
}
}
Dries Buytaert
committed
$item['timestamp'] = strtotime($date);
Dries Buytaert
committed
Dries Buytaert
committed
if ($item['timestamp'] === FALSE) {
$item['timestamp'] = aggregator_parse_w3cdtf($date); // Aggregator_parse_w3cdtf() returns FALSE on failure.
Dries Buytaert
committed
}
Dries Buytaert
committed
$item += array('author' => '', 'description' => '');
Dries Buytaert
committed
// Store on $feed object. This is where processors will look for parsed items.
$feed->items[] = $item;
}
return TRUE;
}
/**
* Callback function used by the XML parser.
*/
function aggregator_element_start($parser, $name, $attributes) {
global $item, $element, $tag, $items, $channel;
Dries Buytaert
committed
$name = strtolower($name);
Dries Buytaert
committed
switch ($name) {
Dries Buytaert
committed
case 'image':
case 'textinput':
case 'content':
case 'summary':
case 'tagline':
case 'subtitle':
case 'logo':
case 'info':
Dries Buytaert
committed
$element = $name;
break;
Dries Buytaert
committed
case 'id':
if ($element != 'item') {
Dries Buytaert
committed
$element = $name;
}
Dries Buytaert
committed
case 'link':
if (!empty($attributes['rel']) && $attributes['rel'] == 'alternate') {
if ($element == 'item') {
$items[$item]['link'] = $attributes['href'];
Dries Buytaert
committed
}
else {
Dries Buytaert
committed
$channel['link'] = $attributes['href'];
Dries Buytaert
committed
}
}
break;
Dries Buytaert
committed
case 'item':
Dries Buytaert
committed
$element = $name;
$item += 1;
break;
Dries Buytaert
committed
case 'entry':
$element = 'item';
Dries Buytaert
committed
$item += 1;
break;
}
$tag = $name;
}
/**
* Call-back function used by the XML parser.
*/
function aggregator_element_end($parser, $name) {
global $element;
switch ($name) {
Dries Buytaert
committed
case 'image':
case 'textinput':
case 'item':
case 'entry':
case 'content':
case 'info':
Dries Buytaert
committed
$element = '';
break;
Dries Buytaert
committed
case 'id':
if ($element == 'id') {
Dries Buytaert
committed
$element = '';
}
}
}
/**
* Callback function used by the XML parser.
*/
function aggregator_element_data($parser, $data) {
global $channel, $element, $items, $item, $image, $tag;
$items += array($item => array());
switch ($element) {
Dries Buytaert
committed
case 'item':
Dries Buytaert
committed
$items[$item] += array($tag => '');
$items[$item][$tag] .= $data;
break;
Dries Buytaert
committed
case 'image':
case 'logo':
Dries Buytaert
committed
$image += array($tag => '');
$image[$tag] .= $data;
break;
Dries Buytaert
committed
case 'link':
Dries Buytaert
committed
if ($data) {
$items[$item] += array($tag => '');
$items[$item][$tag] .= $data;
}
break;
Dries Buytaert
committed
case 'content':
$items[$item] += array('content' => '');
$items[$item]['content'] .= $data;
Dries Buytaert
committed
break;
Dries Buytaert
committed
case 'summary':
$items[$item] += array('summary' => '');
$items[$item]['summary'] .= $data;
Dries Buytaert
committed
break;
Dries Buytaert
committed
case 'tagline':
case 'subtitle':
$channel += array('description' => '');
$channel['description'] .= $data;
Dries Buytaert
committed
break;
Dries Buytaert
committed
case 'info':
case 'id':
case 'textinput':
Dries Buytaert
committed
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
// The sub-element is not supported. However, we must recognize
// it or its contents will end up in the item array.
break;
default:
$channel += array($tag => '');
$channel[$tag] .= $data;
}
}
/**
* Parse the W3C date/time format, a subset of ISO 8601.
*
* PHP date parsing functions do not handle this format.
* See http://www.w3.org/TR/NOTE-datetime for more information.
* Originally from MagpieRSS (http://magpierss.sourceforge.net/).
*
* @param $date_str
* A string with a potentially W3C DTF date.
* @return
* A timestamp if parsed successfully or FALSE if not.
*/
function aggregator_parse_w3cdtf($date_str) {
if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
// Calculate the epoch for current date assuming GMT.
$epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
if ($match[10] != 'Z') { // Z is zulu time, aka GMT
list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
// Zero out the variables.
if (!$tz_hour) {
$tz_hour = 0;
}
if (!$tz_min) {
$tz_min = 0;
}
$offset_secs = (($tz_hour * 60) + $tz_min) * 60;
// Is timezone ahead of GMT? If yes, subtract offset.
if ($tz_mod == '+') {
$offset_secs *= -1;
}
$epoch += $offset_secs;
}
return $epoch;
}
else {
return FALSE;
}
}