summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMitchell Tannenbaum2011-02-15 17:32:42 (GMT)
committer Mitchell Tannenbaum2011-02-15 17:32:42 (GMT)
commitd64ba7b6f27f433b58a6c26052d2b41cd076edf8 (patch)
treebebddb1d0a1672c383a146e8a27704208b6c5011
Initial commit of example_web_scraper. This example demonstrates how to build a Drupal-native web scraper.HEAD7.x-1.0-alpha1master
-rw-r--r--example_web_scraper.features.field.inc290
-rw-r--r--example_web_scraper.features.inc39
-rw-r--r--example_web_scraper.feeds_importer_default.inc176
-rw-r--r--example_web_scraper.feeds_tamper_default.inc24
-rw-r--r--example_web_scraper.info21
-rw-r--r--example_web_scraper.module3
6 files changed, 553 insertions, 0 deletions
diff --git a/example_web_scraper.features.field.inc b/example_web_scraper.features.field.inc
new file mode 100644
index 0000000..6951741
--- /dev/null
+++ b/example_web_scraper.features.field.inc
@@ -0,0 +1,290 @@
+<?php
+
+/**
+ * Implementation of hook_field_default_fields().
+ */
+function example_web_scraper_field_default_fields() {
+ $fields = array();
+
+ // Exported field: 'node-event-body'
+ $fields['node-event-body'] = array(
+ 'field_config' => array(
+ 'active' => '1',
+ 'cardinality' => '1',
+ 'deleted' => '0',
+ 'entity_types' => array(
+ '0' => 'node',
+ ),
+ 'field_name' => 'body',
+ 'foreign keys' => array(
+ 'format' => array(
+ 'columns' => array(
+ 'format' => 'format',
+ ),
+ 'table' => 'filter_format',
+ ),
+ ),
+ 'indexes' => array(
+ 'format' => array(
+ '0' => 'format',
+ ),
+ ),
+ 'module' => 'text',
+ 'settings' => array(),
+ 'translatable' => '1',
+ 'type' => 'text_with_summary',
+ ),
+ 'field_instance' => array(
+ 'bundle' => 'event',
+ 'default_value' => NULL,
+ 'deleted' => '0',
+ 'description' => '',
+ 'display' => array(
+ 'default' => array(
+ 'label' => 'hidden',
+ 'module' => 'text',
+ 'settings' => array(),
+ 'type' => 'text_default',
+ 'weight' => 0,
+ ),
+ 'teaser' => array(
+ 'label' => 'hidden',
+ 'module' => 'text',
+ 'settings' => array(
+ 'trim_length' => 600,
+ ),
+ 'type' => 'text_summary_or_trimmed',
+ 'weight' => 0,
+ ),
+ ),
+ 'entity_type' => 'node',
+ 'field_name' => 'body',
+ 'label' => 'Body',
+ 'required' => FALSE,
+ 'settings' => array(
+ 'display_summary' => TRUE,
+ 'text_processing' => 1,
+ 'user_register_form' => FALSE,
+ ),
+ 'widget' => array(
+ 'module' => 'text',
+ 'settings' => array(
+ 'rows' => 20,
+ 'summary_rows' => 5,
+ ),
+ 'type' => 'text_textarea_with_summary',
+ 'weight' => '-4',
+ ),
+ 'widget_type' => 'text_textarea_with_summary',
+ ),
+ );
+
+ // Exported field: 'node-event-field_when'
+ $fields['node-event-field_when'] = array(
+ 'field_config' => array(
+ 'active' => '1',
+ 'cardinality' => '1',
+ 'deleted' => '0',
+ 'entity_types' => array(),
+ 'field_name' => 'field_when',
+ 'foreign keys' => array(
+ 'format' => array(
+ 'columns' => array(
+ 'format' => 'format',
+ ),
+ 'table' => 'filter_format',
+ ),
+ ),
+ 'indexes' => array(
+ 'format' => array(
+ '0' => 'format',
+ ),
+ ),
+ 'module' => 'text',
+ 'settings' => array(
+ 'max_length' => 255,
+ ),
+ 'translatable' => '1',
+ 'type' => 'text',
+ ),
+ 'field_instance' => array(
+ 'bundle' => 'event',
+ 'default_value' => NULL,
+ 'deleted' => '0',
+ 'description' => '',
+ 'display' => array(
+ 'default' => array(
+ 'label' => 'above',
+ 'module' => 'text',
+ 'settings' => array(),
+ 'type' => 'text_default',
+ 'weight' => 1,
+ ),
+ 'teaser' => array(
+ 'label' => 'above',
+ 'settings' => array(),
+ 'type' => 'hidden',
+ 'weight' => 0,
+ ),
+ ),
+ 'entity_type' => 'node',
+ 'field_name' => 'field_when',
+ 'label' => 'When',
+ 'required' => FALSE,
+ 'settings' => array(
+ 'text_processing' => 0,
+ 'user_register_form' => FALSE,
+ ),
+ 'widget' => array(
+ 'module' => 'text',
+ 'settings' => array(
+ 'size' => 60,
+ ),
+ 'type' => 'text_textfield',
+ 'weight' => '-3',
+ ),
+ ),
+ );
+
+ // Exported field: 'node-event-field_where'
+ $fields['node-event-field_where'] = array(
+ 'field_config' => array(
+ 'active' => '1',
+ 'cardinality' => '1',
+ 'deleted' => '0',
+ 'entity_types' => array(),
+ 'field_name' => 'field_where',
+ 'foreign keys' => array(
+ 'format' => array(
+ 'columns' => array(
+ 'format' => 'format',
+ ),
+ 'table' => 'filter_format',
+ ),
+ ),
+ 'indexes' => array(
+ 'format' => array(
+ '0' => 'format',
+ ),
+ ),
+ 'module' => 'text',
+ 'settings' => array(
+ 'max_length' => 255,
+ ),
+ 'translatable' => '1',
+ 'type' => 'text',
+ ),
+ 'field_instance' => array(
+ 'bundle' => 'event',
+ 'default_value' => NULL,
+ 'deleted' => '0',
+ 'description' => '',
+ 'display' => array(
+ 'default' => array(
+ 'label' => 'above',
+ 'module' => 'text',
+ 'settings' => array(),
+ 'type' => 'text_default',
+ 'weight' => 2,
+ ),
+ 'teaser' => array(
+ 'label' => 'above',
+ 'settings' => array(),
+ 'type' => 'hidden',
+ 'weight' => 0,
+ ),
+ ),
+ 'entity_type' => 'node',
+ 'field_name' => 'field_where',
+ 'label' => 'Where',
+ 'required' => FALSE,
+ 'settings' => array(
+ 'text_processing' => 0,
+ 'user_register_form' => FALSE,
+ ),
+ 'widget' => array(
+ 'module' => 'text',
+ 'settings' => array(
+ 'size' => 60,
+ ),
+ 'type' => 'text_textfield',
+ 'weight' => '-2',
+ ),
+ ),
+ );
+
+ // Exported field: 'node-stanford_schedule_page-body'
+ $fields['node-stanford_schedule_page-body'] = array(
+ 'field_config' => array(
+ 'active' => '1',
+ 'cardinality' => '1',
+ 'deleted' => '0',
+ 'entity_types' => array(
+ '0' => 'node',
+ ),
+ 'field_name' => 'body',
+ 'foreign keys' => array(
+ 'format' => array(
+ 'columns' => array(
+ 'format' => 'format',
+ ),
+ 'table' => 'filter_format',
+ ),
+ ),
+ 'indexes' => array(
+ 'format' => array(
+ '0' => 'format',
+ ),
+ ),
+ 'module' => 'text',
+ 'settings' => array(),
+ 'translatable' => '1',
+ 'type' => 'text_with_summary',
+ ),
+ 'field_instance' => array(
+ 'bundle' => 'stanford_schedule_page',
+ 'default_value' => NULL,
+ 'deleted' => '0',
+ 'description' => '',
+ 'display' => array(
+ 'default' => array(
+ 'label' => 'hidden',
+ 'module' => 'text',
+ 'settings' => array(),
+ 'type' => 'text_default',
+ 'weight' => 0,
+ ),
+ 'teaser' => array(
+ 'label' => 'hidden',
+ 'module' => 'text',
+ 'settings' => array(
+ 'trim_length' => 600,
+ ),
+ 'type' => 'text_summary_or_trimmed',
+ 'weight' => 0,
+ ),
+ ),
+ 'entity_type' => 'node',
+ 'field_name' => 'body',
+ 'label' => 'Body',
+ 'required' => FALSE,
+ 'settings' => array(
+ 'display_summary' => TRUE,
+ 'text_processing' => 1,
+ 'user_register_form' => FALSE,
+ ),
+ 'widget' => array(
+ 'module' => 'text',
+ 'settings' => array(
+ 'rows' => 20,
+ 'summary_rows' => 5,
+ ),
+ 'type' => 'text_textarea_with_summary',
+ 'weight' => -4,
+ ),
+ 'widget_type' => 'text_textarea_with_summary',
+ ),
+ );
+
+ return $fields;
+}
diff --git a/example_web_scraper.features.inc b/example_web_scraper.features.inc
new file mode 100644
index 0000000..994d563
--- /dev/null
+++ b/example_web_scraper.features.inc
@@ -0,0 +1,39 @@
+<?php
+
+/**
+ * Implementation of hook_ctools_plugin_api().
+ */
+function example_web_scraper_ctools_plugin_api() {
+ list($module, $api) = func_get_args();
+ if ($module == "feeds" && $api == "feeds_importer_default") {
+ return array("version" => 1);
+ }
+ elseif ($module == "feeds_tamper" && $api == "feeds_tamper_default") {
+ return array("version" => 2);
+ }
+}
+
+/**
+ * Implementation of hook_node_info().
+ */
+function example_web_scraper_node_info() {
+ $items = array(
+ 'event' => array(
+ 'name' => t('Event'),
+ 'base' => 'node_content',
+ 'description' => '',
+ 'has_title' => '1',
+ 'title_label' => t('Title'),
+ 'help' => '',
+ ),
+ 'stanford_schedule_page' => array(
+ 'name' => t('Stanford schedule page'),
+ 'base' => 'node_content',
+ 'description' => '',
+ 'has_title' => '1',
+ 'title_label' => t('Title'),
+ 'help' => '',
+ ),
+ );
+ return $items;
+}
diff --git a/example_web_scraper.feeds_importer_default.inc b/example_web_scraper.feeds_importer_default.inc
new file mode 100644
index 0000000..ac3628d
--- /dev/null
+++ b/example_web_scraper.feeds_importer_default.inc
@@ -0,0 +1,176 @@
+<?php
+
+/**
+ * Implementation of hook_feeds_importer_default().
+ */
+function example_web_scraper_feeds_importer_default() {
+ $export = array();
+
+ $feeds_importer = new stdClass;
+ $feeds_importer->disabled = FALSE; /* Edit this to true to make a default feeds_importer disabled initially */
+ $feeds_importer->api_version = 1;
+ $feeds_importer->id = 'stanford_event';
+ $feeds_importer->config = array(
+ 'name' => 'stanford event',
+ 'description' => '',
+ 'fetcher' => array(
+ 'plugin_key' => 'FeedsHTTPFetcher',
+ 'config' => array(
+ 'auto_detect_feeds' => FALSE,
+ 'use_pubsubhubbub' => FALSE,
+ 'designated_hub' => '',
+ ),
+ ),
+ 'parser' => array(
+ 'plugin_key' => 'FeedsXPathParserHTML',
+ 'config' => array(
+ 'sources' => array(
+ 'xpathparser:0' => 'h1',
+ 'xpathparser:1' => 'dl/dd[1]',
+ 'xpathparser:2' => 'dl/dd[2]',
+ 'xpathparser:3' => 'p',
+ ),
+ 'rawXML' => array(
+ 'xpathparser:0' => 0,
+ 'xpathparser:1' => 0,
+ 'xpathparser:2' => 0,
+ 'xpathparser:3' => 0,
+ ),
+ 'context' => '//div[@id=\'record\']',
+ 'exp' => array(
+ 'errors' => 0,
+ 'debug' => array(
+ 'context' => 0,
+ 'xpathparser:0' => 0,
+ 'xpathparser:1' => 0,
+ 'xpathparser:2' => 0,
+ 'xpathparser:3' => 0,
+ ),
+ 'tidy_encoding' => 'UTF8',
+ ),
+ ),
+ ),
+ 'processor' => array(
+ 'plugin_key' => 'FeedsSelfNodeProcessor',
+ 'config' => array(
+ 'content_type' => 'event',
+ 'expire' => -1,
+ 'author' => 0,
+ 'mappings' => array(
+ 0 => array(
+ 'source' => 'xpathparser:0',
+ 'target' => 'title',
+ 'unique' => FALSE,
+ ),
+ 1 => array(
+ 'source' => 'xpathparser:1',
+ 'target' => 'field_when',
+ 'unique' => FALSE,
+ ),
+ 2 => array(
+ 'source' => 'xpathparser:2',
+ 'target' => 'field_where',
+ 'unique' => FALSE,
+ ),
+ 3 => array(
+ 'source' => 'xpathparser:3',
+ 'target' => 'body',
+ 'unique' => FALSE,
+ ),
+ ),
+ 'update_existing' => 2,
+ 'input_format' => NULL,
+ ),
+ ),
+ 'content_type' => 'event',
+ 'update' => 0,
+ 'import_period' => '0',
+ 'expire_period' => 3600,
+ 'import_on_create' => 1,
+ 'process_in_background' => 1,
+ );
+ $export['stanford_event'] = $feeds_importer;
+
+ $feeds_importer = new stdClass;
+ $feeds_importer->disabled = FALSE; /* Edit this to true to make a default feeds_importer disabled initially */
+ $feeds_importer->api_version = 1;
+ $feeds_importer->id = 'stanford_schedule_page';
+ $feeds_importer->config = array(
+ 'name' => 'stanford schedule page',
+ 'description' => '',
+ 'fetcher' => array(
+ 'plugin_key' => 'FeedsCrawler',
+ 'config' => array(
+ 'auto_detect_feeds' => 0,
+ 'use_pubsubhubbub' => 0,
+ 'designated_hub' => '',
+ 'crawler' => array(
+ 'num_pages' => '5',
+ 'delay' => '1',
+ 'first_run' => '0',
+ 'auto' => '0',
+ 'xpath' => '',
+ 'url' => array(
+ 'url_pattern' => 'http://events.stanford.edu/2011/January/$index/',
+ 'initial' => '2',
+ 'increment' => '1',
+ ),
+ ),
+ 'crawled' => FALSE,
+ ),
+ ),
+ 'parser' => array(
+ 'plugin_key' => 'FeedsXPathParserHTML',
+ 'config' => array(
+ 'sources' => array(
+ 'xpathparser:1' => 'a',
+ 'xpathparser:2' => 'a/@href',
+ ),
+ 'rawXML' => array(
+ 'xpathparser:1' => 0,
+ 'xpathparser:2' => 0,
+ ),
+ 'context' => '//div[@class=\'eventItemText\']',
+ 'exp' => array(
+ 'errors' => 1,
+ 'debug' => array(
+ 'context' => 'context',
+ 'xpathparser:1' => 'xpathparser:1',
+ 'xpathparser:2' => 'xpathparser:2',
+ ),
+ ),
+ ),
+ ),
+ 'processor' => array(
+ 'plugin_key' => 'FeedsNodeProcessor',
+ 'config' => array(
+ 'content_type' => 'event',
+ 'expire' => '-1',
+ 'author' => 0,
+ 'mappings' => array(
+ 0 => array(
+ 'source' => 'xpathparser:1',
+ 'target' => 'title',
+ 'unique' => FALSE,
+ ),
+ 1 => array(
+ 'source' => 'xpathparser:2',
+ 'target' => 'feeds_source',
+ 'unique' => 1,
+ ),
+ ),
+ 'update_existing' => '0',
+ 'input_format' => 'plain_text',
+ ),
+ ),
+ 'content_type' => 'stanford_schedule_page',
+ 'update' => 0,
+ 'import_period' => '-1',
+ 'expire_period' => 3600,
+ 'import_on_create' => 1,
+ 'process_in_background' => 0,
+ );
+ $export['stanford_schedule_page'] = $feeds_importer;
+
+ return $export;
+}
diff --git a/example_web_scraper.feeds_tamper_default.inc b/example_web_scraper.feeds_tamper_default.inc
new file mode 100644
index 0000000..d3960b7
--- /dev/null
+++ b/example_web_scraper.feeds_tamper_default.inc
@@ -0,0 +1,24 @@
+<?php
+
+/**
+ * Implementation of hook_feeds_tamper_default().
+ */
+function example_web_scraper_feeds_tamper_default() {
+ $export = array();
+
+ $feeds_tamper = new stdClass;
+ $feeds_tamper->disabled = FALSE; /* Edit this to true to make a default feeds_tamper disabled initially */
+ $feeds_tamper->api_version = 2;
+ $feeds_tamper->id = 'Add domain';
+ $feeds_tamper->importer = 'stanford_schedule_page';
+ $feeds_tamper->source = 'xpathparser:2';
+ $feeds_tamper->plugin_id = 'rewrite';
+ $feeds_tamper->settings = array(
+ 'text' => 'http://events.stanford.edu/[xpathparser:2]',
+ );
+ $feeds_tamper->weight = 0;
+ $feeds_tamper->description = 'add domain';
+ $export['Add domain'] = $feeds_tamper;
+
+ return $export;
+}
diff --git a/example_web_scraper.info b/example_web_scraper.info
new file mode 100644
index 0000000..730ae49
--- /dev/null
+++ b/example_web_scraper.info
@@ -0,0 +1,21 @@
+core = "7.x"
+dependencies[] = "features"
+dependencies[] = "feeds"
+dependencies[] = "feeds_crawler"
+dependencies[] = "feeds_selfnode_processor"
+dependencies[] = "feeds_tamper"
+dependencies[] = "feeds_xpathparser"
+description = "An example of a Drupal-native web scraper."
+features[ctools][] = "feeds:feeds_importer_default:1"
+features[ctools][] = "feeds_tamper:feeds_tamper_default:2"
+features[feeds_importer][] = "stanford_event"
+features[feeds_importer][] = "stanford_schedule_page"
+features[feeds_tamper][] = "Add domain"
+features[field][] = "node-event-body"
+features[field][] = "node-event-field_when"
+features[field][] = "node-event-field_where"
+features[field][] = "node-stanford_schedule_page-body"
+features[node][] = "event"
+features[node][] = "stanford_schedule_page"
+name = "Example Web Scraper"
+package = "Features"
diff --git a/example_web_scraper.module b/example_web_scraper.module
new file mode 100644
index 0000000..8d6369a
--- /dev/null
+++ b/example_web_scraper.module
@@ -0,0 +1,3 @@
+<?php
+
+include_once('example_web_scraper.features.inc');