Skip to content
Snippets Groups Projects
Commit ac06a5ce authored by Chris Leppanen's avatar Chris Leppanen
Browse files

Added: Support for QueryPath. Broke out the parsers into seperate include files.

parent 48d4d5a6
No related branches found
No related tags found
No related merge requests found
......@@ -5,6 +5,8 @@
define('FEEDS_XPATH_PARSER_HTML', 0);
define('FEEDS_XPATH_PARSER_XML', 1);
define('FEEDS_XPATH_PARSER_REGEX', 2);
define('FEEDS_XPATH_PARSER_QUERYPATH', 3);
define('FEEDS_XPATH_PREG', '/(\/{0,2})(?<!@)(?<!:)(?<!\s)(?<!-)(?<!\')(?<!=)(?<!\[)(?<!")\b(\w+)\b(?!:)(?!\()/i');
/**
* @file
......@@ -13,7 +15,7 @@ define('FEEDS_XPATH_PARSER_REGEX', 2);
*/
/**
* Takes a document and runs user provided XPath queries against it.
* Takes a document and runs user provided XPath or REGEX queries against it.
*/
class FeedsXPathParser extends FeedsParser {
......@@ -21,95 +23,48 @@ class FeedsXPathParser extends FeedsParser {
* Implementation of FeedsParser::parse().
*/
public function parse(FeedsImportBatch $batch, FeedsSource $source) {
$raw = $batch->getRaw();
if (!function_exists('simplexml_import_dom')) {
throw new Exception(t('Feeds XML Parser requires SimpleXML PHP extension.'));
}
$this->config = $source->getConfigFor($this);
$this->rawXML = array_keys(array_filter($this->config['rawXML']));
$mappings = feeds_importer($this->id)->processor->config['mappings'];
$source_config = $source->getConfigFor($this);
if ($this->config['parser_type'] == FEEDS_XPATH_PARSER_XML) {
$output = $this->parseXML($raw);
}
elseif ($this->config['parser_type'] == FEEDS_XPATH_PARSER_HTML) {
$output = $this->parseHTML($raw);
}
elseif ($this->config['parser_type'] == FEEDS_XPATH_PARSER_REGEX) {
$output = $this->parseREGEX($raw);
if ($source_config['parser_type'] == FEEDS_XPATH_PARSER_XML) {
require_once 'FeedsXPathParserHTML.inc';
$this->parser = new FeedsXPathParserXML;
$output = $this->parser->parse($batch->getRaw(), $source_config, $mappings);
}
if ($output) {
$batch->setItems($output);
elseif ($source_config['parser_type'] == FEEDS_XPATH_PARSER_HTML) {
require_once 'FeedsXPathParserHTML.inc';
$this->parser = new FeedsXPathParserHTML;
$output = $this->parser->parse($batch->getRaw(), $source_config, $mappings);
}
}
private function parseXML($raw) {
$dom = new DOMDocument();
$success = @$dom->loadXML($raw);
return $this->query($success, $dom);
}
private function parseHTML($raw) {
$dom = new DOMDocument();
$success = @$dom->loadHTML($raw);
return $this->query($success, $dom);
}
private function query($success, $dom) {
if (!$success) {
drupal_set_message(t('There was an error parsing the document.
Please make sure you have selected the correct parser.'), 'error');
return;
elseif ($source_config['parser_type'] == FEEDS_XPATH_PARSER_REGEX) {
$output = $this->parseREGEX($batch->getRaw());
}
$xml = simplexml_import_dom($dom);
unset($dom);
$mappings = feeds_importer($this->id)->processor->config['mappings'];
list($sources, $queries) = $this->getSourcesQueries($mappings);
$results = array();
foreach ($queries as $key => $query) {
$result = @$xml->xpath($query);
if (!$result) {
drupal_set_message(t('There was an error with one of your XPath queries.
Make sure the syntax is valid.'), 'error');
return;
}
$results[$sources[$key]] = $result;
elseif ($source_config['parser_type'] == FEEDS_XPATH_PARSER_QUERYPATH && module_exists('querypath')) {
require_once 'FeedsXPathParserQueryPath.inc';
$this->parser = new FeedsXPathParserQueryPath;
$output = $this->parser->parse($batch->getRaw(), $source_config, $mappings);
}
unset($xml);
$output = array();
foreach ($results as $source => $items) {
if (!isset($count) || $count == count($items)) {
foreach ($items as $key => $item) {
if (in_array($source, $this->rawXML)) {
$item = $item->asXML();
}
else {
$item = (string) $item;
}
$output[$key] = array_merge((array)$output[$key], array($source => $item));
}
$count = count($items);
}
else {
drupal_set_message(t('Mismatching results.
Queries must produce the same number of items.'), 'error');
return;
}
if ($output) {
$batch->setItems($output);
}
return $output;
}
private function parseREGEX($raw) {
$mappings = feeds_importer($this->id)->processor->config['mappings'];
list($sources, $queries) = $this->getSourcesQueries($mappings);
$this->setSourcesQueries($mappings);
$results = array();
foreach ($queries as $key => $query) {
foreach ($this->queries as $key => $query) {
$success = @preg_match_all($query, $raw, $matches);
if ($success !== FALSE) {
......@@ -125,17 +80,15 @@ class FeedsXPathParser extends FeedsParser {
foreach ($merged as $k => $values) {
$merged[$k] = implode(' ', $values);
}
$results[$sources[$key]] = $merged;
$results[$this->sources[$key]] = $merged;
}
else {
$results[$sources[$key]] = $matches[0];
$results[$this->sources[$key]] = $matches[0];
}
}
else {
drupal_set_message(t('There was an error with one of your regex queries.
Make sure the syntax is valid.'), 'error');
return;
drupal_set_message(t('There was an error with the regex: %query', array('%query' => $query)));
}
}
......@@ -156,16 +109,8 @@ class FeedsXPathParser extends FeedsParser {
return $output;
}
private function getSourcesQueries($mappings) {
foreach ($mappings as $mapping) {
$source = $mapping['source'];
if (trim($this->config[$source])) {
$sources[] = $source;
$queries[] = $this->config[$source];
}
}
return array($sources, $queries);
function getSourceElement($item, $element_key) {
return $this->parser->getSourceElement($item, $element_key);
}
/**
......@@ -192,22 +137,18 @@ class FeedsXPathParser extends FeedsParser {
}
}
$items = array(
format_plural(count($uniques),
t('Field <strong>!column</strong> is mandatory and considered unique: only one item per !column value will be created.',
array('!column' => implode(', ', $uniques))),
t('Fields <strong>!columns</strong> are mandatory and values in these columns are considered unique: only one entry per value in one of these columns will be created.',
array('!columns' => implode(', ', $uniques)))),
);
ctools_include('dependent');
$form['help']['#value'] = '<div class="help">' . theme('item_list', $items) . '</div>';
$supported = array('HTML', 'XML', 'REGEX');
ctools_include('dependent');
if (module_exists('querypath')) {
$supported[] = 'QueryPath';
}
$form['parser_type'] = array(
'#title' => t('Select the parsing engine to use'),
'#type' => 'radios',
'#options' => array('HTML', 'XML', 'REGEX'),
'#options' => $supported,
'#default_value' => isset($source_config['parser_type']) ?
$source_config['parser_type'] : FEEDS_XPATH_PARSER_HTML,
);
......@@ -235,18 +176,59 @@ class FeedsXPathParser extends FeedsParser {
theme('item_list', $regex_help) .
'</div>';
$form['context'] = array(
'#type' => 'textfield',
'#title' => t('Context'),
'#required' => TRUE,
'#description' => t('This is the base query, all other queries will run in this context.'),
'#default_value' => isset($source_config['context']) ? $source_config['context'] : '',
);
$form['sources'] = array(
'#type' => 'fieldset',
);
$items = array(
format_plural(count($uniques),
t('Field <strong>!column</strong> is mandatory and considered unique: only one item per !column value will be created.',
array('!column' => implode(', ', $uniques))),
t('Fields <strong>!columns</strong> are mandatory and values in these columns are considered unique: only one entry per value in one of these columns will be created.',
array('!columns' => implode(', ', $uniques)))),
);
$form['sources']['help']['#value'] = '<div class="help">' . theme('item_list', $items) . '</div>';
$form['attrs'] = array(
'#type' => 'fieldset',
'#input' => TRUE,
'#prefix' => '<div id="edit-feeds-FeedsXPathParser-attrs-wrapper"><div id="edit-feeds-FeedsXPathParser-attrs">',
'#suffix' => '</div></div>',
'#process' => array('ctools_dependent_process'),
'#dependency' => array(
'radio:feeds[FeedsXPathParser][parser_type]' => array(
FEEDS_XPATH_PARSER_QUERYPATH,
)
),
);
foreach ($sources as $source) {
$form[$source] = array(
$form['sources'][$source] = array(
'#type' => 'textfield',
'#title' => $source,
'#description' => t('The query string to run.'),
'#default_value' => isset($source_config[$source]) ? $source_config[$source] : '',
'#default_value' => isset($source_config['sources'][$source]) ? $source_config['sources'][$source] : '',
);
$form['attrs'][$source] = array(
'#type' => 'textfield',
'#title' => $source . ' Attribute',
'#description' => t('The attribute to return.'),
'#default_value' => isset($source_config['attrs'][$source]) ? $source_config['attrs'][$source] : '',
);
}
$form['rawXML'] = array(
'#type' => 'checkboxes',
'#title' => t('Select the queries you would like to return raw XML/HTML'),
'#title' => t('Select the queries you would like to return raw XML or HTML'),
'#options' => array_combine($sources, $sources),
'#process' => array('ctools_dependent_process', 'expand_checkboxes'),
'#prefix' => '<div id="edit-feeds-FeedsXPathParser-rawXML-wrapper"><div id="edit-feeds-FeedsXPathParser-rawXML">',
......@@ -256,6 +238,7 @@ class FeedsXPathParser extends FeedsParser {
'radio:feeds[FeedsXPathParser][parser_type]' => array(
FEEDS_XPATH_PARSER_HTML,
FEEDS_XPATH_PARSER_XML,
FEEDS_XPATH_PARSER_QUERYPATH,
)
),
);
......
<?php
class FeedsXPathParserHTML {
public function parse($raw, $config, $mappings) {
$this->config = $config;
$this->setSourcesQueries($mappings);
$dom = new DOMDocument();
$success = @$dom->loadHTML($raw);
if (!$success) {
throw new Exception(t('There was an error parsing the HTML document.'));
}
$xml = simplexml_import_dom($dom);
unset($dom);
return $this->query($xml);
}
private function setSourcesQueries($mappings) {
$this->sources = array();
$this->queries = array();
foreach ($mappings as $mapping) {
$source = $mapping['source'];
if ($query = trim($this->config['sources'][$source])) {
$this->sources[] = $source;
$this->queries[] = $query;
}
}
}
private function query($xml) {
$xml = $this->setNamespaces($xml);
$results = @$xml->xpath($this->config['context']);
if ($results === FALSE) {
throw new Exception(t('There was an error with the XPath query: %query',
array('%query' => $this->sourceConfig['context'])));
}
unset($xml);
foreach ($results as &$result) {
$result = $result->asXML();
}
return $results;
}
private function setNamespaces($xml) {
$this->namespaces = $xml->getNamespaces(TRUE);
foreach ($this->namespaces as $prefix => $namespace) {
if ($prefix === '') {
$xml->registerXPathNamespace('__default__', $namespace);
$this->sourceConfig['context'] = preg_replace(FEEDS_XPATH_PREG,
'$1__default__:$2', $this->config['context']
);
}
else {
$xml->registerXPathNamespace($prefix, $namespace);
}
}
return $xml;
}
public function getSourceElement($item, $element_key) {
$xml = @new SimpleXMLElement($item);
$query = $this->queries[array_search($element_key, $this->sources)];
$results = $xml->xpath($query);
unset($xml);
$rawXML = array_keys(array_filter($this->config['rawXML']));
if (is_array($results) && !empty($results)) {
foreach ($results as &$result) {
if (in_array($element_key, $rawXML)) {
$result = $result->asXML();
}
else {
$result = (string)$result;
}
}
return implode(' ', $results);
}
}
}
class FeedsXPathParserXML extends FeedsXPathParserHTML {
public function parse($raw, $config, $mappings) {
$this->config = $config;
$this->setSourcesQueries($mappings);
$xml = new SimpleXMLElement($raw);
return $this->query($xml);
}
}
<?php
class FeedsXPathParserQueryPath {
public function parse($raw, $config, $mappings) {
$this->config = $config;
$this->setSourcesQueries($mappings);
$output = array();
foreach (qp($raw, $config['context']) as $child) {
$output[] = $child->html();
}
return $output;
}
public function getSourceElement($item, $element_key) {
$rawXML = array_keys(array_filter($this->config['rawXML']));
$key = array_search($element_key, $this->sources);
$query = $this->queries[$key];
$attr = $this->attrs[$key];
if ($query != '') {
$qp = qp($item, $query);
}
else {
$qp = qp($item);
}
if ($attr != '') {
return $qp->attr($attr);
}
if (in_array($element_key, $rawXML)) {
return $qp->html();
}
else {
return $qp->text();
}
}
private function setSourcesQueries($mappings) {
$this->sources = array();
$this->queries = array();
$this->attrs = array();
foreach ($mappings as $mapping) {
$source = $mapping['source'];
$this->sources[] = $source;
$this->queries[] = trim($this->config['sources'][$source]);
$this->attrs[] = trim($this->config['attrs'][$source]);
}
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment