From df6855e88b2ca3ccdf23eb5306f1e74c63df56fa Mon Sep 17 00:00:00 2001 From: Mike Ryan <mikeryan@virtuoso-performance.com> Date: Thu, 19 May 2016 14:24:41 -0500 Subject: [PATCH] Issue #2729581 by mikeryan: Bring XML parser in from migrate_source_xml --- src/Plugin/migrate_plus/data_parser/Xml.php | 341 ++++++++++++++++++++ 1 file changed, 341 insertions(+) create mode 100644 src/Plugin/migrate_plus/data_parser/Xml.php diff --git a/src/Plugin/migrate_plus/data_parser/Xml.php b/src/Plugin/migrate_plus/data_parser/Xml.php new file mode 100644 index 00000000..65f1bab1 --- /dev/null +++ b/src/Plugin/migrate_plus/data_parser/Xml.php @@ -0,0 +1,341 @@ +<?php + +namespace Drupal\migrate_plus\Plugin\migrate_plus\data_parser; + +use Drupal\Core\Plugin\ContainerFactoryPluginInterface; +use Drupal\migrate\MigrateException; +use Drupal\migrate_plus\DataParserPluginBase; + +/** + * Obtain XML data for migration. + * + * @DataParser( + * id = "xml", + * title = @Translation("XML") + * ) + */ +class Xml extends DataParserPluginBase implements ContainerFactoryPluginInterface { + + /** + * The XMLReader we are encapsulating. + * + * @var \XMLReader + */ + protected $reader; + + /** + * Array of the element names from the query. + * + * 0-based from the first (root) element. For example, '//file/article' would + * be stored as [0 => 'file', 1 => 'article']. + * + * @var array + */ + protected $elementsToMatch = []; + + /** + * An optional xpath predicate. + * + * Restricts the matching elements based on values in their children. Parsed + * from the element query at construct time. + * + * @var string + */ + protected $xpathPredicate = NULL; + + /** + * Array representing the path to the current element as we traverse the XML. + * + * For example, if in an XML string like '<file><article>...</article></file>' + * we are positioned within the article element, currentPath will be + * [0 => 'file', 1 => 'article']. + * + * @var array + */ + protected $currentPath = []; + + /** + * Retains all elements with a given name to support extraction from parents. + * + * This is a hack to support field extraction of values in parents + * of the 'context node' - ie, if $this->fields() has something like '..\nid'. + * Since we are using a streaming xml processor, it is too late to snoop + * around parent elements again once we've located an element of interest. So, + * grab elements with matching names and their depths, and refer back to it + * when building the source row. + * + * @var array + */ + protected $parentXpathCache = []; + + /** + * Hash of the element names that should be captured into $parentXpathCache. + * + * @var array + */ + protected $parentElementsOfInterest = []; + + /** + * Element name matching mode. + * + * When matching element names, whether to compare to the namespace-prefixed + * name, or the local name. + * + * @var bool + */ + protected $prefixedName = FALSE; + + /** + * {@inheritdoc} + */ + public function __construct(array $configuration, $plugin_id, $plugin_definition) { + parent::__construct($configuration, $plugin_id, $plugin_definition); + + $this->reader = new \XMLReader(); + + // Suppress errors during parsing, so we can pick them up after. + libxml_use_internal_errors(TRUE); + + // Parse the element query. First capture group is the element path, second + // (if present) is the attribute. + preg_match_all('|^/([^\[]+)\[?(.*?)]?$|', $configuration['item_selector'], $matches); + $element_path = $matches[1][0]; + $this->elementsToMatch = explode('/', $element_path); + $predicate = $matches[2][0]; + if ($predicate) { + $this->xpathPredicate = $predicate; + } + + // If the element path contains any colons, it must be specifying + // namespaces, so we need to compare using the prefixed element + // name in next(). + if (strpos($element_path, ':')) { + $this->prefixedName = TRUE; + } + + foreach ($this->fieldSelectors() as $field_name => $xpath) { + if (substr($xpath, 0, 3) === '..\\') { + $this->parentElementsOfInterest[] = str_replace('..\\', '', $xpath); + } + } + } + + /** + * Builds a \SimpleXmlElement rooted at the iterator's current location. + * + * The resulting SimpleXmlElement also contains any child nodes of the current + * element. + * + * @return \SimpleXmlElement|false + * A \SimpleXmlElement when the document is parseable, or false if a + * parsing error occurred. + * + * @throws MigrateException + */ + protected function getSimpleXml() { + $node = $this->reader->expand(); + if ($node) { + // We must associate the DOMNode with a DOMDocument to be able to import + // it into SimpleXML. Despite appearances, this is almost twice as fast as + // simplexml_load_string($this->readOuterXML()); + $dom = new \DOMDocument(); + $node = $dom->importNode($node, TRUE); + $dom->appendChild($node); + $sxml_elem = simplexml_import_dom($node); + $this->registerNamespaces($sxml_elem); + return $sxml_elem; + } + else { + foreach (libxml_get_errors() as $error) { + $error_string = self::parseLibXmlError($error); + throw new MigrateException($error_string); + } + return FALSE; + } + } + + /** + * {@inheritdoc} + */ + public function rewind() { + // Reset our path tracker. + $this->currentPath = []; + parent::rewind(); + } + + /** + * {@inheritdoc} + */ + protected function openSourceUrl($url) { + // (Re)open the provided URL. + $this->reader->close(); + return $this->reader->open($url, NULL, \LIBXML_NOWARNING); + } + + /** + * {@inheritdoc} + */ + protected function fetchNextRow() { + $target_element = NULL; + + // Loop over each node in the XML file, looking for elements at a path + // matching the input query string (represented in $this->elementsToMatch). + while ($this->reader->read()) { + if ($this->reader->nodeType == \XMLReader::ELEMENT) { + if ($this->prefixedName) { + $this->currentPath[$this->reader->depth] = $this->reader->name; + if (array_key_exists($this->reader->name, $this->parentElementsOfInterest)) { + $this->parentXpathCache[$this->reader->depth][$this->reader->name][] = $this->getSimpleXml(); + } + } + else { + $this->currentPath[$this->reader->depth] = $this->reader->localName; + if (array_key_exists($this->reader->localName, $this->parentElementsOfInterest)) { + $this->parentXpathCache[$this->reader->depth][$this->reader->name][] = $this->getSimpleXml(); + } + } + if ($this->currentPath == $this->elementsToMatch) { + // We're positioned to the right element path - build the SimpleXML + // object to enable proper xpath predicate evaluation. + $target_element = $this->getSimpleXml(); + if ($target_element !== FALSE) { + if (empty($this->xpathPredicate) || $this->predicateMatches($target_element)) { + break; + } + } + } + } + elseif ($this->reader->nodeType == \XMLReader::END_ELEMENT) { + // Remove this element and any deeper ones from the current path. + foreach ($this->currentPath as $depth => $name) { + if ($depth >= $this->reader->depth) { + unset($this->currentPath[$depth]); + } + } + foreach ($this->parentXpathCache as $depth => $elements) { + if ($depth > $this->reader->depth) { + unset($this->parentXpathCache[$depth]); + } + } + } + } + + // If we've found the desired element, populate the currentItem and + // currentId with its data. + if ($target_element) { + foreach ($this->fieldSelectors() as $field_name => $xpath) { + foreach ($target_element->xpath($xpath) as $value) { + $this->currentItem[$field_name] = (string) $value; + } + } + } + } + + /** + * Tests whether the iterator's xpath predicate matches the provided element. + * + * Has some limitations esp. in that it is easy to write predicates that + * reference things outside this SimpleXmlElement's tree, but "simpler" + * predicates should work as expected. + * + * @param \SimpleXMLElement $elem + * The element to test. + * + * @return bool + * True if the element matches the predicate, false if not. + */ + protected function predicateMatches(\SimpleXMLElement $elem) { + return !empty($elem->xpath('/*[' . $this->xpathPredicate . ']')); + } + + /** + * Gets an ancestor SimpleXMLElement, if the element name was registered. + * + * Gets the SimpleXMLElement some number of levels above the iterator + * having the given name, but only for element names that this + * Xml data parser was told to retain for future reference through the + * constructor's $parent_elements_of_interest. + * + * @param int $levels_up + * The number of levels back towards the root of the DOM tree to ascend + * before searching for the named element. + * @param string $name + * The name of the desired element. + * + * @return \SimpleXMLElement|false + * The element matching the level and name requirements, or false if it is + * not present or was not retained. + */ + public function getAncestorElements($levels_up, $name) { + if ($levels_up > 0) { + $levels_up *= -1; + } + $ancestor_depth = $this->reader->depth + $levels_up + 1; + if ($ancestor_depth < 0) { + return FALSE; + } + + if (array_key_exists($ancestor_depth, $this->parentXpathCache) && array_key_exists($name, $this->parentXpathCache[$ancestor_depth])) { + return $this->parentXpathCache[$ancestor_depth][$name]; + } + else { + return FALSE; + } + } + + /** + * Registers the iterator's namespaces to a SimpleXMLElement. + * + * @param \SimpleXMLElement $xml + * The element to apply namespace registrations to. + */ + protected function registerNamespaces(\SimpleXMLElement $xml) { + if (is_array($this->configuration['namespaces'])) { + foreach ($this->configuration['namespaces'] as $prefix => $ns) { + $xml->registerXPathNamespace($prefix, $ns); + } + } + } + + /** + * Parses a LibXMLError to a error message string. + * + * @param \LibXMLError $error + * Error thrown by the XML. + * + * @return string + * Error message + */ + public static function parseLibXmlError(\LibXMLError $error) { + $error_code_name = 'Unknown Error'; + switch ($error->level) { + case LIBXML_ERR_WARNING: + $error_code_name = t('Warning'); + break; + + case LIBXML_ERR_ERROR: + $error_code_name = t('Error'); + break; + + case LIBXML_ERR_FATAL: + $error_code_name = t('Fatal Error'); + break; + } + + return t( + "@libxmlerrorcodename @libxmlerrorcode: @libxmlerrormessage\n" . + "Line: @libxmlerrorline\n" . + "Column: @libxmlerrorcolumn\n" . + "File: @libxmlerrorfile", + [ + '@libxmlerrorcodename' => $error_code_name, + '@libxmlerrorcode' => $error->code, + '@libxmlerrormessage' => trim($error->message), + '@libxmlerrorline' => $error->line, + '@libxmlerrorcolumn' => $error->column, + '@libxmlerrorfile' => (($error->file)) ? $error->file : NULL, + ] + ); + } + +} -- GitLab