Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
migrate_plus
Manage
Activity
Members
Labels
Plan
Wiki
Custom issue tracker
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Model registry
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
project
migrate_plus
Commits
df6855e8
Commit
df6855e8
authored
8 years ago
by
Mike Ryan
Browse files
Options
Downloads
Patches
Plain Diff
Issue
#2729581
by mikeryan: Bring XML parser in from migrate_source_xml
parent
c289b008
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/Plugin/migrate_plus/data_parser/Xml.php
+341
-0
341 additions, 0 deletions
src/Plugin/migrate_plus/data_parser/Xml.php
with
341 additions
and
0 deletions
src/Plugin/migrate_plus/data_parser/Xml.php
0 → 100644
+
341
−
0
View file @
df6855e8
<?php
namespace
Drupal\migrate_plus\Plugin\migrate_plus\data_parser
;
use
Drupal\Core\Plugin\ContainerFactoryPluginInterface
;
use
Drupal\migrate\MigrateException
;
use
Drupal\migrate_plus
\DataParserPluginBase
;
/**
* Obtain XML data for migration.
*
* @DataParser(
* id = "xml",
* title = @Translation("XML")
* )
*/
class
Xml
extends
DataParserPluginBase
implements
ContainerFactoryPluginInterface
{
/**
* The XMLReader we are encapsulating.
*
* @var \XMLReader
*/
protected
$reader
;
/**
* Array of the element names from the query.
*
* 0-based from the first (root) element. For example, '//file/article' would
* be stored as [0 => 'file', 1 => 'article'].
*
* @var array
*/
protected
$elementsToMatch
=
[];
/**
* An optional xpath predicate.
*
* Restricts the matching elements based on values in their children. Parsed
* from the element query at construct time.
*
* @var string
*/
protected
$xpathPredicate
=
NULL
;
/**
* Array representing the path to the current element as we traverse the XML.
*
* For example, if in an XML string like '<file><article>...</article></file>'
* we are positioned within the article element, currentPath will be
* [0 => 'file', 1 => 'article'].
*
* @var array
*/
protected
$currentPath
=
[];
/**
* Retains all elements with a given name to support extraction from parents.
*
* This is a hack to support field extraction of values in parents
* of the 'context node' - ie, if $this->fields() has something like '..\nid'.
* Since we are using a streaming xml processor, it is too late to snoop
* around parent elements again once we've located an element of interest. So,
* grab elements with matching names and their depths, and refer back to it
* when building the source row.
*
* @var array
*/
protected
$parentXpathCache
=
[];
/**
* Hash of the element names that should be captured into $parentXpathCache.
*
* @var array
*/
protected
$parentElementsOfInterest
=
[];
/**
* Element name matching mode.
*
* When matching element names, whether to compare to the namespace-prefixed
* name, or the local name.
*
* @var bool
*/
protected
$prefixedName
=
FALSE
;
/**
* {@inheritdoc}
*/
public
function
__construct
(
array
$configuration
,
$plugin_id
,
$plugin_definition
)
{
parent
::
__construct
(
$configuration
,
$plugin_id
,
$plugin_definition
);
$this
->
reader
=
new
\XMLReader
();
// Suppress errors during parsing, so we can pick them up after.
libxml_use_internal_errors
(
TRUE
);
// Parse the element query. First capture group is the element path, second
// (if present) is the attribute.
preg_match_all
(
'|^/([^\[]+)\[?(.*?)]?$|'
,
$configuration
[
'item_selector'
],
$matches
);
$element_path
=
$matches
[
1
][
0
];
$this
->
elementsToMatch
=
explode
(
'/'
,
$element_path
);
$predicate
=
$matches
[
2
][
0
];
if
(
$predicate
)
{
$this
->
xpathPredicate
=
$predicate
;
}
// If the element path contains any colons, it must be specifying
// namespaces, so we need to compare using the prefixed element
// name in next().
if
(
strpos
(
$element_path
,
':'
))
{
$this
->
prefixedName
=
TRUE
;
}
foreach
(
$this
->
fieldSelectors
()
as
$field_name
=>
$xpath
)
{
if
(
substr
(
$xpath
,
0
,
3
)
===
'..\\'
)
{
$this
->
parentElementsOfInterest
[]
=
str_replace
(
'..\\'
,
''
,
$xpath
);
}
}
}
/**
* Builds a \SimpleXmlElement rooted at the iterator's current location.
*
* The resulting SimpleXmlElement also contains any child nodes of the current
* element.
*
* @return \SimpleXmlElement|false
* A \SimpleXmlElement when the document is parseable, or false if a
* parsing error occurred.
*
* @throws MigrateException
*/
protected
function
getSimpleXml
()
{
$node
=
$this
->
reader
->
expand
();
if
(
$node
)
{
// We must associate the DOMNode with a DOMDocument to be able to import
// it into SimpleXML. Despite appearances, this is almost twice as fast as
// simplexml_load_string($this->readOuterXML());
$dom
=
new
\DOMDocument
();
$node
=
$dom
->
importNode
(
$node
,
TRUE
);
$dom
->
appendChild
(
$node
);
$sxml_elem
=
simplexml_import_dom
(
$node
);
$this
->
registerNamespaces
(
$sxml_elem
);
return
$sxml_elem
;
}
else
{
foreach
(
libxml_get_errors
()
as
$error
)
{
$error_string
=
self
::
parseLibXmlError
(
$error
);
throw
new
MigrateException
(
$error_string
);
}
return
FALSE
;
}
}
/**
* {@inheritdoc}
*/
public
function
rewind
()
{
// Reset our path tracker.
$this
->
currentPath
=
[];
parent
::
rewind
();
}
/**
* {@inheritdoc}
*/
protected
function
openSourceUrl
(
$url
)
{
// (Re)open the provided URL.
$this
->
reader
->
close
();
return
$this
->
reader
->
open
(
$url
,
NULL
,
\LIBXML_NOWARNING
);
}
/**
* {@inheritdoc}
*/
protected
function
fetchNextRow
()
{
$target_element
=
NULL
;
// Loop over each node in the XML file, looking for elements at a path
// matching the input query string (represented in $this->elementsToMatch).
while
(
$this
->
reader
->
read
())
{
if
(
$this
->
reader
->
nodeType
==
\XMLReader
::
ELEMENT
)
{
if
(
$this
->
prefixedName
)
{
$this
->
currentPath
[
$this
->
reader
->
depth
]
=
$this
->
reader
->
name
;
if
(
array_key_exists
(
$this
->
reader
->
name
,
$this
->
parentElementsOfInterest
))
{
$this
->
parentXpathCache
[
$this
->
reader
->
depth
][
$this
->
reader
->
name
][]
=
$this
->
getSimpleXml
();
}
}
else
{
$this
->
currentPath
[
$this
->
reader
->
depth
]
=
$this
->
reader
->
localName
;
if
(
array_key_exists
(
$this
->
reader
->
localName
,
$this
->
parentElementsOfInterest
))
{
$this
->
parentXpathCache
[
$this
->
reader
->
depth
][
$this
->
reader
->
name
][]
=
$this
->
getSimpleXml
();
}
}
if
(
$this
->
currentPath
==
$this
->
elementsToMatch
)
{
// We're positioned to the right element path - build the SimpleXML
// object to enable proper xpath predicate evaluation.
$target_element
=
$this
->
getSimpleXml
();
if
(
$target_element
!==
FALSE
)
{
if
(
empty
(
$this
->
xpathPredicate
)
||
$this
->
predicateMatches
(
$target_element
))
{
break
;
}
}
}
}
elseif
(
$this
->
reader
->
nodeType
==
\XMLReader
::
END_ELEMENT
)
{
// Remove this element and any deeper ones from the current path.
foreach
(
$this
->
currentPath
as
$depth
=>
$name
)
{
if
(
$depth
>=
$this
->
reader
->
depth
)
{
unset
(
$this
->
currentPath
[
$depth
]);
}
}
foreach
(
$this
->
parentXpathCache
as
$depth
=>
$elements
)
{
if
(
$depth
>
$this
->
reader
->
depth
)
{
unset
(
$this
->
parentXpathCache
[
$depth
]);
}
}
}
}
// If we've found the desired element, populate the currentItem and
// currentId with its data.
if
(
$target_element
)
{
foreach
(
$this
->
fieldSelectors
()
as
$field_name
=>
$xpath
)
{
foreach
(
$target_element
->
xpath
(
$xpath
)
as
$value
)
{
$this
->
currentItem
[
$field_name
]
=
(
string
)
$value
;
}
}
}
}
/**
* Tests whether the iterator's xpath predicate matches the provided element.
*
* Has some limitations esp. in that it is easy to write predicates that
* reference things outside this SimpleXmlElement's tree, but "simpler"
* predicates should work as expected.
*
* @param \SimpleXMLElement $elem
* The element to test.
*
* @return bool
* True if the element matches the predicate, false if not.
*/
protected
function
predicateMatches
(
\SimpleXMLElement
$elem
)
{
return
!
empty
(
$elem
->
xpath
(
'/*['
.
$this
->
xpathPredicate
.
']'
));
}
/**
* Gets an ancestor SimpleXMLElement, if the element name was registered.
*
* Gets the SimpleXMLElement some number of levels above the iterator
* having the given name, but only for element names that this
* Xml data parser was told to retain for future reference through the
* constructor's $parent_elements_of_interest.
*
* @param int $levels_up
* The number of levels back towards the root of the DOM tree to ascend
* before searching for the named element.
* @param string $name
* The name of the desired element.
*
* @return \SimpleXMLElement|false
* The element matching the level and name requirements, or false if it is
* not present or was not retained.
*/
public
function
getAncestorElements
(
$levels_up
,
$name
)
{
if
(
$levels_up
>
0
)
{
$levels_up
*=
-
1
;
}
$ancestor_depth
=
$this
->
reader
->
depth
+
$levels_up
+
1
;
if
(
$ancestor_depth
<
0
)
{
return
FALSE
;
}
if
(
array_key_exists
(
$ancestor_depth
,
$this
->
parentXpathCache
)
&&
array_key_exists
(
$name
,
$this
->
parentXpathCache
[
$ancestor_depth
]))
{
return
$this
->
parentXpathCache
[
$ancestor_depth
][
$name
];
}
else
{
return
FALSE
;
}
}
/**
* Registers the iterator's namespaces to a SimpleXMLElement.
*
* @param \SimpleXMLElement $xml
* The element to apply namespace registrations to.
*/
protected
function
registerNamespaces
(
\SimpleXMLElement
$xml
)
{
if
(
is_array
(
$this
->
configuration
[
'namespaces'
]))
{
foreach
(
$this
->
configuration
[
'namespaces'
]
as
$prefix
=>
$ns
)
{
$xml
->
registerXPathNamespace
(
$prefix
,
$ns
);
}
}
}
/**
* Parses a LibXMLError to a error message string.
*
* @param \LibXMLError $error
* Error thrown by the XML.
*
* @return string
* Error message
*/
public
static
function
parseLibXmlError
(
\LibXMLError
$error
)
{
$error_code_name
=
'Unknown Error'
;
switch
(
$error
->
level
)
{
case
LIBXML_ERR_WARNING
:
$error_code_name
=
t
(
'Warning'
);
break
;
case
LIBXML_ERR_ERROR
:
$error_code_name
=
t
(
'Error'
);
break
;
case
LIBXML_ERR_FATAL
:
$error_code_name
=
t
(
'Fatal Error'
);
break
;
}
return
t
(
"@libxmlerrorcodename @libxmlerrorcode: @libxmlerrormessage
\n
"
.
"Line: @libxmlerrorline
\n
"
.
"Column: @libxmlerrorcolumn
\n
"
.
"File: @libxmlerrorfile"
,
[
'@libxmlerrorcodename'
=>
$error_code_name
,
'@libxmlerrorcode'
=>
$error
->
code
,
'@libxmlerrormessage'
=>
trim
(
$error
->
message
),
'@libxmlerrorline'
=>
$error
->
line
,
'@libxmlerrorcolumn'
=>
$error
->
column
,
'@libxmlerrorfile'
=>
((
$error
->
file
))
?
$error
->
file
:
NULL
,
]
);
}
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment