From ce552ed8b5a31c6e2c5d070107c5dc4401f56dba Mon Sep 17 00:00:00 2001
From: Matroskeen <matroskeen@3426249.no-reply.drupal.org>
Date: Fri, 15 Apr 2022 15:02:58 +0000
Subject: [PATCH] Issue #3096393 by marvil07, Matroskeen: Support html5 parsing
 on dom plugin

---
 src/Plugin/migrate/process/Dom.php | 22 +++++++++++++++++++++-
 tests/src/Unit/process/DomTest.php | 23 +++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/Plugin/migrate/process/Dom.php b/src/Plugin/migrate/process/Dom.php
index ca538a61..aa3051d8 100644
--- a/src/Plugin/migrate/process/Dom.php
+++ b/src/Plugin/migrate/process/Dom.php
@@ -10,6 +10,7 @@ use Drupal\migrate\MigrateExecutableInterface;
 use Drupal\migrate\Plugin\MigrationInterface;
 use Drupal\migrate\ProcessPluginBase;
 use Drupal\migrate\Row;
+use Masterminds\HTML5;
 
 /**
  * Handles string to DOM and back conversions.
@@ -30,6 +31,9 @@ use Drupal\migrate\Row;
  *   declaration. Defaults to '1.0'.
  * - encoding: (optional) The encoding of the document as part of the XML
  *   declaration. Defaults to 'UTF-8'.
+ * - import_method: (optional) What parser to use. Possible values:
+ *   - 'html': (default) use dom extension parsing.
+ *   - 'html5': use html5 parsing.
  *
  * @codingStandardsIgnoreStart
  *
@@ -95,6 +99,10 @@ class Dom extends ProcessPluginBase {
     if (!in_array($configuration['method'], ['import', 'export'])) {
       throw new \InvalidArgumentException('The "method" must be "import" or "export".');
     }
+    $configuration['import_method'] = $configuration['import_method'] ?? 'html';
+    if (!in_array($configuration['import_method'], ['html', 'html5'])) {
+      throw new \InvalidArgumentException('The "import_method" must be "html" or "html5".');
+    }
     parent::__construct($configuration, $plugin_id, $plugin_definition);
     $this->configuration += $this->defaultValues();
     $this->logMessages = (bool) $this->configuration['log_messages'];
@@ -159,7 +167,19 @@ class Dom extends ProcessPluginBase {
     }
 
     $document = new \DOMDocument($this->configuration['version'], $this->configuration['encoding']);
-    $document->loadHTML($html);
+    switch ($this->configuration['import_method']) {
+      case 'html5':
+        $html5 = new HTML5([
+          'target_document' => $document,
+          'disable_html_ns' => TRUE,
+        ]);
+        $html5->loadHTML($html);
+        break;
+
+      case 'html':
+      default:
+        $document->loadHTML($html);
+    }
 
     if ($this->logMessages) {
       restore_error_handler();
diff --git a/tests/src/Unit/process/DomTest.php b/tests/src/Unit/process/DomTest.php
index d87be093..6fb9f383 100644
--- a/tests/src/Unit/process/DomTest.php
+++ b/tests/src/Unit/process/DomTest.php
@@ -42,6 +42,17 @@ final class DomTest extends MigrateProcessTestCase {
       ->transform($value, $this->migrateExecutable, $this->row, 'destinationproperty');
   }
 
+  /**
+   * @covers ::__construct
+   */
+  public function testInvalidImportMethod() {
+    $configuration['method'] = 'import';
+    $configuration['import_method'] = 'invalid';
+    $this->expectException(\InvalidArgumentException::class);
+    $this->expectExceptionMessage('The "import_method" must be "html" or "html5".');
+    (new Dom($configuration, 'dom', []));
+  }
+
   /**
    * @covers ::import
    */
@@ -54,6 +65,18 @@ final class DomTest extends MigrateProcessTestCase {
     $this->assertTrue($document instanceof \DOMDocument);
   }
 
+  /**
+   * @covers ::import
+   */
+  public function testImportMethodHtml5(): void {
+    $configuration['method'] = 'import';
+    $configuration['import_method'] = 'html5';
+    $value = '<p>A simple paragraph.</p>';
+    $document = (new Dom($configuration, 'dom', []))
+      ->transform($value, $this->migrateExecutable, $this->row, 'destinationproperty');
+    $this->assertTrue($document instanceof \DOMDocument);
+  }
+
   /**
    * @covers ::import
    */
-- 
GitLab