From 3ae37397adf8a74bcee19dd03935ea536ff2b9ac Mon Sep 17 00:00:00 2001
From: Lee Rowlands <lee.rowlands@previousnext.com.au>
Date: Tue, 2 Jan 2024 08:31:58 +1000
Subject: [PATCH] Issue #3410303 by longwave, Luke.Leber, Wim Leers, quietone,
 dslatkin: FilterHtml data loss when iframe and/or textarea is allowed

---
 .../filter/src/Plugin/Filter/FilterHtml.php    | 18 +++++++++++++++++-
 .../tests/src/Kernel/FilterKernelTest.php      | 11 +++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/core/modules/filter/src/Plugin/Filter/FilterHtml.php b/core/modules/filter/src/Plugin/Filter/FilterHtml.php
index 345ae193dea9..0718b73092e0 100644
--- a/core/modules/filter/src/Plugin/Filter/FilterHtml.php
+++ b/core/modules/filter/src/Plugin/Filter/FilterHtml.php
@@ -7,6 +7,9 @@
 use Drupal\Component\Utility\Html;
 use Drupal\filter\FilterProcessResult;
 use Drupal\filter\Plugin\FilterBase;
+use Masterminds\HTML5\Parser\DOMTreeBuilder;
+use Masterminds\HTML5\Parser\Scanner;
+use Masterminds\HTML5\Parser\Tokenizer;
 
 /**
  * Provides a filter to limit allowed HTML tags.
@@ -258,7 +261,20 @@ public function getHTMLRestrictions() {
     $star_protector = '__zqh6vxfbk3cg__';
     $html = str_replace('*', $star_protector, $html);
 
-    $dom = Html::load($html);
+    // Use HTML5 parser with a custom tokenizer to correctly parse tags that
+    // normally use text mode, such as iframe.
+    $events = new DOMTreeBuilder(FALSE, ['disable_html_ns' => TRUE]);
+    $scanner = new Scanner('<body>' . $html);
+    $parser = new class($scanner, $events) extends Tokenizer {
+
+      public function setTextMode($textMode, $untilTag = NULL) {
+        // Do nothing, we never enter text mode.
+      }
+
+    };
+    $parser->parse();
+
+    $dom = $events->document();
     $xpath = new \DOMXPath($dom);
     foreach ($xpath->query('//body//*') as $node) {
       $tag = $node->tagName;
diff --git a/core/modules/filter/tests/src/Kernel/FilterKernelTest.php b/core/modules/filter/tests/src/Kernel/FilterKernelTest.php
index 660ba730da9f..357437c33446 100644
--- a/core/modules/filter/tests/src/Kernel/FilterKernelTest.php
+++ b/core/modules/filter/tests/src/Kernel/FilterKernelTest.php
@@ -579,6 +579,17 @@ public function testHtmlFilter() {
     $this->assertNormalized($f, '<a>link</a>', 'HTML filter removes allowed attributes that have a not explicitly allowed value.');
     $f = (string) $filter->process('<a href="/beautiful-animals" kitten="cute" llama="epic majestical">link</a>', Language::LANGCODE_NOT_SPECIFIED);
     $this->assertSame('<a href="/beautiful-animals" llama="epic majestical">link</a>', $f, 'HTML filter keeps explicitly allowed attributes with an attribute value that is also explicitly allowed.');
+
+    // Allow iframes and check that the subsequent tags are parsed correctly.
+    $filter->setConfiguration([
+      'settings' => [
+        'allowed_html' => '<iframe> <a href llama>',
+        'filter_html_help' => 1,
+        'filter_html_nofollow' => 0,
+      ],
+    ]);
+    $f = (string) $filter->process('<a kitten="cute" llama="awesome">link</a>', Language::LANGCODE_NOT_SPECIFIED);
+    $this->assertNormalized($f, '<a llama="awesome">link</a>');
   }
 
   /**
-- 
GitLab