Issue #3347610 by drunken monkey, itaran: Fixed error in HTML filter with very long attributes. (542e798b) · Commits · project / search_api

CHANGELOG.txt

+2 −0

Original line number	Diff line number	Diff line
		Search API 1.x, dev (xxxx-xx-xx):
		---------------------------------
		- #3347610 by drunken monkey, itaran: Fixed error in HTML filter with very long
		attributes.
		- #3317569 by drunken monkey: Added tests for indexing multi-valued reverse
		entity references.
		- #3363257 by drunken monkey: Fixed failing tests against Drupal 10.1 HEAD.

src/Plugin/search_api/processor/HtmlFilter.php

+92 −7

Original line number	Diff line number	Diff line
		@@ -195,16 +195,11 @@ class HtmlFilter extends FieldsProcessorPluginBase {
		protected function processFieldValue(&$value, $type) {
		// Remove invisible content.
		$text = preg_replace('@<(applet\|audio\|canvas\|command\|embed\|iframe\|map\|menu\|noembed\|noframes\|noscript\|script\|style\|svg\|video)[^>]>.</\1>@siU', ' ', $value);
		// Let removed tags still delimit words.
		$is_text_type = $this->getDataTypeHelper()->isTextType($type);
		if ($is_text_type) {
		// Let removed tags still delimit words.
		$text = str_replace(['<', '>'], [' <', '> '], $text);
		if ($this->configuration['title']) {
		$text = preg_replace('/(<[-a-z_]+[^>]["\s])title\s=\s("([^"]+)"\|\'([^\']+)\')([^>]>)/i', '$1 $5 $3$4 ', $text);
		}
		if ($this->configuration['alt']) {
		$text = preg_replace('/<[-a-z_]+[^>]["\s]alt\s=\s("([^"]+)"\|\'([^\']+)\')[^>]>/i', ' <img>$2$3</img> ', $text);
		}
		$text = $this->handleAttributes($text);
		}
		if ($this->configuration['tags'] && $is_text_type) {
		$text = strip_tags($text, '<' . implode('><', array_keys($this->configuration['tags'])) . '>');
		@@ -216,6 +211,96 @@ class HtmlFilter extends FieldsProcessorPluginBase {
		}
		}

		/**
		* Copies configured attributes out of HTML tags so they are indexed.
		*
		* @param string $text
		* The text to process, with spaces added around all HTML tags.
		*
		* @return string
		* The same text, with the contents of attributes "alt" and/or "title" (as
		* configured) copied into their element contents so they can be indexed.
		*/
		protected function handleAttributes(string $text): string {
		// Determine which attributes should be indexed and bail early if it's none.
		$handled_attributes = [];
		foreach (['alt', 'title'] as $attr) {
		if ($this->configuration[$attr]) {
		$handled_attributes[] = $attr;
		}
		}
		if (!$handled_attributes) {
		return $text;
		}

		$processed_text = '';
		$pos = 0;
		$text_len = mb_strlen($text);
		// Go through the whole text, looking for HTML tags.
		while ($pos < $text_len) {
		// Find start of HTML tag.
		// Since there is always a space in front of a "<" character, we do not
		// need to write "$start_pos === FALSE" explicitly to check for a match.
		$start_pos = mb_strpos($text, '<', $pos);
		// Add everything from the last position to this start tag (or the end of
		// the string, if we found none) to the processed text.
		$processed_text .= mb_substr($text, $pos, $start_pos ? $start_pos - $pos : NULL);
		if (!$start_pos) {
		break;
		}

		// Find end of HTML tag.
		// As above for $start_pos, $end_pos cannot be 0 since it must be greater
		// than $start_pos. So, no need to check for FALSE strictly.
		$end_pos = mb_strpos($text, '>', $start_pos + 1);
		// Extract the contents of the tag, and add it to the processed text.
		$tag_contents = mb_substr($text, $start_pos, $end_pos ? $end_pos + 1 - $start_pos : NULL);
		$processed_text .= $tag_contents;
		if (!$end_pos) {
		break;
		}
		// Next, we want to begin searching right after the end of this HTML tag.
		$pos = $end_pos + 1;

		// Split the tag contents, without the angle brackets, into the element
		// name and the rest.
		$tag_contents = trim($tag_contents, '<> ');
		[$element_name, $tag_contents] = explode(' ', $tag_contents, 2) + [1 => NULL];
		// If there is just the element name, no need to look for attributes.
		if (!$tag_contents) {
		continue;
		}

		// This will match all the attributes we're looking for.
		$attr_regex = '(?:' . implode('\|', $handled_attributes) . ')';
		$pattern = "/(?:^\|\s)$attr_regex\s+=\s+(['\"])/Su";
		$flags = PREG_OFFSET_CAPTURE \| PREG_SET_ORDER;
		if (preg_match_all($pattern, $tag_contents, $matches, $flags)) {
		foreach ($matches as $match) {
		// Now just extract the attribute value as everything between the
		// matched quote character and the next such character.
		// Unfortunately, preg_match_all() reports positions in bytes, not
		// characters, so we need to use a bit of magic to reconcile this with
		// our usual handling of Unicode.
		$quote_char = $match[1][0];
		/** @var int $quote_pos */
		$quote_pos = $match[1][1];
		$tag_contents_from_quote = substr($tag_contents, $quote_pos + 1);
		$length = mb_strpos($tag_contents_from_quote, $quote_char);
		$attr_value = mb_substr($tag_contents_from_quote, 0, $length);
		// Take care of self-closing tags, so users are still able to set a
		// boost for, for instance, the "alt" attribute from an "img" tag.
		if ($tag_contents[-1] === '/') {
		$attr_value = " <$element_name> $attr_value </$element_name>";
		}
		$processed_text .= ' ' . $attr_value;
		}
		}
		}

		return $processed_text;
		}

		/**
		* {@inheritdoc}
		*/

tests/src/Unit/Processor/HtmlFilterTest.php

+10 −1

Original line number	Diff line number	Diff line
		@@ -141,6 +141,15 @@ class HtmlFilterTest extends UnitTestCase {
		],
		TRUE,
		],
		// Test handling of very long tags.
		[
		'<img alt="ALT" src="image/png;base64,' . str_repeat('1', 1000000) . '" /> word </a>',
		[
		Utility::createTextToken('ALT', 2),
		Utility::createTextToken('word'),
		],
		TRUE,
		],
		// Test fault tolerance.
		[
		'a < b',
		@@ -187,7 +196,7 @@ class HtmlFilterTest extends UnitTestCase {
		'<h2>Foo Bar <em>Baz</em></h2>

		<p>Bla Bla Bla. <strong title="Foobar">Important:</strong> Bla.</p>
		<img src="/foo.png" alt="Some picture" />
		<img src="image/png;base64,' . str_repeat('1', 1000000) . '" alt="Some picture" />
		<span>This is hidden</span>',
		[
		Utility::createTextToken('Foo Bar', 3.0),