Issue #3388678 by drunken monkey, benjifisher: Fixed performance regression in... (e970f06f) · Commits · project / search_api

CHANGELOG.txt

+2 −0

Original line number	Diff line number	Diff line
		Search API 1.x, dev (xxxx-xx-xx):
		---------------------------------
		- #3388678 by drunken monkey, benjifisher: Fixed performance regression in HTML
		Filter processor.
		- #3398192 by drunken monkey, BenStallings, admirlju: Fixed warning for
		undefined array key "type" in SearchApiDate::opBetween().
		- #3394189 by drunken monkey: Dropped support for Drupal 9.

src/Plugin/search_api/processor/HtmlFilter.php

+11 −63

Original line number	Diff line number	Diff line
		@@ -223,82 +223,30 @@ class HtmlFilter extends FieldsProcessorPluginBase {
		*/
		protected function handleAttributes(string $text): string {
		// Determine which attributes should be indexed and bail early if it's none.
		$handled_attributes = [];
		$handled_attributes = $xpath_expr = [];
		foreach (['alt', 'title'] as $attr) {
		if ($this->configuration[$attr]) {
		$handled_attributes[] = $attr;
		$xpath_expr[] = "//*[@$attr]";
		}
		}
		if (!$handled_attributes) {
		return $text;
		}

		$processed_text = '';
		$pos = 0;
		$text_len = mb_strlen($text);
		// Go through the whole text, looking for HTML tags.
		while ($pos < $text_len) {
		// Find start of HTML tag.
		// Since there is always a space in front of a "<" character, we do not
		// need to write "$start_pos === FALSE" explicitly to check for a match.
		$start_pos = mb_strpos($text, '<', $pos);
		// Add everything from the last position to this start tag (or the end of
		// the string, if we found none) to the processed text.
		$processed_text .= mb_substr($text, $pos, $start_pos ? $start_pos - $pos : NULL);
		if (!$start_pos) {
		break;
		}

		// Find end of HTML tag.
		// As above for $start_pos, $end_pos cannot be 0 since it must be greater
		// than $start_pos. So, no need to check for FALSE strictly.
		$end_pos = mb_strpos($text, '>', $start_pos + 1);
		// Extract the contents of the tag, and add it to the processed text.
		$tag_contents = mb_substr($text, $start_pos, $end_pos ? $end_pos + 1 - $start_pos : NULL);
		$processed_text .= $tag_contents;
		if (!$end_pos) {
		break;
		}
		// Next, we want to begin searching right after the end of this HTML tag.
		$pos = $end_pos + 1;

		// Split the tag contents, without the angle brackets, into the element
		// name and the rest.
		$tag_contents = trim($tag_contents, '<> ');
		[$element_name, $tag_contents] = explode(' ', $tag_contents, 2) + [1 => NULL];
		// If there is just the element name, no need to look for attributes.
		if (!$tag_contents) {
		continue;
		}

		// This will match all the attributes we're looking for.
		$attr_regex = '(?:' . implode('\|', $handled_attributes) . ')';
		$pattern = "/(?:^\|\s)$attr_regex\s+=\s+(['\"])/Su";
		$flags = PREG_OFFSET_CAPTURE \| PREG_SET_ORDER;
		if (preg_match_all($pattern, $tag_contents, $matches, $flags)) {
		foreach ($matches as $match) {
		// Now just extract the attribute value as everything between the
		// matched quote character and the next such character.
		// Unfortunately, preg_match_all() reports positions in bytes, not
		// characters, so we need to use a bit of magic to reconcile this with
		// our usual handling of Unicode.
		$quote_char = $match[1][0];
		/** @var int $quote_pos */
		$quote_pos = $match[1][1];
		$tag_contents_from_quote = substr($tag_contents, $quote_pos + 1);
		$length = mb_strpos($tag_contents_from_quote, $quote_char);
		$attr_value = mb_substr($tag_contents_from_quote, 0, $length);
		// Take care of self-closing tags, so users are still able to set a
		// boost for, for instance, the "alt" attribute from an "img" tag.
		if ($tag_contents[-1] === '/') {
		$attr_value = " <$element_name> $attr_value </$element_name>";
		}
		$processed_text .= ' ' . $attr_value;
		$dom = Html::load($text);
		$xpath = new \DOMXPath($dom);
		/** @var \DOMElement $node */
		foreach ($xpath->query(implode('\|', $xpath_expr)) as $node) {
		foreach ($handled_attributes as $attr_name) {
		$attr = $node->attributes?->getNamedItem($attr_name);
		if ($attr !== NULL) {
		$node->prepend(" {$attr->textContent} ");
		}
		}
		}

		return $processed_text;
		return Html::serialize($dom);
		}

		/**

tests/src/Unit/Processor/HtmlFilterTest.php

+71 −39

Original line number	Diff line number	Diff line
		@@ -2,6 +2,7 @@

		namespace Drupal\Tests\search_api\Unit\Processor;

		use Drupal\Component\Utility\Random;
		use Drupal\search_api\IndexInterface;
		use Drupal\search_api\Item\Field;
		use Drupal\search_api\Plugin\search_api\data_type\value\TextToken;
		@@ -53,8 +54,7 @@ class HtmlFilterTest extends UnitTestCase {
		'alt' => FALSE,
		];
		$this->processor->setConfiguration($configuration);
		$type = 'text';
		$this->invokeMethod('processFieldValue', [&$passed_value, $type]);
		$this->invokeMethod('processFieldValue', [&$passed_value, 'text']);
		$this->assertEquals($expected_value, $passed_value);
		}

		@@ -95,8 +95,7 @@ class HtmlFilterTest extends UnitTestCase {
		'alt' => $alt_config,
		];
		$this->processor->setConfiguration($configuration);
		$type = 'text';
		$this->invokeMethod('processFieldValue', [&$passed_value, $type]);
		$this->invokeMethod('processFieldValue', [&$passed_value, 'text']);
		$this->assertEquals($expected_value, $passed_value);
		}

		@@ -150,14 +149,6 @@ class HtmlFilterTest extends UnitTestCase {
		],
		TRUE,
		],
		// Test fault tolerance.
		[
		'a < b',
		[
		Utility::createTextToken('a < b'),
		],
		TRUE,
		],
		];
		}

		@@ -180,8 +171,7 @@ class HtmlFilterTest extends UnitTestCase {
		'alt' => TRUE,
		];
		$this->processor->setConfiguration($configuration);
		$type = 'text';
		$this->invokeMethod('processFieldValue', [&$passed_value, $type]);
		$this->invokeMethod('processFieldValue', [&$passed_value, 'text']);
		$this->assertEquals($expected_value, $passed_value);
		}

		@@ -192,7 +182,32 @@ class HtmlFilterTest extends UnitTestCase {
		* An array of argument arrays for testTagConfiguration().
		*/
		public function tagConfigurationDataProvider() {
		$complex_test = [
		$tags_config = ['h2' => '2'];
		return [
		['h2word', 'h2word', []],
		['h2word', [Utility::createTextToken('h2word')], $tags_config],
		[
		'foo bar <h2> h2word </h2>',
		[
		Utility::createTextToken('foo bar'),
		Utility::createTextToken('h2word', 2.0),
		],
		$tags_config,
		],
		[
		'foo bar <h2>h2word</h2>',
		[
		Utility::createTextToken('foo bar'),
		Utility::createTextToken('h2word', 2.0),
		],
		$tags_config,
		],
		[
		'<div>word</div>',
		[Utility::createTextToken('word', 2)],
		['div' => 2],
		],
		[
		'<h2>Foo Bar <em>Baz</em></h2>

		<p>Bla Bla Bla. <strong title="Foobar">Important:</strong> Bla.</p>
		@@ -213,33 +228,18 @@ class HtmlFilterTest extends UnitTestCase {
		'img' => 0.5,
		'span' => 0,
		],
		];
		$tags_config = ['h2' => '2'];
		return [
		['h2word', 'h2word', []],
		['h2word', [Utility::createTextToken('h2word')], $tags_config],
		[
		'foo bar <h2> h2word </h2>',
		[
		Utility::createTextToken('foo bar'),
		Utility::createTextToken('h2word', 2.0),
		],
		$tags_config,
		],
		[
		'foo bar <h2>h2word</h2>',
		'foo <img src="img.png" alt="image" title = "check this out" /> bar',
		[
		Utility::createTextToken('foo bar'),
		Utility::createTextToken('h2word', 2.0),
		],
		$tags_config,
		Utility::createTextToken('foo', 1.0),
		Utility::createTextToken('check this out image', 0.5),
		Utility::createTextToken('bar', 1.0),
		],
		[
		'<div>word</div>',
		[Utility::createTextToken('word', 2)],
		['div' => 2],
		'img' => 0.5,
		],
		],
		$complex_test,
		];
		}

		@@ -263,8 +263,7 @@ class HtmlFilterTest extends UnitTestCase {
		<span>This is hidden</span>';
		$expected_value = preg_replace('/\s+/', ' ', strip_tags($passed_value));

		$type = 'string';
		$this->invokeMethod('processFieldValue', [&$passed_value, $type]);
		$this->invokeMethod('processFieldValue', [&$passed_value, 'string']);
		$this->assertEquals($expected_value, $passed_value);
		}

		@@ -352,4 +351,37 @@ class HtmlFilterTest extends UnitTestCase {
		$this->assertEquals([], $field->getValues());
		}

		/**
		* Tests that attribute handling is still fast even for large text values.
		*
		* @see https://www.drupal.org/project/search_api/issues/3388678
		*/
		public function testLargeTextAttributesHandling(): void {
		$this->processor->setConfiguration([
		'tags' => [
		'em' => 1.5,
		'strong' => 2.0,
		'h2' => 3.0,
		'img' => 0.5,
		'span' => 0,
		],
		'title' => TRUE,
		'alt' => TRUE,
		]);
		$text = '';
		$random = new Random();
		for ($i = 0; $i < 2000; ++$i) {
		$text .= ' ' . htmlspecialchars($random->sentences(10));
		$tag = $random->name();
		$attr = $random->name();
		$value = htmlspecialchars($random->word(12));
		$contents = htmlspecialchars($random->sentences(10));
		$text .= " <$tag $attr=\"$value\">$contents</$tag>";
		}
		$start = microtime(TRUE);
		$this->invokeMethod('processFieldValue', [&$text, 'text']);
		$took = microtime(TRUE) - $start;
		$this->assertLessThan(1.0, $took, 'Processing large field value took too long.');
		}

		}