Loading CHANGELOG.txt +2 −0 Original line number Diff line number Diff line Search API 1.x, dev (xxxx-xx-xx): --------------------------------- - #3388678 by drunken monkey, benjifisher: Fixed performance regression in HTML Filter processor. - #3398192 by drunken monkey, BenStallings, admirlju: Fixed warning for undefined array key "type" in SearchApiDate::opBetween(). - #3394189 by drunken monkey: Dropped support for Drupal 9. Loading src/Plugin/search_api/processor/HtmlFilter.php +11 −63 Original line number Diff line number Diff line Loading @@ -223,82 +223,30 @@ class HtmlFilter extends FieldsProcessorPluginBase { */ protected function handleAttributes(string $text): string { // Determine which attributes should be indexed and bail early if it's none. $handled_attributes = []; $handled_attributes = $xpath_expr = []; foreach (['alt', 'title'] as $attr) { if ($this->configuration[$attr]) { $handled_attributes[] = $attr; $xpath_expr[] = "//*[@$attr]"; } } if (!$handled_attributes) { return $text; } $processed_text = ''; $pos = 0; $text_len = mb_strlen($text); // Go through the whole text, looking for HTML tags. while ($pos < $text_len) { // Find start of HTML tag. // Since there is always a space in front of a "<" character, we do not // need to write "$start_pos === FALSE" explicitly to check for a match. $start_pos = mb_strpos($text, '<', $pos); // Add everything from the last position to this start tag (or the end of // the string, if we found none) to the processed text. $processed_text .= mb_substr($text, $pos, $start_pos ? $start_pos - $pos : NULL); if (!$start_pos) { break; } // Find end of HTML tag. // As above for $start_pos, $end_pos cannot be 0 since it must be greater // than $start_pos. So, no need to check for FALSE strictly. $end_pos = mb_strpos($text, '>', $start_pos + 1); // Extract the contents of the tag, and add it to the processed text. $tag_contents = mb_substr($text, $start_pos, $end_pos ? $end_pos + 1 - $start_pos : NULL); $processed_text .= $tag_contents; if (!$end_pos) { break; } // Next, we want to begin searching right after the end of this HTML tag. $pos = $end_pos + 1; // Split the tag contents, without the angle brackets, into the element // name and the rest. $tag_contents = trim($tag_contents, '<> '); [$element_name, $tag_contents] = explode(' ', $tag_contents, 2) + [1 => NULL]; // If there is just the element name, no need to look for attributes. if (!$tag_contents) { continue; } // This will match all the attributes we're looking for. $attr_regex = '(?:' . implode('|', $handled_attributes) . ')'; $pattern = "/(?:^|\s)$attr_regex\s*+=\s*+(['\"])/Su"; $flags = PREG_OFFSET_CAPTURE | PREG_SET_ORDER; if (preg_match_all($pattern, $tag_contents, $matches, $flags)) { foreach ($matches as $match) { // Now just extract the attribute value as everything between the // matched quote character and the next such character. // Unfortunately, preg_match_all() reports positions in bytes, not // characters, so we need to use a bit of magic to reconcile this with // our usual handling of Unicode. $quote_char = $match[1][0]; /** @var int $quote_pos */ $quote_pos = $match[1][1]; $tag_contents_from_quote = substr($tag_contents, $quote_pos + 1); $length = mb_strpos($tag_contents_from_quote, $quote_char); $attr_value = mb_substr($tag_contents_from_quote, 0, $length); // Take care of self-closing tags, so users are still able to set a // boost for, for instance, the "alt" attribute from an "img" tag. if ($tag_contents[-1] === '/') { $attr_value = " <$element_name> $attr_value </$element_name>"; } $processed_text .= ' ' . $attr_value; $dom = Html::load($text); $xpath = new \DOMXPath($dom); /** @var \DOMElement $node */ foreach ($xpath->query(implode('|', $xpath_expr)) as $node) { foreach ($handled_attributes as $attr_name) { $attr = $node->attributes?->getNamedItem($attr_name); if ($attr !== NULL) { $node->prepend(" {$attr->textContent} "); } } } return $processed_text; return Html::serialize($dom); } /** Loading tests/src/Unit/Processor/HtmlFilterTest.php +71 −39 Original line number Diff line number Diff line Loading @@ -2,6 +2,7 @@ namespace Drupal\Tests\search_api\Unit\Processor; use Drupal\Component\Utility\Random; use Drupal\search_api\IndexInterface; use Drupal\search_api\Item\Field; use Drupal\search_api\Plugin\search_api\data_type\value\TextToken; Loading Loading @@ -53,8 +54,7 @@ class HtmlFilterTest extends UnitTestCase { 'alt' => FALSE, ]; $this->processor->setConfiguration($configuration); $type = 'text'; $this->invokeMethod('processFieldValue', [&$passed_value, $type]); $this->invokeMethod('processFieldValue', [&$passed_value, 'text']); $this->assertEquals($expected_value, $passed_value); } Loading Loading @@ -95,8 +95,7 @@ class HtmlFilterTest extends UnitTestCase { 'alt' => $alt_config, ]; $this->processor->setConfiguration($configuration); $type = 'text'; $this->invokeMethod('processFieldValue', [&$passed_value, $type]); $this->invokeMethod('processFieldValue', [&$passed_value, 'text']); $this->assertEquals($expected_value, $passed_value); } Loading Loading @@ -150,14 +149,6 @@ class HtmlFilterTest extends UnitTestCase { ], TRUE, ], // Test fault tolerance. [ 'a < b', [ Utility::createTextToken('a < b'), ], TRUE, ], ]; } Loading @@ -180,8 +171,7 @@ class HtmlFilterTest extends UnitTestCase { 'alt' => TRUE, ]; $this->processor->setConfiguration($configuration); $type = 'text'; $this->invokeMethod('processFieldValue', [&$passed_value, $type]); $this->invokeMethod('processFieldValue', [&$passed_value, 'text']); $this->assertEquals($expected_value, $passed_value); } Loading @@ -192,7 +182,32 @@ class HtmlFilterTest extends UnitTestCase { * An array of argument arrays for testTagConfiguration(). */ public function tagConfigurationDataProvider() { $complex_test = [ $tags_config = ['h2' => '2']; return [ ['h2word', 'h2word', []], ['h2word', [Utility::createTextToken('h2word')], $tags_config], [ 'foo bar <h2> h2word </h2>', [ Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0), ], $tags_config, ], [ 'foo bar <h2>h2word</h2>', [ Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0), ], $tags_config, ], [ '<div>word</div>', [Utility::createTextToken('word', 2)], ['div' => 2], ], [ '<h2>Foo Bar <em>Baz</em></h2> <p>Bla Bla Bla. <strong title="Foobar">Important:</strong> Bla.</p> Loading @@ -213,33 +228,18 @@ class HtmlFilterTest extends UnitTestCase { 'img' => 0.5, 'span' => 0, ], ]; $tags_config = ['h2' => '2']; return [ ['h2word', 'h2word', []], ['h2word', [Utility::createTextToken('h2word')], $tags_config], [ 'foo bar <h2> h2word </h2>', [ Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0), ], $tags_config, ], [ 'foo bar <h2>h2word</h2>', 'foo <img src="img.png" alt="image" title = "check this out" /> bar', [ Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0), ], $tags_config, Utility::createTextToken('foo', 1.0), Utility::createTextToken('check this out image', 0.5), Utility::createTextToken('bar', 1.0), ], [ '<div>word</div>', [Utility::createTextToken('word', 2)], ['div' => 2], 'img' => 0.5, ], ], $complex_test, ]; } Loading @@ -263,8 +263,7 @@ class HtmlFilterTest extends UnitTestCase { <span>This is hidden</span>'; $expected_value = preg_replace('/\s+/', ' ', strip_tags($passed_value)); $type = 'string'; $this->invokeMethod('processFieldValue', [&$passed_value, $type]); $this->invokeMethod('processFieldValue', [&$passed_value, 'string']); $this->assertEquals($expected_value, $passed_value); } Loading Loading @@ -352,4 +351,37 @@ class HtmlFilterTest extends UnitTestCase { $this->assertEquals([], $field->getValues()); } /** * Tests that attribute handling is still fast even for large text values. * * @see https://www.drupal.org/project/search_api/issues/3388678 */ public function testLargeTextAttributesHandling(): void { $this->processor->setConfiguration([ 'tags' => [ 'em' => 1.5, 'strong' => 2.0, 'h2' => 3.0, 'img' => 0.5, 'span' => 0, ], 'title' => TRUE, 'alt' => TRUE, ]); $text = ''; $random = new Random(); for ($i = 0; $i < 2000; ++$i) { $text .= ' ' . htmlspecialchars($random->sentences(10)); $tag = $random->name(); $attr = $random->name(); $value = htmlspecialchars($random->word(12)); $contents = htmlspecialchars($random->sentences(10)); $text .= " <$tag $attr=\"$value\">$contents</$tag>"; } $start = microtime(TRUE); $this->invokeMethod('processFieldValue', [&$text, 'text']); $took = microtime(TRUE) - $start; $this->assertLessThan(1.0, $took, 'Processing large field value took too long.'); } } Loading
CHANGELOG.txt +2 −0 Original line number Diff line number Diff line Search API 1.x, dev (xxxx-xx-xx): --------------------------------- - #3388678 by drunken monkey, benjifisher: Fixed performance regression in HTML Filter processor. - #3398192 by drunken monkey, BenStallings, admirlju: Fixed warning for undefined array key "type" in SearchApiDate::opBetween(). - #3394189 by drunken monkey: Dropped support for Drupal 9. Loading
src/Plugin/search_api/processor/HtmlFilter.php +11 −63 Original line number Diff line number Diff line Loading @@ -223,82 +223,30 @@ class HtmlFilter extends FieldsProcessorPluginBase { */ protected function handleAttributes(string $text): string { // Determine which attributes should be indexed and bail early if it's none. $handled_attributes = []; $handled_attributes = $xpath_expr = []; foreach (['alt', 'title'] as $attr) { if ($this->configuration[$attr]) { $handled_attributes[] = $attr; $xpath_expr[] = "//*[@$attr]"; } } if (!$handled_attributes) { return $text; } $processed_text = ''; $pos = 0; $text_len = mb_strlen($text); // Go through the whole text, looking for HTML tags. while ($pos < $text_len) { // Find start of HTML tag. // Since there is always a space in front of a "<" character, we do not // need to write "$start_pos === FALSE" explicitly to check for a match. $start_pos = mb_strpos($text, '<', $pos); // Add everything from the last position to this start tag (or the end of // the string, if we found none) to the processed text. $processed_text .= mb_substr($text, $pos, $start_pos ? $start_pos - $pos : NULL); if (!$start_pos) { break; } // Find end of HTML tag. // As above for $start_pos, $end_pos cannot be 0 since it must be greater // than $start_pos. So, no need to check for FALSE strictly. $end_pos = mb_strpos($text, '>', $start_pos + 1); // Extract the contents of the tag, and add it to the processed text. $tag_contents = mb_substr($text, $start_pos, $end_pos ? $end_pos + 1 - $start_pos : NULL); $processed_text .= $tag_contents; if (!$end_pos) { break; } // Next, we want to begin searching right after the end of this HTML tag. $pos = $end_pos + 1; // Split the tag contents, without the angle brackets, into the element // name and the rest. $tag_contents = trim($tag_contents, '<> '); [$element_name, $tag_contents] = explode(' ', $tag_contents, 2) + [1 => NULL]; // If there is just the element name, no need to look for attributes. if (!$tag_contents) { continue; } // This will match all the attributes we're looking for. $attr_regex = '(?:' . implode('|', $handled_attributes) . ')'; $pattern = "/(?:^|\s)$attr_regex\s*+=\s*+(['\"])/Su"; $flags = PREG_OFFSET_CAPTURE | PREG_SET_ORDER; if (preg_match_all($pattern, $tag_contents, $matches, $flags)) { foreach ($matches as $match) { // Now just extract the attribute value as everything between the // matched quote character and the next such character. // Unfortunately, preg_match_all() reports positions in bytes, not // characters, so we need to use a bit of magic to reconcile this with // our usual handling of Unicode. $quote_char = $match[1][0]; /** @var int $quote_pos */ $quote_pos = $match[1][1]; $tag_contents_from_quote = substr($tag_contents, $quote_pos + 1); $length = mb_strpos($tag_contents_from_quote, $quote_char); $attr_value = mb_substr($tag_contents_from_quote, 0, $length); // Take care of self-closing tags, so users are still able to set a // boost for, for instance, the "alt" attribute from an "img" tag. if ($tag_contents[-1] === '/') { $attr_value = " <$element_name> $attr_value </$element_name>"; } $processed_text .= ' ' . $attr_value; $dom = Html::load($text); $xpath = new \DOMXPath($dom); /** @var \DOMElement $node */ foreach ($xpath->query(implode('|', $xpath_expr)) as $node) { foreach ($handled_attributes as $attr_name) { $attr = $node->attributes?->getNamedItem($attr_name); if ($attr !== NULL) { $node->prepend(" {$attr->textContent} "); } } } return $processed_text; return Html::serialize($dom); } /** Loading
tests/src/Unit/Processor/HtmlFilterTest.php +71 −39 Original line number Diff line number Diff line Loading @@ -2,6 +2,7 @@ namespace Drupal\Tests\search_api\Unit\Processor; use Drupal\Component\Utility\Random; use Drupal\search_api\IndexInterface; use Drupal\search_api\Item\Field; use Drupal\search_api\Plugin\search_api\data_type\value\TextToken; Loading Loading @@ -53,8 +54,7 @@ class HtmlFilterTest extends UnitTestCase { 'alt' => FALSE, ]; $this->processor->setConfiguration($configuration); $type = 'text'; $this->invokeMethod('processFieldValue', [&$passed_value, $type]); $this->invokeMethod('processFieldValue', [&$passed_value, 'text']); $this->assertEquals($expected_value, $passed_value); } Loading Loading @@ -95,8 +95,7 @@ class HtmlFilterTest extends UnitTestCase { 'alt' => $alt_config, ]; $this->processor->setConfiguration($configuration); $type = 'text'; $this->invokeMethod('processFieldValue', [&$passed_value, $type]); $this->invokeMethod('processFieldValue', [&$passed_value, 'text']); $this->assertEquals($expected_value, $passed_value); } Loading Loading @@ -150,14 +149,6 @@ class HtmlFilterTest extends UnitTestCase { ], TRUE, ], // Test fault tolerance. [ 'a < b', [ Utility::createTextToken('a < b'), ], TRUE, ], ]; } Loading @@ -180,8 +171,7 @@ class HtmlFilterTest extends UnitTestCase { 'alt' => TRUE, ]; $this->processor->setConfiguration($configuration); $type = 'text'; $this->invokeMethod('processFieldValue', [&$passed_value, $type]); $this->invokeMethod('processFieldValue', [&$passed_value, 'text']); $this->assertEquals($expected_value, $passed_value); } Loading @@ -192,7 +182,32 @@ class HtmlFilterTest extends UnitTestCase { * An array of argument arrays for testTagConfiguration(). */ public function tagConfigurationDataProvider() { $complex_test = [ $tags_config = ['h2' => '2']; return [ ['h2word', 'h2word', []], ['h2word', [Utility::createTextToken('h2word')], $tags_config], [ 'foo bar <h2> h2word </h2>', [ Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0), ], $tags_config, ], [ 'foo bar <h2>h2word</h2>', [ Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0), ], $tags_config, ], [ '<div>word</div>', [Utility::createTextToken('word', 2)], ['div' => 2], ], [ '<h2>Foo Bar <em>Baz</em></h2> <p>Bla Bla Bla. <strong title="Foobar">Important:</strong> Bla.</p> Loading @@ -213,33 +228,18 @@ class HtmlFilterTest extends UnitTestCase { 'img' => 0.5, 'span' => 0, ], ]; $tags_config = ['h2' => '2']; return [ ['h2word', 'h2word', []], ['h2word', [Utility::createTextToken('h2word')], $tags_config], [ 'foo bar <h2> h2word </h2>', [ Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0), ], $tags_config, ], [ 'foo bar <h2>h2word</h2>', 'foo <img src="img.png" alt="image" title = "check this out" /> bar', [ Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0), ], $tags_config, Utility::createTextToken('foo', 1.0), Utility::createTextToken('check this out image', 0.5), Utility::createTextToken('bar', 1.0), ], [ '<div>word</div>', [Utility::createTextToken('word', 2)], ['div' => 2], 'img' => 0.5, ], ], $complex_test, ]; } Loading @@ -263,8 +263,7 @@ class HtmlFilterTest extends UnitTestCase { <span>This is hidden</span>'; $expected_value = preg_replace('/\s+/', ' ', strip_tags($passed_value)); $type = 'string'; $this->invokeMethod('processFieldValue', [&$passed_value, $type]); $this->invokeMethod('processFieldValue', [&$passed_value, 'string']); $this->assertEquals($expected_value, $passed_value); } Loading Loading @@ -352,4 +351,37 @@ class HtmlFilterTest extends UnitTestCase { $this->assertEquals([], $field->getValues()); } /** * Tests that attribute handling is still fast even for large text values. * * @see https://www.drupal.org/project/search_api/issues/3388678 */ public function testLargeTextAttributesHandling(): void { $this->processor->setConfiguration([ 'tags' => [ 'em' => 1.5, 'strong' => 2.0, 'h2' => 3.0, 'img' => 0.5, 'span' => 0, ], 'title' => TRUE, 'alt' => TRUE, ]); $text = ''; $random = new Random(); for ($i = 0; $i < 2000; ++$i) { $text .= ' ' . htmlspecialchars($random->sentences(10)); $tag = $random->name(); $attr = $random->name(); $value = htmlspecialchars($random->word(12)); $contents = htmlspecialchars($random->sentences(10)); $text .= " <$tag $attr=\"$value\">$contents</$tag>"; } $start = microtime(TRUE); $this->invokeMethod('processFieldValue', [&$text, 'text']); $took = microtime(TRUE) - $start; $this->assertLessThan(1.0, $took, 'Processing large field value took too long.'); } }