Commit e970f06f authored by Thomas Seidl's avatar Thomas Seidl
Browse files

Issue #3388678 by drunken monkey, benjifisher: Fixed performance regression in...

Issue #3388678 by drunken monkey, benjifisher: Fixed performance regression in HTML Filter processor.
parent a094dd61
Loading
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
Search API 1.x, dev (xxxx-xx-xx):
---------------------------------
- #3388678 by drunken monkey, benjifisher: Fixed performance regression in HTML
  Filter processor.
- #3398192 by drunken monkey, BenStallings, admirlju: Fixed warning for
  undefined array key "type" in SearchApiDate::opBetween().
- #3394189 by drunken monkey: Dropped support for Drupal 9.
+11 −63
Original line number Diff line number Diff line
@@ -223,82 +223,30 @@ class HtmlFilter extends FieldsProcessorPluginBase {
   */
  protected function handleAttributes(string $text): string {
    // Determine which attributes should be indexed and bail early if it's none.
    $handled_attributes = [];
    $handled_attributes = $xpath_expr = [];
    foreach (['alt', 'title'] as $attr) {
      if ($this->configuration[$attr]) {
        $handled_attributes[] = $attr;
        $xpath_expr[] = "//*[@$attr]";
      }
    }
    if (!$handled_attributes) {
      return $text;
    }

    $processed_text = '';
    $pos = 0;
    $text_len = mb_strlen($text);
    // Go through the whole text, looking for HTML tags.
    while ($pos < $text_len) {
      // Find start of HTML tag.
      // Since there is always a space in front of a "<" character, we do not
      // need to write "$start_pos === FALSE" explicitly to check for a match.
      $start_pos = mb_strpos($text, '<', $pos);
      // Add everything from the last position to this start tag (or the end of
      // the string, if we found none) to the processed text.
      $processed_text .= mb_substr($text, $pos, $start_pos ? $start_pos - $pos : NULL);
      if (!$start_pos) {
        break;
      }

      // Find end of HTML tag.
      // As above for $start_pos, $end_pos cannot be 0 since it must be greater
      // than $start_pos. So, no need to check for FALSE strictly.
      $end_pos = mb_strpos($text, '>', $start_pos + 1);
      // Extract the contents of the tag, and add it to the processed text.
      $tag_contents = mb_substr($text, $start_pos, $end_pos ? $end_pos + 1 - $start_pos : NULL);
      $processed_text .= $tag_contents;
      if (!$end_pos) {
        break;
      }
      // Next, we want to begin searching right after the end of this HTML tag.
      $pos = $end_pos + 1;

      // Split the tag contents, without the angle brackets, into the element
      // name and the rest.
      $tag_contents = trim($tag_contents, '<> ');
      [$element_name, $tag_contents] = explode(' ', $tag_contents, 2) + [1 => NULL];
      // If there is just the element name, no need to look for attributes.
      if (!$tag_contents) {
        continue;
      }

      // This will match all the attributes we're looking for.
      $attr_regex = '(?:' . implode('|', $handled_attributes) . ')';
      $pattern = "/(?:^|\s)$attr_regex\s*+=\s*+(['\"])/Su";
      $flags = PREG_OFFSET_CAPTURE | PREG_SET_ORDER;
      if (preg_match_all($pattern, $tag_contents, $matches, $flags)) {
        foreach ($matches as $match) {
          // Now just extract the attribute value as everything between the
          // matched quote character and the next such character.
          // Unfortunately, preg_match_all() reports positions in bytes, not
          // characters, so we need to use a bit of magic to reconcile this with
          // our usual handling of Unicode.
          $quote_char = $match[1][0];
          /** @var int $quote_pos */
          $quote_pos = $match[1][1];
          $tag_contents_from_quote = substr($tag_contents, $quote_pos + 1);
          $length = mb_strpos($tag_contents_from_quote, $quote_char);
          $attr_value = mb_substr($tag_contents_from_quote, 0, $length);
          // Take care of self-closing tags, so users are still able to set a
          // boost for, for instance, the "alt" attribute from an "img" tag.
          if ($tag_contents[-1] === '/') {
            $attr_value = " <$element_name> $attr_value </$element_name>";
          }
          $processed_text .= ' ' . $attr_value;
    $dom = Html::load($text);
    $xpath = new \DOMXPath($dom);
    /** @var \DOMElement $node */
    foreach ($xpath->query(implode('|', $xpath_expr)) as $node) {
      foreach ($handled_attributes as $attr_name) {
        $attr = $node->attributes?->getNamedItem($attr_name);
        if ($attr !== NULL) {
          $node->prepend(" {$attr->textContent} ");
        }
      }
    }

    return $processed_text;
    return Html::serialize($dom);
  }

  /**
+71 −39
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@

namespace Drupal\Tests\search_api\Unit\Processor;

use Drupal\Component\Utility\Random;
use Drupal\search_api\IndexInterface;
use Drupal\search_api\Item\Field;
use Drupal\search_api\Plugin\search_api\data_type\value\TextToken;
@@ -53,8 +54,7 @@ class HtmlFilterTest extends UnitTestCase {
      'alt' => FALSE,
    ];
    $this->processor->setConfiguration($configuration);
    $type = 'text';
    $this->invokeMethod('processFieldValue', [&$passed_value, $type]);
    $this->invokeMethod('processFieldValue', [&$passed_value, 'text']);
    $this->assertEquals($expected_value, $passed_value);
  }

@@ -95,8 +95,7 @@ class HtmlFilterTest extends UnitTestCase {
      'alt' => $alt_config,
    ];
    $this->processor->setConfiguration($configuration);
    $type = 'text';
    $this->invokeMethod('processFieldValue', [&$passed_value, $type]);
    $this->invokeMethod('processFieldValue', [&$passed_value, 'text']);
    $this->assertEquals($expected_value, $passed_value);
  }

@@ -150,14 +149,6 @@ class HtmlFilterTest extends UnitTestCase {
        ],
        TRUE,
      ],
      // Test fault tolerance.
      [
        'a < b',
        [
          Utility::createTextToken('a < b'),
        ],
        TRUE,
      ],
    ];
  }

@@ -180,8 +171,7 @@ class HtmlFilterTest extends UnitTestCase {
      'alt' => TRUE,
    ];
    $this->processor->setConfiguration($configuration);
    $type = 'text';
    $this->invokeMethod('processFieldValue', [&$passed_value, $type]);
    $this->invokeMethod('processFieldValue', [&$passed_value, 'text']);
    $this->assertEquals($expected_value, $passed_value);
  }

@@ -192,7 +182,32 @@ class HtmlFilterTest extends UnitTestCase {
   *   An array of argument arrays for testTagConfiguration().
   */
  public function tagConfigurationDataProvider() {
    $complex_test = [
    $tags_config = ['h2' => '2'];
    return [
      ['h2word', 'h2word', []],
      ['h2word', [Utility::createTextToken('h2word')], $tags_config],
      [
        'foo bar <h2> h2word </h2>',
        [
          Utility::createTextToken('foo bar'),
          Utility::createTextToken('h2word', 2.0),
        ],
        $tags_config,
      ],
      [
        'foo bar <h2>h2word</h2>',
        [
          Utility::createTextToken('foo bar'),
          Utility::createTextToken('h2word', 2.0),
        ],
        $tags_config,
      ],
      [
        '<div>word</div>',
        [Utility::createTextToken('word', 2)],
        ['div' => 2],
      ],
      [
        '<h2>Foo Bar <em>Baz</em></h2>

          <p>Bla Bla Bla. <strong title="Foobar">Important:</strong> Bla.</p>
@@ -213,33 +228,18 @@ class HtmlFilterTest extends UnitTestCase {
          'img' => 0.5,
          'span' => 0,
        ],
    ];
    $tags_config = ['h2' => '2'];
    return [
      ['h2word', 'h2word', []],
      ['h2word', [Utility::createTextToken('h2word')], $tags_config],
      [
        'foo bar <h2> h2word </h2>',
        [
          Utility::createTextToken('foo bar'),
          Utility::createTextToken('h2word', 2.0),
        ],
        $tags_config,
      ],
      [
        'foo bar <h2>h2word</h2>',
        'foo <img src="img.png" alt="image" title = "check this out" /> bar',
        [
          Utility::createTextToken('foo bar'),
          Utility::createTextToken('h2word', 2.0),
        ],
        $tags_config,
          Utility::createTextToken('foo', 1.0),
          Utility::createTextToken('check this out image', 0.5),
          Utility::createTextToken('bar', 1.0),
        ],
        [
        '<div>word</div>',
        [Utility::createTextToken('word', 2)],
        ['div' => 2],
          'img' => 0.5,
        ],
      ],
      $complex_test,
    ];
  }

@@ -263,8 +263,7 @@ class HtmlFilterTest extends UnitTestCase {
<span>This is hidden</span>';
    $expected_value = preg_replace('/\s+/', ' ', strip_tags($passed_value));

    $type = 'string';
    $this->invokeMethod('processFieldValue', [&$passed_value, $type]);
    $this->invokeMethod('processFieldValue', [&$passed_value, 'string']);
    $this->assertEquals($expected_value, $passed_value);
  }

@@ -352,4 +351,37 @@ class HtmlFilterTest extends UnitTestCase {
    $this->assertEquals([], $field->getValues());
  }

  /**
   * Tests that attribute handling is still fast even for large text values.
   *
   * @see https://www.drupal.org/project/search_api/issues/3388678
   */
  public function testLargeTextAttributesHandling(): void {
    $this->processor->setConfiguration([
      'tags' => [
        'em' => 1.5,
        'strong' => 2.0,
        'h2' => 3.0,
        'img' => 0.5,
        'span' => 0,
      ],
      'title' => TRUE,
      'alt' => TRUE,
    ]);
    $text = '';
    $random = new Random();
    for ($i = 0; $i < 2000; ++$i) {
      $text .= ' ' . htmlspecialchars($random->sentences(10));
      $tag = $random->name();
      $attr = $random->name();
      $value = htmlspecialchars($random->word(12));
      $contents = htmlspecialchars($random->sentences(10));
      $text .= " <$tag $attr=\"$value\">$contents</$tag>";
    }
    $start = microtime(TRUE);
    $this->invokeMethod('processFieldValue', [&$text, 'text']);
    $took = microtime(TRUE) - $start;
    $this->assertLessThan(1.0, $took, 'Processing large field value took too long.');
  }

}