SearchQuery.php 20.2 KB
Newer Older
1 2 3 4
<?php

namespace Drupal\search;

5
use Drupal\Core\Database\Query\Condition;
6
use Drupal\Component\Utility\Unicode;
7
use Drupal\Core\Database\Query\SelectExtender;
8
use Drupal\Core\Database\Query\SelectInterface;
9 10

/**
11 12
 * Search query extender and helper functions.
 *
13
 * Performs a query on the full-text search index for a word or words.
14
 *
15 16 17 18 19
 * This query is used by search plugins that use the search index (not all
 * search plugins do, as some use a different searching mechanism). It
 * assumes you have set up a query on the {search_index} table with alias 'i',
 * and will only work if the user is searching for at least one "positive"
 * keyword or phrase.
20
 *
21 22 23 24 25
 * For efficiency, users of this query can run the prepareAndNormalize()
 * method to figure out if there are any search results, before fully setting
 * up and calling execute() to execute the query. The scoring expressions are
 * not needed until the execute() step. However, it's not really necessary
 * to do this, because this class's execute() method does that anyway.
26
 *
27 28
 * During both the prepareAndNormalize() and execute() steps, there can be
 * problems. Call getStatus() to figure out if the query is OK or not.
29
 *
30
 * The query object is given the tag 'search_$type' and can be further
31 32 33
 * extended with hook_query_alter().
 */
class SearchQuery extends SelectExtender {
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70

  /**
   * Indicates no positive keywords were in the search expression.
   *
   * Positive keywords are words that are searched for, as opposed to negative
   * keywords, which are words that are excluded. To count as a keyword, a
   * word must be at least
   * \Drupal::config('search.settings')->get('index.minimum_word_size')
   * characters.
   *
   * @see SearchQuery::getStatus()
   */
  const NO_POSITIVE_KEYWORDS = 1;

  /**
   * Indicates that part of the search expression was ignored.
   *
   * To prevent Denial of Service attacks, only
   * \Drupal::config('search.settings')->get('and_or_limit') expressions
   * (positive keywords, phrases, negative keywords) are allowed; this flag
   * indicates that expressions existed past that limit and they were removed.
   *
   * @see SearchQuery::getStatus()
   */
  const EXPRESSIONS_IGNORED = 2;

  /**
   * Indicates that lower-case "or" was in the search expression.
   *
   * The word "or" in lower case was found in the search expression. This
   * probably means someone was trying to do an OR search but used lower-case
   * instead of upper-case.
   *
   * @see SearchQuery::getStatus()
   */
  const LOWER_CASE_OR = 4;

71
  /**
72 73 74 75 76 77 78 79
   * Indicates that no positive keyword matches were found.
   *
   * @see SearchQuery::getStatus()
   */
  const NO_KEYWORD_MATCHES = 8;

  /**
   * The keywords and advanced search options that are entered by the user.
80 81 82 83 84 85
   *
   * @var string
   */
  protected $searchExpression;

  /**
86
   * The type of search (search type).
87
   *
88 89
   * This maps to the value of the type column in search_index, and is usually
   * equal to the machine-readable name of the plugin or the search page.
90 91 92 93 94 95
   *
   * @var string
   */
  protected $type;

  /**
96
   * Parsed-out positive and negative search keys.
97 98 99
   *
   * @var array
   */
100
  protected $keys = ['positive' => [], 'negative' => []];
101 102

  /**
103
   * Indicates whether the query conditions are simple or complex (LIKE).
104
   *
105
   * @var bool
106 107 108 109 110 111
   */
  protected $simple = TRUE;

  /**
   * Conditions that are used for exact searches.
   *
112 113
   * This is always used for the second step in the query, but is not part of
   * the preparation step unless $this->simple is FALSE.
114 115 116 117 118 119 120 121 122 123 124 125 126
   *
   * @var DatabaseCondition
   */
  protected $conditions;

  /**
   * Indicates how many matches for a search query are necessary.
   *
   * @var int
   */
  protected $matches = 0;

  /**
127
   * Array of positive search words.
128 129 130 131 132
   *
   * These words have to match against {search_index}.word.
   *
   * @var array
   */
133
  protected $words = [];
134 135

  /**
136
   * Multiplier to normalize the keyword score.
137
   *
138 139
   * This value is calculated by the preparation step, and is used as a
   * multiplier of the word scores to make sure they are between 0 and 1.
140 141 142
   *
   * @var float
   */
143
  protected $normalize = 0;
144 145

  /**
146
   * Indicates whether the preparation step has been executed.
147
   *
148
   * @var bool
149
   */
150
  protected $executedPrepare = FALSE;
151 152

  /**
153
   * A bitmap of status conditions, described in getStatus().
154
   *
155
   * @var int
156
   *
157
   * @see SearchQuery::getStatus()
158
   */
159
  protected $status = 0;
160 161

  /**
162
   * The word score expressions.
163 164
   *
   * @var array
165 166
   *
   * @see SearchQuery::addScore()
167
   */
168
  protected $scores = [];
169 170

  /**
171
   * Arguments for the score expressions.
172 173 174
   *
   * @var array
   */
175
  protected $scoresArguments = [];
176

177 178 179 180 181 182 183
  /**
   * The number of 'i.relevance' occurrences in score expressions.
   *
   * @var int
   */
  protected $relevance_count = 0;

184
  /**
185
   * Multipliers for score expressions.
186
   *
187
   * @var array
188
   */
189
  protected $multiply = [];
190 191

  /**
192
   * Sets the search query expression.
193
   *
194
   * @param string $expression
195
   *   A search string, which can contain keywords and options.
196
   * @param string $type
197
   *   The search type. This maps to {search_index}.type in the database.
198
   *
199
   * @return $this
200
   */
201
  public function searchExpression($expression, $type) {
202
    $this->searchExpression = $expression;
203
    $this->type = $type;
204

205 206 207 208
    // Add query tag.
    $this->addTag('search_' . $type);

    // Initialize conditions and status.
209
    $this->conditions = new Condition('AND');
210 211
    $this->status = 0;

212 213 214 215 216 217
    return $this;
  }

  /**
   * Parses the search query into SQL conditions.
   *
218 219 220 221 222 223
   * Sets up the following variables:
   * - $this->keys
   * - $this->words
   * - $this->conditions
   * - $this->simple
   * - $this->matches
224 225
   */
  protected function parseSearchExpression() {
226
    // Matches words optionally prefixed by a - sign. A word in this case is
227
    // something between two spaces, optionally quoted.
228
    preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' ' . $this->searchExpression, $keywords, PREG_SET_ORDER);
229

230
    if (count($keywords) == 0) {
231 232 233 234
      return;
    }

    // Classify tokens.
235
    $in_or = FALSE;
236
    $limit_combinations = \Drupal::config('search.settings')->get('and_or_limit');
237 238 239 240 241 242 243
    // The first search expression does not count as AND.
    $and_count = -1;
    $or_count = 0;
    foreach ($keywords as $match) {
      if ($or_count && $and_count + $or_count >= $limit_combinations) {
        // Ignore all further search expressions to prevent Denial-of-Service
        // attacks using a high number of AND/OR combinations.
244
        $this->status |= SearchQuery::EXPRESSIONS_IGNORED;
245 246
        break;
      }
247

248
      // Strip off phrase quotes.
249
      $phrase = FALSE;
250 251 252 253 254
      if ($match[2]{0} == '"') {
        $match[2] = substr($match[2], 1, -1);
        $phrase = TRUE;
        $this->simple = FALSE;
      }
255

256 257 258 259 260 261
      // Simplify keyword according to indexing rules and external
      // preprocessors. Use same process as during search indexing, so it
      // will match search index.
      $words = search_simplify($match[2]);
      // Re-explode in case simplification added more words, except when
      // matching a phrase.
262
      $words = $phrase ? [$words] : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);
263 264 265 266 267 268 269 270 271 272
      // Negative matches.
      if ($match[1] == '-') {
        $this->keys['negative'] = array_merge($this->keys['negative'], $words);
      }
      // OR operator: instead of a single keyword, we store an array of all
      // OR'd keywords.
      elseif ($match[2] == 'OR' && count($this->keys['positive'])) {
        $last = array_pop($this->keys['positive']);
        // Starting a new OR?
        if (!is_array($last)) {
273
          $last = [$last];
274 275
        }
        $this->keys['positive'][] = $last;
276
        $in_or = TRUE;
277 278 279 280 281 282 283 284 285 286 287
        $or_count++;
        continue;
      }
      // AND operator: implied, so just ignore it.
      elseif ($match[2] == 'AND' || $match[2] == 'and') {
        continue;
      }

      // Plain keyword.
      else {
        if ($match[2] == 'or') {
288 289
          // Lower-case "or" instead of "OR" is a warning condition.
          $this->status |= SearchQuery::LOWER_CASE_OR;
290
        }
291
        if ($in_or) {
292 293 294 295 296 297 298 299
          // Add to last element (which is an array).
          $this->keys['positive'][count($this->keys['positive']) - 1] = array_merge($this->keys['positive'][count($this->keys['positive']) - 1], $words);
        }
        else {
          $this->keys['positive'] = array_merge($this->keys['positive'], $words);
          $and_count++;
        }
      }
300
      $in_or = FALSE;
301 302 303
    }

    // Convert keywords into SQL statements.
304 305
    $has_and = FALSE;
    $has_or = FALSE;
306 307 308 309
    // Positive matches.
    foreach ($this->keys['positive'] as $key) {
      // Group of ORed terms.
      if (is_array($key) && count($key)) {
310 311 312 313 314 315 316
        // If we had already found one OR, this is another one AND-ed with the
        // first, meaning it is not a simple query.
        if ($has_or) {
          $this->simple = FALSE;
        }
        $has_or = TRUE;
        $has_new_scores = FALSE;
317
        $queryor = new Condition('OR');
318 319
        foreach ($key as $or) {
          list($num_new_scores) = $this->parseWord($or);
320
          $has_new_scores |= $num_new_scores;
321 322 323 324 325
          $queryor->condition('d.data', "% $or %", 'LIKE');
        }
        if (count($queryor)) {
          $this->conditions->condition($queryor);
          // A group of OR keywords only needs to match once.
326
          $this->matches += ($has_new_scores > 0);
327 328 329 330
        }
      }
      // Single ANDed term.
      else {
331
        $has_and = TRUE;
332 333 334 335 336 337 338 339 340
        list($num_new_scores, $num_valid_words) = $this->parseWord($key);
        $this->conditions->condition('d.data', "% $key %", 'LIKE');
        if (!$num_valid_words) {
          $this->simple = FALSE;
        }
        // Each AND keyword needs to match at least once.
        $this->matches += $num_new_scores;
      }
    }
341
    if ($has_and && $has_or) {
342 343
      $this->simple = FALSE;
    }
344

345 346 347 348 349 350 351 352
    // Negative matches.
    foreach ($this->keys['negative'] as $key) {
      $this->conditions->condition('d.data', "% $key %", 'NOT LIKE');
      $this->simple = FALSE;
    }
  }

  /**
353 354 355 356 357
   * Parses a word or phrase for parseQuery().
   *
   * Splits a phrase into words. Adds its words to $this->words, if it is not
   * already there. Returns a list containing the number of new words found,
   * and the total number of words in the phrase.
358 359 360 361
   */
  protected function parseWord($word) {
    $num_new_scores = 0;
    $num_valid_words = 0;
362

363 364 365 366
    // Determine the scorewords of this word/phrase.
    $split = explode(' ', $word);
    foreach ($split as $s) {
      $num = is_numeric($s);
367
      if ($num || Unicode::strlen($s) >= \Drupal::config('search.settings')->get('index.minimum_word_size')) {
368 369 370 371 372 373 374
        if (!isset($this->words[$s])) {
          $this->words[$s] = $s;
          $num_new_scores++;
        }
        $num_valid_words++;
      }
    }
375

376
    // Return matching snippet and number of added words.
377
    return [$num_new_scores, $num_valid_words];
378 379 380
  }

  /**
381
   * Prepares the query and calculates the normalization factor.
382
   *
383 384
   * After the query is normalized the keywords are weighted to give the results
   * a relevancy score. The query is ready for execution after this.
385
   *
386 387 388 389 390
   * Error and warning conditions can apply. Call getStatus() after calling
   * this method to retrieve them.
   *
   * @return bool
   *   TRUE if at least one keyword matched the search index; FALSE if not.
391
   */
392
  public function prepareAndNormalize() {
393
    $this->parseSearchExpression();
394
    $this->executedPrepare = TRUE;
395 396

    if (count($this->words) == 0) {
397 398 399 400
      // Although the query could proceed, there is no point in joining
      // with other tables and attempting to normalize if there are no
      // keywords present.
      $this->status |= SearchQuery::NO_POSITIVE_KEYWORDS;
401 402
      return FALSE;
    }
403

404
    // Build the basic search query: match the entered keywords.
405
    $or = new Condition('OR');
406 407
    foreach ($this->words as $word) {
      $or->condition('i.word', $word);
408
    }
409 410 411
    $this->condition($or);

    // Add keyword normalization information to the query.
412 413 414 415
    $this->join('search_total', 't', 'i.word = t.word');
    $this
      ->condition('i.type', $this->type)
      ->groupBy('i.type')
416 417 418 419 420 421 422
      ->groupBy('i.sid');

    // If the query is simple, we should have calculated the number of
    // matching words we need to find, so impose that criterion. For non-
    // simple queries, this condition could lead to incorrectly deciding not
    // to continue with the full query.
    if ($this->simple) {
423
      $this->having('COUNT(*) >= :matches', [':matches' => $this->matches]);
424
    }
425

426 427
    // Clone the query object to calculate normalization.
    $normalize_query = clone $this->query;
428

429 430
    // For complex search queries, add the LIKE conditions; if the query is
    // simple, we do not need them for normalization.
431
    if (!$this->simple) {
432 433 434 435
      $normalize_query->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode');
      if (count($this->conditions)) {
        $normalize_query->condition($this->conditions);
      }
436 437
    }

438 439 440 441 442
    // Calculate normalization, which is the max of all the search scores for
    // positive keywords in the query. And note that the query could have other
    // fields added to it by the user of this extension.
    $normalize_query->addExpression('SUM(i.score * t.count)', 'calculated_score');
    $result = $normalize_query
443 444 445
      ->range(0, 1)
      ->orderBy('calculated_score', 'DESC')
      ->execute()
446 447 448 449
      ->fetchObject();
    if (isset($result->calculated_score)) {
      $this->normalize = (float) $result->calculated_score;
    }
450 451 452 453

    if ($this->normalize) {
      return TRUE;
    }
454 455 456 457

    // If the normalization value was zero, that indicates there were no
    // matches to the supplied positive keywords.
    $this->status |= SearchQuery::NO_KEYWORD_MATCHES;
458 459 460
    return FALSE;
  }

461
  /**
462
   * {@inheritdoc}
463 464 465 466 467 468 469 470 471 472 473 474 475
   */
  public function preExecute(SelectInterface $query = NULL) {
    if (!$this->executedPrepare) {
      $this->prepareAndNormalize();
    }

    if (!$this->normalize) {
      return FALSE;
    }

    return parent::preExecute($query);
  }

476 477 478
  /**
   * Adds a custom score expression to the search query.
   *
479 480 481 482 483
   * Score expressions are used to order search results. If no calls to
   * addScore() have taken place, a default keyword relevance score will be
   * used. However, if at least one call to addScore() has taken place, the
   * keyword relevance score is not automatically added.
   *
484 485 486 487
   * Note that you must use this method to add ordering to your searches, and
   * not call orderBy() directly, when using the SearchQuery extender. This is
   * because of the two-pass system the SearchQuery class uses to normalize
   * scores.
488
   *
489
   * @param string $score
490 491 492
   *   The score expression, which should evaluate to a number between 0 and 1.
   *   The string 'i.relevance' in a score expression will be replaced by a
   *   measure of keyword relevance between 0 and 1.
493
   * @param array $arguments
494
   *   Query arguments needed to provide values to the score expression.
495
   * @param float $multiply
496 497 498
   *   If set, the score is multiplied with this value. However, all scores
   *   with multipliers are then divided by the total of all multipliers, so
   *   that overall, the normalization is maintained.
499
   *
500
   * @return $this
501
   */
502
  public function addScore($score, $arguments = [], $multiply = FALSE) {
503 504
    if ($multiply) {
      $i = count($this->multiply);
505
      // Modify the score expression so it is multiplied by the multiplier,
506 507 508 509 510 511
      // with a divisor to renormalize. Note that the ROUND here is necessary
      // for PostgreSQL and SQLite in order to ensure that the :multiply_* and
      // :total_* arguments are treated as a numeric type, because the
      // PostgreSQL PDO driver sometimes puts values in as strings instead of
      // numbers in complex expressions like this.
      $score = "(ROUND(:multiply_$i, 4)) * COALESCE(($score), 0) / (ROUND(:total_$i, 4))";
512 513 514
      // Add an argument for the multiplier. The :total_$i argument is taken
      // care of in the execute() method, which is when the total divisor is
      // calculated.
515 516 517 518
      $arguments[':multiply_' . $i] = $multiply;
      $this->multiply[] = $multiply;
    }

519 520 521 522 523 524 525
    // Search scoring needs a way to include a keyword relevance in the score.
    // For historical reasons, this is done by putting 'i.relevance' into the
    // search expression. So, use string replacement to change this to a
    // calculated query expression, counting the number of occurrences so
    // in the execute() method we can add arguments.
    while (($pos = strpos($score, 'i.relevance')) !== FALSE) {
      $pieces = explode('i.relevance', $score, 2);
526
      $score = implode('((ROUND(:normalization_' . $this->relevance_count . ', 4)) * i.score * t.count)', $pieces);
527 528 529
      $this->relevance_count++;
    }

530 531 532 533 534 535 536 537 538
    $this->scores[] = $score;
    $this->scoresArguments += $arguments;

    return $this;
  }

  /**
   * Executes the search.
   *
539 540
   * The complex conditions are applied to the query including score
   * expressions and ordering.
541 542 543
   *
   * Error and warning conditions can apply. Call getStatus() after calling
   * this method to retrieve them.
544
   *
545
   * @return \Drupal\Core\Database\StatementInterface|null
546
   *   A query result set containing the results of the query.
547
   */
548
  public function execute() {
549 550
    if (!$this->preExecute($this)) {
      return NULL;
551 552
    }

553 554 555 556 557
    // Add conditions to the query.
    $this->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode');
    if (count($this->conditions)) {
      $this->condition($this->conditions);
    }
558

559
    // Add default score (keyword relevance) if there are not any defined.
560 561 562 563 564
    if (empty($this->scores)) {
      $this->addScore('i.relevance');
    }

    if (count($this->multiply)) {
565 566 567
      // Re-normalize scores with multipliers by dividing by the total of all
      // multipliers. The expressions were altered in addScore(), so here just
      // add the arguments for the total.
568
      $sum = array_sum($this->multiply);
569
      for ($i = 0; $i < count($this->multiply); $i++) {
570 571 572 573
        $this->scoresArguments[':total_' . $i] = $sum;
      }
    }

574

575 576
    // Add arguments for the keyword relevance normalization number.
    $normalization = 1.0 / $this->normalize;
577
    for ($i = 0; $i < $this->relevance_count; $i++) {
578 579
      $this->scoresArguments[':normalization_' . $i] = $normalization;
    }
580 581

    // Add all scores together to form a query field.
582 583
    $this->addExpression('SUM(' . implode(' + ', $this->scores) . ')', 'calculated_score', $this->scoresArguments);

584 585
    // If an order has not yet been set for this query, add a default order
    // that sorts by the calculated sum of scores.
586 587 588 589
    if (count($this->getOrderBy()) == 0) {
      $this->orderBy('calculated_score', 'DESC');
    }

590
    // Add query metadata.
591 592
    $this
      ->addMetaData('normalize', $this->normalize)
593
      ->fields('i', ['type', 'sid']);
594 595 596 597 598 599 600 601 602 603 604
    return $this->query->execute();
  }

  /**
   * Builds the default count query for SearchQuery.
   *
   * Since SearchQuery always uses GROUP BY, we can default to a subquery. We
   * also add the same conditions as execute() because countQuery() is called
   * first.
   */
  public function countQuery() {
605 606 607 608
    if (!$this->executedPrepare) {
      $this->prepareAndNormalize();
    }

609 610 611 612 613
    // Clone the inner query.
    $inner = clone $this->query;

    // Add conditions to query.
    $inner->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type');
614 615 616
    if (count($this->conditions)) {
      $inner->condition($this->conditions);
    }
617 618 619 620

    // Remove existing fields and expressions, they are not needed for a count
    // query.
    $fields =& $inner->getFields();
621
    $fields = [];
622
    $expressions =& $inner->getExpressions();
623
    $expressions = [];
624

625
    // Add sid as the only field and count them as a subquery.
626
    $count = db_select($inner->fields('i', ['sid']), NULL, ['target' => 'replica']);
627 628 629 630 631 632

    // Add the COUNT() expression.
    $count->addExpression('COUNT(*)');

    return $count;
  }
633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648

  /**
   * Returns the query status bitmap.
   *
   * @return int
   *   A bitmap indicating query status. Zero indicates there were no problems.
   *   A non-zero value is a combination of one or more of the following flags:
   *   - SearchQuery::NO_POSITIVE_KEYWORDS
   *   - SearchQuery::EXPRESSIONS_IGNORED
   *   - SearchQuery::LOWER_CASE_OR
   *   - SearchQuery::NO_KEYWORD_MATCHES
   */
  public function getStatus() {
    return $this->status;
  }

649
}