SearchQuery.php 19.4 KB
Newer Older
1
2
3
4
5
<?php

/**
 * @file
 * Definition of Drupal\search\SearchQuery.
6
7
 *
 * Search query extender and helper functions.
8
9
10
11
12
13
14
15
 */

namespace Drupal\search;

use Drupal\Core\Database\Query\SelectExtender;
use Drupal\Core\Database\StatementEmpty;

/**
16
 * Performs a query on the full-text search index for a word or words.
17
 *
18
19
20
21
22
 * This query is used by search plugins that use the search index (not all
 * search plugins do, as some use a different searching mechanism). It
 * assumes you have set up a query on the {search_index} table with alias 'i',
 * and will only work if the user is searching for at least one "positive"
 * keyword or phrase.
23
 *
24
25
26
27
28
 * For efficiency, users of this query can run the prepareAndNormalize()
 * method to figure out if there are any search results, before fully setting
 * up and calling execute() to execute the query. The scoring expressions are
 * not needed until the execute() step. However, it's not really necessary
 * to do this, because this class's execute() method does that anyway.
29
 *
30
31
 * During both the prepareAndNormalize() and execute() steps, there can be
 * problems. Call getStatus() to figure out if the query is OK or not.
32
 *
33
 * The query object is given the tag 'search_$type' and can be further
34
35
36
 * extended with hook_query_alter().
 */
class SearchQuery extends SelectExtender {
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

  /**
   * Indicates no positive keywords were in the search expression.
   *
   * Positive keywords are words that are searched for, as opposed to negative
   * keywords, which are words that are excluded. To count as a keyword, a
   * word must be at least
   * \Drupal::config('search.settings')->get('index.minimum_word_size')
   * characters.
   *
   * @see SearchQuery::getStatus()
   */
  const NO_POSITIVE_KEYWORDS = 1;

  /**
   * Indicates that part of the search expression was ignored.
   *
   * To prevent Denial of Service attacks, only
   * \Drupal::config('search.settings')->get('and_or_limit') expressions
   * (positive keywords, phrases, negative keywords) are allowed; this flag
   * indicates that expressions existed past that limit and they were removed.
   *
   * @see SearchQuery::getStatus()
   */
  const EXPRESSIONS_IGNORED = 2;

  /**
   * Indicates that lower-case "or" was in the search expression.
   *
   * The word "or" in lower case was found in the search expression. This
   * probably means someone was trying to do an OR search but used lower-case
   * instead of upper-case.
   *
   * @see SearchQuery::getStatus()
   */
  const LOWER_CASE_OR = 4;

74
  /**
75
76
77
78
79
80
81
82
   * Indicates that no positive keyword matches were found.
   *
   * @see SearchQuery::getStatus()
   */
  const NO_KEYWORD_MATCHES = 8;

  /**
   * The keywords and advanced search options that are entered by the user.
83
84
85
86
87
88
   *
   * @var string
   */
  protected $searchExpression;

  /**
89
   * The type of search (search type).
90
   *
91
92
   * This maps to the value of the type column in search_index, and is usually
   * equal to the machine-readable name of the plugin or the search page.
93
94
95
96
97
98
   *
   * @var string
   */
  protected $type;

  /**
99
   * Parsed-out positive and negative search keys.
100
101
102
103
104
105
   *
   * @var array
   */
  protected $keys = array('positive' => array(), 'negative' => array());

  /**
106
   * Indicates whether the query conditions are simple or complex (LIKE).
107
   *
108
   * @var bool
109
110
111
112
113
114
   */
  protected $simple = TRUE;

  /**
   * Conditions that are used for exact searches.
   *
115
116
   * This is always used for the second step in the query, but is not part of
   * the preparation step unless $this->simple is FALSE.
117
118
119
120
121
122
123
124
125
126
127
128
129
   *
   * @var DatabaseCondition
   */
  protected $conditions;

  /**
   * Indicates how many matches for a search query are necessary.
   *
   * @var int
   */
  protected $matches = 0;

  /**
130
   * Array of positive search words.
131
132
133
134
135
136
137
138
   *
   * These words have to match against {search_index}.word.
   *
   * @var array
   */
  protected $words = array();

  /**
139
   * Multiplier to normalize the keyword score.
140
   *
141
142
   * This value is calculated by the preparation step, and is used as a
   * multiplier of the word scores to make sure they are between 0 and 1.
143
144
145
   *
   * @var float
   */
146
  protected $normalize = 0;
147
148

  /**
149
   * Indicates whether the preparation step has been executed.
150
   *
151
   * @var bool
152
   */
153
  protected $executedPrepare = FALSE;
154
155

  /**
156
   * A bitmap of status conditions, described in getStatus().
157
   *
158
   * @var int
159
   *
160
   * @see SearchQuery::getStatus()
161
   */
162
  protected $status = 0;
163
164

  /**
165
   * The word score expressions.
166
167
   *
   * @var array
168
169
   *
   * @see SearchQuery::addScore()
170
   */
171
  protected $scores = array();
172
173

  /**
174
   * Arguments for the score expressions.
175
176
177
   *
   * @var array
   */
178
  protected $scoresArguments = array();
179

180
181
182
183
184
185
186
  /**
   * The number of 'i.relevance' occurrences in score expressions.
   *
   * @var int
   */
  protected $relevance_count = 0;

187
  /**
188
   * Multipliers for score expressions.
189
   *
190
   * @var array
191
   */
192
  protected $multiply = array();
193
194

  /**
195
   * Sets the search query expression.
196
   *
197
198
   * @param $expression
   *   A search string, which can contain keywords and options.
199
200
   * @param $type
   *   The search type. This maps to {search_index}.type in the database.
201
   *
202
   * @return $this
203
   */
204
  public function searchExpression($expression, $type) {
205
    $this->searchExpression = $expression;
206
    $this->type = $type;
207

208
209
210
211
212
213
214
    // Add query tag.
    $this->addTag('search_' . $type);

    // Initialize conditions and status.
    $this->conditions = db_and();
    $this->status = 0;

215
216
217
218
219
220
    return $this;
  }

  /**
   * Parses the search query into SQL conditions.
   *
221
222
223
224
225
226
   * Sets up the following variables:
   * - $this->keys
   * - $this->words
   * - $this->conditions
   * - $this->simple
   * - $this->matches
227
228
   */
  protected function parseSearchExpression() {
229
    // Matches words optionally prefixed by a - sign. A word in this case is
230
231
232
233
234
235
236
237
238
    // something between two spaces, optionally quoted.
    preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' ' .  $this->searchExpression , $keywords, PREG_SET_ORDER);

    if (count($keywords) ==  0) {
      return;
    }

    // Classify tokens.
    $or = FALSE;
239
    $limit_combinations = \Drupal::config('search.settings')->get('and_or_limit');
240
241
242
243
244
245
246
    // The first search expression does not count as AND.
    $and_count = -1;
    $or_count = 0;
    foreach ($keywords as $match) {
      if ($or_count && $and_count + $or_count >= $limit_combinations) {
        // Ignore all further search expressions to prevent Denial-of-Service
        // attacks using a high number of AND/OR combinations.
247
        $this->status |= SearchQuery::EXPRESSIONS_IGNORED;
248
249
        break;
      }
250

251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
      $phrase = FALSE;
      // Strip off phrase quotes.
      if ($match[2]{0} == '"') {
        $match[2] = substr($match[2], 1, -1);
        $phrase = TRUE;
        $this->simple = FALSE;
      }
      // Simplify keyword according to indexing rules and external
      // preprocessors. Use same process as during search indexing, so it
      // will match search index.
      $words = search_simplify($match[2]);
      // Re-explode in case simplification added more words, except when
      // matching a phrase.
      $words = $phrase ? array($words) : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);
      // Negative matches.
      if ($match[1] == '-') {
        $this->keys['negative'] = array_merge($this->keys['negative'], $words);
      }
      // OR operator: instead of a single keyword, we store an array of all
      // OR'd keywords.
      elseif ($match[2] == 'OR' && count($this->keys['positive'])) {
        $last = array_pop($this->keys['positive']);
        // Starting a new OR?
        if (!is_array($last)) {
          $last = array($last);
        }
        $this->keys['positive'][] = $last;
        $or = TRUE;
        $or_count++;
        continue;
      }
      // AND operator: implied, so just ignore it.
      elseif ($match[2] == 'AND' || $match[2] == 'and') {
        continue;
      }

      // Plain keyword.
      else {
        if ($match[2] == 'or') {
290
291
          // Lower-case "or" instead of "OR" is a warning condition.
          $this->status |= SearchQuery::LOWER_CASE_OR;
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
        }
        if ($or) {
          // Add to last element (which is an array).
          $this->keys['positive'][count($this->keys['positive']) - 1] = array_merge($this->keys['positive'][count($this->keys['positive']) - 1], $words);
        }
        else {
          $this->keys['positive'] = array_merge($this->keys['positive'], $words);
          $and_count++;
        }
      }
      $or = FALSE;
    }

    // Convert keywords into SQL statements.
    $simple_and = FALSE;
    $simple_or = FALSE;
    // Positive matches.
    foreach ($this->keys['positive'] as $key) {
      // Group of ORed terms.
      if (is_array($key) && count($key)) {
        $simple_or = TRUE;
        $any = FALSE;
        $queryor = db_or();
        foreach ($key as $or) {
          list($num_new_scores) = $this->parseWord($or);
          $any |= $num_new_scores;
          $queryor->condition('d.data', "% $or %", 'LIKE');
        }
        if (count($queryor)) {
          $this->conditions->condition($queryor);
          // A group of OR keywords only needs to match once.
          $this->matches += ($any > 0);
        }
      }
      // Single ANDed term.
      else {
        $simple_and = TRUE;
        list($num_new_scores, $num_valid_words) = $this->parseWord($key);
        $this->conditions->condition('d.data', "% $key %", 'LIKE');
        if (!$num_valid_words) {
          $this->simple = FALSE;
        }
        // Each AND keyword needs to match at least once.
        $this->matches += $num_new_scores;
      }
    }
    if ($simple_and && $simple_or) {
      $this->simple = FALSE;
    }
    // Negative matches.
    foreach ($this->keys['negative'] as $key) {
      $this->conditions->condition('d.data', "% $key %", 'NOT LIKE');
      $this->simple = FALSE;
    }
  }

  /**
349
350
351
352
353
   * Parses a word or phrase for parseQuery().
   *
   * Splits a phrase into words. Adds its words to $this->words, if it is not
   * already there. Returns a list containing the number of new words found,
   * and the total number of words in the phrase.
354
355
356
357
   */
  protected function parseWord($word) {
    $num_new_scores = 0;
    $num_valid_words = 0;
358

359
360
361
362
    // Determine the scorewords of this word/phrase.
    $split = explode(' ', $word);
    foreach ($split as $s) {
      $num = is_numeric($s);
363
      if ($num || drupal_strlen($s) >= \Drupal::config('search.settings')->get('index.minimum_word_size')) {
364
365
366
367
368
369
370
        if (!isset($this->words[$s])) {
          $this->words[$s] = $s;
          $num_new_scores++;
        }
        $num_valid_words++;
      }
    }
371

372
373
374
375
376
    // Return matching snippet and number of added words.
    return array($num_new_scores, $num_valid_words);
  }

  /**
377
   * Prepares the query and calculates the normalization factor.
378
   *
379
380
   * After the query is normalized the keywords are weighted to give the results
   * a relevancy score. The query is ready for execution after this.
381
   *
382
383
384
385
386
   * Error and warning conditions can apply. Call getStatus() after calling
   * this method to retrieve them.
   *
   * @return bool
   *   TRUE if at least one keyword matched the search index; FALSE if not.
387
   */
388
  public function prepareAndNormalize() {
389
    $this->parseSearchExpression();
390
    $this->executedPrepare = TRUE;
391
392

    if (count($this->words) == 0) {
393
394
395
396
      // Although the query could proceed, there is no point in joining
      // with other tables and attempting to normalize if there are no
      // keywords present.
      $this->status |= SearchQuery::NO_POSITIVE_KEYWORDS;
397
398
      return FALSE;
    }
399

400
401
402
403
    // Build the basic search query: match the entered keywords.
    $or = db_or();
    foreach ($this->words as $word) {
      $or->condition('i.word', $word);
404
    }
405
406
407
    $this->condition($or);

    // Add keyword normalization information to the query.
408
409
410
411
412
413
414
    $this->join('search_total', 't', 'i.word = t.word');
    $this
      ->condition('i.type', $this->type)
      ->groupBy('i.type')
      ->groupBy('i.sid')
      ->having('COUNT(*) >= :matches', array(':matches' => $this->matches));

415
416
    // Clone the query object to calculate normalization.
    $normalize_query = clone $this->query;
417

418
419
    // For complex search queries, add the LIKE conditions; if the query is
    // simple, we do not need them for normalization.
420
    if (!$this->simple) {
421
422
423
424
      $normalize_query->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode');
      if (count($this->conditions)) {
        $normalize_query->condition($this->conditions);
      }
425
426
    }

427
428
429
430
431
    // Calculate normalization, which is the max of all the search scores for
    // positive keywords in the query. And note that the query could have other
    // fields added to it by the user of this extension.
    $normalize_query->addExpression('SUM(i.score * t.count)', 'calculated_score');
    $result = $normalize_query
432
433
434
      ->range(0, 1)
      ->orderBy('calculated_score', 'DESC')
      ->execute()
435
436
437
438
      ->fetchObject();
    if (isset($result->calculated_score)) {
      $this->normalize = (float) $result->calculated_score;
    }
439
440
441
442

    if ($this->normalize) {
      return TRUE;
    }
443
444
445
446

    // If the normalization value was zero, that indicates there were no
    // matches to the supplied positive keywords.
    $this->status |= SearchQuery::NO_KEYWORD_MATCHES;
447
448
449
450
451
452
    return FALSE;
  }

  /**
   * Adds a custom score expression to the search query.
   *
453
454
455
456
457
458
459
460
461
   * Score expressions are used to order search results. If no calls to
   * addScore() have taken place, a default keyword relevance score will be
   * used. However, if at least one call to addScore() has taken place, the
   * keyword relevance score is not automatically added.
   *
   * Also note that if you call orderBy() directly on the query, search scores
   * will not automatically be used to order search results. Your orderBy()
   * expression can reference 'calculated_score', which will be the total
   * calculated score value.
462
   *
463
   * @param string $score
464
465
466
   *   The score expression, which should evaluate to a number between 0 and 1.
   *   The string 'i.relevance' in a score expression will be replaced by a
   *   measure of keyword relevance between 0 and 1.
467
   * @param array $arguments
468
   *   Query arguments needed to provide values to the score expression.
469
   * @param float $multiply
470
471
472
   *   If set, the score is multiplied with this value. However, all scores
   *   with multipliers are then divided by the total of all multipliers, so
   *   that overall, the normalization is maintained.
473
   *
474
   * @return $this
475
476
477
478
   */
  public function addScore($score, $arguments = array(), $multiply = FALSE) {
    if ($multiply) {
      $i = count($this->multiply);
479
480
      // Modify the score expression so it is multiplied by the multiplier,
      // with a divisor to renormalize.
481
      $score = "(CAST (:multiply_$i AS DECIMAL(10,4))) * COALESCE(($score), 0) / (CAST (:total_$i AS DECIMAL(10,4)))";
482
483
484
      // Add an argument for the multiplier. The :total_$i argument is taken
      // care of in the execute() method, which is when the total divisor is
      // calculated.
485
486
487
488
      $arguments[':multiply_' . $i] = $multiply;
      $this->multiply[] = $multiply;
    }

489
490
491
492
493
494
495
496
497
498
499
    // Search scoring needs a way to include a keyword relevance in the score.
    // For historical reasons, this is done by putting 'i.relevance' into the
    // search expression. So, use string replacement to change this to a
    // calculated query expression, counting the number of occurrences so
    // in the execute() method we can add arguments.
    while (($pos = strpos($score, 'i.relevance')) !== FALSE) {
      $pieces = explode('i.relevance', $score, 2);
      $score = implode('((CAST (:normalization_' . $this->relevance_count . ' AS DECIMAL(10,4))) * i.score * t.count)', $pieces);
      $this->relevance_count++;
    }

500
501
502
503
504
505
506
507
508
    $this->scores[] = $score;
    $this->scoresArguments += $arguments;

    return $this;
  }

  /**
   * Executes the search.
   *
509
510
511
512
513
514
   * If not already done, this calls prepareAndNormalize() first. Then the
   * complex conditions are applied to the query including score expressions
   * and ordering.
   *
   * Error and warning conditions can apply. Call getStatus() after calling
   * this method to retrieve them.
515
516
   *
   * @return
517
   *   A query result set containing the results of the query.
518
   */
519
520
521
522
  public function execute() {

    if (!$this->executedPrepare) {
      $this->prepareAndNormalize();
523
    }
524

525
    if (!$this->normalize) {
526
      // There were no keyword matches, so return an empty result set.
527
528
529
      return new StatementEmpty();
    }

530
531
532
533
534
    // Add conditions to the query.
    $this->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode');
    if (count($this->conditions)) {
      $this->condition($this->conditions);
    }
535

536
    // Add default score (keyword relevance) if there are not any defined.
537
538
539
540
541
    if (empty($this->scores)) {
      $this->addScore('i.relevance');
    }

    if (count($this->multiply)) {
542
543
544
      // Re-normalize scores with multipliers by dividing by the total of all
      // multipliers. The expressions were altered in addScore(), so here just
      // add the arguments for the total.
545
      $sum = array_sum($this->multiply);
546
      for ($i = 0; $i < count($this->multiply); $i++) {
547
548
549
550
        $this->scoresArguments[':total_' . $i] = $sum;
      }
    }

551

552
553
554
555
556
    // Add arguments for the keyword relevance normalization number.
    $normalization = 1.0 / $this->normalize;
    for ($i = 0; $i < $this->relevance_count; $i++ ) {
      $this->scoresArguments[':normalization_' . $i] = $normalization;
    }
557
558

    // Add all scores together to form a query field.
559
560
    $this->addExpression('SUM(' . implode(' + ', $this->scores) . ')', 'calculated_score', $this->scoresArguments);

561
562
    // If an order has not yet been set for this query, add a default order
    // that sorts by the calculated sum of scores.
563
564
565
566
    if (count($this->getOrderBy()) == 0) {
      $this->orderBy('calculated_score', 'DESC');
    }

567
    // Add query metadata.
568
569
570
571
572
573
574
575
576
577
578
579
580
581
    $this
      ->addMetaData('normalize', $this->normalize)
      ->fields('i', array('type', 'sid'));
    return $this->query->execute();
  }

  /**
   * Builds the default count query for SearchQuery.
   *
   * Since SearchQuery always uses GROUP BY, we can default to a subquery. We
   * also add the same conditions as execute() because countQuery() is called
   * first.
   */
  public function countQuery() {
582
583
584
585
    if (!$this->executedPrepare) {
      $this->prepareAndNormalize();
    }

586
587
588
589
590
    // Clone the inner query.
    $inner = clone $this->query;

    // Add conditions to query.
    $inner->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type');
591
592
593
    if (count($this->conditions)) {
      $inner->condition($this->conditions);
    }
594
595
596
597
598
599
600
601

    // Remove existing fields and expressions, they are not needed for a count
    // query.
    $fields =& $inner->getFields();
    $fields = array();
    $expressions =& $inner->getExpressions();
    $expressions = array();

602
    // Add sid as the only field and count them as a subquery.
603
604
605
606
607
608
609
    $count = db_select($inner->fields('i', array('sid')), NULL, array('target' => 'slave'));

    // Add the COUNT() expression.
    $count->addExpression('COUNT(*)');

    return $count;
  }
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625

  /**
   * Returns the query status bitmap.
   *
   * @return int
   *   A bitmap indicating query status. Zero indicates there were no problems.
   *   A non-zero value is a combination of one or more of the following flags:
   *   - SearchQuery::NO_POSITIVE_KEYWORDS
   *   - SearchQuery::EXPRESSIONS_IGNORED
   *   - SearchQuery::LOWER_CASE_OR
   *   - SearchQuery::NO_KEYWORD_MATCHES
   */
  public function getStatus() {
    return $this->status;
  }

626
}