search.module 36.4 KB
Newer Older
Dries's avatar
 
Dries committed
1 2
<?php

Dries's avatar
 
Dries committed
3 4 5 6 7
/**
 * @file
 * Enables site-wide keyword searching.
 */

8
use Drupal\Component\Utility\SafeMarkup;
9
use Drupal\Component\Utility\String;
10
use Drupal\Component\Utility\Unicode;
11
use Drupal\Core\Form\FormStateInterface;
12
use Drupal\Core\Routing\RouteMatchInterface;
13

14
/**
Steven Wittens's avatar
Steven Wittens committed
15
 * Matches all 'N' Unicode character classes (numbers)
16
 */
17
define('PREG_CLASS_NUMBERS',
18 19 20 21 22 23 24 25 26
  '\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}' .
  '\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}' .
  '\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}' .
  '\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-' .
  '\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}' .
  '\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}' .
  '\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}' .
  '\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-' .
  '\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}');
27 28

/**
Steven Wittens's avatar
Steven Wittens committed
29
 * Matches all 'P' Unicode character classes (punctuation)
30
 */
31
define('PREG_CLASS_PUNCTUATION',
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
  '\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}' .
  '\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}' .
  '\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}' .
  '\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}' .
  '\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}' .
  '\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}' .
  '\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}' .
  '\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}' .
  '\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-' .
  '\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}' .
  '\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}' .
  '\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}' .
  '\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}' .
  '\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-' .
  '\x{ff65}');
47 48

/**
49 50 51 52 53 54 55 56 57 58 59 60 61
 * Matches CJK (Chinese, Japanese, Korean) letter-like characters.
 *
 * This list is derived from the "East Asian Scripts" section of
 * http://www.unicode.org/charts/index.html, as well as a comment on
 * http://unicode.org/reports/tr11/tr11-11.html listing some character
 * ranges that are reserved for additional CJK ideographs.
 *
 * The character ranges do not include numbers, punctuation, or symbols, since
 * these are handled separately in search. Note that radicals and strokes are
 * considered symbols. (See
 * http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt)
 *
 * @see search_expand_cjk()
62
 */
63 64 65 66 67
define('PREG_CLASS_CJK', '\x{1100}-\x{11FF}\x{3040}-\x{309F}\x{30A1}-\x{318E}' .
  '\x{31A0}-\x{31B7}\x{31F0}-\x{31FF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FCF}' .
  '\x{A000}-\x{A48F}\x{A4D0}-\x{A4FD}\x{A960}-\x{A97F}\x{AC00}-\x{D7FF}' .
  '\x{F900}-\x{FAFF}\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}\x{FF66}-\x{FFDC}' .
  '\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}');
68

Dries's avatar
 
Dries committed
69
/**
70
 * Implements hook_help().
Dries's avatar
 
Dries committed
71
 */
72
function search_help($route_name, RouteMatchInterface $route_match) {
73 74
  switch ($route_name) {
    case 'help.page.search':
75 76
      $output = '';
      $output .= '<h3>' . t('About') . '</h3>';
77
      $output .= '<p>' . t('The Search module provides the ability to set up search pages based on plugins provided by other modules. In Drupal core, there are two page-type plugins: the Content page type provides keyword searching for content managed by the Node module, and the Users page type provides keyword searching for registered users. Contributed modules may provide other page-type plugins. For more information, see <a href="!search-module">the online documentation for the Search module</a>.', array('!search-module' => 'https://drupal.org/documentation/modules/search')) . '</p>';
78 79
      $output .= '<h3>' . t('Uses') . '</h3>';
      $output .= '<dl>';
80 81 82 83 84 85 86 87
      $output .= '<dt>' . t('Configuring search pages') . '</dt>';
      $output .= '<dd>' . t('To configure search pages, visit the <a href="!search-settings">Search pages page</a>. In the Search pages section, you can add a new search page, edit the configuration of existing search pages, enable and disable search pages, and choose the default search page. Each enabled search page has a URL path starting with <em>search</em>, and each will appear as a tab or local task link on the <a href="!search-url">search page</a>; you can configure the text that is shown in the tab. In addition, some search page plugins have additional settings that you can configure for each search page.', array('!search-settings' => \Drupal::url('search.settings'), '!search-url' => \Drupal::url('search.view'))) . '</dd>';
      $output .= '<dt>' . t('Managing the search index') . '</dt>';
      $output .= '<dd>' . t('Some search page plugins, such as the core Content search page, index searchable text using the Drupal core search index, and will not work unless content is indexed. Indexing is done during <em>cron</em> runs, so it requires a <a href="!cron">cron maintenance task</a> to be set up. There are also several settings affecting indexing that can be configured on the <a href="!search-settings">Search pages page</a>: the number of items to index per cron run, the minimum word length to index, and how to handle Chinese, Japanese, and Korean characters.', array('!cron' => \Drupal::url('system.cron_settings'), '!search-settings' => \Drupal::url('search.settings'))) . '</dd>';
      $output .= '<dd>' . t('Modules providing search page plugins generally ensure that content-related actions on your site (creating, editing, or deleting content and comments) automatically cause affected content items to be marked for indexing or reindexing at the next cron run. When content is marked for reindexing, the previous content remains in the index until cron runs, at which time it is replaced by the new content. However, there are some actions related to the structure of your site that do not cause affected content to be marked for reindexing. Examples of structure-related actions that affect content include deleting or editing taxonomy terms, enabling or disabling modules that add text to content (such as Taxonomy, Comment, and field-providing modules), and modifying the fields or display parameters of your content types. If you take one of these actions and you want to ensure that the search index is updated to reflect your changed site structure, you can mark all content for reindexing by clicking the "Re-index site" button on the <a href="!search-settings">Search pages page</a>. If you have a lot of content on your site, it may take several cron runs for the content to be reindexed.', array('!search-settings' => \Drupal::url('search.settings'))) . '</dd>';
      $output .= '<dt>' . t('Displaying the Search block') . '</dt>';
      $output .= '<dd>' . t('The Search module includes a block, which can be enabled and configured on the <a href="!blocks">Block layout page</a>, if you have the Block module enabled; the default block title is Search, and it is the Search form block in the Forms category, if you wish to add another instance. The block is available to users with the <a href="!search_permission">Use search</a> permission, and it performs a search using the configured default search page.', array('!blocks' => (\Drupal::moduleHandler()->moduleExists('block')) ? \Drupal::url('block.admin_display') : '#', '!search_permission' => \Drupal::url('user.admin_permissions', array(), array('fragment' => 'module-search')))) . '</dd>';
      $output .= '<dt>' . t('Searching your site') . '</dt>';
88
      $output .= '<dd>' . t('Users with <a href="!search_permission">Use search</a> permission can use the Search block and <a href="!search">Search page</a>. Users with the <a href="!node_permission">View published content</a> permission can use configured search pages of type <em>Content</em> to search for content containing exact keywords; in addition, users with <a href="!search_permission">Use advanced search</a> permission can use more complex search filtering. Users with the <a href="!user_permission">View user information</a> permission can use configured search pages of type <em>Users</em> to search for active users containing the keyword anywhere in the username, and users with the <a href="!user_permission">Administer users</a> permission can search for active and blocked users, by email address or username keyword.', array('!search' => \Drupal::url('search.view'), '!search_permission' => \Drupal::url('user.admin_permissions', array(), array('fragment' => 'module-search')), '!node_permission' => \Drupal::url('user.admin_permissions', array(), array('fragment' => 'module-node')), '!user_permission' => \Drupal::url('user.admin_permissions', array(), array('fragment' => 'module-user')))) . '</dd>';
89 90
      $output .= '<dt>' . t('Extending the Search module') . '</dt>';
      $output .= '<dd>' . t('By default, the Search module only supports exact keyword matching in content searches. You can modify this behavior by installing a language-specific stemming module for your language (such as <a href="!porterstemmer_url">Porter Stemmer</a> for American English), which allows words such as walk, walking, and walked to be matched in the Search module. Another approach is to use a third-party search technology with stemming or partial word matching features built in, such as <a href="!solr_url">Apache Solr</a> or <a href="!sphinx_url">Sphinx</a>. There are also contributed modules that provide additional search pages. These and other <a href="!contrib-search">search-related contributed modules</a> can be downloaded by visiting Drupal.org.', array('!contrib-search' => 'https://drupal.org/project/project_module?f[2]=im_vid_3%3A105', '!porterstemmer_url' => 'https://drupal.org/project/porterstemmer', '!solr_url' => 'https://drupal.org/project/apachesolr', '!sphinx_url' => 'https://drupal.org/project/sphinx')) . '</dd>';
91
      $output .= '</dl>';
92
      return $output;
93 94

    case 'search.settings':
95
      return '<p>' . t('The search engine maintains an index of words found in your site\'s content. To build and maintain this index, a correctly configured <a href="!cron">cron maintenance task</a> is required. Indexing behavior can be adjusted using the settings below.', array('!cron' => \Drupal::url('system.status'))) . '</p>';
Dries's avatar
 
Dries committed
96
  }
Dries's avatar
 
Dries committed
97
}
Kjartan's avatar
Kjartan committed
98

99
/**
100
 * Implements hook_theme().
101 102 103
 */
function search_theme() {
  return array(
104
    'search_result' => array(
105
      'variables' => array('result' => NULL, 'plugin_id' => NULL),
106
      'file' => 'search.pages.inc',
107 108 109
    ),
  );
}
Dries's avatar
 
Dries committed
110

111
/**
112
 * Implements hook_preprocess_HOOK() for block templates.
113 114
 */
function search_preprocess_block(&$variables) {
115
  if ($variables['plugin_id'] == 'search_form_block') {
116
    $variables['attributes']['role'] = 'search';
117
    $variables['attributes']['class'][] = 'container-inline';
118 119 120
  }
}

Dries's avatar
Dries committed
121
/**
122
 * Clears either a part of, or the entire search index.
123
 *
124 125 126
 * This function is meant for use by search page plugins, or for building a
 * user interface that lets users clear all or parts of the search index.
 *
127
 * @param $type
128 129 130 131 132 133
 *   (optional) The plugin ID or other machine-readable type for the items to
 *   remove from the search index. If omitted, $sid and $langcode are ignored
 *   and the entire search index is cleared.
 * @param $sid
 *   (optional) The ID of the items to remove from the search index. If
 *   omitted, all items matching $type are cleared, and $langcode is ignored.
134
 * @param $langcode
135 136
 *   (optional) Language code of the item to remove from the search index. If
 *   omitted, all items matching $sid and $type are cleared.
137
 */
138 139 140 141 142 143 144 145 146 147 148 149 150
function search_index_clear($type = NULL, $sid = NULL, $langcode = NULL) {
  $query_index = db_delete('search_index');
  $query_dataset = db_delete('search_dataset');
  if ($type) {
    $query_index->condition('type', $type);
    $query_dataset->condition('type', $type);
    if ($sid) {
      $query_index->condition('sid', $sid);
      $query_dataset->condition('sid', $sid);
      if ($langcode) {
        $query_index->condition('langcode', $langcode);
        $query_dataset->condition('langcode', $langcode);
      }
151
    }
152
  }
153

154 155
  $query_index->execute();
  $query_dataset->execute();
Dries's avatar
Dries committed
156 157
}

158
/**
159 160 161 162
 * Marks a word as "dirty" (changed), or retrieves the list of dirty words.
 *
 * This is used during indexing (cron). Words that are dirty have outdated
 * total counts in the search_total table, and need to be recounted.
163
 */
164
function search_dirty($word = NULL) {
165
  $dirty = &drupal_static(__FUNCTION__, array());
166 167
  if ($word !== NULL) {
    $dirty[$word] = TRUE;
168 169 170 171 172 173
  }
  else {
    return $dirty;
  }
}

Kjartan's avatar
Kjartan committed
174
/**
175
 * Implements hook_cron().
Dries's avatar
 
Dries committed
176
 *
177 178
 * Fires updateIndex() in the plugins for all indexable active search pages,
 * and cleans up dirty words.
179 180
 *
 * @see search_dirty()
Kjartan's avatar
Kjartan committed
181 182
 */
function search_cron() {
183 184
  // We register a shutdown function to ensure that search_total is always up
  // to date.
185
  drupal_register_shutdown_function('search_update_totals');
186

187 188 189 190
  /** @var $search_page_repository \Drupal\search\SearchPageRepositoryInterface */
  $search_page_repository = \Drupal::service('search.search_page_repository');
  foreach ($search_page_repository->getIndexableSearchPages() as $entity) {
    $entity->getPlugin()->updateIndex();
191
  }
192 193 194
}

/**
195 196 197
 * Updates the {search_total} database table.
 *
 * This function is called on shutdown to ensure that {search_total} is always
198 199 200
 * up to date (even if cron times out or otherwise fails).
 */
function search_update_totals() {
201
  // Update word IDF (Inverse Document Frequency) counts for new/changed words.
202
  foreach (search_dirty() as $word => $dummy) {
203
    // Get total count
204
    $total = db_query("SELECT SUM(score) FROM {search_index} WHERE word = :word", array(':word' => $word), array('target' => 'replica'))->fetchField();
205
    // Apply Zipf's law to equalize the probability distribution.
206
    $total = log10(1 + 1/(max(1, $total)));
207
    db_merge('search_total')
208
      ->key('word', $word)
209 210
      ->fields(array('count' => $total))
      ->execute();
Dries's avatar
Dries committed
211 212 213 214
  }
  // Find words that were deleted from search_index, but are still in
  // search_total. We use a LEFT JOIN between the two tables and keep only the
  // rows which fail to join.
215
  $result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL", array(), array('target' => 'replica'));
216 217 218 219 220 221 222 223
  $or = db_or();
  foreach ($result as $word) {
    $or->condition('word', $word->realword);
  }
  if (count($or) > 0) {
    db_delete('search_total')
      ->condition($or)
      ->execute();
Kjartan's avatar
Kjartan committed
224 225 226 227
  }
}

/**
228
 * Simplifies a string according to indexing rules.
229 230 231 232 233 234 235 236
 *
 * @param $text
 *   Text to simplify.
 *
 * @return
 *   Simplified text.
 *
 * @see hook_search_preprocess()
Kjartan's avatar
Kjartan committed
237
 */
238
function search_simplify($text, $langcode = NULL) {
239
  // Decode entities to UTF-8
240
  $text = String::decodeEntities($text);
241

242
  // Lowercase
243
  $text = Unicode::strtolower($text);
244

245
  // Call an external processor for word handling.
246
  search_invoke_preprocess($text, $langcode);
Kjartan's avatar
Kjartan committed
247

248
  // Simple CJK handling
249
  if (\Drupal::config('search.settings')->get('index.overlap_cjk')) {
250
    $text = preg_replace_callback('/[' . PREG_CLASS_CJK . ']+/u', 'search_expand_cjk', $text);
251
  }
252

253 254 255 256 257 258
  // To improve searching for numerical data such as dates, IP addresses
  // or version numbers, we consider a group of numerical characters
  // separated only by punctuation characters to be one piece.
  // This also means that searching for e.g. '20/03/1984' also returns
  // results with '20-03-1984' in them.
  // Readable regexp: ([number]+)[punctuation]+(?=[number])
259
  $text = preg_replace('/([' . PREG_CLASS_NUMBERS . ']+)[' . PREG_CLASS_PUNCTUATION . ']+(?=[' . PREG_CLASS_NUMBERS . '])/u', '\1', $text);
Kjartan's avatar
Kjartan committed
260

261 262 263 264 265
  // Multiple dot and dash groups are word boundaries and replaced with space.
  // No need to use the unicode modifer here because 0-127 ASCII characters
  // can't match higher UTF-8 characters as the leftmost bit of those are 1.
  $text = preg_replace('/[.-]{2,}/', ' ', $text);

266
  // The dot, underscore and dash are simply removed. This allows meaningful
267
  // search behavior with acronyms and URLs. See unicode note directly above.
268
  $text = preg_replace('/[._-]+/', '', $text);
Kjartan's avatar
Kjartan committed
269

270 271
  // With the exception of the rules above, we consider all punctuation,
  // marks, spacers, etc, to be a word boundary.
272
  $text = preg_replace('/[' . Unicode::PREG_CLASS_WORD_BOUNDARY . ']+/u', ' ', $text);
Kjartan's avatar
Kjartan committed
273

274 275 276 277 278
  // Truncate everything to 50 characters.
  $words = explode(' ', $text);
  array_walk($words, '_search_index_truncate');
  $text = implode(' ', $words);

279 280 281 282
  return $text;
}

/**
283 284 285 286 287 288 289
 * Splits CJK (Chinese, Japanese, Korean) text into tokens.
 *
 * The Search module matches exact words, where a word is defined to be a
 * sequence of characters delimited by spaces or punctuation. CJK languages are
 * written in long strings of characters, though, not split up into words. So
 * in order to allow search matching, we split up CJK text into tokens
 * consisting of consecutive, overlapping sequences of characters whose length
290 291
 * is equal to the 'minimum_word_size' variable. This tokenizing is only done
 * if the 'overlap_cjk' variable is TRUE.
292 293 294 295
 *
 * @param $matches
 *   This function is a callback for preg_replace_callback(), which is called
 *   from search_simplify(). So, $matches is an array of regular expression
296 297
 *   matches, which means that $matches[0] contains the matched text -- a
 *   string of CJK characters to tokenize.
298 299 300
 *
 * @return
 *   Tokenized text, starting and ending with a space character.
301 302
 */
function search_expand_cjk($matches) {
303
  $min = \Drupal::config('search.settings')->get('index.minimum_word_size');
304
  $str = $matches[0];
305
  $length = Unicode::strlen($str);
306 307
  // If the text is shorter than the minimum word size, don't tokenize it.
  if ($length <= $min) {
308
    return ' ' . $str . ' ';
309
  }
310
  $tokens = ' ';
311
  // Build a FIFO queue of characters.
312
  $chars = array();
313 314
  for ($i = 0; $i < $length; $i++) {
    // Add the next character off the beginning of the string to the queue.
315
    $current = Unicode::substr($str, 0, 1);
316 317 318
    $str = substr($str, strlen($current));
    $chars[] = $current;
    if ($i >= $min - 1) {
319
      // Make a token of $min characters, and add it to the token string.
320
      $tokens .= implode('', $chars) . ' ';
321
      // Shift out the first character in the queue.
322 323
      array_shift($chars);
    }
324 325 326 327 328
  }
  return $tokens;
}

/**
329
 * Simplifies and splits a string into tokens for indexing.
330
 */
331
function search_index_split($text, $langcode = NULL) {
332 333
  $last = &drupal_static(__FUNCTION__);
  $lastsplit = &drupal_static(__FUNCTION__ . ':lastsplit');
334 335 336 337

  if ($last == $text) {
    return $lastsplit;
  }
338
  // Process words
339
  $text = search_simplify($text, $langcode);
340
  $words = explode(' ', $text);
Kjartan's avatar
Kjartan committed
341

342 343 344 345 346 347 348
  // Save last keyword result
  $last = $text;
  $lastsplit = $words;

  return $words;
}

349
/**
350
 * Helper function for array_walk in search_index_split.
351
 */
352
function _search_index_truncate(&$text) {
353 354 355
  if (is_numeric($text)) {
    $text = ltrim($text, '0');
  }
356
  $text = Unicode::truncate($text, 50);
357 358
}

359 360 361
/**
 * Invokes hook_search_preprocess() in modules.
 */
362
function search_invoke_preprocess(&$text, $langcode = NULL) {
363
  foreach (\Drupal::moduleHandler()->getImplementations('search_preprocess') as $module) {
364
    $text = \Drupal::moduleHandler()->invoke($module, 'search_preprocess', array($text, $langcode));
Kjartan's avatar
Kjartan committed
365 366 367 368
  }
}

/**
369
 * Updates the full-text search index for a particular item.
370
 *
371
 * @param $type
372 373
 *   The plugin ID or other machine-readable type of this item,
 *   which should be less than 64 bytes.
374 375
 * @param $sid
 *   An ID number identifying this particular item (e.g., node ID).
376 377
 * @param $langcode
 *   Language code for text being indexed.
378 379
 * @param $text
 *   The content of this item. Must be a piece of HTML or plain text.
380 381
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
382
 */
383
function search_index($type, $sid, $langcode, $text) {
384
  $minimum_word_size = \Drupal::config('search.settings')->get('index.minimum_word_size');
385

386 387
  // Multipliers for scores of words inside certain HTML tags. The weights are
  // stored in config so that modules can overwrite the default weights.
388
  // Note: 'a' must be included for link ranking to work.
389
  $tags = \Drupal::config('search.settings')->get('index.tag_weights');
390

391 392
  // Strip off all ignored tags to speed up processing, but insert space before
  // and after them to keep word boundaries.
393
  $text = str_replace(array('<', '>'), array(' <', '> '), $text);
394
  $text = strip_tags($text, '<' . implode('><', array_keys($tags)) . '>');
395 396 397 398 399 400

  // Split HTML tags from plain text.
  $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  // Note: PHP ensures the array consists of alternating delimiters and literals
  // and begins and ends with a literal (inserting $null as required).

401
  $tag = FALSE; // Odd/even counter. Tag or no tag.
402
  $score = 1; // Starting score per word
403 404 405 406
  $accum = ' '; // Accumulator for cleaned up data
  $tagstack = array(); // Stack with open tags
  $tagwords = 0; // Counter for consecutive words
  $focus = 1; // Focus state
407

408
  $scored_words = array(); // Accumulator for words for index
409 410 411 412 413

  foreach ($split as $value) {
    if ($tag) {
      // Increase or decrease score per word based on tag
      list($tagname) = explode(' ', $value, 2);
414
      $tagname = Unicode::strtolower($tagname);
415
      // Closing or opening tag?
416
      if ($tagname[0] == '/') {
417 418 419 420
        $tagname = substr($tagname, 1);
        // If we encounter unexpected tags, reset score to avoid incorrect boosting.
        if (!count($tagstack) || $tagstack[0] != $tagname) {
          $tagstack = array();
421 422
          $score = 1;
        }
423 424 425 426
        else {
          // Remove from tag stack and decrement score
          $score = max(1, $score - $tags[array_shift($tagstack)]);
        }
Kjartan's avatar
Kjartan committed
427
      }
428
      else {
429
        if (isset($tagstack[0]) && $tagstack[0] == $tagname) {
430 431 432
          // None of the tags we look for make sense when nested identically.
          // If they are, it's probably broken HTML.
          $tagstack = array();
433
          $score = 1;
434 435 436 437 438 439
        }
        else {
          // Add to open tag stack and increment score
          array_unshift($tagstack, $tagname);
          $score += $tags[$tagname];
        }
Kjartan's avatar
Kjartan committed
440
      }
441 442
      // A tag change occurred, reset counter.
      $tagwords = 0;
443 444 445 446
    }
    else {
      // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
      if ($value != '') {
447
        $words = search_index_split($value, $langcode);
448
        foreach ($words as $word) {
449
          // Add word to accumulator
450
          $accum .= $word . ' ';
451
          // Check wordlength
452
          if (is_numeric($word) || Unicode::strlen($word) >= $minimum_word_size) {
453 454
            if (!isset($scored_words[$word])) {
              $scored_words[$word] = 0;
455
            }
456
            $scored_words[$word] += $score * $focus;
457 458
            // Focus is a decaying value in terms of the amount of unique words up to this point.
            // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
459
            $focus = min(1, .01 + 3.5 / (2 + count($scored_words) * .015));
460
          }
461 462 463 464 465 466
          $tagwords++;
          // Too many words inside a single tag probably mean a tag was accidentally left open.
          if (count($tagstack) && $tagwords >= 15) {
            $tagstack = array();
            $score = 1;
          }
Dries's avatar
 
Dries committed
467
        }
Kjartan's avatar
Kjartan committed
468 469
      }
    }
470
    $tag = !$tag;
Kjartan's avatar
Kjartan committed
471 472
  }

473
  search_index_clear($type, $sid, $langcode);
Kjartan's avatar
Kjartan committed
474

475
  // Insert cleaned up data into dataset
476 477 478
  db_insert('search_dataset')
    ->fields(array(
      'sid' => $sid,
479
      'langcode' => $langcode,
480
      'type' => $type,
481 482 483 484
      'data' => $accum,
      'reindex' => 0,
    ))
    ->execute();
485

486
  // Insert results into search index
487
  foreach ($scored_words as $word => $score) {
488
    // If a word already exists in the database, its score gets increased
489
    // appropriately. If not, we create a new record with the appropriate
490
    // starting score.
491
    db_merge('search_index')
492
      ->keys(array(
493 494
        'word' => $word,
        'sid' => $sid,
495
        'langcode' => $langcode,
496
        'type' => $type,
497 498 499 500
      ))
      ->fields(array('score' => $score))
      ->expression('score', 'score + :score', array(':score' => $score))
      ->execute();
501 502
    search_dirty($word);
  }
503 504 505
}

/**
506
 * Changes the timestamp on indexed items to 'now' to force reindexing.
507
 *
508 509 510 511 512 513 514 515 516 517 518 519 520 521
 * This function is meant for use by search page plugins, or for building a
 * user interface that lets users mark all or parts of the search index for
 * reindexing.
 *
 * @param string $type
 *   (optional) The plugin ID or other machine-readable type of this item. If
 *   omitted, the entire search index is marked for reindexing, and $sid and
 *   $langcode are ignored.
 * @param int $sid
 *   (optional) An ID number identifying this particular item (e.g., node ID).
 *   If omitted, everything matching $type is marked, and $langcode is ignored.
 * @param string $langcode
 *   (optional) The language code to clear. If omitted, everything matching
 *   $type and $sid is marked.
522
 */
523 524
function search_mark_for_reindex($type = NULL, $sid = NULL, $langcode = NULL) {
  $query = db_update('search_dataset')
525
    ->fields(array('reindex' => REQUEST_TIME))
526 527
    // Only mark items that were not previously marked for reindex, so that
    // marked items maintain their priority by request time.
528 529 530 531 532 533 534 535 536 537 538 539 540
    ->condition('reindex', 0);

  if ($type) {
    $query->condition('type', $type);
    if ($sid) {
      $query->condition('sid', $sid);
      if ($langcode) {
        $query->condition('langcode', $langcode);
      }
    }
  }

  $query->execute();
541 542
}

543 544 545 546 547 548
/**
 * @defgroup search Search interface
 * @{
 * The Drupal search interface manages a global search mechanism.
 *
 * Modules may plug into this system to provide searches of different types of
549 550
 * data. Most of the system is handled by the Search module, so this must be
 * enabled for all of the search features to work.
551
 *
552
 * There are two ways to interact with the search system:
553 554 555
 * - Specifically for searching nodes, you can implement
 *   hook_node_update_index() and hook_node_search_result(). However, note that
 *   the search system already indexes all visible output of a node; i.e.,
556
 *   everything displayed normally during node viewing. This is
557 558
 *   usually sufficient. You should only use this mechanism if you want
 *   additional, non-visible data to be indexed.
559 560
 * - Define a plugin implementing \Drupal\search\Plugin\SearchInterface and
 *   annotated as \Drupal\search\Annotation\SearchPlugin. This will create a
561 562 563 564
 *   search page type that users can use to set up one or more search pages.
 *   Each of these corresponds to a tab on the /search page, which can be
 *   used to perform searches. You will also need to implement the execute()
 *   method from the interface to perform the search. A base class is provided
565 566
 *   in \Drupal\search\Plugin\SearchPluginBase. For more information about
 *   plugins, see the @link plugin_api Plugin API topic. @endlink
567
 *
568
 * If your module needs to provide a more complicated search form, then you
569 570 571
 * need to implement it yourself. In that case, you may wish to define it as a
 * local task (tab) under the /search page (e.g. /search/mymodule) so that users
 * can easily find it.
572 573 574
 *
 * @see plugin_api
 * @see annotation
575 576 577
 */

/**
578
 * Returns snippets from a piece of text, with search keywords highlighted.
579
 *
580 581
 * Used for formatting search results.
 *
582
 * @param string $keys
583
 *   A string containing a search query.
584
 * @param string $text
585 586
 *   The text to extract fragments from.
 *
587
 * @return string
588 589
 *   A string containing HTML for the excerpt.
 */
590
function search_excerpt($keys, $text, $langcode = NULL) {
Steven Wittens's avatar
Steven Wittens committed
591
  // We highlight around non-indexable or CJK characters.
592
  $boundary = '(?:(?<=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . '])|(?=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . ']))';
Steven Wittens's avatar
Steven Wittens committed
593

594
  // Extract positive keywords and phrases.
595
  preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' ' . $keys, $matches);
596 597
  $keys = array_merge($matches[2], $matches[3]);

598 599
  // Prepare text by stripping HTML tags and decoding HTML entities.
  $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
600
  $text = String::decodeEntities($text);
601
  $text_length = strlen($text);
602

603 604 605 606 607 608 609 610 611 612 613 614 615 616
  // Make a list of unique keywords that are actually found in the text,
  // which could be items in $keys or replacements that are equivalent through
  // search_simplify().
  $temp_keys = array();
  foreach ($keys as $key) {
    $key = _search_find_match_with_simplify($key, $text, $boundary, $langcode);
    if (isset($key)) {
      // Quote slashes so they can be used in regular expressions.
      $temp_keys[] = preg_quote($key, '/');
    }
  }
  // Several keywords could have simplified down to the same thing, so pick
  // out the unique ones.
  $keys = array_unique($temp_keys);
617

618 619 620
  // Extract fragments of about 60 characters around keywords, bounded by word
  // boundary characters. Try to reach 256 characters, using second occurrences
  // if necessary.
621 622
  $ranges = array();
  $length = 0;
623 624 625 626 627 628
  $look_start = array();
  $remaining_keys = $keys;

  while ($length < 256 && !empty($remaining_keys)) {
    $found_keys = array();
    foreach ($remaining_keys as $key) {
629 630 631
      if ($length >= 256) {
        break;
      }
632 633 634 635 636

      // Remember where we last found $key, in case we are coming through a
      // second time.
      if (!isset($look_start[$key])) {
        $look_start[$key] = 0;
637
      }
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653

      // See if we can find $key after where we found it the last time. Since
      // we are requiring a match on a word boundary, make sure $text starts
      // and ends with a space.
      $matches = array();
      if (preg_match('/' . $boundary . $key . $boundary . '/iu', ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) {
        $found_position = $matches[0][1];
        $look_start[$key] = $found_position + 1;
        // Keep track of which keys we found this time, in case we need to
        // pass through again to find more text.
        $found_keys[] = $key;

        // Locate a space before and after this match, leaving about 60
        // characters of context on each end.
        $before = strpos(' ' . $text, ' ', max(0, $found_position - 61));
        if ($before !== FALSE && $before <= $found_position) {
654 655 656 657 658 659
          if ($text_length > $found_position + 60) {
            $after = strrpos(substr($text, 0, $found_position + 60), ' ', $found_position);
          }
          else {
            $after = $text_length;
          }
660 661 662 663 664 665 666 667
          if ($after !== FALSE && $after > $found_position) {
            // Account for the spaces we added.
            $before = max($before - 1, 0);
            if ($before < $after) {
              // Save this range.
              $ranges[$before] = $after;
              $length += $after - $before;
            }
668 669 670 671
          }
        }
      }
    }
672 673 674
    // Next time through this loop, only look for keys we found this time,
    // if any.
    $remaining_keys = $found_keys;
675
  }
676

677
  if (empty($ranges)) {
678 679 680
    // We didn't find any keyword matches, so just return the first part of the
    // text. We also need to re-encode any HTML special characters that we
    // entity-decoded above.
681
    return String::checkPlain(Unicode::truncate($text, 256, TRUE, TRUE));
682 683 684 685 686
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

687 688 689 690 691 692 693 694 695
  // Collapse overlapping text ranges into one. The sorting makes it O(n).
  $new_ranges = array();
  $max_end = 0;
  foreach ($ranges as $this_from => $this_to) {
    $max_end = max($max_end, $this_to);
    if (!isset($working_from)) {
      // This is the first time through this loop: initialize.
      $working_from = $this_from;
      $working_to = $this_to;
696 697
      continue;
    }
698 699 700
    if ($this_from <= $working_to) {
      // The ranges overlap: combine them.
      $working_to = max($working_to, $this_to);
701 702
    }
    else {
703 704 705 706
      // The ranges do not overlap: save the working range and start a new one.
      $new_ranges[$working_from] = $working_to;
      $working_from = $this_from;
      $working_to = $this_to;
707 708
    }
  }
709 710
  // Save the remaining working range.
  $new_ranges[$working_from] = $working_to;
711

712
  // Fetch text within the combined ranges we found.
713
  $out = array();
714
  foreach ($new_ranges as $from => $to) {
715 716
    $out[] = substr($text, $from, $to - $from);
  }
717

718 719 720 721
  // Combine the text chunks with "…" separators. The "…" needs to be
  // translated. Let translators have the … separator text as one chunk.
  $ellipses = explode('!excerpt', t('… !excerpt … !excerpt …'));
  $text = (isset($new_ranges[0]) ? '' : $ellipses[0]) . implode($ellipses[1], $out) . (($max_end < strlen($text) - 1) ? $ellipses[2] : '');
722
  $text = String::checkPlain($text);
723

724 725 726
  // Highlight keywords. Must be done at once to prevent conflicts ('strong'
  // and '<strong>').
  $text = trim(preg_replace('/' . $boundary . '(?:' . implode('|', $keys) . ')' . $boundary . '/iu', '<strong>\0</strong>', ' ' . $text . ' '));
727
  return SafeMarkup::set($text);
728 729
}

730 731 732 733
/**
 * @} End of "defgroup search".
 */

734
/**
735
 * Finds an appropriate keyword in text.
736 737 738 739 740 741
 *
 * @param $key
 *   The keyword to find.
 * @param $text
 *   The text to search for the keyword.
 * @param $boundary
742 743 744
 *   Regular expression for boundary characters between words.
 * @param $langcode
 *   Language code.
745 746
 *
 * @return
747 748 749 750
 *   A segment of $text that is between word boundary characters that either
 *   matches $key directly, or matches $key when both this text segment and
 *   $key are processed by search_simplify(). If a matching text segment is
 *   not located, NULL is returned.
751
 */
752 753 754 755 756 757 758 759 760 761
function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NULL) {
  // See if $key appears as-is. When testing, make sure $text starts/ends with
  // a space, because we require $key to be surrounded by word boundary
  // characters.
  $temp = trim($key);
  if ($temp == '') {
    return NULL;
  }
  if (preg_match('/' . $boundary . preg_quote($temp, '/') . $boundary . '/iu', ' ' . $text . ' ')) {
    return $key;
762 763
  }

764 765 766 767 768 769
  // Run both text and key through search_simplify.
  $simplified_key = trim(search_simplify($key, $langcode));
  $simplified_text = trim(search_simplify($text, $langcode));
  if ($simplified_key == '' || $simplified_text == '' || strpos($simplified_text, $simplified_key) === FALSE) {
    // The simplfied keyword and text do not match at all, or are empty.
    return NULL;
770 771
  }

772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
  // Split $text into words, keeping track of where the word boundaries are.
  $words = preg_split('/' . $boundary . '/iu', $text, NULL, PREG_SPLIT_OFFSET_CAPTURE);
  // Add an entry pointing to the end of the string, for the loop below.
  $words[] = array('', strlen($text));
  $num_words = count($words);

  // Find the smallest segment of complete words in $text that we can simplify
  // to match $simplified_key.
  $start_position = 0;
  $word_end = 0;
  for ($word_index = 0; $word_index < $num_words; $word_index++) {
    // See if we can move the starting position out from our previously-saved
    // best position to here and still have a match.
    $trial_position = $words[$word_index][1];
    if ($trial_position < strlen($text)) {
      $candidate = substr($text, $trial_position);
      $test_text = trim(search_simplify($candidate, $langcode));
      if (strpos($test_text, $simplified_key) !== FALSE) {
        $start_position = $trial_position;
        $word_end = $trial_position + strlen($words[$word_index][0]);
        continue;
      }
794 795
    }

796 797 798 799 800 801 802
    // See if we can end at our currently-saved word-ending position and still
    // match, in which case this is the minimal matching string.
    if ($word_end > $start_position) {
      $candidate = substr($text, $start_position, $word_end - $start_position);
      $test_text = trim(search_simplify($candidate, $langcode));
      if (strpos($test_text, $simplified_key) !== FALSE) {
        return $candidate;
803 804 805
      }
    }

806 807 808
    // Save the end position of this word for the next time through this loop.
    $word_end = $trial_position + strlen($words[$word_index][0]);
  }
809

810 811
  // If we get here, we couldn't find a match.
  return NULL;
812
}
813

814 815 816 817 818 819 820 821 822 823
/**
 * Implements hook_form_FORM_ID_alter() for the search_block_form form.
 *
 * Since the exposed form is a GET form, we don't want it to send the form
 * tokens. However, you cannot make this happen in the form builder function
 * itself, because the tokens are added to the form after the builder function
 * is called. So, we have to do it in a form_alter.
 *
 * @see \Drupal\search\Form\SearchBlockForm
 */
824
function search_form_search_block_form_alter(&$form, FormStateInterface $form_state) {
825 826 827 828
  $form['form_build_id']['#access'] = FALSE;
  $form['form_token']['#access'] = FALSE;
  $form['form_id']['#access'] = FALSE;
}