search.module 39.1 KB
Newer Older
Dries's avatar
   
Dries committed
1
<?php
2
// $Id$
Dries's avatar
   
Dries committed
3

Dries's avatar
   
Dries committed
4
5
6
7
8
/**
 * @file
 * Enables site-wide keyword searching.
 */

9
/**
10
 * Matches Unicode character classes to exclude from the search index.
Steven Wittens's avatar
Steven Wittens committed
11
 *
12
13
 * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
 *
14
 * The index only contains the following character classes:
15
16
17
18
19
20
21
 * Lu     Letter, Uppercase
 * Ll     Letter, Lowercase
 * Lt     Letter, Titlecase
 * Lo     Letter, Other
 * Nd     Number, Decimal Digit
 * No     Number, Other
 */
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
define('PREG_CLASS_SEARCH_EXCLUDE',
'\x{0}-\x{2f}\x{3a}-\x{40}\x{5b}-\x{60}\x{7b}-\x{bf}\x{d7}\x{f7}\x{2b0}-'.
'\x{385}\x{387}\x{3f6}\x{482}-\x{489}\x{559}-\x{55f}\x{589}-\x{5c7}\x{5f3}-'.
'\x{61f}\x{640}\x{64b}-\x{65e}\x{66a}-\x{66d}\x{670}\x{6d4}\x{6d6}-\x{6ed}'.
'\x{6fd}\x{6fe}\x{700}-\x{70f}\x{711}\x{730}-\x{74a}\x{7a6}-\x{7b0}\x{901}-'.
'\x{903}\x{93c}\x{93e}-\x{94d}\x{951}-\x{954}\x{962}-\x{965}\x{970}\x{981}-'.
'\x{983}\x{9bc}\x{9be}-\x{9cd}\x{9d7}\x{9e2}\x{9e3}\x{9f2}-\x{a03}\x{a3c}-'.
'\x{a4d}\x{a70}\x{a71}\x{a81}-\x{a83}\x{abc}\x{abe}-\x{acd}\x{ae2}\x{ae3}'.
'\x{af1}-\x{b03}\x{b3c}\x{b3e}-\x{b57}\x{b70}\x{b82}\x{bbe}-\x{bd7}\x{bf0}-'.
'\x{c03}\x{c3e}-\x{c56}\x{c82}\x{c83}\x{cbc}\x{cbe}-\x{cd6}\x{d02}\x{d03}'.
'\x{d3e}-\x{d57}\x{d82}\x{d83}\x{dca}-\x{df4}\x{e31}\x{e34}-\x{e3f}\x{e46}-'.
'\x{e4f}\x{e5a}\x{e5b}\x{eb1}\x{eb4}-\x{ebc}\x{ec6}-\x{ecd}\x{f01}-\x{f1f}'.
'\x{f2a}-\x{f3f}\x{f71}-\x{f87}\x{f90}-\x{fd1}\x{102c}-\x{1039}\x{104a}-'.
'\x{104f}\x{1056}-\x{1059}\x{10fb}\x{10fc}\x{135f}-\x{137c}\x{1390}-\x{1399}'.
'\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16f0}\x{1712}-\x{1714}'.
'\x{1732}-\x{1736}\x{1752}\x{1753}\x{1772}\x{1773}\x{17b4}-\x{17db}\x{17dd}'.
'\x{17f0}-\x{180e}\x{1843}\x{18a9}\x{1920}-\x{1945}\x{19b0}-\x{19c0}\x{19c8}'.
'\x{19c9}\x{19de}-\x{19ff}\x{1a17}-\x{1a1f}\x{1d2c}-\x{1d61}\x{1d78}\x{1d9b}-'.
'\x{1dc3}\x{1fbd}\x{1fbf}-\x{1fc1}\x{1fcd}-\x{1fcf}\x{1fdd}-\x{1fdf}\x{1fed}-'.
'\x{1fef}\x{1ffd}-\x{2070}\x{2074}-\x{207e}\x{2080}-\x{2101}\x{2103}-\x{2106}'.
'\x{2108}\x{2109}\x{2114}\x{2116}-\x{2118}\x{211e}-\x{2123}\x{2125}\x{2127}'.
'\x{2129}\x{212e}\x{2132}\x{213a}\x{213b}\x{2140}-\x{2144}\x{214a}-\x{2b13}'.
'\x{2ce5}-\x{2cff}\x{2d6f}\x{2e00}-\x{3005}\x{3007}-\x{303b}\x{303d}-\x{303f}'.
'\x{3099}-\x{309e}\x{30a0}\x{30fb}-\x{30fe}\x{3190}-\x{319f}\x{31c0}-\x{31cf}'.
'\x{3200}-\x{33ff}\x{4dc0}-\x{4dff}\x{a015}\x{a490}-\x{a716}\x{a802}\x{a806}'.
'\x{a80b}\x{a823}-\x{a82b}\x{d800}-\x{f8ff}\x{fb1e}\x{fb29}\x{fd3e}\x{fd3f}'.
'\x{fdfc}-\x{fe6b}\x{feff}-\x{ff0f}\x{ff1a}-\x{ff20}\x{ff3b}-\x{ff40}\x{ff5b}-'.
'\x{ff65}\x{ff70}\x{ff9e}\x{ff9f}\x{ffe0}-\x{fffd}');
50
51

/**
Steven Wittens's avatar
Steven Wittens committed
52
 * Matches all 'N' Unicode character classes (numbers)
53
 */
54
55
56
57
58
59
60
61
62
63
define('PREG_CLASS_NUMBERS',
'\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}'.
'\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}'.
'\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}'.
'\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-'.
'\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}'.
'\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}'.
'\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}'.
'\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-'.
'\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}');
64
65

/**
Steven Wittens's avatar
Steven Wittens committed
66
 * Matches all 'P' Unicode character classes (punctuation)
67
 */
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
define('PREG_CLASS_PUNCTUATION',
'\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}'.
'\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}'.
'\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}'.
'\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}'.
'\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}'.
'\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}'.
'\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}'.
'\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}'.
'\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-'.
'\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}'.
'\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}'.
'\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}'.
'\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}'.
'\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-'.
'\x{ff65}');

/**
 * Matches all CJK characters that are candidates for auto-splitting
 * (Chinese, Japanese, Korean).
 * Contains kana and BMP ideographs.
 */
define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}'.
'\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}');
92

Dries's avatar
   
Dries committed
93
94
95
96
/**
 * Implementation of hook_help().
 */
function search_help($section = 'admin/help#search') {
Dries's avatar
   
Dries committed
97
  switch ($section) {
Dries's avatar
   
Dries committed
98
99
    case 'admin/modules#description':
      return t('Enables site-wide keyword searching.');
100
101
    case 'admin/settings/search':
      return t('
Steven Wittens's avatar
Steven Wittens committed
102
<p>The search engine works by maintaining an index of the words in your site\'s content. You can adjust the settings below to tweak the indexing behaviour. Note that the search requires cron to be set up correctly.</p>
103
');
104
105
106
    case 'search#noresults':
      return t('<p><ul>
<li>Check if your spelling is correct.</li>
107
108
109
<li>Remove quotes around phrases to match each word individually: <em>"blue smurf"</em> will match less than <em>blue smurf</em>.</li>
<li>Consider loosening your query with <em>OR</em>: <em>blue smurf</em> will match less than <em>blue OR smurf</em>.</li>
</ul></p>');
Dries's avatar
   
Dries committed
110
  }
Dries's avatar
   
Dries committed
111
}
Kjartan's avatar
Kjartan committed
112
113

/**
Dries's avatar
   
Dries committed
114
 * Implementation of hook_perm().
Kjartan's avatar
Kjartan committed
115
 */
Dries's avatar
   
Dries committed
116
function search_perm() {
Dries's avatar
   
Dries committed
117
  return array('search content', 'administer search');
Dries's avatar
   
Dries committed
118
119
}

120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/**
 * Implementation of hook_block().
 */
function search_block($op = 'list', $delta = 0) {
  global $user;
  if ($op == 'list') {
    $blocks[0]['info'] = t('Search form');
    return $blocks;
  }
  else if ($op == 'view' && user_access('search content') && arg(0) != 'search') {
    $block['content'] = search_form('', '', null, '');
    $block['subject'] = t('Search');
    return $block;
  }
}

Dries's avatar
   
Dries committed
136
137
138
/**
 * Implementation of hook_menu().
 */
Dries's avatar
   
Dries committed
139
function search_menu($may_cache) {
Dries's avatar
   
Dries committed
140
  $items = array();
Dries's avatar
   
Dries committed
141
142

  if ($may_cache) {
Dries's avatar
Dries committed
143
    $items[] = array('path' => 'search', 'title' => t('search'),
Dries's avatar
   
Dries committed
144
145
146
      'callback' => 'search_view',
      'access' => user_access('search content'),
      'type' => MENU_SUGGESTED_ITEM);
147
148
149
150
151
152
  }
  else if (arg(0) == 'search') {
    // To remember the user's search keywords when switching across tabs,
    // we dynamically add the keywords to the search tabs' paths.
    $keys = search_get_keys();
    $keys = strlen($keys) ? '/'. $keys : '';
Dries's avatar
Dries committed
153
    foreach (module_list() as $name) {
154
155
      if (module_hook($name, 'search') && $title = module_invoke($name, 'search', 'name')) {
        $items[] = array('path' => 'search/'. $name . $keys, 'title' => $title,
Dries's avatar
Dries committed
156
157
          'callback' => 'search_view',
          'access' => user_access('search content'),
158
          'type' => MENU_LOCAL_TASK);
Dries's avatar
Dries committed
159
160
      }
    }
Dries's avatar
   
Dries committed
161
162
  }

Dries's avatar
   
Dries committed
163
164
165
166
  return $items;
}

/**
167
 * Implementation of hook_validate().
Dries's avatar
   
Dries committed
168
 */
169
170
function search_settings_form_validate($form_id, &$form) {
  // If the word length settings change, the index needs to be rebuilt.
171
  if (variable_get('minimum_word_size', 4) != $form['minimum_word_size']) {
172
173
    drupal_set_message(t('The index will be rebuilt.'));
    search_wipe();
Dries's avatar
Dries committed
174
  }
175
}
Dries's avatar
   
Dries committed
176

177
178
179
180
/**
 * Menu callback; displays the search module settings page.
 */
function search_settings() {
181
182
183
184
185
186
187
188
189
  // Collect some stats
  $remaining = 0;
  $total = 0;
  foreach (module_list() as $module) {
    if (module_hook($module, 'search')) {
      $status = module_invoke($module, 'search', 'status');
      $remaining += $status['remaining'];
      $total += $status['total'];
    }
Dries's avatar
Dries committed
190
  }
191
  $count = format_plural($remaining, 'There is 1 item left to index.', 'There are %count items left to index.');
192
  $percentage = ((int)min(100, 100 * ($total - $remaining) / max(1, $total))) . '%';
193
  $status = '<p><strong>'. t('%percentage of the site has been indexed.', array('%percentage' => $percentage)) .' '. $count .'</strong></p>';
194
195
  $form['search_admin'] = array('#type' => 'fieldset', '#title' => t('Indexing status'));
  $form['search_admin']['status'] = array('#type' => 'markup', '#value' => $status);
196
197

  $items = drupal_map_assoc(array(10, 20, 50, 100, 200, 500));
198
199

  // Indexing throttle:
200
201
  $form['indexing_throttle'] = array('#type' => 'fieldset', '#title' => t('Indexing throttle'));
  $form['indexing_throttle']['search_cron_limit'] = array('#type' => 'select', '#title' => t('Items to index per cron run'), '#default_value' => variable_get('search_cron_limit', 100), '#options' => $items, '#description' => t('The maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.'));
Dries's avatar
   
Dries committed
202
  // Indexing settings:
203
204
  $form['indexing_settings'] = array('#type' => 'fieldset', '#title' => t('Indexing settings'));
  $form['indexing_settings']['info'] = array('#type' => 'markup', '#value' => '<em>'. t('<p>Changing the setting below will cause the site index to be rebuilt. The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed.</p><p>The default settings should be appropriate for the majority of sites.</p>') .'</em>');
205
  $form['indexing_settings']['minimum_word_size'] = array('#type' => 'textfield', '#title' => t('Minimum word length to index'), '#default_value' => variable_get('minimum_word_size', 4), '#size' => 5, '#maxlength' => 3, '#description' => t('The number of characters a word has to be to be indexed. A lower setting means better search result ranking, but also a larger database. Each search query must contain at least one keyword that is this size (or longer).'));
206
  $form['indexing_settings']['remove_short'] = array('#type' => 'textfield', '#title' => t('Minimum word length to search for'), '#default_value' => variable_get('remove_short', 3), '#size' => 5, '#maxlength' => 3, '#description' => t('The number of characters a word has to be to be searched for, including wildcard characters.'));
Dries's avatar
   
Dries committed
207

208
209
  // Per module settings
  $form = array_merge($form, module_invoke_all('search', 'admin'));
210
  return $form;
Dries's avatar
   
Dries committed
211
212
}

Dries's avatar
Dries committed
213
/**
214
215
216
217
218
219
220
 * Wipes a part of or the entire search index.
 *
 * @param $sid
 *  (optional) The SID of the item to wipe. If specified, $type must be passed
 *  too.
 * @param $type
 *  (optional) The type of item to wipe.
Dries's avatar
Dries committed
221
 */
222
function search_wipe($sid = NULL, $type = NULL) {
223
224
225
226
  if ($type == NULL && $sid == NULL) {
    module_invoke_all('search', 'reset');
  }
  else {
227
    db_query("DELETE FROM {search_dataset} WHERE sid = %d AND type = '%s'", $sid, $type);
228
229
230
    db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'", $sid, $type);
    db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type);
  }
Dries's avatar
Dries committed
231
232
}

233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
/**
 * Marks a word as dirty (or retrieves the list of dirty words). This is used
 * during indexing (cron). Words which are dirty have outdated total counts in
 * the search_total table, and need to be recounted.
 */
function search_dirty($word = null) {
  static $dirty = array();
  if ($word !== null) {
    $dirty[$word] = true;
  }
  else {
    return $dirty;
  }
}

Kjartan's avatar
Kjartan committed
248
/**
Dries's avatar
   
Dries committed
249
250
 * Implementation of hook_cron().
 *
251
252
 * Fires hook_update_index() in all modules and cleans up dirty words (see
 * search_dirty).
Kjartan's avatar
Kjartan committed
253
254
 */
function search_cron() {
Dries's avatar
Dries committed
255
  // Update word index
Kjartan's avatar
Kjartan committed
256
  foreach (module_list() as $module) {
257
258
    module_invoke($module, 'update_index');
  }
259
  // Update word IDF (Inverse Document Frequency) counts for new/changed words
260
  foreach (search_dirty() as $word => $dummy) {
261
    // Get total count
262
    $total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word));
263
264
265
    // Apply Zipf's law to equalize the probability distribution
    $total = log10(1 + 1/(max(1, $total)));
    db_query("UPDATE {search_total} SET count = %f WHERE word = '%s'", $total, $word);
Dries's avatar
Dries committed
266
    if (!db_affected_rows()) {
267
      db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %f)", $word, $total);
Dries's avatar
Dries committed
268
269
270
271
272
273
274
275
    }
  }
  // Find words that were deleted from search_index, but are still in
  // search_total. We use a LEFT JOIN between the two tables and keep only the
  // rows which fail to join.
  $result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL");
  while ($word = db_fetch_object($result)) {
    db_query("DELETE FROM {search_total} WHERE word = '%s'", $word->realword);
Kjartan's avatar
Kjartan committed
276
277
278
279
  }
}

/**
280
 * Simplifies a string according to indexing rules.
Kjartan's avatar
Kjartan committed
281
 */
282
function search_simplify($text) {
283
284
  // Decode entities to UTF-8
  $text = decode_entities($text);
285

286
287
288
  // Lowercase
  $text = drupal_strtolower($text);

289
290
  // Call an external processor for word handling.
  search_preprocess($text);
Kjartan's avatar
Kjartan committed
291

292
293
294
  // Baseline CJK handling
  $text = preg_replace_callback('/['. PREG_CLASS_CJK .']+/u', 'search_expand_cjk', $text);

295
296
297
298
299
300
301
  // To improve searching for numerical data such as dates, IP addresses
  // or version numbers, we consider a group of numerical characters
  // separated only by punctuation characters to be one piece.
  // This also means that searching for e.g. '20/03/1984' also returns
  // results with '20-03-1984' in them.
  // Readable regexp: ([number]+)[punctuation]+(?=[number])
  $text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text);
Kjartan's avatar
Kjartan committed
302

303
304
305
  // The dot, underscore and dash are simply removed. This allows meaningful
  // search behaviour with acronyms and URLs.
  $text = preg_replace('/[._-]+/', '', $text);
Kjartan's avatar
Kjartan committed
306

307
308
309
  // With the exception of the rules above, we consider all punctuation,
  // marks, spacers, etc, to be a word boundary.
  $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text);
Kjartan's avatar
Kjartan committed
310

311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
  return $text;
}

/**
 * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping
 * pairs of characters.
 */
function search_expand_cjk($matches) {
  $tokens = ' ';
  // Split off first character
  $last = drupal_substr($matches[0], 0, 1);
  $str = substr($matches[0], strlen($last));
  // Begin loop
  $l = drupal_strlen($str);
  for ($i = 0; $i < $l; ++$i) {
    // Grab next character
    $current = drupal_substr($str, 0, 1);
    $str = substr($str, strlen($last));
    $tokens .= $last . $current .' ';
    $last = $current;
  }
  return $tokens;
}

/**
 * Splits a string into tokens for indexing.
 */
function search_index_split($text) {
  static $last = null;
  static $lastsplit = null;

  if ($last == $text) {
    return $lastsplit;
  }
345
  // Process words
346
  $text = search_simplify($text);
347
  $words = explode(' ', $text);
348
  array_walk($words, '_search_index_truncate');
Kjartan's avatar
Kjartan committed
349

350
351
352
353
354
355
356
  // Save last keyword result
  $last = $text;
  $lastsplit = $words;

  return $words;
}

357
/**
358
 * Helper function for array_walk in search_index_split.
359
 */
360
function _search_index_truncate(&$text) {
Steven Wittens's avatar
Steven Wittens committed
361
  $text = truncate_utf8($text, 50);
362
363
}

364
365
366
367
/**
 * Invokes hook_search_preprocess() in modules.
 */
function search_preprocess(&$text) {
368
369
  foreach (module_implements('search_preprocess') as $module) {
    $text = module_invoke($module, 'search_preprocess', $text);
Kjartan's avatar
Kjartan committed
370
371
372
373
  }
}

/**
Steven Wittens's avatar
Steven Wittens committed
374
 * Update the full-text search index for a particular item.
375
376
377
 *
 * @param $sid
 *   A number identifying this particular item (e.g. node id).
Kjartan's avatar
Kjartan committed
378
 *
379
380
381
382
383
 * @param $type
 *   A string defining this type of item (e.g. 'node')
 *
 * @param $text
 *   The content of this item. Must be a piece of HTML text.
384
385
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
386
 */
387
function search_index($sid, $type, $text) {
388
  $minimum_word_size = variable_get('minimum_word_size', 4);
389

390
  // Link matching
391
  global $base_url;
392
  $node_regexp = '@href=[\'"]?(?:'. preg_quote($base_url, '@') .'/)?(?:\?q=)?/?((?![a-z]+:)[^\'">]+)[\'">]@i';
393
394
395

  // Multipliers for scores of words inside certain HTML tags.
  // Note: 'a' must be included for link ranking to work.
396
  $tags = array('h1' => 25,
397
398
399
400
401
                'h2' => 18,
                'h3' => 15,
                'h4' => 12,
                'h5' => 9,
                'h6' => 6,
402
403
404
405
406
                'u' => 3,
                'b' => 3,
                'i' => 3,
                'strong' => 3,
                'em' => 3,
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
                'a' => 10);

  // Strip off all ignored tags to speed up processing, but insert space before/after
  // them to keep word boundaries.
  $text = str_replace(array('<', '>'), array(' <', '> '), $text);
  $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');

  // Split HTML tags from plain text.
  $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  // Note: PHP ensures the array consists of alternating delimiters and literals
  // and begins and ends with a literal (inserting $null as required).

  $tag = false; // Odd/even counter. Tag or no tag.
  $link = false; // State variable for link analyser
  $score = 1; // Starting score per word
422
423
424
425
  $accum = ' '; // Accumulator for cleaned up data
  $tagstack = array(); // Stack with open tags
  $tagwords = 0; // Counter for consecutive words
  $focus = 1; // Focus state
426

427
  $results = array(0 => array()); // Accumulator for words for index
428
429
430
431
432

  foreach ($split as $value) {
    if ($tag) {
      // Increase or decrease score per word based on tag
      list($tagname) = explode(' ', $value, 2);
433
      $tagname = drupal_strtolower($tagname);
434
      // Closing or opening tag?
435
      if ($tagname{0} == '/') {
436
437
438
439
        $tagname = substr($tagname, 1);
        // If we encounter unexpected tags, reset score to avoid incorrect boosting.
        if (!count($tagstack) || $tagstack[0] != $tagname) {
          $tagstack = array();
440
441
          $score = 1;
        }
442
443
444
445
446
        else {
          // Remove from tag stack and decrement score
          $score = max(1, $score - $tags[array_shift($tagstack)]);
        }
        if ($tagname == 'a') {
447
448
          $link = false;
        }
Kjartan's avatar
Kjartan committed
449
      }
450
      else {
451
452
453
454
        if ($tagstack[0] == $tagname) {
          // None of the tags we look for make sense when nested identically.
          // If they are, it's probably broken HTML.
          $tagstack = array();
455
          $score = 1;
456
457
458
459
460
461
        }
        else {
          // Add to open tag stack and increment score
          array_unshift($tagstack, $tagname);
          $score += $tags[$tagname];
        }
462
463
464
465
        if ($tagname == 'a') {
          // Check if link points to a node on this site
          if (preg_match($node_regexp, $value, $match)) {
            $path = drupal_get_normal_path($match[1]);
466
            if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
467
468
              $linknid = $match[1];
              if ($linknid > 0) {
469
470
471
472
473
474
                // Note: ignore links to uncachable nodes to avoid redirect bugs.
                $node = db_fetch_object(db_query('SELECT n.title, n.nid, n.vid, r.format FROM {node} n INNER JOIN {node_revisions} r ON n.vid = r.vid WHERE n.nid = %d', $linknid));
                if (filter_format_allowcache($node->format)) {
                  $link = true;
                  $linktitle = $node->title;
                }
475
476
              }
            }
Kjartan's avatar
Kjartan committed
477
478
479
          }
        }
      }
480
481
      // A tag change occurred, reset counter.
      $tagwords = 0;
482
483
484
485
    }
    else {
      // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
      if ($value != '') {
486
487
488
489
490
491
492
        if ($link) {
          // Check to see if the node link text is its URL. If so, we use the target node title instead.
          if (preg_match('!^https?://!i', $value)) {
            $value = $linktitle;
          }
        }
        $words = search_index_split($value);
493
        foreach ($words as $word) {
494
495
496
          // Add word to accumulator
          $accum .= $word .' ';
          $num = is_numeric($word);
497
          // Check wordlength
498
499
500
501
502
503
          if ($num || drupal_strlen($word) >= $minimum_word_size) {
            // Normalize numbers
            if ($num) {
              $word = (int)ltrim($word, '-0');
            }

504
505
506
507
            if ($link) {
              if (!isset($results[$linknid])) {
                $results[$linknid] = array();
              }
508
              $results[$linknid][$word] += $score * $focus;
509
510
            }
            else {
511
512
513
514
              $results[0][$word] += $score * $focus;
              // Focus is a decaying value in terms of the amount of unique words up to this point.
              // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
              $focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015));
515
516
            }
          }
517
518
519
520
521
522
          $tagwords++;
          // Too many words inside a single tag probably mean a tag was accidentally left open.
          if (count($tagstack) && $tagwords >= 15) {
            $tagstack = array();
            $score = 1;
          }
Dries's avatar
   
Dries committed
523
        }
Kjartan's avatar
Kjartan committed
524
525
      }
    }
526
    $tag = !$tag;
Kjartan's avatar
Kjartan committed
527
528
  }

529
  search_wipe($sid, $type);
Kjartan's avatar
Kjartan committed
530

531
532
533
  // Insert cleaned up data into dataset
  db_query("INSERT INTO {search_dataset} (sid, type, data) VALUES (%d, '%s', '%s')", $sid, $type, $accum);

534
535
536
537
538
539
  // Insert results into search index
  foreach ($results[0] as $word => $score) {
    db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %d)", $word, $sid, $type, $score);
    search_dirty($word);
  }
  unset($results[0]);
Dries's avatar
   
Dries committed
540

541
542
543
544
545
  // Now insert links to nodes
  foreach ($results as $nid => $words) {
    foreach ($words as $word => $score) {
      db_query("INSERT INTO {search_index} (word, sid, type, fromsid, fromtype, score) VALUES ('%s', %d, '%s', %d, '%s', %d)", $word, $nid, 'node', $sid, $type, $score);
      search_dirty($word);
Kjartan's avatar
Kjartan committed
546
547
548
549
    }
  }
}

550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
/**
 * Extract a module-specific search option from a search query. e.g. 'type:book'
 */
function search_query_extract($keys, $option) {
  if (preg_match('/(^| )'. $option .':([^ ]*)( |$)/i', $keys, $matches)) {
    return $matches[2];
  }
}

/**
 * Return a query with the given module-specific search option inserted in.
 * e.g. 'type:book'.
 */
function search_query_insert($keys, $option, $value = '') {
  if (search_query_extract($keys, $option)) {
    $keys = trim(preg_replace('/(^| )'. $option .':[^ ]*/i', '', $keys));
  }
  if ($value != '') {
    $keys .= ' '. $option .':'. $value;
  }
  return $keys;
}

/**
 * Parse a search query into SQL conditions.
 *
 * We build a query that matches the dataset bodies
 */
function search_parse_query($text) {
  $keys = array('positive' => array(), 'negative' => array());

  // Tokenize query string
  preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' '. $text, $matches, PREG_SET_ORDER);

  if (count($matches) < 1) {
    return NULL;
  }

  // Classify tokens
  $or = false;
  foreach ($matches as $match) {
    // Strip off quotes
    if ($match[2]{0} == '"') {
      $match[2] = substr($match[2], 1, -1);
    }
    // Simplify keyword according to indexing rules
    $match[2] = search_simplify($match[2]);
    // Negative matches
    if ($match[1] == '-') {
      $keys['negative'][] = $match[2];
    }
    // OR operator: instead of a single keyword, we store an array of all
    // OR'd keywords.
    elseif ($match[2] == 'OR' && count($keys['positive'])) {
      $keys['positive'][] = array(array_pop($keys['positive']));
      $or = true;
      continue;
    }
    // Plain keyword
    else {
      if ($or) {
        $keys['positive'][count($keys['positive']) - 1][] = $match[2];
      }
      else {
        $keys['positive'][] = $match[2];
      }
    }
    $or = false;
  }

  // Convert keywords into SQL statements.
  $scorewords = array();
  $query = array();
  $query2 = array();
  $arguments = array();
  $arguments2 = array();
  $matches = 0; // Counts the minimal number of words per item must match in the index.
  // Positive matches
  foreach ($keys['positive'] as $key) {
    // Group of ORed terms
    if (is_array($key) && count($key)) {
      $queryor = array();
      foreach ($key as $or) {
        $q = _search_parse_query($or, $scorewords);
        if ($q) {
          $queryor[] = $q;
          $arguments[] = $or;
        }
      }
      if (count($queryor)) {
        $query[] = '('. implode(' OR ', $queryor) .')';
      }
    }
    // Single ANDed term
    else {
      $q = _search_parse_query($key, $scorewords);
      if ($q) {
        $query[] = $q;
        $arguments[] = $key;
      }
    }
    $matches++;
  }
  foreach ($keys['negative'] as $key) {
    $q = _search_parse_query($key, $scorewords, true);
    if ($q) {
      $query[] = $q;
      $arguments[] = $key;
    }
  }
  // We separate word-index conditions because they are not needed in the
  // counting query.
  foreach ($scorewords as $word) {
    $query2[] = "i.word = '%s'";
    $arguments2[] = $word;
  }
  $query = implode(' AND ', $query);
  $query2 = implode(' OR ', $query2);
  return array($query, $arguments, $query2, $arguments2, $matches);
}

/**
 * Helper function for search_parse_query();
 */
function _search_parse_query(&$word, &$scores, $not = false) {
  // Determine the scorewords of this word/phrase
  if (!$not) {
    $split = explode(' ', $word);
    foreach ($split as $s) {
      $num = is_numeric($s);
      if ($num || drupal_strlen($s) >= variable_get('minimum_word_size', 4)) {
        $scores[] = $num ? ((int)ltrim($word, '-0')) : $s;
      }
    }
  }
  // Return matching snippet
  return "d.data ". ($not ? 'NOT ' : '') ."LIKE '%% %s %%'";
}

Kjartan's avatar
Kjartan committed
689
/**
Steven Wittens's avatar
Steven Wittens committed
690
 * Do a query on the full-text search index for a word or words.
691
 *
Steven Wittens's avatar
Steven Wittens committed
692
693
 * This function is normally only called by each module that support the
 * indexed search (and thus, implements hook_update_index()).
694
 *
695
696
697
698
 * Two queries are performed which can be extended by the caller.
 *
 * The first query selects a set of possible matches based on the search index
 * and any extra given restrictions. This is the classic "OR" search.
699
 *
700
 * SELECT i.type, i.sid, SUM(i.score*t.count) AS relevance
701
 * FROM {search_index} i
702
703
704
 * INNER JOIN {search_total} t ON i.word = t.word
 * $join1
 * WHERE $where1 AND (...)
705
 * GROUP BY i.type, i.sid
706
707
708
709
710
711
712
713
714
715
716
 *
 * The second query further refines this set by verifying advanced text
 * conditions (such as AND, negative or phrase matches), and orders the results
 * on a the column or expression 'score':
 *
 * SELECT i.type, i.sid, $select2
 * FROM temp_search_sids i
 * INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type
 * $join2
 * WHERE (...)
 * ORDER BY score DESC
717
 *
718
 * @param $keywords
719
720
721
722
 *   A search string as entered by the user.
 *
 * @param $type
 *   A string identifying the calling module.
Kjartan's avatar
Kjartan committed
723
 *
724
725
 * @param $join1
 *   (optional) Inserted into the JOIN part of the first SQL query.
726
727
 *   For example "INNER JOIN {node} n ON n.nid = i.sid".
 *
728
729
730
731
732
733
734
735
736
737
738
 * @param $where1
 *   (optional) Inserted into the WHERE part of the first SQL query.
 *   For example "(n.status > %d)".
 *
 * @param $arguments1
 *   (optional) Extra SQL arguments belonging to the first query.
 *
 * @param $select2
 *   (optional) Inserted into the SELECT pat of the second query. Must contain
 *   a column selected as 'score'.
 *   defaults to 'i.relevance AS score'
739
 *
740
741
742
743
744
745
 * @param $join2
 *   (optional) Inserted into the JOIN par of the second SQL query.
 *   For example "INNER JOIN {node_comment_statistics} n ON n.nid = i.sid"
 *
 * @param $arguments2
 *   (optional) Extra SQL arguments belonging to the second query parameter.
746
 *
747
748
 * @return
 *   An array of SIDs for the search results.
749
750
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
751
 */
752
753
function do_search($keywords, $type, $join1 = '', $where1 = '1', $arguments1 = array(), $select2 = 'i.relevance AS score', $join2 = '', $arguments2 = array()) {
  $query = search_parse_query($keywords);
754

755
756
  if ($query === NULL || $query[0] == '' || $query[2] == '') {
    return array();
757
  }
758

759
760
761
762
763
764
765
766
767
  // First pass: select all possible matching sids, doing a simple index-based OR matching on the keywords.
  // 'matches' is used to reject those items that cannot possibly match the query.
  $conditions = $where1 .' AND ('. $query[2] .") AND i.type = '%s'";
  $arguments = array_merge($arguments1, $query[3], array($type, $query[4]));
  $result = db_query_temporary("SELECT i.type, i.sid, SUM(i.score * t.count) AS relevance, COUNT(*) AS matches FROM {search_index} i INNER JOIN {search_total} t ON i.word = t.word $join1 WHERE $conditions GROUP BY i.type, i.sid HAVING matches >= %d", $arguments, 'temp_search_sids');

  // Calculate maximum relevance, to normalize it
  $normalize = db_result(db_query('SELECT MAX(relevance) FROM temp_search_sids'));
  if (!$normalize) {
768
769
    return array();
  }
770
  $select2 = str_replace('i.relevance', '('. (1.0 / $normalize) .' * i.relevance)', $select2);
771

772
773
774
775
776
777
  // Second pass: only keep items that match the complicated keywords conditions (phrase search, negative keywords, ...)
  $conditions = '('. $query[0] .')';
  $arguments = array_merge($arguments2, $query[1]);
  $result = db_query_temporary("SELECT i.type, i.sid, $select2 FROM temp_search_sids i INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type $join2 WHERE $conditions ORDER BY score DESC", $arguments, 'temp_search_results');
  if (($count = db_result(db_query('SELECT COUNT(*) FROM temp_search_results'))) == 0) {
    return array();
778
779
780
  }
  $count_query = "SELECT $count";

781
782
  // Do actual search query
  $result = pager_query("SELECT * FROM temp_search_results", 10, 0, $count_query, $arguments);
783
784
785
786
787
  $results = array();
  while ($item = db_fetch_object($result)) {
    $results[] = $item->sid;
  }
  return $results;
Kjartan's avatar
Kjartan committed
788
789
}

790
791
792
793
794
795
796
797
798
799
/**
 * Helper function for grabbing search keys.
 */
function search_get_keys() {
  // Extract keys as remainder of path
  // Note: support old GET format of searches for existing links.
  $path = explode('/', $_GET['q'], 3);
  return count($path) == 3 ? $path[2] : $_REQUEST['keys'];
}

Dries's avatar
   
Dries committed
800
801
802
803
/**
 * Menu callback; presents the search form and/or search results.
 */
function search_view() {
804
805
806
807
808
  $type = arg(1);

  // Search form submits with POST but redirects to GET. This way we can keep
  // the search query URL clean as a whistle:
  // search/type/keyword+keyword
809
  if (isset($_POST['op'])) {
810
811
812
    if ($type == '') {
      $type = 'node';
    }
813
    $keys = module_invoke($type, 'search', 'post', $_POST['edit']['keys']);
814
    drupal_goto('search/'. drupal_urlencode($type) .'/'. drupal_urlencode(is_null($keys) ? $_POST['edit']['keys'] : $keys));
815
816
817
  }
  else if ($type == '') {
    // Note: search/node can not be a default tab because it would take on the
818
    // path of its parent (search). It would prevent remembering keywords when
819
820
821
822
    // switching tabs. This is why we drupal_goto to it from the parent instead.
    drupal_goto('search/node');
  }
  $keys = search_get_keys();
Dries's avatar
   
Dries committed
823

Dries's avatar
   
Dries committed
824
  if (user_access('search content')) {
825
826
827
    // Only perform search if there is non-whitespace search term:
    if (trim($keys)) {
      // Log the search keys:
Dries's avatar
Dries committed
828
      watchdog('search',
829
        t('Search: %keys (%type).', array('%keys' => theme('placeholder', $keys), '%type' => module_invoke($type, 'search', 'name'))),
Dries's avatar
Dries committed
830
        WATCHDOG_NOTICE,
831
        l(t('results'), 'search/'. drupal_urlencode($type) .'/'. drupal_urlencode($keys))
Dries's avatar
Dries committed
832
        );
Dries's avatar
   
Dries committed
833

834
      // Collect the search results:
835
      $results = search_data($keys, $type);
Dries's avatar
   
Dries committed
836

Dries's avatar
   
Dries committed
837
      if ($results) {
838
        $results = theme('box', t('Search results'), $results);
Dries's avatar
   
Dries committed
839
840
      }
      else {
841
        $results = theme('box', t('Your search yielded no results'), search_help('search#noresults'));
Dries's avatar
   
Dries committed
842
      }
Dries's avatar
   
Dries committed
843
    }
844
845
846
847
848
849
    else if (isset($_POST['edit'])) {
      form_set_error('keys', t('Please enter some keywords.'));
    }

    // Construct the search form.
    // Note, we do this last because of the form_set_error() above.
850
    $output = search_form(NULL, $keys, $type);
851
852

    $output .= $results;
Dries's avatar
   
Dries committed
853

Dries's avatar
   
Dries committed
854
    return $output;
855
  }
Dries's avatar
   
Dries committed
856
  else {
Dries's avatar
   
Dries committed
857
    drupal_access_denied();
Dries's avatar
   
Dries committed
858
  }
Kjartan's avatar
Kjartan committed
859
860
}

861
862
863
864
865
866
867
868
/**
 * @defgroup search Search interface
 * @{
 * The Drupal search interface manages a global search mechanism.
 *
 * Modules may plug into this system to provide searches of different types of
 * data. Most of the system is handled by search.module, so this must be enabled
 * for all of the search features to work.
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
 *
 * There are three ways to interact with the search system:
 * - Specifically for searching nodes, you can implement nodeapi('update index')
 *   and nodeapi('search result'). However, note that the search system already
 *   indexes all visible output of a node, i.e. everything displayed normally
 *   by hook_view() and hook_nodeapi('view'). This is usually sufficient.
 *   You should only use this mechanism if you want additional, non-visible data
 *   to be indexed.
 * - Implement hook_search(). This will create a search tab for your module on
 *   the /search page with a simple keyword search form. You may optionally
 *   implement hook_search_item() to customize the display of your results.
 * - Implement hook_update_index(). This allows your module to use Drupal's
 *   HTML indexing mechanism for searching full text efficiently.
 *
 * If your module needs to provide a more complicated search form, then you need
Dries's avatar
Dries committed
884
 * to implement it yourself without hook_search(). In that case, you should
885
886
 * define it as a local task (tab) under the /search page (e.g. /search/mymodule)
 * so that users can easily find it.
887
888
889
890
891
892
893
894
895
 */

/**
 * Render a search form.
 *
 * @param $action
 *   Form action. Defaults to "search".
 * @param $keys
 *   The search string entered by the user, containing keywords for the search.
896
897
898
 * @param $type
 *   The type of search to render the node for. Must be the name of module
 *   which implements hook_search(). Defaults to 'node'.
899
900
 * @param $prompt
 *   A piece of text to put before the form (e.g. "Enter your keywords")
901
902
903
 * @return
 *   An HTML string containing the search form.
 */
904
function search_form($action = '', $keys = '', $type = null, $prompt = null) {
905
906
907
  $edit = $_POST['edit'];

  if (!$action) {
908
    $action = url('search/'. $type);
909
  }
910
911
912
  if (is_null($prompt)) {
    $prompt = t('Enter your keywords');
  }
913

914
  $form = array();
915
916
  $form['#action'] = $action;
  $form['#attributes'] = array('class' => 'search-form');
917
918
919
920
  $form['basic'] = array('#type' => 'item', '#title' => $prompt);
  $form['basic']['inline'] = array('#type' => 'markup', '#prefix' => '<div class="container-inline">', '#suffix' => '</div>');
  $form['basic']['inline']['keys'] = array('#type' => 'textfield', '#title' => '', '#default_value' => $keys, '#size' => $prompt ? 40 : 30, '#maxlength' => 255);
  $form['basic']['inline']['submit'] = array('#type' => 'submit', '#value' => t('Search'));
921

922
  $form = array_merge($form, module_invoke($type, 'search', 'form', $keys));
923

924
  return drupal_get_form('search_form', $form);
925
926
927
}

/**
Steven Wittens's avatar
Steven Wittens committed
928
 * Perform a standard search on the given keys, and return the formatted results.
929
930
931
932
933
934
935
936
937
938
939
940
941
 */
function search_data($keys = NULL, $type = 'node') {
  $output = '';

  if (isset($keys)) {
    if (module_hook($type, 'search')) {
      $results = module_invoke($type, 'search', 'search', $keys);
      if (is_array($results) && count($results)) {
        $output .= '<dl class="search-results">';
        foreach ($results as $entry) {
          $output .= theme('search_item', $entry, $type);
        }
        $output .= '</dl>';
942
        $output .= theme('pager', NULL, 15, 0);
943
944
945
946
947
948
949
950
951
952
953
954
      }
    }
  }

  return $output;
}

/**
 * Returns snippets from a piece of text, with certain keywords highlighted.
 * Used for formatting search results.
 *
 * @param $keys
955
 *   A string containing a search query.
956
957
958
959
960
961
962
963
 *
 * @param $text
 *   The text to extract fragments from.
 *
 * @return
 *   A string containing HTML for the excerpt.
 */
function search_excerpt($keys, $text) {
964
965
966
967
968
  // Extract positive keywords and phrases
  preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' '. $keys, $matches);
  $keys = array_merge($matches[2], $matches[3]);

  // Prepare text
969
  $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
970
971
  array_walk($keys, '_search_excerpt_replace');
  $workkeys = $keys;
972
973
974
975

  // Extract a fragment per keyword for at most 4 keywords.
  // First we collect ranges of text around each keyword, starting/ending
  // at spaces.
976
  // If the sum of all fragments is too short, we look for second occurrences.
977
978
979
  $ranges = array();
  $included = array();
  $length = 0;
980
981
  while ($length < 256 && count($workkeys)) {
    foreach ($workkeys as $k => $key) {
982
      if (strlen($key) == 0) {
983
        unset($workkeys[$k]);
984
        unset($keys[$k]);
985
986
987
988
989
        continue;
      }
      if ($length >= 256) {
        break;
      }
990
      // Remember occurrence of key so we can skip over it if more occurrences
991
992
993
994
      // are desired.
      if (!isset($included[$key])) {
        $included[$key] = 0;
      }
995
996
997
998
      // Locate a keyword (position $p), then locate a space in front (position
      // $q) and behind it (position $s)
      if (preg_match('/\b'. $key .'\b/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
        $p = $match[0][1];
999
1000
1001
1002
1003
1004
1005
1006
        if (($q = strpos($text, ' ', max(0, $p - 60))) !== false) {
          $end = substr($text, $p, 80);
          if (($s = strrpos($end, ' ')) !== false) {
            $ranges[$q] = $p + $s;
            $length += $p + $s - $q;
            $included[$key] = $p + 1;
          }
          else {
1007
            unset($workkeys[$k]);
1008
1009
1010
          }
        }
        else {
1011
          unset($workkeys[$k]);
1012
1013
1014
        }
      }
      else {
1015
        unset($workkeys[$k]);
1016
1017
      }
    }
1018
  }
1019

1020
1021
1022
  // If we didn't find anything, return the beginning.
  if (count($ranges) == 0) {
    return truncate_utf8($text, 256) . ' ...';
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

  // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
  $newranges = array();
  foreach ($ranges as $from2 => $to2) {
    if (!isset($from1)) {
      $from1 = $from2;
      $to1 = $to2;
      continue;
    }
    if ($from2 <= $to1) {
      $to1 = max($to1, $to2);
    }
    else {
      $newranges[$from1] = $to1;
      $from1 = $from2;
      $to1 = $to2;
    }
  }
  $newranges[$from1] = $to1;

  // Fetch text
  $out = array();
  foreach ($newranges as $from => $to) {
    $out[] = substr($text, $from, $to - $from);
  }
1052
  $text = (isset($newranges[0]) ? '' : '... '). implode(' ... ', $out) .' ...';
1053
1054

  // Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
1055
  $text = preg_replace('/\b('. implode('|', $keys) .')\b/iu', '<strong>\0</strong>', $text);
1056
1057
1058
  return $text;
}

1059
1060
1061
1062
/**
 * @} End of "defgroup search".
 */

1063
1064
1065
/**
 * Helper function for array_walk in search_except.
 */
1066
1067
function _search_excerpt_replace(&$text) {
  $text = preg_quote($text, '/');
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
}

/**
 * Format a single result entry of a search query.
 *
 * Modules may implement hook_search_item() in order to override this default
 * function to display search results.
 *
 * @param $item
 *   A single search result as returned by hook_search(). The result should be
Dries's avatar
Dries committed
1078
1079
1080
 *   an array with keys "link", "title", "type", "user", "date", and "snippet".
 *   Optionally, "extra" can be an array of extra info to show along with the
 *   result.
1081
 * @param $type
1082
1083
1084
 *   The type of item found, such as "user" or "node".
 *
 * @ingroup themeable
1085
1086
1087
1088
1089
1090
 */
function theme_search_item($item, $type) {
  if (module_hook($type, 'search_item')) {
    $output = module_invoke($type, 'search_item', $item);
  }
  else {
1091
    $output = ' <dt class="title"><a href="'. check_url($item['link']) .'">'. check_plain($item['title']) .'</a></dt>';
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
    $info = array();
    if ($item['type']) {
      $info[] = $item['type'];
    }
    if ($item['user']) {
      $info[] = $item['user'];
    }
    if ($item['date']) {
      $info[] = format_date($item['date'], 'small');
    }
Dries's avatar
Dries committed
1102
1103
    if (is_array($item['extra'])) {
      $info = array_merge($info, $item['extra']);
1104
1105
1106
1107
1108
1109
    }
    $output .= ' <dd>'. ($item['snippet'] ? '<p>'. $item['snippet'] . '</p>' : '') . '<p class="search-info">' . implode(' - ', $info) .'</p></dd>';
  }

  return $output;
}