search.module 39.1 KB
Newer Older
Dries's avatar
   
Dries committed
1
<?php
2
// $Id$
Dries's avatar
   
Dries committed
3

Dries's avatar
   
Dries committed
4
5
6
7
8
/**
 * @file
 * Enables site-wide keyword searching.
 */

9
/**
10
 * Matches Unicode character classes to exclude from the search index.
Steven Wittens's avatar
Steven Wittens committed
11
 *
12
13
 * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
 *
14
 * The index only contains the following character classes:
15
16
17
18
19
20
21
 * Lu     Letter, Uppercase
 * Ll     Letter, Lowercase
 * Lt     Letter, Titlecase
 * Lo     Letter, Other
 * Nd     Number, Decimal Digit
 * No     Number, Other
 */
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
define('PREG_CLASS_SEARCH_EXCLUDE',
'\x{0}-\x{2f}\x{3a}-\x{40}\x{5b}-\x{60}\x{7b}-\x{bf}\x{d7}\x{f7}\x{2b0}-'.
'\x{385}\x{387}\x{3f6}\x{482}-\x{489}\x{559}-\x{55f}\x{589}-\x{5c7}\x{5f3}-'.
'\x{61f}\x{640}\x{64b}-\x{65e}\x{66a}-\x{66d}\x{670}\x{6d4}\x{6d6}-\x{6ed}'.
'\x{6fd}\x{6fe}\x{700}-\x{70f}\x{711}\x{730}-\x{74a}\x{7a6}-\x{7b0}\x{901}-'.
'\x{903}\x{93c}\x{93e}-\x{94d}\x{951}-\x{954}\x{962}-\x{965}\x{970}\x{981}-'.
'\x{983}\x{9bc}\x{9be}-\x{9cd}\x{9d7}\x{9e2}\x{9e3}\x{9f2}-\x{a03}\x{a3c}-'.
'\x{a4d}\x{a70}\x{a71}\x{a81}-\x{a83}\x{abc}\x{abe}-\x{acd}\x{ae2}\x{ae3}'.
'\x{af1}-\x{b03}\x{b3c}\x{b3e}-\x{b57}\x{b70}\x{b82}\x{bbe}-\x{bd7}\x{bf0}-'.
'\x{c03}\x{c3e}-\x{c56}\x{c82}\x{c83}\x{cbc}\x{cbe}-\x{cd6}\x{d02}\x{d03}'.
'\x{d3e}-\x{d57}\x{d82}\x{d83}\x{dca}-\x{df4}\x{e31}\x{e34}-\x{e3f}\x{e46}-'.
'\x{e4f}\x{e5a}\x{e5b}\x{eb1}\x{eb4}-\x{ebc}\x{ec6}-\x{ecd}\x{f01}-\x{f1f}'.
'\x{f2a}-\x{f3f}\x{f71}-\x{f87}\x{f90}-\x{fd1}\x{102c}-\x{1039}\x{104a}-'.
'\x{104f}\x{1056}-\x{1059}\x{10fb}\x{10fc}\x{135f}-\x{137c}\x{1390}-\x{1399}'.
'\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16f0}\x{1712}-\x{1714}'.
'\x{1732}-\x{1736}\x{1752}\x{1753}\x{1772}\x{1773}\x{17b4}-\x{17db}\x{17dd}'.
'\x{17f0}-\x{180e}\x{1843}\x{18a9}\x{1920}-\x{1945}\x{19b0}-\x{19c0}\x{19c8}'.
'\x{19c9}\x{19de}-\x{19ff}\x{1a17}-\x{1a1f}\x{1d2c}-\x{1d61}\x{1d78}\x{1d9b}-'.
'\x{1dc3}\x{1fbd}\x{1fbf}-\x{1fc1}\x{1fcd}-\x{1fcf}\x{1fdd}-\x{1fdf}\x{1fed}-'.
'\x{1fef}\x{1ffd}-\x{2070}\x{2074}-\x{207e}\x{2080}-\x{2101}\x{2103}-\x{2106}'.
'\x{2108}\x{2109}\x{2114}\x{2116}-\x{2118}\x{211e}-\x{2123}\x{2125}\x{2127}'.
'\x{2129}\x{212e}\x{2132}\x{213a}\x{213b}\x{2140}-\x{2144}\x{214a}-\x{2b13}'.
'\x{2ce5}-\x{2cff}\x{2d6f}\x{2e00}-\x{3005}\x{3007}-\x{303b}\x{303d}-\x{303f}'.
'\x{3099}-\x{309e}\x{30a0}\x{30fb}-\x{30fe}\x{3190}-\x{319f}\x{31c0}-\x{31cf}'.
'\x{3200}-\x{33ff}\x{4dc0}-\x{4dff}\x{a015}\x{a490}-\x{a716}\x{a802}\x{a806}'.
'\x{a80b}\x{a823}-\x{a82b}\x{d800}-\x{f8ff}\x{fb1e}\x{fb29}\x{fd3e}\x{fd3f}'.
'\x{fdfc}-\x{fe6b}\x{feff}-\x{ff0f}\x{ff1a}-\x{ff20}\x{ff3b}-\x{ff40}\x{ff5b}-'.
'\x{ff65}\x{ff70}\x{ff9e}\x{ff9f}\x{ffe0}-\x{fffd}');
50
51

/**
Steven Wittens's avatar
Steven Wittens committed
52
 * Matches all 'N' Unicode character classes (numbers)
53
 */
54
55
56
57
58
59
60
61
62
63
define('PREG_CLASS_NUMBERS',
'\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}'.
'\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}'.
'\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}'.
'\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-'.
'\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}'.
'\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}'.
'\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}'.
'\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-'.
'\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}');
64
65

/**
Steven Wittens's avatar
Steven Wittens committed
66
 * Matches all 'P' Unicode character classes (punctuation)
67
 */
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
define('PREG_CLASS_PUNCTUATION',
'\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}'.
'\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}'.
'\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}'.
'\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}'.
'\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}'.
'\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}'.
'\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}'.
'\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}'.
'\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-'.
'\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}'.
'\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}'.
'\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}'.
'\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}'.
'\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-'.
'\x{ff65}');

/**
 * Matches all CJK characters that are candidates for auto-splitting
 * (Chinese, Japanese, Korean).
 * Contains kana and BMP ideographs.
 */
define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}'.
'\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}');
92

Dries's avatar
   
Dries committed
93
94
95
/**
 * Implementation of hook_help().
 */
96
97
function search_help($path, $arg) {
  switch ($path) {
98
    case 'admin/help#search':
99
      $output = '<p>'. t('The search module adds the ability to search for content by keywords. Search is often the only practical way to find content on a large site. Search is useful for finding users and posts by searching on keywords.') .'</p>';
100
      $output .= '<p>'. t('The search engine works by maintaining an index of the words in your site\'s content. It indexes the posts and users. You can adjust the settings to tweak the indexing behavior. Note that the search requires cron to be set up correctly. The index percentage sets the maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.') .'</p>';
101
      $output .= '<p>'. t('For more information please read the configuration and customization handbook <a href="@search">Search page</a>.', array('@search' => 'http://drupal.org/handbook/modules/search/')) .'</p>';
102
      return $output;
103
    case 'admin/settings/search':
104
      return '<p>'. t('The search engine works by maintaining an index of the words in your site\'s content. You can adjust the settings below to tweak the indexing behavior. Note that the search requires cron to be set up correctly.') .'</p>';
105
    case 'search#noresults':
106
      return t('<ul>
107
<li>Check if your spelling is correct.</li>
108
109
<li>Remove quotes around phrases to match each word individually: <em>"blue smurf"</em> will match less than <em>blue smurf</em>.</li>
<li>Consider loosening your query with <em>OR</em>: <em>blue smurf</em> will match less than <em>blue OR smurf</em>.</li>
110
</ul>');
Dries's avatar
   
Dries committed
111
  }
Dries's avatar
   
Dries committed
112
}
Kjartan's avatar
Kjartan committed
113

114
115
116
117
118
119
120
121
122
123
124
125
126
/**
 * Implementation of hook_theme()
 */
function search_theme() {
  return array(
    'search_theme_form' => array(
      'arguments' => array('form' => NULL),
    ),
    'search_block_form' => array(
      'arguments' => array('form' => NULL),
    ),
    'search_item' => array(
      'arguments' => array('item' => NULL, 'type' => NULL),
127
      'file' => 'search.pages.inc',
128
129
130
    ),
    'search_page' => array(
      'arguments' => array('results' => NULL, 'type' => NULL),
131
      'file' => 'search.pages.inc',
132
133
134
135
    ),
  );
}

Kjartan's avatar
Kjartan committed
136
/**
Dries's avatar
   
Dries committed
137
 * Implementation of hook_perm().
Kjartan's avatar
Kjartan committed
138
 */
Dries's avatar
   
Dries committed
139
function search_perm() {
140
  return array('search content', 'use advanced search', 'administer search');
Dries's avatar
   
Dries committed
141
142
}

143
144
145
146
147
148
/**
 * Implementation of hook_block().
 */
function search_block($op = 'list', $delta = 0) {
  if ($op == 'list') {
    $blocks[0]['info'] = t('Search form');
149
150
    // Not worth caching.
    $blocks[0]['cache'] = BLOCK_NO_CACHE;
151
152
    return $blocks;
  }
153
  else if ($op == 'view' && user_access('search content')) {
154
    $block['content'] = drupal_get_form('search_block_form');
155
156
157
158
159
    $block['subject'] = t('Search');
    return $block;
  }
}

Dries's avatar
   
Dries committed
160
161
162
/**
 * Implementation of hook_menu().
 */
163
164
function search_menu() {
  $items['search'] = array(
165
    'title' => 'Search',
166
167
168
    'page callback' => 'search_view',
    'access arguments' => array('search content'),
    'type' => MENU_SUGGESTED_ITEM,
169
    'file' => 'search.pages.inc',
170
171
  );
  $items['admin/settings/search'] = array(
172
173
    'title' => 'Search settings',
    'description' => 'Configure relevance settings for search and other indexing options',
174
175
176
177
    'page callback' => 'drupal_get_form',
    'page arguments' => array('search_admin_settings'),
    'access arguments' => array('administer search'),
    'type' => MENU_NORMAL_ITEM,
178
    'file' => 'search.admin.inc',
179
180
  );
  $items['admin/settings/search/wipe'] = array(
181
    'title' => 'Clear index',
182
183
184
185
    'page callback' => 'drupal_get_form',
    'page arguments' => array('search_wipe_confirm'),
    'access arguments' => array('administer search'),
    'type' => MENU_CALLBACK,
186
    'file' => 'search.admin.inc',
187
  );
188
  $items['admin/reports/search'] = array(
189
190
    'title' => 'Top search phrases',
    'description' => 'View most popular search phrases.',
191
    'page callback' => 'dblog_top',
192
    'page arguments' => array('search'),
193
194
    'file' => 'dblog.admin.inc',
    'file path' => drupal_get_path('module', 'dblog'),
195
  );
Dries's avatar
   
Dries committed
196

197
  foreach (module_implements('search') as $name) {
198
    $items['search/'. $name .'/%menu_tail'] = array(
199
200
      'title callback' => 'module_invoke',
      'title arguments' => array($name, 'search', 'name', TRUE),
201
202
      'page callback' => 'search_view',
      'page arguments' => array($name),
203
204
      'access callback' => '_search_menu',
      'access arguments' => array($name),
205
      'type' => MENU_LOCAL_TASK,
206
      'parent' => 'search',
207
      'file' => 'search.pages.inc',
208
    );
209
  }
210
211
212
  return $items;
}

213
214
function _search_menu($name) {
  return user_access('search content') && module_invoke($name, 'search', 'name');
Dries's avatar
   
Dries committed
215
216
}

Dries's avatar
Dries committed
217
/**
218
219
220
221
222
223
224
 * Wipes a part of or the entire search index.
 *
 * @param $sid
 *  (optional) The SID of the item to wipe. If specified, $type must be passed
 *  too.
 * @param $type
 *  (optional) The type of item to wipe.
Dries's avatar
Dries committed
225
 */
226
function search_wipe($sid = NULL, $type = NULL, $reindex = FALSE) {
227
228
229
230
  if ($type == NULL && $sid == NULL) {
    module_invoke_all('search', 'reset');
  }
  else {
231
    db_query("DELETE FROM {search_dataset} WHERE sid = %d AND type = '%s'", $sid, $type);
232
    db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type);
233
    // When re-indexing, keep link references
234
    db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'". ($reindex ? " AND fromsid = 0" : ''), $sid, $type);
235
  }
Dries's avatar
Dries committed
236
237
}

238
239
240
241
242
/**
 * Marks a word as dirty (or retrieves the list of dirty words). This is used
 * during indexing (cron). Words which are dirty have outdated total counts in
 * the search_total table, and need to be recounted.
 */
243
function search_dirty($word = NULL) {
244
  static $dirty = array();
245
246
  if ($word !== NULL) {
    $dirty[$word] = TRUE;
247
248
249
250
251
252
  }
  else {
    return $dirty;
  }
}

Kjartan's avatar
Kjartan committed
253
/**
Dries's avatar
   
Dries committed
254
255
 * Implementation of hook_cron().
 *
256
257
 * Fires hook_update_index() in all modules and cleans up dirty words (see
 * search_dirty).
Kjartan's avatar
Kjartan committed
258
259
 */
function search_cron() {
260
261
262
263
  // We register a shutdown function to ensure that search_total is always up
  // to date.
  register_shutdown_function('search_update_totals');

Dries's avatar
Dries committed
264
  // Update word index
Kjartan's avatar
Kjartan committed
265
  foreach (module_list() as $module) {
266
267
    module_invoke($module, 'update_index');
  }
268
269
270
271
272
273
274
}

/**
 * This function is called on shutdown to ensure that search_total is always
 * up to date (even if cron times out or otherwise fails).
 */
function search_update_totals() {
275
  // Update word IDF (Inverse Document Frequency) counts for new/changed words
276
  foreach (search_dirty() as $word => $dummy) {
277
    // Get total count
278
    $total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word));
279
280
281
    // Apply Zipf's law to equalize the probability distribution
    $total = log10(1 + 1/(max(1, $total)));
    db_query("UPDATE {search_total} SET count = %f WHERE word = '%s'", $total, $word);
Dries's avatar
Dries committed
282
    if (!db_affected_rows()) {
283
      db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %f)", $word, $total);
Dries's avatar
Dries committed
284
285
286
287
288
289
290
291
    }
  }
  // Find words that were deleted from search_index, but are still in
  // search_total. We use a LEFT JOIN between the two tables and keep only the
  // rows which fail to join.
  $result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL");
  while ($word = db_fetch_object($result)) {
    db_query("DELETE FROM {search_total} WHERE word = '%s'", $word->realword);
Kjartan's avatar
Kjartan committed
292
293
294
295
  }
}

/**
296
 * Simplifies a string according to indexing rules.
Kjartan's avatar
Kjartan committed
297
 */
298
function search_simplify($text) {
299
300
  // Decode entities to UTF-8
  $text = decode_entities($text);
301

302
303
304
  // Lowercase
  $text = drupal_strtolower($text);

305
  // Call an external processor for word handling.
306
  search_invoke_preprocess($text);
Kjartan's avatar
Kjartan committed
307

308
  // Simple CJK handling
309
  if (variable_get('overlap_cjk', TRUE)) {
Dries's avatar
Dries committed
310
    $text = preg_replace_callback('/['. PREG_CLASS_CJK .']+/u', 'search_expand_cjk', $text);
311
  }
312

313
314
315
316
317
318
319
  // To improve searching for numerical data such as dates, IP addresses
  // or version numbers, we consider a group of numerical characters
  // separated only by punctuation characters to be one piece.
  // This also means that searching for e.g. '20/03/1984' also returns
  // results with '20-03-1984' in them.
  // Readable regexp: ([number]+)[punctuation]+(?=[number])
  $text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text);
Kjartan's avatar
Kjartan committed
320

321
  // The dot, underscore and dash are simply removed. This allows meaningful
322
  // search behavior with acronyms and URLs.
323
  $text = preg_replace('/[._-]+/', '', $text);
Kjartan's avatar
Kjartan committed
324

325
326
  // With the exception of the rules above, we consider all punctuation,
  // marks, spacers, etc, to be a word boundary.
327
  $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE .']+/u', ' ', $text);
Kjartan's avatar
Kjartan committed
328

329
330
331
332
333
  return $text;
}

/**
 * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping
334
 * sequences of characters ('minimum_word_size' long).
335
336
 */
function search_expand_cjk($matches) {
337
338
339
340
341
  $min = variable_get('minimum_word_size', 3);
  $str = $matches[0];
  $l = drupal_strlen($str);
  // Passthrough short words
  if ($l <= $min) {
Steven Wittens's avatar
Steven Wittens committed
342
    return ' '. $str .' ';
343
  }
344
  $tokens = ' ';
345
346
  // FIFO queue of characters
  $chars = array();
347
348
349
350
  // Begin loop
  for ($i = 0; $i < $l; ++$i) {
    // Grab next character
    $current = drupal_substr($str, 0, 1);
351
352
353
354
355
356
    $str = substr($str, strlen($current));
    $chars[] = $current;
    if ($i >= $min - 1) {
      $tokens .= implode('', $chars) .' ';
      array_shift($chars);
    }
357
358
359
360
361
362
363
364
  }
  return $tokens;
}

/**
 * Splits a string into tokens for indexing.
 */
function search_index_split($text) {
365
366
  static $last = NULL;
  static $lastsplit = NULL;
367
368
369
370

  if ($last == $text) {
    return $lastsplit;
  }
371
  // Process words
372
  $text = search_simplify($text);
373
  $words = explode(' ', $text);
374
  array_walk($words, '_search_index_truncate');
Kjartan's avatar
Kjartan committed
375

376
377
378
379
380
381
382
  // Save last keyword result
  $last = $text;
  $lastsplit = $words;

  return $words;
}

383
/**
384
 * Helper function for array_walk in search_index_split.
385
 */
386
function _search_index_truncate(&$text) {
Steven Wittens's avatar
Steven Wittens committed
387
  $text = truncate_utf8($text, 50);
388
389
}

390
391
392
/**
 * Invokes hook_search_preprocess() in modules.
 */
393
function search_invoke_preprocess(&$text) {
394
395
  foreach (module_implements('search_preprocess') as $module) {
    $text = module_invoke($module, 'search_preprocess', $text);
Kjartan's avatar
Kjartan committed
396
397
398
399
  }
}

/**
Steven Wittens's avatar
Steven Wittens committed
400
 * Update the full-text search index for a particular item.
401
402
403
 *
 * @param $sid
 *   A number identifying this particular item (e.g. node id).
Kjartan's avatar
Kjartan committed
404
 *
405
406
407
408
409
 * @param $type
 *   A string defining this type of item (e.g. 'node')
 *
 * @param $text
 *   The content of this item. Must be a piece of HTML text.
410
411
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
412
 */
413
function search_index($sid, $type, $text) {
414
  $minimum_word_size = variable_get('minimum_word_size', 3);
415

416
  // Link matching
417
  global $base_url;
418
  $node_regexp = '@href=[\'"]?(?:'. preg_quote($base_url, '@') .'/|'. preg_quote(base_path(), '@') .')(?:\?q=)?/?((?![a-z]+:)[^\'">]+)[\'">]@i';
419
420
421

  // Multipliers for scores of words inside certain HTML tags.
  // Note: 'a' must be included for link ranking to work.
422
  $tags = array('h1' => 25,
423
424
425
426
427
                'h2' => 18,
                'h3' => 15,
                'h4' => 12,
                'h5' => 9,
                'h6' => 6,
428
429
430
431
432
                'u' => 3,
                'b' => 3,
                'i' => 3,
                'strong' => 3,
                'em' => 3,
433
434
435
436
437
438
439
440
441
442
443
444
                'a' => 10);

  // Strip off all ignored tags to speed up processing, but insert space before/after
  // them to keep word boundaries.
  $text = str_replace(array('<', '>'), array(' <', '> '), $text);
  $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');

  // Split HTML tags from plain text.
  $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  // Note: PHP ensures the array consists of alternating delimiters and literals
  // and begins and ends with a literal (inserting $null as required).

445
446
  $tag = FALSE; // Odd/even counter. Tag or no tag.
  $link = FALSE; // State variable for link analyser
447
  $score = 1; // Starting score per word
448
449
450
451
  $accum = ' '; // Accumulator for cleaned up data
  $tagstack = array(); // Stack with open tags
  $tagwords = 0; // Counter for consecutive words
  $focus = 1; // Focus state
452

453
  $results = array(0 => array()); // Accumulator for words for index
454
455
456
457
458

  foreach ($split as $value) {
    if ($tag) {
      // Increase or decrease score per word based on tag
      list($tagname) = explode(' ', $value, 2);
459
      $tagname = drupal_strtolower($tagname);
460
      // Closing or opening tag?
461
      if ($tagname[0] == '/') {
462
463
464
465
        $tagname = substr($tagname, 1);
        // If we encounter unexpected tags, reset score to avoid incorrect boosting.
        if (!count($tagstack) || $tagstack[0] != $tagname) {
          $tagstack = array();
466
467
          $score = 1;
        }
468
469
470
471
472
        else {
          // Remove from tag stack and decrement score
          $score = max(1, $score - $tags[array_shift($tagstack)]);
        }
        if ($tagname == 'a') {
473
          $link = FALSE;
474
        }
Kjartan's avatar
Kjartan committed
475
      }
476
      else {
477
        if (isset($tagstack[0]) && $tagstack[0] == $tagname) {
478
479
480
          // None of the tags we look for make sense when nested identically.
          // If they are, it's probably broken HTML.
          $tagstack = array();
481
          $score = 1;
482
483
484
485
486
487
        }
        else {
          // Add to open tag stack and increment score
          array_unshift($tagstack, $tagname);
          $score += $tags[$tagname];
        }
488
489
490
491
        if ($tagname == 'a') {
          // Check if link points to a node on this site
          if (preg_match($node_regexp, $value, $match)) {
            $path = drupal_get_normal_path($match[1]);
492
            if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
493
494
              $linknid = $match[1];
              if ($linknid > 0) {
495
496
497
                // Note: ignore links to uncachable nodes to avoid redirect bugs.
                $node = db_fetch_object(db_query('SELECT n.title, n.nid, n.vid, r.format FROM {node} n INNER JOIN {node_revisions} r ON n.vid = r.vid WHERE n.nid = %d', $linknid));
                if (filter_format_allowcache($node->format)) {
498
                  $link = TRUE;
499
500
                  $linktitle = $node->title;
                }
501
502
              }
            }
Kjartan's avatar
Kjartan committed
503
504
505
          }
        }
      }
506
507
      // A tag change occurred, reset counter.
      $tagwords = 0;
508
509
510
511
    }
    else {
      // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
      if ($value != '') {
512
513
514
515
516
517
518
        if ($link) {
          // Check to see if the node link text is its URL. If so, we use the target node title instead.
          if (preg_match('!^https?://!i', $value)) {
            $value = $linktitle;
          }
        }
        $words = search_index_split($value);
519
        foreach ($words as $word) {
520
521
522
          // Add word to accumulator
          $accum .= $word .' ';
          $num = is_numeric($word);
523
          // Check wordlength
524
525
526
527
528
529
          if ($num || drupal_strlen($word) >= $minimum_word_size) {
            // Normalize numbers
            if ($num) {
              $word = (int)ltrim($word, '-0');
            }

530
531
532
533
            if ($link) {
              if (!isset($results[$linknid])) {
                $results[$linknid] = array();
              }
534
              $results[$linknid][$word] += $score * $focus;
535
536
            }
            else {
537
538
539
              if (!isset($results[0][$word])) {
                $results[0][$word] = 0;
              }
540
541
542
543
              $results[0][$word] += $score * $focus;
              // Focus is a decaying value in terms of the amount of unique words up to this point.
              // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
              $focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015));
544
545
            }
          }
546
547
548
549
550
551
          $tagwords++;
          // Too many words inside a single tag probably mean a tag was accidentally left open.
          if (count($tagstack) && $tagwords >= 15) {
            $tagstack = array();
            $score = 1;
          }
Dries's avatar
   
Dries committed
552
        }
Kjartan's avatar
Kjartan committed
553
554
      }
    }
555
    $tag = !$tag;
Kjartan's avatar
Kjartan committed
556
557
  }

558
  search_wipe($sid, $type, TRUE);
Kjartan's avatar
Kjartan committed
559

560
561
562
  // Insert cleaned up data into dataset
  db_query("INSERT INTO {search_dataset} (sid, type, data) VALUES (%d, '%s', '%s')", $sid, $type, $accum);

563
564
  // Insert results into search index
  foreach ($results[0] as $word => $score) {
565
    db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %f)", $word, $sid, $type, $score);
566
567
568
    search_dirty($word);
  }
  unset($results[0]);
Dries's avatar
   
Dries committed
569

570
571
572
  // Now insert links to nodes
  foreach ($results as $nid => $words) {
    foreach ($words as $word => $score) {
573
      db_query("INSERT INTO {search_index} (word, sid, type, fromsid, fromtype, score) VALUES ('%s', %d, '%s', %d, '%s', %f)", $word, $nid, 'node', $sid, $type, $score);
574
      search_dirty($word);
Kjartan's avatar
Kjartan committed
575
576
577
578
    }
  }
}

579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
/**
 * Extract a module-specific search option from a search query. e.g. 'type:book'
 */
function search_query_extract($keys, $option) {
  if (preg_match('/(^| )'. $option .':([^ ]*)( |$)/i', $keys, $matches)) {
    return $matches[2];
  }
}

/**
 * Return a query with the given module-specific search option inserted in.
 * e.g. 'type:book'.
 */
function search_query_insert($keys, $option, $value = '') {
  if (search_query_extract($keys, $option)) {
    $keys = trim(preg_replace('/(^| )'. $option .':[^ ]*/i', '', $keys));
  }
  if ($value != '') {
    $keys .= ' '. $option .':'. $value;
  }
  return $keys;
}

/**
 * Parse a search query into SQL conditions.
 *
605
 * We build a query that matches the dataset bodies.
606
607
608
609
610
611
612
613
614
615
616
617
 */
function search_parse_query($text) {
  $keys = array('positive' => array(), 'negative' => array());

  // Tokenize query string
  preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' '. $text, $matches, PREG_SET_ORDER);

  if (count($matches) < 1) {
    return NULL;
  }

  // Classify tokens
618
  $or = FALSE;
619
  foreach ($matches as $match) {
620
    $phrase = FALSE;
Steven Wittens's avatar
Steven Wittens committed
621
    // Strip off phrase quotes
622
623
    if ($match[2]{0} == '"') {
      $match[2] = substr($match[2], 1, -1);
624
      $phrase = TRUE;
625
    }
Steven Wittens's avatar
Steven Wittens committed
626
    // Simplify keyword according to indexing rules and external preprocessors
627
628
    $words = search_simplify($match[2]);
    // Re-explode in case simplification added more words, except when matching a phrase
Steven Wittens's avatar
Steven Wittens committed
629
    $words = $phrase ? array($words) : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);
630
631
    // Negative matches
    if ($match[1] == '-') {
632
      $keys['negative'] = array_merge($keys['negative'], $words);
633
634
635
636
    }
    // OR operator: instead of a single keyword, we store an array of all
    // OR'd keywords.
    elseif ($match[2] == 'OR' && count($keys['positive'])) {
Steven Wittens's avatar
Steven Wittens committed
637
638
639
640
641
642
      $last = array_pop($keys['positive']);
      // Starting a new OR?
      if (!is_array($last)) {
        $last = array($last);
      }
      $keys['positive'][] = $last;
643
      $or = TRUE;
644
645
646
647
648
      continue;
    }
    // Plain keyword
    else {
      if ($or) {
649
650
        // Add to last element (which is an array)
        $keys['positive'][count($keys['positive']) - 1] = array_merge($keys['positive'][count($keys['positive']) - 1], $words);
651
652
      }
      else {
653
        $keys['positive'] = array_merge($keys['positive'], $words);
654
655
      }
    }
656
    $or = FALSE;
657
658
659
660
661
662
663
  }

  // Convert keywords into SQL statements.
  $query = array();
  $query2 = array();
  $arguments = array();
  $arguments2 = array();
Steven Wittens's avatar
Steven Wittens committed
664
  $matches = 0;
665
666
667
668
669
  // Positive matches
  foreach ($keys['positive'] as $key) {
    // Group of ORed terms
    if (is_array($key) && count($key)) {
      $queryor = array();
670
      $any = FALSE;
671
      foreach ($key as $or) {
Steven Wittens's avatar
Steven Wittens committed
672
673
        list($q, $count) = _search_parse_query($or, $arguments2);
        $any |= $count;
674
675
676
677
678
679
680
        if ($q) {
          $queryor[] = $q;
          $arguments[] = $or;
        }
      }
      if (count($queryor)) {
        $query[] = '('. implode(' OR ', $queryor) .')';
Steven Wittens's avatar
Steven Wittens committed
681
682
        // A group of OR keywords only needs to match once
        $matches += ($any > 0);
683
684
685
686
      }
    }
    // Single ANDed term
    else {
Steven Wittens's avatar
Steven Wittens committed
687
      list($q, $count) = _search_parse_query($key, $arguments2);
688
689
690
      if ($q) {
        $query[] = $q;
        $arguments[] = $key;
Steven Wittens's avatar
Steven Wittens committed
691
692
        // Each AND keyword needs to match at least once
        $matches += $count;
693
694
695
      }
    }
  }
Steven Wittens's avatar
Steven Wittens committed
696
  // Negative matches
697
  foreach ($keys['negative'] as $key) {
698
    list($q) = _search_parse_query($key, $arguments2, TRUE);
699
700
701
702
703
704
    if ($q) {
      $query[] = $q;
      $arguments[] = $key;
    }
  }
  $query = implode(' AND ', $query);
705

Steven Wittens's avatar
Steven Wittens committed
706
  // Build word-index conditions for the first pass
707
  $query2 = substr(str_repeat("i.word = '%s' OR ", count($arguments2)), 0, -4);
Steven Wittens's avatar
Steven Wittens committed
708
709

  return array($query, $arguments, $query2, $arguments2, $matches);
710
711
712
713
714
}

/**
 * Helper function for search_parse_query();
 */
715
function _search_parse_query(&$word, &$scores, $not = FALSE) {
Steven Wittens's avatar
Steven Wittens committed
716
  $count = 0;
717
718
719
720
721
  // Determine the scorewords of this word/phrase
  if (!$not) {
    $split = explode(' ', $word);
    foreach ($split as $s) {
      $num = is_numeric($s);
722
      if ($num || drupal_strlen($s) >= variable_get('minimum_word_size', 3)) {
Steven Wittens's avatar
Steven Wittens committed
723
724
725
726
727
        $s = $num ? ((int)ltrim($s, '-0')) : $s;
        if (!isset($scores[$s])) {
          $scores[$s] = $s;
          $count++;
        }
728
729
730
      }
    }
  }
Steven Wittens's avatar
Steven Wittens committed
731
732
  // Return matching snippet and number of added words
  return array("d.data ". ($not ? 'NOT ' : '') ."LIKE '%% %s %%'", $count);
733
734
}

Kjartan's avatar
Kjartan committed
735
/**
Steven Wittens's avatar
Steven Wittens committed
736
 * Do a query on the full-text search index for a word or words.
737
 *
Steven Wittens's avatar
Steven Wittens committed
738
739
 * This function is normally only called by each module that support the
 * indexed search (and thus, implements hook_update_index()).
740
 *
741
742
743
744
 * Two queries are performed which can be extended by the caller.
 *
 * The first query selects a set of possible matches based on the search index
 * and any extra given restrictions. This is the classic "OR" search.
745
 *
746
 * SELECT i.type, i.sid, SUM(i.score*t.count) AS relevance
747
 * FROM {search_index} i
748
749
750
 * INNER JOIN {search_total} t ON i.word = t.word
 * $join1
 * WHERE $where1 AND (...)
751
 * GROUP BY i.type, i.sid
752
753
754
755
756
757
758
759
760
761
762
 *
 * The second query further refines this set by verifying advanced text
 * conditions (such as AND, negative or phrase matches), and orders the results
 * on a the column or expression 'score':
 *
 * SELECT i.type, i.sid, $select2
 * FROM temp_search_sids i
 * INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type
 * $join2
 * WHERE (...)
 * ORDER BY score DESC
763
 *
764
 * @param $keywords
765
766
767
768
 *   A search string as entered by the user.
 *
 * @param $type
 *   A string identifying the calling module.
Kjartan's avatar
Kjartan committed
769
 *
770
771
 * @param $join1
 *   (optional) Inserted into the JOIN part of the first SQL query.
772
773
 *   For example "INNER JOIN {node} n ON n.nid = i.sid".
 *
774
775
776
777
778
779
780
781
782
783
784
 * @param $where1
 *   (optional) Inserted into the WHERE part of the first SQL query.
 *   For example "(n.status > %d)".
 *
 * @param $arguments1
 *   (optional) Extra SQL arguments belonging to the first query.
 *
 * @param $select2
 *   (optional) Inserted into the SELECT pat of the second query. Must contain
 *   a column selected as 'score'.
 *   defaults to 'i.relevance AS score'
785
 *
786
787
788
789
790
791
 * @param $join2
 *   (optional) Inserted into the JOIN par of the second SQL query.
 *   For example "INNER JOIN {node_comment_statistics} n ON n.nid = i.sid"
 *
 * @param $arguments2
 *   (optional) Extra SQL arguments belonging to the second query parameter.
792
 *
793
 * @param $sort_parameters
Dries's avatar
Dries committed
794
 *   (optional) SQL arguments for sorting the final results.
795
796
 *              Default: 'ORDER BY score DESC'
 *
797
798
 * @return
 *   An array of SIDs for the search results.
799
800
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
801
 */
802
function do_search($keywords, $type, $join1 = '', $where1 = '1', $arguments1 = array(), $select2 = 'i.relevance AS score', $join2 = '', $arguments2 = array(), $sort_parameters = 'ORDER BY score DESC') {
803
  $query = search_parse_query($keywords);
804

Steven Wittens's avatar
Steven Wittens committed
805
  if ($query[2] == '') {
806
    form_set_error('keys', t('You must include at least one positive keyword with @count characters or more.', array('@count' => variable_get('minimum_word_size', 3))));
Steven Wittens's avatar
Steven Wittens committed
807
  }
808
809
  if ($query === NULL || $query[0] == '' || $query[2] == '') {
    return array();
810
  }
811

812
813
814
  // First pass: select all possible matching sids, doing a simple index-based OR matching on the keywords.
  // 'matches' is used to reject those items that cannot possibly match the query.
  $conditions = $where1 .' AND ('. $query[2] .") AND i.type = '%s'";
Steven Wittens's avatar
Steven Wittens committed
815
  $arguments = array_merge($arguments1, $query[3], array($type, $query[4]));
816
  $result = db_query_temporary("SELECT i.type, i.sid, SUM(i.score * t.count) AS relevance, COUNT(*) AS matches FROM {search_index} i INNER JOIN {search_total} t ON i.word = t.word $join1 WHERE $conditions GROUP BY i.type, i.sid HAVING COUNT(*) >= %d", $arguments, 'temp_search_sids');
817
818

  // Calculate maximum relevance, to normalize it
819
  $normalize = db_result(db_query('SELECT MAX(relevance) FROM temp_search_sids'));
820
  if (!$normalize) {
821
822
    return array();
  }
823
  $select2 = str_replace('i.relevance', '('. (1.0 / $normalize) .' * i.relevance)', $select2);
824

825
826
827
  // Second pass: only keep items that match the complicated keywords conditions (phrase search, negative keywords, ...)
  $conditions = '('. $query[0] .')';
  $arguments = array_merge($arguments2, $query[1]);
828
  $result = db_query_temporary("SELECT i.type, i.sid, $select2 FROM temp_search_sids i INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type $join2 WHERE $conditions $sort_parameters", $arguments, 'temp_search_results');
829
  if (($count = db_result(db_query('SELECT COUNT(*) FROM temp_search_results'))) == 0) {
830
    return array();
831
832
833
  }
  $count_query = "SELECT $count";

834
  // Do actual search query
835
  $result = pager_query("SELECT * FROM temp_search_results", 10, 0, $count_query);
836
837
  $results = array();
  while ($item = db_fetch_object($result)) {
838
    $results[] = $item;
839
840
  }
  return $results;
Kjartan's avatar
Kjartan committed
841
842
}

843
844
845
846
/**
 * Helper function for grabbing search keys.
 */
function search_get_keys() {
847
848
849
850
851
852
853
854
855
  static $return;
  if (!isset($return)) {
    // Extract keys as remainder of path
    // Note: support old GET format of searches for existing links.
    $path = explode('/', $_GET['q'], 3);
    $keys = empty($_REQUEST['keys']) ? '' : $_REQUEST['keys'];
    $return = count($path) == 3 ? $path[2] : $keys;
  }
  return $return;
856
857
}

858
859
860
861
862
863
864
865
/**
 * @defgroup search Search interface
 * @{
 * The Drupal search interface manages a global search mechanism.
 *
 * Modules may plug into this system to provide searches of different types of
 * data. Most of the system is handled by search.module, so this must be enabled
 * for all of the search features to work.
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
 *
 * There are three ways to interact with the search system:
 * - Specifically for searching nodes, you can implement nodeapi('update index')
 *   and nodeapi('search result'). However, note that the search system already
 *   indexes all visible output of a node, i.e. everything displayed normally
 *   by hook_view() and hook_nodeapi('view'). This is usually sufficient.
 *   You should only use this mechanism if you want additional, non-visible data
 *   to be indexed.
 * - Implement hook_search(). This will create a search tab for your module on
 *   the /search page with a simple keyword search form. You may optionally
 *   implement hook_search_item() to customize the display of your results.
 * - Implement hook_update_index(). This allows your module to use Drupal's
 *   HTML indexing mechanism for searching full text efficiently.
 *
 * If your module needs to provide a more complicated search form, then you need
Dries's avatar
Dries committed
881
 * to implement it yourself without hook_search(). In that case, you should
882
883
 * define it as a local task (tab) under the /search page (e.g. /search/mymodule)
 * so that users can easily find it.
884
885
886
887
888
889
890
891
892
 */

/**
 * Render a search form.
 *
 * @param $action
 *   Form action. Defaults to "search".
 * @param $keys
 *   The search string entered by the user, containing keywords for the search.
893
894
895
 * @param $type
 *   The type of search to render the node for. Must be the name of module
 *   which implements hook_search(). Defaults to 'node'.
896
897
 * @param $prompt
 *   A piece of text to put before the form (e.g. "Enter your keywords")
898
899
900
 * @return
 *   An HTML string containing the search form.
 */
901
function search_form(&$form_state, $action = '', $keys = '', $type = NULL, $prompt = NULL) {
902
903
904
905

  // Add CSS
  drupal_add_css(drupal_get_path('module', 'search') .'/search.css', 'module', 'all', FALSE);

906
  if (!$action) {
907
    $action = url('search/'. $type);
908
  }
909
910
911
  if (is_null($prompt)) {
    $prompt = t('Enter your keywords');
  }
912

913
914
915
916
917
  $form = array(
    '#action' => $action,
    '#attributes' => array('class' => 'search-form'),
  );
  $form['module'] = array('#type' => 'value', '#value' => $type);
918
  $form['basic'] = array('#type' => 'item', '#title' => $prompt);
919
920
921
922
923
924
925
926
  $form['basic']['inline'] = array('#prefix' => '<div class="container-inline">', '#suffix' => '</div>');
  $form['basic']['inline']['keys'] = array(
    '#type' => 'textfield',
    '#title' => '',
    '#default_value' => $keys,
    '#size' => $prompt ? 40 : 20,
    '#maxlength' => 255,
  );
927
928
  // processed_keys is used to coordinate keyword passing between other forms
  // that hook into the basic search form.
929
  $form['basic']['inline']['processed_keys'] = array('#type' => 'value', '#value' => array());
930
  $form['basic']['inline']['submit'] = array('#type' => 'submit', '#value' => t('Search'));
931

932
  return $form;
933
934
935
}

/**
936
937
938
939
940
 * Form builder; Output a search form for the search block and the theme's search box.
 *
 * @ingroup forms
 * @see search_box_form_submit().
 * @see theme_search_box_form().
941
 */
942
function search_box(&$form_state, $form_id) {
943
944
  // Use search_keys instead of keys to avoid ID conflicts with the search block.
  $form[$form_id .'_keys'] = array(
945
    '#title' => t('Search this site'),
946
947
948
949
950
951
    '#type' => 'textfield',
    '#size' => 15,
    '#default_value' => '',
    '#attributes' => array('title' => t('Enter the terms you wish to search for.')),
  );
  $form['submit'] = array('#type' => 'submit', '#value' => t('Search'));
952
953
  $form['#submit'][] = 'search_box_form_submit';
  $form['#validate'][] = 'search_box_form_validate';
954

955
  return $form;
956
957
958
959
960
}

/**
 * Process a block search form submission.
 */
961
function search_box_form_submit($form, &$form_state) {
962
  $form_id = $form['form_id']['#value'];
963
  $form_state['redirect'] = 'search/node/'. trim($form_state['values'][$form_id .'_keys']);
964
965
966
967
}

/**
 * Theme the theme search form.
968
969
 *
 * @ingroup themeable
970
971
 */
function theme_search_theme_form($form) {
972
  return '<div id="search" class="container-inline">'. drupal_render($form) .'</div>';
973
974
975
976
}

/**
 * Theme the block search form.
977
978
 *
 * @ingroup themeable
979
980
 */
function theme_search_block_form($form) {
981
  return '<div class="container-inline">'. drupal_render($form) .'</div>';
982
983
984
}

/**
Steven Wittens's avatar
Steven Wittens committed
985
 * Perform a standard search on the given keys, and return the formatted results.
986
987
 */
function search_data($keys = NULL, $type = 'node') {
988

989
990
991
  if (isset($keys)) {
    if (module_hook($type, 'search')) {
      $results = module_invoke($type, 'search', 'search', $keys);
992
      if (isset($results) && is_array($results) && count($results)) {
993
994
995
996
997
        if (module_hook($type, 'search_page')) {
          return module_invoke($type, 'search_page', $results);
        }
        else {
          return theme('search_page', $results, $type);
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
        }
      }
    }
  }
}

/**
 * Returns snippets from a piece of text, with certain keywords highlighted.
 * Used for formatting search results.
 *
 * @param $keys
1009
 *   A string containing a search query.
1010
1011
1012
1013
1014
1015
1016
1017
 *
 * @param $text
 *   The text to extract fragments from.
 *
 * @return
 *   A string containing HTML for the excerpt.
 */
function search_excerpt($keys, $text) {
Steven Wittens's avatar
Steven Wittens committed
1018
1019
1020
  // We highlight around non-indexable or CJK characters.
  $boundary = '(?:(?<=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .'])|(?=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .']))';

1021
1022
1023
1024
1025
  // Extract positive keywords and phrases
  preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' '. $keys, $matches);
  $keys = array_merge($matches[2], $matches[3]);

  // Prepare text
Steven Wittens's avatar
Steven Wittens committed
1026
  $text = ' '. strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text)) .' ';
1027
1028
  array_walk($keys, '_search_excerpt_replace');
  $workkeys = $keys;
1029
1030
1031
1032

  // Extract a fragment per keyword for at most 4 keywords.
  // First we collect ranges of text around each keyword, starting/ending
  // at spaces.
1033
  // If the sum of all fragments is too short, we look for second occurrences.
1034
1035
1036
  $ranges = array();
  $included = array();
  $length = 0;
1037
1038
  while ($length < 256 && count($workkeys)) {
    foreach ($workkeys as $k => $key) {
1039
      if (strlen($key) == 0) {
1040
        unset($workkeys[$k]);
1041
        unset($keys[$k]);
1042
1043
1044
1045
1046
        continue;
      }
      if ($length >= 256) {
        break;
      }
1047
      // Remember occurrence of key so we can skip over it if more occurrences
1048
1049
1050
1051
      // are desired.
      if (!isset($included[$key])) {
        $included[$key] = 0;
      }
1052
1053
      // Locate a keyword (position $p), then locate a space in front (position
      // $q) and behind it (position $s)
Steven Wittens's avatar
Steven Wittens committed
1054
      if (preg_match('/'. $boundary . $key . $boundary .'/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
1055
        $p = $match[0][1];
1056
        if (($q = strpos($text, ' ', max(0, $p - 60))) !== FALSE) {
1057
          $end = substr($text, $p, 80);
1058
          if (($s = strrpos($end, ' ')) !== FALSE) {
1059
1060
1061
1062
1063
            $ranges[$q]