search.module 43.3 KB
Newer Older
Dries's avatar
   
Dries committed
1
<?php
2
// $Id$
Dries's avatar
   
Dries committed
3

Dries's avatar
   
Dries committed
4
5
6
7
8
/**
 * @file
 * Enables site-wide keyword searching.
 */

9
/**
10
 * Matches Unicode character classes to exclude from the search index.
Steven Wittens's avatar
Steven Wittens committed
11
 *
12
13
 * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
 *
14
 * The index only contains the following character classes:
15
16
17
18
19
20
21
 * Lu     Letter, Uppercase
 * Ll     Letter, Lowercase
 * Lt     Letter, Titlecase
 * Lo     Letter, Other
 * Nd     Number, Decimal Digit
 * No     Number, Other
 */
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
define('PREG_CLASS_SEARCH_EXCLUDE',
'\x{0}-\x{2f}\x{3a}-\x{40}\x{5b}-\x{60}\x{7b}-\x{bf}\x{d7}\x{f7}\x{2b0}-'.
'\x{385}\x{387}\x{3f6}\x{482}-\x{489}\x{559}-\x{55f}\x{589}-\x{5c7}\x{5f3}-'.
'\x{61f}\x{640}\x{64b}-\x{65e}\x{66a}-\x{66d}\x{670}\x{6d4}\x{6d6}-\x{6ed}'.
'\x{6fd}\x{6fe}\x{700}-\x{70f}\x{711}\x{730}-\x{74a}\x{7a6}-\x{7b0}\x{901}-'.
'\x{903}\x{93c}\x{93e}-\x{94d}\x{951}-\x{954}\x{962}-\x{965}\x{970}\x{981}-'.
'\x{983}\x{9bc}\x{9be}-\x{9cd}\x{9d7}\x{9e2}\x{9e3}\x{9f2}-\x{a03}\x{a3c}-'.
'\x{a4d}\x{a70}\x{a71}\x{a81}-\x{a83}\x{abc}\x{abe}-\x{acd}\x{ae2}\x{ae3}'.
'\x{af1}-\x{b03}\x{b3c}\x{b3e}-\x{b57}\x{b70}\x{b82}\x{bbe}-\x{bd7}\x{bf0}-'.
'\x{c03}\x{c3e}-\x{c56}\x{c82}\x{c83}\x{cbc}\x{cbe}-\x{cd6}\x{d02}\x{d03}'.
'\x{d3e}-\x{d57}\x{d82}\x{d83}\x{dca}-\x{df4}\x{e31}\x{e34}-\x{e3f}\x{e46}-'.
'\x{e4f}\x{e5a}\x{e5b}\x{eb1}\x{eb4}-\x{ebc}\x{ec6}-\x{ecd}\x{f01}-\x{f1f}'.
'\x{f2a}-\x{f3f}\x{f71}-\x{f87}\x{f90}-\x{fd1}\x{102c}-\x{1039}\x{104a}-'.
'\x{104f}\x{1056}-\x{1059}\x{10fb}\x{10fc}\x{135f}-\x{137c}\x{1390}-\x{1399}'.
'\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16f0}\x{1712}-\x{1714}'.
'\x{1732}-\x{1736}\x{1752}\x{1753}\x{1772}\x{1773}\x{17b4}-\x{17db}\x{17dd}'.
'\x{17f0}-\x{180e}\x{1843}\x{18a9}\x{1920}-\x{1945}\x{19b0}-\x{19c0}\x{19c8}'.
'\x{19c9}\x{19de}-\x{19ff}\x{1a17}-\x{1a1f}\x{1d2c}-\x{1d61}\x{1d78}\x{1d9b}-'.
'\x{1dc3}\x{1fbd}\x{1fbf}-\x{1fc1}\x{1fcd}-\x{1fcf}\x{1fdd}-\x{1fdf}\x{1fed}-'.
'\x{1fef}\x{1ffd}-\x{2070}\x{2074}-\x{207e}\x{2080}-\x{2101}\x{2103}-\x{2106}'.
'\x{2108}\x{2109}\x{2114}\x{2116}-\x{2118}\x{211e}-\x{2123}\x{2125}\x{2127}'.
'\x{2129}\x{212e}\x{2132}\x{213a}\x{213b}\x{2140}-\x{2144}\x{214a}-\x{2b13}'.
'\x{2ce5}-\x{2cff}\x{2d6f}\x{2e00}-\x{3005}\x{3007}-\x{303b}\x{303d}-\x{303f}'.
'\x{3099}-\x{309e}\x{30a0}\x{30fb}-\x{30fe}\x{3190}-\x{319f}\x{31c0}-\x{31cf}'.
'\x{3200}-\x{33ff}\x{4dc0}-\x{4dff}\x{a015}\x{a490}-\x{a716}\x{a802}\x{a806}'.
'\x{a80b}\x{a823}-\x{a82b}\x{d800}-\x{f8ff}\x{fb1e}\x{fb29}\x{fd3e}\x{fd3f}'.
'\x{fdfc}-\x{fe6b}\x{feff}-\x{ff0f}\x{ff1a}-\x{ff20}\x{ff3b}-\x{ff40}\x{ff5b}-'.
'\x{ff65}\x{ff70}\x{ff9e}\x{ff9f}\x{ffe0}-\x{fffd}');
50
51

/**
Steven Wittens's avatar
Steven Wittens committed
52
 * Matches all 'N' Unicode character classes (numbers)
53
 */
54
55
56
57
58
59
60
61
62
63
define('PREG_CLASS_NUMBERS',
'\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}'.
'\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}'.
'\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}'.
'\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-'.
'\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}'.
'\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}'.
'\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}'.
'\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-'.
'\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}');
64
65

/**
Steven Wittens's avatar
Steven Wittens committed
66
 * Matches all 'P' Unicode character classes (punctuation)
67
 */
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
define('PREG_CLASS_PUNCTUATION',
'\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}'.
'\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}'.
'\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}'.
'\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}'.
'\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}'.
'\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}'.
'\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}'.
'\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}'.
'\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-'.
'\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}'.
'\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}'.
'\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}'.
'\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}'.
'\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-'.
'\x{ff65}');

/**
 * Matches all CJK characters that are candidates for auto-splitting
 * (Chinese, Japanese, Korean).
 * Contains kana and BMP ideographs.
 */
define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}'.
'\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}');
92

Dries's avatar
   
Dries committed
93
94
95
/**
 * Implementation of hook_help().
 */
96
function search_help($section) {
Dries's avatar
   
Dries committed
97
  switch ($section) {
98
99
100
101
102
103
104
105
106
107
108
109
    case 'admin/help#search':
      $output = '<p>'. t('The search module adds the ability to search for content by keywords.  Search is often the only practical way to find content on a large site.  Search is useful for finding users and posts by searching on keywords.') .'</p>';
      $output .= '<p>'. t('The search engine works by maintaining an index of the words in your site\'s content. It indexes the posts and users.  You can adjust the settings to tweak the indexing behaviour. Note that the search requires cron to be set up correctly.  The index percentage sets the maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.') .'</p>';
      $output .= t('<p>You can</p>
<ul>
<li>read about how your site uses cron in the <a href="%admin-help-system">administer &gt;&gt; help &gt;&gt; system</a>.</li>
<li>run your <a href="%file-cron">cron.php</a>.</li>
<li>read about <a href="%external-http-drupal-org-node-23714">configuring cron jobs</a>.</li>
<li><a href="%admin-settings-search">administer &gt;&gt; settings &gt;&gt; search</a>.</ul>
', array('%admin-help-system' => url('admin/help/system'), '%file-cron' => 'cron.php', '%external-http-drupal-org-node-23714' => 'http://drupal.org/node/23714', '%admin-settings-search' => url('admin/settings/search')));
      $output .= '<p>'. t('For more information please read the configuration and customization handbook <a href="%search">Search page</a>.', array('%search' => 'http://www.drupal.org/handbook/modules/search/')) .'</p>';
      return $output;
Dries's avatar
   
Dries committed
110
111
    case 'admin/modules#description':
      return t('Enables site-wide keyword searching.');
112
113
    case 'admin/settings/search':
      return t('
Steven Wittens's avatar
Steven Wittens committed
114
<p>The search engine works by maintaining an index of the words in your site\'s content. You can adjust the settings below to tweak the indexing behaviour. Note that the search requires cron to be set up correctly.</p>
115
');
116
117
118
    case 'search#noresults':
      return t('<p><ul>
<li>Check if your spelling is correct.</li>
119
120
121
<li>Remove quotes around phrases to match each word individually: <em>"blue smurf"</em> will match less than <em>blue smurf</em>.</li>
<li>Consider loosening your query with <em>OR</em>: <em>blue smurf</em> will match less than <em>blue OR smurf</em>.</li>
</ul></p>');
Dries's avatar
   
Dries committed
122
  }
Dries's avatar
   
Dries committed
123
}
Kjartan's avatar
Kjartan committed
124
125

/**
Dries's avatar
   
Dries committed
126
 * Implementation of hook_perm().
Kjartan's avatar
Kjartan committed
127
 */
Dries's avatar
   
Dries committed
128
function search_perm() {
Dries's avatar
   
Dries committed
129
  return array('search content', 'administer search');
Dries's avatar
   
Dries committed
130
131
}

132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
/**
 * Implementation of hook_block().
 */
function search_block($op = 'list', $delta = 0) {
  global $user;
  if ($op == 'list') {
    $blocks[0]['info'] = t('Search form');
    return $blocks;
  }
  else if ($op == 'view' && user_access('search content') && arg(0) != 'search') {
    $block['content'] = search_form('', '', null, '');
    $block['subject'] = t('Search');
    return $block;
  }
}

Dries's avatar
   
Dries committed
148
149
150
/**
 * Implementation of hook_menu().
 */
Dries's avatar
   
Dries committed
151
function search_menu($may_cache) {
Dries's avatar
   
Dries committed
152
  $items = array();
Dries's avatar
   
Dries committed
153
154

  if ($may_cache) {
Dries's avatar
Dries committed
155
    $items[] = array('path' => 'search', 'title' => t('search'),
Dries's avatar
   
Dries committed
156
157
158
      'callback' => 'search_view',
      'access' => user_access('search content'),
      'type' => MENU_SUGGESTED_ITEM);
159
    $items[] = array('path' => 'admin/settings/search/wipe', 'title' => t('Clear index'),
160
161
162
      'callback' => 'search_wipe_confirm',
      'access' => user_access('administer search'),
      'type' => MENU_CALLBACK);
163
164
165
166
167
  }
  else if (arg(0) == 'search') {
    // To remember the user's search keywords when switching across tabs,
    // we dynamically add the keywords to the search tabs' paths.
    $keys = search_get_keys();
168
    $keys = strlen($keys) ? '/'. $keys : '';
Dries's avatar
Dries committed
169
    foreach (module_list() as $name) {
170
171
      if (module_hook($name, 'search') && $title = module_invoke($name, 'search', 'name')) {
        $items[] = array('path' => 'search/'. $name . $keys, 'title' => $title,
Dries's avatar
Dries committed
172
173
          'callback' => 'search_view',
          'access' => user_access('search content'),
174
          'type' => MENU_LOCAL_TASK);
Dries's avatar
Dries committed
175
176
      }
    }
Dries's avatar
   
Dries committed
177
178
  }

Dries's avatar
   
Dries committed
179
180
181
182
  return $items;
}

/**
183
 * Implementation of hook_validate().
Dries's avatar
   
Dries committed
184
 */
185
function search_settings_form_validate($form_id, &$form) {
186
  if ($_POST['op'] == t('Re-index site')) {
187
188
189
190
191
    drupal_goto('admin/settings/search/wipe');
  }
  // If these settings change, the index needs to be rebuilt.
  if ((variable_get('minimum_word_size', 3) != $form['minimum_word_size']) ||
      (variable_get('overlap_cjk', true) != $form['overlap_cjk'])) {
192
193
    drupal_set_message(t('The index will be rebuilt.'));
    search_wipe();
Dries's avatar
Dries committed
194
  }
195
}
Dries's avatar
   
Dries committed
196

197
198
199
200
/**
 * Menu callback; displays the search module settings page.
 */
function search_settings() {
201
202
203
204
205
206
207
208
209
  // Collect some stats
  $remaining = 0;
  $total = 0;
  foreach (module_list() as $module) {
    if (module_hook($module, 'search')) {
      $status = module_invoke($module, 'search', 'status');
      $remaining += $status['remaining'];
      $total += $status['total'];
    }
Dries's avatar
Dries committed
210
  }
211
  $count = format_plural($remaining, 'There is 1 item left to index.', 'There are %count items left to index.');
212
  $percentage = ((int)min(100, 100 * ($total - $remaining) / max(1, $total))) . '%';
213
  $status = '<p><strong>'. t('%percentage of the site has been indexed.', array('%percentage' => $percentage)) .' '. $count .'</strong></p>';
214
215
  $form['status'] = array('#type' => 'fieldset', '#title' => t('Indexing status'));
  $form['status']['status'] = array('#type' => 'markup', '#value' => $status);
216
  $form['status']['wipe'] = array('#type' => 'submit', '#value' => t('Re-index site'));
217
218

  $items = drupal_map_assoc(array(10, 20, 50, 100, 200, 500));
219
220

  // Indexing throttle:
221
222
  $form['indexing_throttle'] = array('#type' => 'fieldset', '#title' => t('Indexing throttle'));
  $form['indexing_throttle']['search_cron_limit'] = array('#type' => 'select', '#title' => t('Items to index per cron run'), '#default_value' => variable_get('search_cron_limit', 100), '#options' => $items, '#description' => t('The maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.'));
Dries's avatar
   
Dries committed
223
  // Indexing settings:
224
  $form['indexing_settings'] = array('#type' => 'fieldset', '#title' => t('Indexing settings'));
225
226
227
  $form['indexing_settings']['info'] = array('#type' => 'markup', '#value' => '<em>'. t('<p>Changing the settings below will cause the site index to be rebuilt. The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed.</p><p>The default settings should be appropriate for the majority of sites.</p>') .'</em>');
  $form['indexing_settings']['minimum_word_size'] = array('#type' => 'textfield', '#title' => t('Minimum word length to index'), '#default_value' => variable_get('minimum_word_size', 3), '#size' => 5, '#maxlength' => 3, '#description' => t('The number of characters a word has to be to be indexed. A lower setting means better search result ranking, but also a larger database. Each search query must contain at least one keyword that is this size (or longer).'));
  $form['indexing_settings']['overlap_cjk'] = array('#type' => 'checkbox', '#title' => t('Simple CJK handling'), '#default_value' => variable_get('overlap_cjk', true), '#description' => t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Turn this off if you want to use an external preprocessor for this instead. Does not affect other languages.'));
Dries's avatar
   
Dries committed
228

229
230
  // Per module settings
  $form = array_merge($form, module_invoke_all('search', 'admin'));
231
  return $form;
Dries's avatar
   
Dries committed
232
233
}

234
235
236
237
238
/**
 * Menu callback: confirm wiping of the index.
 */
function search_wipe_confirm() {
  return confirm_form('search_wipe_confirm', $form, t('Are you sure you want to re-index the site?'),
239
                  'admin/forums', t(' The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed. This action cannot be undone.'), t('Re-index site'), t('Cancel'));
240
241
242
243
244
}

/**
 * Handler for wipe confirmation
 */
245
function search_wipe_confirm_submit($form_id, &$form) {
246
247
248
249
250
251
252
  if ($form['confirm']) {
    search_wipe();
    drupal_set_message(t('The index will be rebuilt.'));
    drupal_goto('admin/settings/search');
  }
}

Dries's avatar
Dries committed
253
/**
254
255
256
257
258
259
260
 * Wipes a part of or the entire search index.
 *
 * @param $sid
 *  (optional) The SID of the item to wipe. If specified, $type must be passed
 *  too.
 * @param $type
 *  (optional) The type of item to wipe.
Dries's avatar
Dries committed
261
 */
262
function search_wipe($sid = NULL, $type = NULL) {
263
264
265
266
  if ($type == NULL && $sid == NULL) {
    module_invoke_all('search', 'reset');
  }
  else {
267
    db_query("DELETE FROM {search_dataset} WHERE sid = %d AND type = '%s'", $sid, $type);
268
269
270
    db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'", $sid, $type);
    db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type);
  }
Dries's avatar
Dries committed
271
272
}

273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
/**
 * Marks a word as dirty (or retrieves the list of dirty words). This is used
 * during indexing (cron). Words which are dirty have outdated total counts in
 * the search_total table, and need to be recounted.
 */
function search_dirty($word = null) {
  static $dirty = array();
  if ($word !== null) {
    $dirty[$word] = true;
  }
  else {
    return $dirty;
  }
}

Kjartan's avatar
Kjartan committed
288
/**
Dries's avatar
   
Dries committed
289
290
 * Implementation of hook_cron().
 *
291
292
 * Fires hook_update_index() in all modules and cleans up dirty words (see
 * search_dirty).
Kjartan's avatar
Kjartan committed
293
294
 */
function search_cron() {
Dries's avatar
Dries committed
295
  // Update word index
Kjartan's avatar
Kjartan committed
296
  foreach (module_list() as $module) {
297
298
    module_invoke($module, 'update_index');
  }
299
  // Update word IDF (Inverse Document Frequency) counts for new/changed words
300
  foreach (search_dirty() as $word => $dummy) {
301
    // Get total count
302
    $total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word));
303
304
305
    // Apply Zipf's law to equalize the probability distribution
    $total = log10(1 + 1/(max(1, $total)));
    db_query("UPDATE {search_total} SET count = %f WHERE word = '%s'", $total, $word);
Dries's avatar
Dries committed
306
    if (!db_affected_rows()) {
307
      db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %f)", $word, $total);
Dries's avatar
Dries committed
308
309
310
311
312
313
314
315
    }
  }
  // Find words that were deleted from search_index, but are still in
  // search_total. We use a LEFT JOIN between the two tables and keep only the
  // rows which fail to join.
  $result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL");
  while ($word = db_fetch_object($result)) {
    db_query("DELETE FROM {search_total} WHERE word = '%s'", $word->realword);
Kjartan's avatar
Kjartan committed
316
317
318
319
  }
}

/**
320
 * Simplifies a string according to indexing rules.
Kjartan's avatar
Kjartan committed
321
 */
322
function search_simplify($text) {
323
324
  // Decode entities to UTF-8
  $text = decode_entities($text);
325

326
327
328
  // Lowercase
  $text = drupal_strtolower($text);

329
330
  // Call an external processor for word handling.
  search_preprocess($text);
Kjartan's avatar
Kjartan committed
331

332
333
  // Simple CJK handling
  if (variable_get('overlap_cjk', true)) {
Dries's avatar
Dries committed
334
    $text = preg_replace_callback('/['. PREG_CLASS_CJK .']+/u', 'search_expand_cjk', $text);
335
  }
336

337
338
339
340
341
342
343
  // To improve searching for numerical data such as dates, IP addresses
  // or version numbers, we consider a group of numerical characters
  // separated only by punctuation characters to be one piece.
  // This also means that searching for e.g. '20/03/1984' also returns
  // results with '20-03-1984' in them.
  // Readable regexp: ([number]+)[punctuation]+(?=[number])
  $text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text);
Kjartan's avatar
Kjartan committed
344

345
346
347
  // The dot, underscore and dash are simply removed. This allows meaningful
  // search behaviour with acronyms and URLs.
  $text = preg_replace('/[._-]+/', '', $text);
Kjartan's avatar
Kjartan committed
348

349
350
351
  // With the exception of the rules above, we consider all punctuation,
  // marks, spacers, etc, to be a word boundary.
  $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text);
Kjartan's avatar
Kjartan committed
352

353
354
355
356
357
  return $text;
}

/**
 * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping
358
 * sequences of characters ('minimum_word_size' long).
359
360
 */
function search_expand_cjk($matches) {
361
362
363
364
365
  $min = variable_get('minimum_word_size', 3);
  $str = $matches[0];
  $l = drupal_strlen($str);
  // Passthrough short words
  if ($l <= $min) {
Steven Wittens's avatar
Steven Wittens committed
366
    return ' '. $str .' ';
367
  }
368
  $tokens = ' ';
369
370
  // FIFO queue of characters
  $chars = array();
371
372
373
374
  // Begin loop
  for ($i = 0; $i < $l; ++$i) {
    // Grab next character
    $current = drupal_substr($str, 0, 1);
375
376
377
378
379
380
    $str = substr($str, strlen($current));
    $chars[] = $current;
    if ($i >= $min - 1) {
      $tokens .= implode('', $chars) .' ';
      array_shift($chars);
    }
381
382
383
384
385
386
387
388
389
390
391
392
393
394
  }
  return $tokens;
}

/**
 * Splits a string into tokens for indexing.
 */
function search_index_split($text) {
  static $last = null;
  static $lastsplit = null;

  if ($last == $text) {
    return $lastsplit;
  }
395
  // Process words
396
  $text = search_simplify($text);
397
  $words = explode(' ', $text);
398
  array_walk($words, '_search_index_truncate');
Kjartan's avatar
Kjartan committed
399

400
401
402
403
404
405
406
  // Save last keyword result
  $last = $text;
  $lastsplit = $words;

  return $words;
}

407
/**
408
 * Helper function for array_walk in search_index_split.
409
 */
410
function _search_index_truncate(&$text) {
Steven Wittens's avatar
Steven Wittens committed
411
  $text = truncate_utf8($text, 50);
412
413
}

414
415
416
417
/**
 * Invokes hook_search_preprocess() in modules.
 */
function search_preprocess(&$text) {
418
419
  foreach (module_implements('search_preprocess') as $module) {
    $text = module_invoke($module, 'search_preprocess', $text);
Kjartan's avatar
Kjartan committed
420
421
422
423
  }
}

/**
Steven Wittens's avatar
Steven Wittens committed
424
 * Update the full-text search index for a particular item.
425
426
427
 *
 * @param $sid
 *   A number identifying this particular item (e.g. node id).
Kjartan's avatar
Kjartan committed
428
 *
429
430
431
432
433
 * @param $type
 *   A string defining this type of item (e.g. 'node')
 *
 * @param $text
 *   The content of this item. Must be a piece of HTML text.
434
435
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
436
 */
437
function search_index($sid, $type, $text) {
438
  $minimum_word_size = variable_get('minimum_word_size', 3);
439

440
  // Link matching
441
  global $base_url;
442
  $node_regexp = '@href=[\'"]?(?:'. preg_quote($base_url, '@') .'/)?(?:\?q=)?/?((?![a-z]+:)[^\'">]+)[\'">]@i';
443
444
445

  // Multipliers for scores of words inside certain HTML tags.
  // Note: 'a' must be included for link ranking to work.
446
  $tags = array('h1' => 25,
447
448
449
450
451
                'h2' => 18,
                'h3' => 15,
                'h4' => 12,
                'h5' => 9,
                'h6' => 6,
452
453
454
455
456
                'u' => 3,
                'b' => 3,
                'i' => 3,
                'strong' => 3,
                'em' => 3,
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
                'a' => 10);

  // Strip off all ignored tags to speed up processing, but insert space before/after
  // them to keep word boundaries.
  $text = str_replace(array('<', '>'), array(' <', '> '), $text);
  $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');

  // Split HTML tags from plain text.
  $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  // Note: PHP ensures the array consists of alternating delimiters and literals
  // and begins and ends with a literal (inserting $null as required).

  $tag = false; // Odd/even counter. Tag or no tag.
  $link = false; // State variable for link analyser
  $score = 1; // Starting score per word
472
473
474
475
  $accum = ' '; // Accumulator for cleaned up data
  $tagstack = array(); // Stack with open tags
  $tagwords = 0; // Counter for consecutive words
  $focus = 1; // Focus state
476

477
  $results = array(0 => array()); // Accumulator for words for index
478
479
480
481
482

  foreach ($split as $value) {
    if ($tag) {
      // Increase or decrease score per word based on tag
      list($tagname) = explode(' ', $value, 2);
483
      $tagname = drupal_strtolower($tagname);
484
      // Closing or opening tag?
485
      if ($tagname{0} == '/') {
486
487
488
489
        $tagname = substr($tagname, 1);
        // If we encounter unexpected tags, reset score to avoid incorrect boosting.
        if (!count($tagstack) || $tagstack[0] != $tagname) {
          $tagstack = array();
490
491
          $score = 1;
        }
492
493
494
495
496
        else {
          // Remove from tag stack and decrement score
          $score = max(1, $score - $tags[array_shift($tagstack)]);
        }
        if ($tagname == 'a') {
497
498
          $link = false;
        }
Kjartan's avatar
Kjartan committed
499
      }
500
      else {
501
502
503
504
        if ($tagstack[0] == $tagname) {
          // None of the tags we look for make sense when nested identically.
          // If they are, it's probably broken HTML.
          $tagstack = array();
505
          $score = 1;
506
507
508
509
510
511
        }
        else {
          // Add to open tag stack and increment score
          array_unshift($tagstack, $tagname);
          $score += $tags[$tagname];
        }
512
513
514
515
        if ($tagname == 'a') {
          // Check if link points to a node on this site
          if (preg_match($node_regexp, $value, $match)) {
            $path = drupal_get_normal_path($match[1]);
516
            if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
517
518
              $linknid = $match[1];
              if ($linknid > 0) {
519
520
521
522
523
524
                // Note: ignore links to uncachable nodes to avoid redirect bugs.
                $node = db_fetch_object(db_query('SELECT n.title, n.nid, n.vid, r.format FROM {node} n INNER JOIN {node_revisions} r ON n.vid = r.vid WHERE n.nid = %d', $linknid));
                if (filter_format_allowcache($node->format)) {
                  $link = true;
                  $linktitle = $node->title;
                }
525
526
              }
            }
Kjartan's avatar
Kjartan committed
527
528
529
          }
        }
      }
530
531
      // A tag change occurred, reset counter.
      $tagwords = 0;
532
533
534
535
    }
    else {
      // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
      if ($value != '') {
536
537
538
539
540
541
542
        if ($link) {
          // Check to see if the node link text is its URL. If so, we use the target node title instead.
          if (preg_match('!^https?://!i', $value)) {
            $value = $linktitle;
          }
        }
        $words = search_index_split($value);
543
        foreach ($words as $word) {
544
545
546
          // Add word to accumulator
          $accum .= $word .' ';
          $num = is_numeric($word);
547
          // Check wordlength
548
549
550
551
552
553
          if ($num || drupal_strlen($word) >= $minimum_word_size) {
            // Normalize numbers
            if ($num) {
              $word = (int)ltrim($word, '-0');
            }

554
555
556
557
            if ($link) {
              if (!isset($results[$linknid])) {
                $results[$linknid] = array();
              }
558
              $results[$linknid][$word] += $score * $focus;
559
560
            }
            else {
561
562
563
564
              $results[0][$word] += $score * $focus;
              // Focus is a decaying value in terms of the amount of unique words up to this point.
              // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
              $focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015));
565
566
            }
          }
567
568
569
570
571
572
          $tagwords++;
          // Too many words inside a single tag probably mean a tag was accidentally left open.
          if (count($tagstack) && $tagwords >= 15) {
            $tagstack = array();
            $score = 1;
          }
Dries's avatar
   
Dries committed
573
        }
Kjartan's avatar
Kjartan committed
574
575
      }
    }
576
    $tag = !$tag;
Kjartan's avatar
Kjartan committed
577
578
  }

579
  search_wipe($sid, $type);
Kjartan's avatar
Kjartan committed
580

581
582
583
  // Insert cleaned up data into dataset
  db_query("INSERT INTO {search_dataset} (sid, type, data) VALUES (%d, '%s', '%s')", $sid, $type, $accum);

584
585
  // Insert results into search index
  foreach ($results[0] as $word => $score) {
586
    db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %f)", $word, $sid, $type, $score);
587
588
589
    search_dirty($word);
  }
  unset($results[0]);
Dries's avatar
   
Dries committed
590

591
592
593
  // Now insert links to nodes
  foreach ($results as $nid => $words) {
    foreach ($words as $word => $score) {
594
      db_query("INSERT INTO {search_index} (word, sid, type, fromsid, fromtype, score) VALUES ('%s', %d, '%s', %d, '%s', %f)", $word, $nid, 'node', $sid, $type, $score);
595
      search_dirty($word);
Kjartan's avatar
Kjartan committed
596
597
598
599
    }
  }
}

600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
/**
 * Extract a module-specific search option from a search query. e.g. 'type:book'
 */
function search_query_extract($keys, $option) {
  if (preg_match('/(^| )'. $option .':([^ ]*)( |$)/i', $keys, $matches)) {
    return $matches[2];
  }
}

/**
 * Return a query with the given module-specific search option inserted in.
 * e.g. 'type:book'.
 */
function search_query_insert($keys, $option, $value = '') {
  if (search_query_extract($keys, $option)) {
    $keys = trim(preg_replace('/(^| )'. $option .':[^ ]*/i', '', $keys));
  }
  if ($value != '') {
    $keys .= ' '. $option .':'. $value;
  }
  return $keys;
}

/**
 * Parse a search query into SQL conditions.
 *
626
 * We build a query that matches the dataset bodies.
627
628
629
630
631
632
633
634
635
636
637
638
639
640
 */
function search_parse_query($text) {
  $keys = array('positive' => array(), 'negative' => array());

  // Tokenize query string
  preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' '. $text, $matches, PREG_SET_ORDER);

  if (count($matches) < 1) {
    return NULL;
  }

  // Classify tokens
  $or = false;
  foreach ($matches as $match) {
641
    $phrase = false;
Steven Wittens's avatar
Steven Wittens committed
642
    // Strip off phrase quotes
643
644
    if ($match[2]{0} == '"') {
      $match[2] = substr($match[2], 1, -1);
645
      $phrase = true;
646
    }
Steven Wittens's avatar
Steven Wittens committed
647
    // Simplify keyword according to indexing rules and external preprocessors
648
649
    $words = search_simplify($match[2]);
    // Re-explode in case simplification added more words, except when matching a phrase
Steven Wittens's avatar
Steven Wittens committed
650
    $words = $phrase ? array($words) : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);
651
652
    // Negative matches
    if ($match[1] == '-') {
653
      $keys['negative'] = array_merge($keys['negative'], $words);
654
655
656
657
    }
    // OR operator: instead of a single keyword, we store an array of all
    // OR'd keywords.
    elseif ($match[2] == 'OR' && count($keys['positive'])) {
Steven Wittens's avatar
Steven Wittens committed
658
659
660
661
662
663
      $last = array_pop($keys['positive']);
      // Starting a new OR?
      if (!is_array($last)) {
        $last = array($last);
      }
      $keys['positive'][] = $last;
664
665
666
667
668
669
      $or = true;
      continue;
    }
    // Plain keyword
    else {
      if ($or) {
670
671
        // Add to last element (which is an array)
        $keys['positive'][count($keys['positive']) - 1] = array_merge($keys['positive'][count($keys['positive']) - 1], $words);
672
673
      }
      else {
674
        $keys['positive'] = array_merge($keys['positive'], $words);
675
676
677
678
679
680
681
682
683
684
      }
    }
    $or = false;
  }

  // Convert keywords into SQL statements.
  $query = array();
  $query2 = array();
  $arguments = array();
  $arguments2 = array();
Steven Wittens's avatar
Steven Wittens committed
685
  $matches = 0;
686
687
688
689
690
  // Positive matches
  foreach ($keys['positive'] as $key) {
    // Group of ORed terms
    if (is_array($key) && count($key)) {
      $queryor = array();
Steven Wittens's avatar
Steven Wittens committed
691
      $any = false;
692
      foreach ($key as $or) {
Steven Wittens's avatar
Steven Wittens committed
693
694
        list($q, $count) = _search_parse_query($or, $arguments2);
        $any |= $count;
695
696
697
698
699
700
701
        if ($q) {
          $queryor[] = $q;
          $arguments[] = $or;
        }
      }
      if (count($queryor)) {
        $query[] = '('. implode(' OR ', $queryor) .')';
Steven Wittens's avatar
Steven Wittens committed
702
703
        // A group of OR keywords only needs to match once
        $matches += ($any > 0);
704
705
706
707
      }
    }
    // Single ANDed term
    else {
Steven Wittens's avatar
Steven Wittens committed
708
      list($q, $count) = _search_parse_query($key, $arguments2);
709
710
711
      if ($q) {
        $query[] = $q;
        $arguments[] = $key;
Steven Wittens's avatar
Steven Wittens committed
712
713
        // Each AND keyword needs to match at least once
        $matches += $count;
714
715
716
      }
    }
  }
Steven Wittens's avatar
Steven Wittens committed
717
  // Negative matches
718
  foreach ($keys['negative'] as $key) {
Steven Wittens's avatar
Steven Wittens committed
719
    list($q) = _search_parse_query($key, $arguments2, true);
720
721
722
723
724
725
    if ($q) {
      $query[] = $q;
      $arguments[] = $key;
    }
  }
  $query = implode(' AND ', $query);
726

Steven Wittens's avatar
Steven Wittens committed
727
  // Build word-index conditions for the first pass
728
  $query2 = substr(str_repeat("i.word = '%s' OR ", count($arguments2)), 0, -4);
Steven Wittens's avatar
Steven Wittens committed
729
730

  return array($query, $arguments, $query2, $arguments2, $matches);
731
732
733
734
735
736
}

/**
 * Helper function for search_parse_query();
 */
function _search_parse_query(&$word, &$scores, $not = false) {
Steven Wittens's avatar
Steven Wittens committed
737
  $count = 0;
738
739
740
741
742
  // Determine the scorewords of this word/phrase
  if (!$not) {
    $split = explode(' ', $word);
    foreach ($split as $s) {
      $num = is_numeric($s);
743
      if ($num || drupal_strlen($s) >= variable_get('minimum_word_size', 3)) {
Steven Wittens's avatar
Steven Wittens committed
744
745
746
747
748
        $s = $num ? ((int)ltrim($s, '-0')) : $s;
        if (!isset($scores[$s])) {
          $scores[$s] = $s;
          $count++;
        }
749
750
751
      }
    }
  }
Steven Wittens's avatar
Steven Wittens committed
752
753
  // Return matching snippet and number of added words
  return array("d.data ". ($not ? 'NOT ' : '') ."LIKE '%% %s %%'", $count);
754
755
}

Kjartan's avatar
Kjartan committed
756
/**
Steven Wittens's avatar
Steven Wittens committed
757
 * Do a query on the full-text search index for a word or words.
758
 *
Steven Wittens's avatar
Steven Wittens committed
759
760
 * This function is normally only called by each module that support the
 * indexed search (and thus, implements hook_update_index()).
761
 *
762
763
764
765
 * Two queries are performed which can be extended by the caller.
 *
 * The first query selects a set of possible matches based on the search index
 * and any extra given restrictions. This is the classic "OR" search.
766
 *
767
 * SELECT i.type, i.sid, SUM(i.score*t.count) AS relevance
768
 * FROM {search_index} i
769
770
771
 * INNER JOIN {search_total} t ON i.word = t.word
 * $join1
 * WHERE $where1 AND (...)
772
 * GROUP BY i.type, i.sid
773
774
775
776
777
778
779
780
781
782
783
 *
 * The second query further refines this set by verifying advanced text
 * conditions (such as AND, negative or phrase matches), and orders the results
 * on a the column or expression 'score':
 *
 * SELECT i.type, i.sid, $select2
 * FROM temp_search_sids i
 * INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type
 * $join2
 * WHERE (...)
 * ORDER BY score DESC
784
 *
785
 * @param $keywords
786
787
788
789
 *   A search string as entered by the user.
 *
 * @param $type
 *   A string identifying the calling module.
Kjartan's avatar
Kjartan committed
790
 *
791
792
 * @param $join1
 *   (optional) Inserted into the JOIN part of the first SQL query.
793
794
 *   For example "INNER JOIN {node} n ON n.nid = i.sid".
 *
795
796
797
798
799
800
801
802
803
804
805
 * @param $where1
 *   (optional) Inserted into the WHERE part of the first SQL query.
 *   For example "(n.status > %d)".
 *
 * @param $arguments1
 *   (optional) Extra SQL arguments belonging to the first query.
 *
 * @param $select2
 *   (optional) Inserted into the SELECT pat of the second query. Must contain
 *   a column selected as 'score'.
 *   defaults to 'i.relevance AS score'
806
 *
807
808
809
810
811
812
 * @param $join2
 *   (optional) Inserted into the JOIN par of the second SQL query.
 *   For example "INNER JOIN {node_comment_statistics} n ON n.nid = i.sid"
 *
 * @param $arguments2
 *   (optional) Extra SQL arguments belonging to the second query parameter.
813
 *
814
815
 * @return
 *   An array of SIDs for the search results.
816
817
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
818
 */
819
820
function do_search($keywords, $type, $join1 = '', $where1 = '1', $arguments1 = array(), $select2 = 'i.relevance AS score', $join2 = '', $arguments2 = array()) {
  $query = search_parse_query($keywords);
821

Steven Wittens's avatar
Steven Wittens committed
822
823
824
  if ($query[2] == '') {
    form_set_error('keys', t('You must include at least one positive keyword with %count characters or more.', array('%count' => variable_get('minimum_word_size', 3))));
  }
825
826
  if ($query === NULL || $query[0] == '' || $query[2] == '') {
    return array();
827
  }
828

829
830
831
  // First pass: select all possible matching sids, doing a simple index-based OR matching on the keywords.
  // 'matches' is used to reject those items that cannot possibly match the query.
  $conditions = $where1 .' AND ('. $query[2] .") AND i.type = '%s'";
Steven Wittens's avatar
Steven Wittens committed
832
  $arguments = array_merge($arguments1, $query[3], array($type, $query[4]));
833
  $result = db_query_temporary("SELECT i.type, i.sid, SUM(i.score * t.count) AS relevance, COUNT(*) AS matches FROM {search_index} i INNER JOIN {search_total} t ON i.word = t.word $join1 WHERE $conditions GROUP BY i.type, i.sid HAVING COUNT(*) >= %d", $arguments, 'temp_search_sids');
834
835

  // Calculate maximum relevance, to normalize it
836
  $normalize = db_result(db_query('SELECT MAX(relevance) FROM temp_search_sids'));
837
  if (!$normalize) {
838
839
    return array();
  }
840
  $select2 = str_replace('i.relevance', '('. (1.0 / $normalize) .' * i.relevance)', $select2);
841

842
843
844
  // Second pass: only keep items that match the complicated keywords conditions (phrase search, negative keywords, ...)
  $conditions = '('. $query[0] .')';
  $arguments = array_merge($arguments2, $query[1]);
845
846
  $result = db_query_temporary("SELECT i.type, i.sid, $select2 FROM temp_search_sids i INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type $join2 WHERE $conditions ORDER BY score DESC", $arguments, 'temp_search_results');
  if (($count = db_result(db_query('SELECT COUNT(*) FROM temp_search_results'))) == 0) {
847
    return array();
848
849
850
  }
  $count_query = "SELECT $count";

851
  // Do actual search query
852
  $result = pager_query("SELECT * FROM temp_search_results", 10, 0, $count_query, $arguments);
853
854
855
856
857
  $results = array();
  while ($item = db_fetch_object($result)) {
    $results[] = $item->sid;
  }
  return $results;
Kjartan's avatar
Kjartan committed
858
859
}

860
861
862
863
864
865
866
867
868
869
/**
 * Helper function for grabbing search keys.
 */
function search_get_keys() {
  // Extract keys as remainder of path
  // Note: support old GET format of searches for existing links.
  $path = explode('/', $_GET['q'], 3);
  return count($path) == 3 ? $path[2] : $_REQUEST['keys'];
}

Dries's avatar
   
Dries committed
870
871
872
873
/**
 * Menu callback; presents the search form and/or search results.
 */
function search_view() {
874
875
876
877
878
  $type = arg(1);

  // Search form submits with POST but redirects to GET. This way we can keep
  // the search query URL clean as a whistle:
  // search/type/keyword+keyword
879
  if (isset($_POST['op'])) {
880
881
882
    if ($type == '') {
      $type = 'node';
    }
883
    $keys = module_invoke($type, 'search', 'post', $_POST['edit']['keys']);
884
    drupal_goto('search/'. $type .'/'. (is_null($keys) ? $_POST['edit']['keys'] : $keys));
885
886
887
  }
  else if ($type == '') {
    // Note: search/node can not be a default tab because it would take on the
888
    // path of its parent (search). It would prevent remembering keywords when
889
890
891
892
    // switching tabs. This is why we drupal_goto to it from the parent instead.
    drupal_goto('search/node');
  }
  $keys = search_get_keys();
Dries's avatar
   
Dries committed
893

Dries's avatar
   
Dries committed
894
  if (user_access('search content')) {
895
896
897
    // Only perform search if there is non-whitespace search term:
    if (trim($keys)) {
      // Log the search keys:
Dries's avatar
Dries committed
898
      watchdog('search',
899
        t('Search: %keys (%type).', array('%keys' => theme('placeholder', $keys), '%type' => module_invoke($type, 'search', 'name'))),
Dries's avatar
Dries committed
900
        WATCHDOG_NOTICE,
901
        l(t('results'), 'search/'. $type .'/'. $keys)
Dries's avatar
Dries committed
902
        );
Dries's avatar
   
Dries committed
903

904
      // Collect the search results:
905
      $results = search_data($keys, $type);
Dries's avatar
   
Dries committed
906

Dries's avatar
   
Dries committed
907
      if ($results) {
908
        $results = theme('box', t('Search results'), $results);
Dries's avatar
   
Dries committed
909
910
      }
      else {
911
        $results = theme('box', t('Your search yielded no results'), search_help('search#noresults'));
Dries's avatar
   
Dries committed
912
      }
Dries's avatar
   
Dries committed
913
    }
914
915
916
917
918
919
    else if (isset($_POST['edit'])) {
      form_set_error('keys', t('Please enter some keywords.'));
    }

    // Construct the search form.
    // Note, we do this last because of the form_set_error() above.
920
    $output = search_form(NULL, $keys, $type);
921
922

    $output .= $results;
Dries's avatar
   
Dries committed
923

Dries's avatar
   
Dries committed
924
    return $output;
925
  }
Dries's avatar
   
Dries committed
926
  else {
Dries's avatar
   
Dries committed
927
    drupal_access_denied();
Dries's avatar
   
Dries committed
928
  }
Kjartan's avatar
Kjartan committed
929
930
}

931
932
933
934
935
936
937
938
/**
 * @defgroup search Search interface
 * @{
 * The Drupal search interface manages a global search mechanism.
 *
 * Modules may plug into this system to provide searches of different types of
 * data. Most of the system is handled by search.module, so this must be enabled
 * for all of the search features to work.
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
 *
 * There are three ways to interact with the search system:
 * - Specifically for searching nodes, you can implement nodeapi('update index')
 *   and nodeapi('search result'). However, note that the search system already
 *   indexes all visible output of a node, i.e. everything displayed normally
 *   by hook_view() and hook_nodeapi('view'). This is usually sufficient.
 *   You should only use this mechanism if you want additional, non-visible data
 *   to be indexed.
 * - Implement hook_search(). This will create a search tab for your module on
 *   the /search page with a simple keyword search form. You may optionally
 *   implement hook_search_item() to customize the display of your results.
 * - Implement hook_update_index(). This allows your module to use Drupal's
 *   HTML indexing mechanism for searching full text efficiently.
 *
 * If your module needs to provide a more complicated search form, then you need
Dries's avatar
Dries committed
954
 * to implement it yourself without hook_search(). In that case, you should
955
956
 * define it as a local task (tab) under the /search page (e.g. /search/mymodule)
 * so that users can easily find it.
957
958
959
960
961
962
963
964
965
 */

/**
 * Render a search form.
 *
 * @param $action
 *   Form action. Defaults to "search".
 * @param $keys
 *   The search string entered by the user, containing keywords for the search.
966
967
968
 * @param $type
 *   The type of search to render the node for. Must be the name of module
 *   which implements hook_search(). Defaults to 'node'.
969
970
 * @param $prompt
 *   A piece of text to put before the form (e.g. "Enter your keywords")
971
972
973
 * @return
 *   An HTML string containing the search form.
 */
974
function search_form($action = '', $keys = '', $type = null, $prompt = null) {
975
976

  if (!$action) {
977
    $action = url('search/'. $type);
978
  }
979
980
981
  if (is_null($prompt)) {
    $prompt = t('Enter your keywords');
  }
982

983
  $form = array();
984
985
  $form['#action'] = $action;
  $form['#attributes'] = array('class' => 'search-form');
986
987
  $form['basic'] = array('#type' => 'item', '#title' => $prompt);
  $form['basic']['inline'] = array('#type' => 'markup', '#prefix' => '<div class="container-inline">', '#suffix' => '</div>');
988
  $form['basic']['inline']['keys'] = array('#type' => 'textfield', '#title' => '', '#default_value' => $keys, '#size' => $prompt ? 40 : 20, '#maxlength' => 255);
989
  $form['basic']['inline']['submit'] = array('#type' => 'submit', '#value' => t('Search'));
990

991
  $form_module = module_invoke($type, 'search', 'form', $keys);
992
  if (isset($form_module) && is_array($form_module)) {
993
994
    $form = array_merge($form, $form_module);
  }
995

996
  return drupal_get_form('search_form', $form);
997
998
999
}

/**
Steven Wittens's avatar
Steven Wittens committed
1000
 * Perform a standard search on the given keys, and return the formatted results.
1001
1002
1003
1004
1005
1006
1007
 */
function search_data($keys = NULL, $type = 'node') {
  $output = '';

  if (isset($keys)) {
    if (module_hook($type, 'search')) {
      $results = module_invoke($type, 'search', 'search', $keys);
1008
      if (isset($results) && is_array($results) && count($results)) {
1009
1010
1011
1012
1013
        $output .= '<dl class="search-results">';
        foreach ($results as $entry) {
          $output .= theme('search_item', $entry, $type);
        }
        $output .= '</dl>';
1014
        $output .= theme('pager', NULL, 15, 0);
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
      }
    }
  }

  return $output;
}

/**
 * Returns snippets from a piece of text, with certain keywords highlighted.
 * Used for formatting search results.
 *
 * @param $keys
1027
 *   A string containing a search query.
1028
1029
1030
1031
1032
1033
1034
1035
 *
 * @param $text
 *   The text to extract fragments from.
 *
 * @return
 *   A string containing HTML for the excerpt.
 */
function search_excerpt($keys, $text) {
Steven Wittens's avatar
Steven Wittens committed
1036
1037
1038
  // We highlight around non-indexable or CJK characters.
  $boundary = '(?:(?<=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .'])|(?=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .']))';

1039
1040
1041
1042
1043
  // Extract positive keywords and phrases
  preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' '. $keys, $matches);
  $keys = array_merge($matches[2], $matches[3]);

  // Prepare text
Steven Wittens's avatar
Steven Wittens committed
1044
  $text = ' '. strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text)) .' ';
1045
1046
  array_walk($keys, '_search_excerpt_replace');
  $workkeys = $keys;
1047
1048
1049
1050

  // Extract a fragment per keyword for at most 4 keywords.
  // First we collect ranges of text around each keyword, starting/ending
  // at spaces.
1051
  // If the sum of all fragments is too short, we look for second occurrences.
1052
1053
1054
  $ranges = array();
  $included = array();
  $length = 0;
1055
1056
  while ($length < 256 && count($workkeys)) {
    foreach ($workkeys as $k => $key) {
1057
      if (strlen($key) == 0) {
1058
        unset($workkeys[$k]);
1059
        unset($keys[$k]);
1060
1061
1062
1063
1064
        continue;
      }
      if ($length >= 256) {
        break;
      }
1065
      // Remember occurrence of key so we can skip over it if more occurrences
1066
1067
1068
1069
      // are desired.
      if (!isset($included[$key])) {
        $included[$key] = 0;
      }
1070
1071
      // Locate a keyword (position $p), then locate a space in front (position
      // $q) and behind it (position $s)
Steven Wittens's avatar
Steven Wittens committed
1072
      if (preg_match('/'. $boundary . $key . $boundary .'/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
1073
        $p = $match[0][1];
1074
1075
1076
1077
1078
1079
1080
1081
        if (($q = strpos($text, ' ', max(0, $p - 60))) !== false) {
          $end = substr($text, $p, 80);
          if (($s = strrpos($end, ' ')) !== false) {
            $ranges[$q] = $p + $s;
            $length += $p + $s - $q;
            $included[$key] = $p + 1;
          }
          else {
1082
            unset($workkeys[$k]);
1083
1084
1085
          }
        }
        else {
1086
          unset($workkeys[$k]);
1087
1088
1089
        }
      }
      else {
1090
        unset($workkeys[$k]);
1091
1092
      }
    }
1093
  }
1094

1095
1096
1097
  // If we didn't find anything, return the beginning.
  if (count($ranges) == 0) {
    return truncate_utf8($text, 256) . ' ...';
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

  // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
  $newranges = array();
  foreach ($ranges as $from2 => $to2) {
    if (!isset($from1)) {
      $from1 = $from2;
      $to1 = $to2;
      continue;
    }
    if ($from2 <= $to1) {
      $to1 = max($to1, $to2);
    }
    else {
      $newranges[$from1] = $to1;
      $from1 = $from2;
      $to1 = $to2;
    }
  }
  $newranges[$from1] = $to1;

  // Fetch text
  $out = array();
  foreach ($newranges as $from => $to</