search.module 43.6 KB
Newer Older
Dries's avatar
   
Dries committed
1
<?php
2
// $Id$
Dries's avatar
   
Dries committed
3

Dries's avatar
   
Dries committed
4
5
6
7
8
/**
 * @file
 * Enables site-wide keyword searching.
 */

9
/**
10
 * Matches Unicode character classes to exclude from the search index.
Steven Wittens's avatar
Steven Wittens committed
11
 *
12
13
 * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
 *
14
 * The index only contains the following character classes:
15
16
17
18
19
20
21
 * Lu     Letter, Uppercase
 * Ll     Letter, Lowercase
 * Lt     Letter, Titlecase
 * Lo     Letter, Other
 * Nd     Number, Decimal Digit
 * No     Number, Other
 */
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
define('PREG_CLASS_SEARCH_EXCLUDE',
'\x{0}-\x{2f}\x{3a}-\x{40}\x{5b}-\x{60}\x{7b}-\x{bf}\x{d7}\x{f7}\x{2b0}-'.
'\x{385}\x{387}\x{3f6}\x{482}-\x{489}\x{559}-\x{55f}\x{589}-\x{5c7}\x{5f3}-'.
'\x{61f}\x{640}\x{64b}-\x{65e}\x{66a}-\x{66d}\x{670}\x{6d4}\x{6d6}-\x{6ed}'.
'\x{6fd}\x{6fe}\x{700}-\x{70f}\x{711}\x{730}-\x{74a}\x{7a6}-\x{7b0}\x{901}-'.
'\x{903}\x{93c}\x{93e}-\x{94d}\x{951}-\x{954}\x{962}-\x{965}\x{970}\x{981}-'.
'\x{983}\x{9bc}\x{9be}-\x{9cd}\x{9d7}\x{9e2}\x{9e3}\x{9f2}-\x{a03}\x{a3c}-'.
'\x{a4d}\x{a70}\x{a71}\x{a81}-\x{a83}\x{abc}\x{abe}-\x{acd}\x{ae2}\x{ae3}'.
'\x{af1}-\x{b03}\x{b3c}\x{b3e}-\x{b57}\x{b70}\x{b82}\x{bbe}-\x{bd7}\x{bf0}-'.
'\x{c03}\x{c3e}-\x{c56}\x{c82}\x{c83}\x{cbc}\x{cbe}-\x{cd6}\x{d02}\x{d03}'.
'\x{d3e}-\x{d57}\x{d82}\x{d83}\x{dca}-\x{df4}\x{e31}\x{e34}-\x{e3f}\x{e46}-'.
'\x{e4f}\x{e5a}\x{e5b}\x{eb1}\x{eb4}-\x{ebc}\x{ec6}-\x{ecd}\x{f01}-\x{f1f}'.
'\x{f2a}-\x{f3f}\x{f71}-\x{f87}\x{f90}-\x{fd1}\x{102c}-\x{1039}\x{104a}-'.
'\x{104f}\x{1056}-\x{1059}\x{10fb}\x{10fc}\x{135f}-\x{137c}\x{1390}-\x{1399}'.
'\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16f0}\x{1712}-\x{1714}'.
'\x{1732}-\x{1736}\x{1752}\x{1753}\x{1772}\x{1773}\x{17b4}-\x{17db}\x{17dd}'.
'\x{17f0}-\x{180e}\x{1843}\x{18a9}\x{1920}-\x{1945}\x{19b0}-\x{19c0}\x{19c8}'.
'\x{19c9}\x{19de}-\x{19ff}\x{1a17}-\x{1a1f}\x{1d2c}-\x{1d61}\x{1d78}\x{1d9b}-'.
'\x{1dc3}\x{1fbd}\x{1fbf}-\x{1fc1}\x{1fcd}-\x{1fcf}\x{1fdd}-\x{1fdf}\x{1fed}-'.
'\x{1fef}\x{1ffd}-\x{2070}\x{2074}-\x{207e}\x{2080}-\x{2101}\x{2103}-\x{2106}'.
'\x{2108}\x{2109}\x{2114}\x{2116}-\x{2118}\x{211e}-\x{2123}\x{2125}\x{2127}'.
'\x{2129}\x{212e}\x{2132}\x{213a}\x{213b}\x{2140}-\x{2144}\x{214a}-\x{2b13}'.
'\x{2ce5}-\x{2cff}\x{2d6f}\x{2e00}-\x{3005}\x{3007}-\x{303b}\x{303d}-\x{303f}'.
'\x{3099}-\x{309e}\x{30a0}\x{30fb}-\x{30fe}\x{3190}-\x{319f}\x{31c0}-\x{31cf}'.
'\x{3200}-\x{33ff}\x{4dc0}-\x{4dff}\x{a015}\x{a490}-\x{a716}\x{a802}\x{a806}'.
'\x{a80b}\x{a823}-\x{a82b}\x{d800}-\x{f8ff}\x{fb1e}\x{fb29}\x{fd3e}\x{fd3f}'.
'\x{fdfc}-\x{fe6b}\x{feff}-\x{ff0f}\x{ff1a}-\x{ff20}\x{ff3b}-\x{ff40}\x{ff5b}-'.
'\x{ff65}\x{ff70}\x{ff9e}\x{ff9f}\x{ffe0}-\x{fffd}');
50
51

/**
Steven Wittens's avatar
Steven Wittens committed
52
 * Matches all 'N' Unicode character classes (numbers)
53
 */
54
55
56
57
58
59
60
61
62
63
define('PREG_CLASS_NUMBERS',
'\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}'.
'\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}'.
'\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}'.
'\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-'.
'\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}'.
'\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}'.
'\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}'.
'\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-'.
'\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}');
64
65

/**
Steven Wittens's avatar
Steven Wittens committed
66
 * Matches all 'P' Unicode character classes (punctuation)
67
 */
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
define('PREG_CLASS_PUNCTUATION',
'\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}'.
'\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}'.
'\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}'.
'\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}'.
'\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}'.
'\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}'.
'\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}'.
'\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}'.
'\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-'.
'\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}'.
'\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}'.
'\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}'.
'\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}'.
'\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-'.
'\x{ff65}');

/**
 * Matches all CJK characters that are candidates for auto-splitting
 * (Chinese, Japanese, Korean).
 * Contains kana and BMP ideographs.
 */
define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}'.
'\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}');
92

Dries's avatar
   
Dries committed
93
94
95
/**
 * Implementation of hook_help().
 */
96
function search_help($section) {
Dries's avatar
   
Dries committed
97
  switch ($section) {
98
99
100
101
102
103
104
105
106
107
    case 'admin/help#search':
      $output = '<p>'. t('The search module adds the ability to search for content by keywords.  Search is often the only practical way to find content on a large site.  Search is useful for finding users and posts by searching on keywords.') .'</p>';
      $output .= '<p>'. t('The search engine works by maintaining an index of the words in your site\'s content. It indexes the posts and users.  You can adjust the settings to tweak the indexing behaviour. Note that the search requires cron to be set up correctly.  The index percentage sets the maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.') .'</p>';
      $output .= t('<p>You can</p>
<ul>
<li>read about how your site uses cron in the <a href="%admin-help-system">administer &gt;&gt; help &gt;&gt; system</a>.</li>
<li>run your <a href="%file-cron">cron.php</a>.</li>
<li>read about <a href="%external-http-drupal-org-node-23714">configuring cron jobs</a>.</li>
<li><a href="%admin-settings-search">administer &gt;&gt; settings &gt;&gt; search</a>.</ul>
', array('%admin-help-system' => url('admin/help/system'), '%file-cron' => 'cron.php', '%external-http-drupal-org-node-23714' => 'http://drupal.org/node/23714', '%admin-settings-search' => url('admin/settings/search')));
108
      $output .= '<p>'. t('For more information please read the configuration and customization handbook <a href="%search">Search page</a>.', array('%search' => 'http://drupal.org/handbook/modules/search/')) .'</p>';
109
      return $output;
Dries's avatar
   
Dries committed
110
111
    case 'admin/modules#description':
      return t('Enables site-wide keyword searching.');
112
113
    case 'admin/settings/search':
      return t('
Steven Wittens's avatar
Steven Wittens committed
114
<p>The search engine works by maintaining an index of the words in your site\'s content. You can adjust the settings below to tweak the indexing behaviour. Note that the search requires cron to be set up correctly.</p>
115
');
116
117
118
    case 'search#noresults':
      return t('<p><ul>
<li>Check if your spelling is correct.</li>
119
120
121
<li>Remove quotes around phrases to match each word individually: <em>"blue smurf"</em> will match less than <em>blue smurf</em>.</li>
<li>Consider loosening your query with <em>OR</em>: <em>blue smurf</em> will match less than <em>blue OR smurf</em>.</li>
</ul></p>');
Dries's avatar
   
Dries committed
122
  }
Dries's avatar
   
Dries committed
123
}
Kjartan's avatar
Kjartan committed
124
125

/**
Dries's avatar
   
Dries committed
126
 * Implementation of hook_perm().
Kjartan's avatar
Kjartan committed
127
 */
Dries's avatar
   
Dries committed
128
function search_perm() {
Dries's avatar
   
Dries committed
129
  return array('search content', 'administer search');
Dries's avatar
   
Dries committed
130
131
}

132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/**
 * Implementation of hook_block().
 */
function search_block($op = 'list', $delta = 0) {
  if ($op == 'list') {
    $blocks[0]['info'] = t('Search form');
    return $blocks;
  }
  else if ($op == 'view' && user_access('search content') && arg(0) != 'search') {
    $block['content'] = search_form('', '', null, '');
    $block['subject'] = t('Search');
    return $block;
  }
}

Dries's avatar
   
Dries committed
147
148
149
/**
 * Implementation of hook_menu().
 */
Dries's avatar
   
Dries committed
150
function search_menu($may_cache) {
Dries's avatar
   
Dries committed
151
  $items = array();
Dries's avatar
   
Dries committed
152
153

  if ($may_cache) {
Dries's avatar
Dries committed
154
    $items[] = array('path' => 'search', 'title' => t('search'),
Dries's avatar
   
Dries committed
155
156
157
      'callback' => 'search_view',
      'access' => user_access('search content'),
      'type' => MENU_SUGGESTED_ITEM);
158
    $items[] = array('path' => 'admin/settings/search/wipe', 'title' => t('Clear index'),
159
160
161
      'callback' => 'search_wipe_confirm',
      'access' => user_access('administer search'),
      'type' => MENU_CALLBACK);
162
163
164
165
166
  }
  else if (arg(0) == 'search') {
    // To remember the user's search keywords when switching across tabs,
    // we dynamically add the keywords to the search tabs' paths.
    $keys = search_get_keys();
167
    $keys = strlen($keys) ? '/'. $keys : '';
Dries's avatar
Dries committed
168
    foreach (module_list() as $name) {
169
170
      if (module_hook($name, 'search') && $title = module_invoke($name, 'search', 'name')) {
        $items[] = array('path' => 'search/'. $name . $keys, 'title' => $title,
Dries's avatar
Dries committed
171
172
          'callback' => 'search_view',
          'access' => user_access('search content'),
173
          'type' => MENU_LOCAL_TASK);
Dries's avatar
Dries committed
174
175
      }
    }
Dries's avatar
   
Dries committed
176
177
  }

Dries's avatar
   
Dries committed
178
179
180
181
  return $items;
}

/**
182
 * Implementation of hook_validate().
Dries's avatar
   
Dries committed
183
 */
184
function search_settings_form_validate($form_id, &$form) {
185
  if ($_POST['op'] == t('Re-index site')) {
186
187
188
189
190
    drupal_goto('admin/settings/search/wipe');
  }
  // If these settings change, the index needs to be rebuilt.
  if ((variable_get('minimum_word_size', 3) != $form['minimum_word_size']) ||
      (variable_get('overlap_cjk', true) != $form['overlap_cjk'])) {
191
192
    drupal_set_message(t('The index will be rebuilt.'));
    search_wipe();
Dries's avatar
Dries committed
193
  }
194
}
Dries's avatar
   
Dries committed
195

196
197
198
199
/**
 * Menu callback; displays the search module settings page.
 */
function search_settings() {
200
201
202
203
204
205
206
207
208
  // Collect some stats
  $remaining = 0;
  $total = 0;
  foreach (module_list() as $module) {
    if (module_hook($module, 'search')) {
      $status = module_invoke($module, 'search', 'status');
      $remaining += $status['remaining'];
      $total += $status['total'];
    }
Dries's avatar
Dries committed
209
  }
210
  $count = format_plural($remaining, 'There is 1 item left to index.', 'There are %count items left to index.');
211
  $percentage = ((int)min(100, 100 * ($total - $remaining) / max(1, $total))) . '%';
212
  $status = '<p><strong>'. t('%percentage of the site has been indexed.', array('%percentage' => $percentage)) .' '. $count .'</strong></p>';
213
214
  $form['status'] = array('#type' => 'fieldset', '#title' => t('Indexing status'));
  $form['status']['status'] = array('#type' => 'markup', '#value' => $status);
215
  $form['status']['wipe'] = array('#type' => 'submit', '#value' => t('Re-index site'));
216
217

  $items = drupal_map_assoc(array(10, 20, 50, 100, 200, 500));
218
219

  // Indexing throttle:
220
221
  $form['indexing_throttle'] = array('#type' => 'fieldset', '#title' => t('Indexing throttle'));
  $form['indexing_throttle']['search_cron_limit'] = array('#type' => 'select', '#title' => t('Items to index per cron run'), '#default_value' => variable_get('search_cron_limit', 100), '#options' => $items, '#description' => t('The maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.'));
Dries's avatar
   
Dries committed
222
  // Indexing settings:
223
  $form['indexing_settings'] = array('#type' => 'fieldset', '#title' => t('Indexing settings'));
224
225
226
  $form['indexing_settings']['info'] = array('#type' => 'markup', '#value' => '<em>'. t('<p>Changing the settings below will cause the site index to be rebuilt. The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed.</p><p>The default settings should be appropriate for the majority of sites.</p>') .'</em>');
  $form['indexing_settings']['minimum_word_size'] = array('#type' => 'textfield', '#title' => t('Minimum word length to index'), '#default_value' => variable_get('minimum_word_size', 3), '#size' => 5, '#maxlength' => 3, '#description' => t('The number of characters a word has to be to be indexed. A lower setting means better search result ranking, but also a larger database. Each search query must contain at least one keyword that is this size (or longer).'));
  $form['indexing_settings']['overlap_cjk'] = array('#type' => 'checkbox', '#title' => t('Simple CJK handling'), '#default_value' => variable_get('overlap_cjk', true), '#description' => t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Turn this off if you want to use an external preprocessor for this instead. Does not affect other languages.'));
Dries's avatar
   
Dries committed
227

228
229
  // Per module settings
  $form = array_merge($form, module_invoke_all('search', 'admin'));
230
  return $form;
Dries's avatar
   
Dries committed
231
232
}

233
234
235
236
237
/**
 * Menu callback: confirm wiping of the index.
 */
function search_wipe_confirm() {
  return confirm_form('search_wipe_confirm', $form, t('Are you sure you want to re-index the site?'),
238
                  'admin/settings/search', t(' The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed. This action cannot be undone.'), t('Re-index site'), t('Cancel'));
239
240
241
242
243
}

/**
 * Handler for wipe confirmation
 */
244
function search_wipe_confirm_submit($form_id, &$form) {
245
246
247
  if ($form['confirm']) {
    search_wipe();
    drupal_set_message(t('The index will be rebuilt.'));
248
    return 'admin/settings/search';
249
250
251
  }
}

Dries's avatar
Dries committed
252
/**
253
254
255
256
257
258
259
 * Wipes a part of or the entire search index.
 *
 * @param $sid
 *  (optional) The SID of the item to wipe. If specified, $type must be passed
 *  too.
 * @param $type
 *  (optional) The type of item to wipe.
Dries's avatar
Dries committed
260
 */
261
function search_wipe($sid = NULL, $type = NULL) {
262
263
264
265
  if ($type == NULL && $sid == NULL) {
    module_invoke_all('search', 'reset');
  }
  else {
266
    db_query("DELETE FROM {search_dataset} WHERE sid = %d AND type = '%s'", $sid, $type);
267
268
269
    db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'", $sid, $type);
    db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type);
  }
Dries's avatar
Dries committed
270
271
}

272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
/**
 * Marks a word as dirty (or retrieves the list of dirty words). This is used
 * during indexing (cron). Words which are dirty have outdated total counts in
 * the search_total table, and need to be recounted.
 */
function search_dirty($word = null) {
  static $dirty = array();
  if ($word !== null) {
    $dirty[$word] = true;
  }
  else {
    return $dirty;
  }
}

Kjartan's avatar
Kjartan committed
287
/**
Dries's avatar
   
Dries committed
288
289
 * Implementation of hook_cron().
 *
290
291
 * Fires hook_update_index() in all modules and cleans up dirty words (see
 * search_dirty).
Kjartan's avatar
Kjartan committed
292
293
 */
function search_cron() {
294
295
296
297
  // We register a shutdown function to ensure that search_total is always up
  // to date.
  register_shutdown_function('search_update_totals');

Dries's avatar
Dries committed
298
  // Update word index
Kjartan's avatar
Kjartan committed
299
  foreach (module_list() as $module) {
300
301
    module_invoke($module, 'update_index');
  }
302
303
304
305
306
307
308
}

/**
 * This function is called on shutdown to ensure that search_total is always
 * up to date (even if cron times out or otherwise fails).
 */
function search_update_totals() {
309
  // Update word IDF (Inverse Document Frequency) counts for new/changed words
310
  foreach (search_dirty() as $word => $dummy) {
311
    // Get total count
312
    $total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word));
313
314
315
    // Apply Zipf's law to equalize the probability distribution
    $total = log10(1 + 1/(max(1, $total)));
    db_query("UPDATE {search_total} SET count = %f WHERE word = '%s'", $total, $word);
Dries's avatar
Dries committed
316
    if (!db_affected_rows()) {
317
      db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %f)", $word, $total);
Dries's avatar
Dries committed
318
319
320
321
322
323
324
325
    }
  }
  // Find words that were deleted from search_index, but are still in
  // search_total. We use a LEFT JOIN between the two tables and keep only the
  // rows which fail to join.
  $result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL");
  while ($word = db_fetch_object($result)) {
    db_query("DELETE FROM {search_total} WHERE word = '%s'", $word->realword);
Kjartan's avatar
Kjartan committed
326
327
328
329
  }
}

/**
330
 * Simplifies a string according to indexing rules.
Kjartan's avatar
Kjartan committed
331
 */
332
function search_simplify($text) {
333
334
  // Decode entities to UTF-8
  $text = decode_entities($text);
335

336
337
338
  // Lowercase
  $text = drupal_strtolower($text);

339
340
  // Call an external processor for word handling.
  search_preprocess($text);
Kjartan's avatar
Kjartan committed
341

342
343
  // Simple CJK handling
  if (variable_get('overlap_cjk', true)) {
Dries's avatar
Dries committed
344
    $text = preg_replace_callback('/['. PREG_CLASS_CJK .']+/u', 'search_expand_cjk', $text);
345
  }
346

347
348
349
350
351
352
353
  // To improve searching for numerical data such as dates, IP addresses
  // or version numbers, we consider a group of numerical characters
  // separated only by punctuation characters to be one piece.
  // This also means that searching for e.g. '20/03/1984' also returns
  // results with '20-03-1984' in them.
  // Readable regexp: ([number]+)[punctuation]+(?=[number])
  $text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text);
Kjartan's avatar
Kjartan committed
354

355
356
357
  // The dot, underscore and dash are simply removed. This allows meaningful
  // search behaviour with acronyms and URLs.
  $text = preg_replace('/[._-]+/', '', $text);
Kjartan's avatar
Kjartan committed
358

359
360
361
  // With the exception of the rules above, we consider all punctuation,
  // marks, spacers, etc, to be a word boundary.
  $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text);
Kjartan's avatar
Kjartan committed
362

363
364
365
366
367
  return $text;
}

/**
 * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping
368
 * sequences of characters ('minimum_word_size' long).
369
370
 */
function search_expand_cjk($matches) {
371
372
373
374
375
  $min = variable_get('minimum_word_size', 3);
  $str = $matches[0];
  $l = drupal_strlen($str);
  // Passthrough short words
  if ($l <= $min) {
Steven Wittens's avatar
Steven Wittens committed
376
    return ' '. $str .' ';
377
  }
378
  $tokens = ' ';
379
380
  // FIFO queue of characters
  $chars = array();
381
382
383
384
  // Begin loop
  for ($i = 0; $i < $l; ++$i) {
    // Grab next character
    $current = drupal_substr($str, 0, 1);
385
386
387
388
389
390
    $str = substr($str, strlen($current));
    $chars[] = $current;
    if ($i >= $min - 1) {
      $tokens .= implode('', $chars) .' ';
      array_shift($chars);
    }
391
392
393
394
395
396
397
398
399
400
401
402
403
404
  }
  return $tokens;
}

/**
 * Splits a string into tokens for indexing.
 */
function search_index_split($text) {
  static $last = null;
  static $lastsplit = null;

  if ($last == $text) {
    return $lastsplit;
  }
405
  // Process words
406
  $text = search_simplify($text);
407
  $words = explode(' ', $text);
408
  array_walk($words, '_search_index_truncate');
Kjartan's avatar
Kjartan committed
409

410
411
412
413
414
415
416
  // Save last keyword result
  $last = $text;
  $lastsplit = $words;

  return $words;
}

417
/**
418
 * Helper function for array_walk in search_index_split.
419
 */
420
function _search_index_truncate(&$text) {
Steven Wittens's avatar
Steven Wittens committed
421
  $text = truncate_utf8($text, 50);
422
423
}

424
425
426
427
/**
 * Invokes hook_search_preprocess() in modules.
 */
function search_preprocess(&$text) {
428
429
  foreach (module_implements('search_preprocess') as $module) {
    $text = module_invoke($module, 'search_preprocess', $text);
Kjartan's avatar
Kjartan committed
430
431
432
433
  }
}

/**
Steven Wittens's avatar
Steven Wittens committed
434
 * Update the full-text search index for a particular item.
435
436
437
 *
 * @param $sid
 *   A number identifying this particular item (e.g. node id).
Kjartan's avatar
Kjartan committed
438
 *
439
440
441
442
443
 * @param $type
 *   A string defining this type of item (e.g. 'node')
 *
 * @param $text
 *   The content of this item. Must be a piece of HTML text.
444
445
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
446
 */
447
function search_index($sid, $type, $text) {
448
  $minimum_word_size = variable_get('minimum_word_size', 3);
449

450
  // Link matching
451
  global $base_url;
452
  $node_regexp = '@href=[\'"]?(?:'. preg_quote($base_url, '@') .'/)?(?:\?q=)?/?((?![a-z]+:)[^\'">]+)[\'">]@i';
453
454
455

  // Multipliers for scores of words inside certain HTML tags.
  // Note: 'a' must be included for link ranking to work.
456
  $tags = array('h1' => 25,
457
458
459
460
461
                'h2' => 18,
                'h3' => 15,
                'h4' => 12,
                'h5' => 9,
                'h6' => 6,
462
463
464
465
466
                'u' => 3,
                'b' => 3,
                'i' => 3,
                'strong' => 3,
                'em' => 3,
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
                'a' => 10);

  // Strip off all ignored tags to speed up processing, but insert space before/after
  // them to keep word boundaries.
  $text = str_replace(array('<', '>'), array(' <', '> '), $text);
  $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');

  // Split HTML tags from plain text.
  $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  // Note: PHP ensures the array consists of alternating delimiters and literals
  // and begins and ends with a literal (inserting $null as required).

  $tag = false; // Odd/even counter. Tag or no tag.
  $link = false; // State variable for link analyser
  $score = 1; // Starting score per word
482
483
484
485
  $accum = ' '; // Accumulator for cleaned up data
  $tagstack = array(); // Stack with open tags
  $tagwords = 0; // Counter for consecutive words
  $focus = 1; // Focus state
486

487
  $results = array(0 => array()); // Accumulator for words for index
488
489
490
491
492

  foreach ($split as $value) {
    if ($tag) {
      // Increase or decrease score per word based on tag
      list($tagname) = explode(' ', $value, 2);
493
      $tagname = drupal_strtolower($tagname);
494
      // Closing or opening tag?
495
      if ($tagname[0] == '/') {
496
497
498
499
        $tagname = substr($tagname, 1);
        // If we encounter unexpected tags, reset score to avoid incorrect boosting.
        if (!count($tagstack) || $tagstack[0] != $tagname) {
          $tagstack = array();
500
501
          $score = 1;
        }
502
503
504
505
506
        else {
          // Remove from tag stack and decrement score
          $score = max(1, $score - $tags[array_shift($tagstack)]);
        }
        if ($tagname == 'a') {
507
508
          $link = false;
        }
Kjartan's avatar
Kjartan committed
509
      }
510
      else {
511
512
513
514
        if ($tagstack[0] == $tagname) {
          // None of the tags we look for make sense when nested identically.
          // If they are, it's probably broken HTML.
          $tagstack = array();
515
          $score = 1;
516
517
518
519
520
521
        }
        else {
          // Add to open tag stack and increment score
          array_unshift($tagstack, $tagname);
          $score += $tags[$tagname];
        }
522
523
524
525
        if ($tagname == 'a') {
          // Check if link points to a node on this site
          if (preg_match($node_regexp, $value, $match)) {
            $path = drupal_get_normal_path($match[1]);
526
            if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
527
528
              $linknid = $match[1];
              if ($linknid > 0) {
529
530
531
532
533
534
                // Note: ignore links to uncachable nodes to avoid redirect bugs.
                $node = db_fetch_object(db_query('SELECT n.title, n.nid, n.vid, r.format FROM {node} n INNER JOIN {node_revisions} r ON n.vid = r.vid WHERE n.nid = %d', $linknid));
                if (filter_format_allowcache($node->format)) {
                  $link = true;
                  $linktitle = $node->title;
                }
535
536
              }
            }
Kjartan's avatar
Kjartan committed
537
538
539
          }
        }
      }
540
541
      // A tag change occurred, reset counter.
      $tagwords = 0;
542
543
544
545
    }
    else {
      // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
      if ($value != '') {
546
547
548
549
550
551
552
        if ($link) {
          // Check to see if the node link text is its URL. If so, we use the target node title instead.
          if (preg_match('!^https?://!i', $value)) {
            $value = $linktitle;
          }
        }
        $words = search_index_split($value);
553
        foreach ($words as $word) {
554
555
556
          // Add word to accumulator
          $accum .= $word .' ';
          $num = is_numeric($word);
557
          // Check wordlength
558
559
560
561
562
563
          if ($num || drupal_strlen($word) >= $minimum_word_size) {
            // Normalize numbers
            if ($num) {
              $word = (int)ltrim($word, '-0');
            }

564
565
566
567
            if ($link) {
              if (!isset($results[$linknid])) {
                $results[$linknid] = array();
              }
568
              $results[$linknid][$word] += $score * $focus;
569
570
            }
            else {
571
572
573
574
              $results[0][$word] += $score * $focus;
              // Focus is a decaying value in terms of the amount of unique words up to this point.
              // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
              $focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015));
575
576
            }
          }
577
578
579
580
581
582
          $tagwords++;
          // Too many words inside a single tag probably mean a tag was accidentally left open.
          if (count($tagstack) && $tagwords >= 15) {
            $tagstack = array();
            $score = 1;
          }
Dries's avatar
   
Dries committed
583
        }
Kjartan's avatar
Kjartan committed
584
585
      }
    }
586
    $tag = !$tag;
Kjartan's avatar
Kjartan committed
587
588
  }

589
  search_wipe($sid, $type);
Kjartan's avatar
Kjartan committed
590

591
592
593
  // Insert cleaned up data into dataset
  db_query("INSERT INTO {search_dataset} (sid, type, data) VALUES (%d, '%s', '%s')", $sid, $type, $accum);

594
595
  // Insert results into search index
  foreach ($results[0] as $word => $score) {
596
    db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %f)", $word, $sid, $type, $score);
597
598
599
    search_dirty($word);
  }
  unset($results[0]);
Dries's avatar
   
Dries committed
600

601
602
603
  // Now insert links to nodes
  foreach ($results as $nid => $words) {
    foreach ($words as $word => $score) {
604
      db_query("INSERT INTO {search_index} (word, sid, type, fromsid, fromtype, score) VALUES ('%s', %d, '%s', %d, '%s', %f)", $word, $nid, 'node', $sid, $type, $score);
605
      search_dirty($word);
Kjartan's avatar
Kjartan committed
606
607
608
609
    }
  }
}

610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
/**
 * Extract a module-specific search option from a search query. e.g. 'type:book'
 */
function search_query_extract($keys, $option) {
  if (preg_match('/(^| )'. $option .':([^ ]*)( |$)/i', $keys, $matches)) {
    return $matches[2];
  }
}

/**
 * Return a query with the given module-specific search option inserted in.
 * e.g. 'type:book'.
 */
function search_query_insert($keys, $option, $value = '') {
  if (search_query_extract($keys, $option)) {
    $keys = trim(preg_replace('/(^| )'. $option .':[^ ]*/i', '', $keys));
  }
  if ($value != '') {
    $keys .= ' '. $option .':'. $value;
  }
  return $keys;
}

/**
 * Parse a search query into SQL conditions.
 *
636
 * We build a query that matches the dataset bodies.
637
638
639
640
641
642
643
644
645
646
647
648
649
650
 */
function search_parse_query($text) {
  $keys = array('positive' => array(), 'negative' => array());

  // Tokenize query string
  preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' '. $text, $matches, PREG_SET_ORDER);

  if (count($matches) < 1) {
    return NULL;
  }

  // Classify tokens
  $or = false;
  foreach ($matches as $match) {
651
    $phrase = false;
Steven Wittens's avatar
Steven Wittens committed
652
    // Strip off phrase quotes
653
654
    if ($match[2]{0} == '"') {
      $match[2] = substr($match[2], 1, -1);
655
      $phrase = true;
656
    }
Steven Wittens's avatar
Steven Wittens committed
657
    // Simplify keyword according to indexing rules and external preprocessors
658
659
    $words = search_simplify($match[2]);
    // Re-explode in case simplification added more words, except when matching a phrase
Steven Wittens's avatar
Steven Wittens committed
660
    $words = $phrase ? array($words) : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);
661
662
    // Negative matches
    if ($match[1] == '-') {
663
      $keys['negative'] = array_merge($keys['negative'], $words);
664
665
666
667
    }
    // OR operator: instead of a single keyword, we store an array of all
    // OR'd keywords.
    elseif ($match[2] == 'OR' && count($keys['positive'])) {
Steven Wittens's avatar
Steven Wittens committed
668
669
670
671
672
673
      $last = array_pop($keys['positive']);
      // Starting a new OR?
      if (!is_array($last)) {
        $last = array($last);
      }
      $keys['positive'][] = $last;
674
675
676
677
678
679
      $or = true;
      continue;
    }
    // Plain keyword
    else {
      if ($or) {
680
681
        // Add to last element (which is an array)
        $keys['positive'][count($keys['positive']) - 1] = array_merge($keys['positive'][count($keys['positive']) - 1], $words);
682
683
      }
      else {
684
        $keys['positive'] = array_merge($keys['positive'], $words);
685
686
687
688
689
690
691
692
693
694
      }
    }
    $or = false;
  }

  // Convert keywords into SQL statements.
  $query = array();
  $query2 = array();
  $arguments = array();
  $arguments2 = array();
Steven Wittens's avatar
Steven Wittens committed
695
  $matches = 0;
696
697
698
699
700
  // Positive matches
  foreach ($keys['positive'] as $key) {
    // Group of ORed terms
    if (is_array($key) && count($key)) {
      $queryor = array();
Steven Wittens's avatar
Steven Wittens committed
701
      $any = false;
702
      foreach ($key as $or) {
Steven Wittens's avatar
Steven Wittens committed
703
704
        list($q, $count) = _search_parse_query($or, $arguments2);
        $any |= $count;
705
706
707
708
709
710
711
        if ($q) {
          $queryor[] = $q;
          $arguments[] = $or;
        }
      }
      if (count($queryor)) {
        $query[] = '('. implode(' OR ', $queryor) .')';
Steven Wittens's avatar
Steven Wittens committed
712
713
        // A group of OR keywords only needs to match once
        $matches += ($any > 0);
714
715
716
717
      }
    }
    // Single ANDed term
    else {
Steven Wittens's avatar
Steven Wittens committed
718
      list($q, $count) = _search_parse_query($key, $arguments2);
719
720
721
      if ($q) {
        $query[] = $q;
        $arguments[] = $key;
Steven Wittens's avatar
Steven Wittens committed
722
723
        // Each AND keyword needs to match at least once
        $matches += $count;
724
725
726
      }
    }
  }
Steven Wittens's avatar
Steven Wittens committed
727
  // Negative matches
728
  foreach ($keys['negative'] as $key) {
Steven Wittens's avatar
Steven Wittens committed
729
    list($q) = _search_parse_query($key, $arguments2, true);
730
731
732
733
734
735
    if ($q) {
      $query[] = $q;
      $arguments[] = $key;
    }
  }
  $query = implode(' AND ', $query);
736

Steven Wittens's avatar
Steven Wittens committed
737
  // Build word-index conditions for the first pass
738
  $query2 = substr(str_repeat("i.word = '%s' OR ", count($arguments2)), 0, -4);
Steven Wittens's avatar
Steven Wittens committed
739
740

  return array($query, $arguments, $query2, $arguments2, $matches);
741
742
743
744
745
746
}

/**
 * Helper function for search_parse_query();
 */
function _search_parse_query(&$word, &$scores, $not = false) {
Steven Wittens's avatar
Steven Wittens committed
747
  $count = 0;
748
749
750
751
752
  // Determine the scorewords of this word/phrase
  if (!$not) {
    $split = explode(' ', $word);
    foreach ($split as $s) {
      $num = is_numeric($s);
753
      if ($num || drupal_strlen($s) >= variable_get('minimum_word_size', 3)) {
Steven Wittens's avatar
Steven Wittens committed
754
755
756
757
758
        $s = $num ? ((int)ltrim($s, '-0')) : $s;
        if (!isset($scores[$s])) {
          $scores[$s] = $s;
          $count++;
        }
759
760
761
      }
    }
  }
Steven Wittens's avatar
Steven Wittens committed
762
763
  // Return matching snippet and number of added words
  return array("d.data ". ($not ? 'NOT ' : '') ."LIKE '%% %s %%'", $count);
764
765
}

Kjartan's avatar
Kjartan committed
766
/**
Steven Wittens's avatar
Steven Wittens committed
767
 * Do a query on the full-text search index for a word or words.
768
 *
Steven Wittens's avatar
Steven Wittens committed
769
770
 * This function is normally only called by each module that support the
 * indexed search (and thus, implements hook_update_index()).
771
 *
772
773
774
775
 * Two queries are performed which can be extended by the caller.
 *
 * The first query selects a set of possible matches based on the search index
 * and any extra given restrictions. This is the classic "OR" search.
776
 *
777
 * SELECT i.type, i.sid, SUM(i.score*t.count) AS relevance
778
 * FROM {search_index} i
779
780
781
 * INNER JOIN {search_total} t ON i.word = t.word
 * $join1
 * WHERE $where1 AND (...)
782
 * GROUP BY i.type, i.sid
783
784
785
786
787
788
789
790
791
792
793
 *
 * The second query further refines this set by verifying advanced text
 * conditions (such as AND, negative or phrase matches), and orders the results
 * on a the column or expression 'score':
 *
 * SELECT i.type, i.sid, $select2
 * FROM temp_search_sids i
 * INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type
 * $join2
 * WHERE (...)
 * ORDER BY score DESC
794
 *
795
 * @param $keywords
796
797
798
799
 *   A search string as entered by the user.
 *
 * @param $type
 *   A string identifying the calling module.
Kjartan's avatar
Kjartan committed
800
 *
801
802
 * @param $join1
 *   (optional) Inserted into the JOIN part of the first SQL query.
803
804
 *   For example "INNER JOIN {node} n ON n.nid = i.sid".
 *
805
806
807
808
809
810
811
812
813
814
815
 * @param $where1
 *   (optional) Inserted into the WHERE part of the first SQL query.
 *   For example "(n.status > %d)".
 *
 * @param $arguments1
 *   (optional) Extra SQL arguments belonging to the first query.
 *
 * @param $select2
 *   (optional) Inserted into the SELECT pat of the second query. Must contain
 *   a column selected as 'score'.
 *   defaults to 'i.relevance AS score'
816
 *
817
818
819
820
821
822
 * @param $join2
 *   (optional) Inserted into the JOIN par of the second SQL query.
 *   For example "INNER JOIN {node_comment_statistics} n ON n.nid = i.sid"
 *
 * @param $arguments2
 *   (optional) Extra SQL arguments belonging to the second query parameter.
823
 *
824
825
 * @return
 *   An array of SIDs for the search results.
826
827
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
828
 */
829
830
function do_search($keywords, $type, $join1 = '', $where1 = '1', $arguments1 = array(), $select2 = 'i.relevance AS score', $join2 = '', $arguments2 = array()) {
  $query = search_parse_query($keywords);
831

Steven Wittens's avatar
Steven Wittens committed
832
833
834
  if ($query[2] == '') {
    form_set_error('keys', t('You must include at least one positive keyword with %count characters or more.', array('%count' => variable_get('minimum_word_size', 3))));
  }
835
836
  if ($query === NULL || $query[0] == '' || $query[2] == '') {
    return array();
837
  }
838

839
840
841
  // First pass: select all possible matching sids, doing a simple index-based OR matching on the keywords.
  // 'matches' is used to reject those items that cannot possibly match the query.
  $conditions = $where1 .' AND ('. $query[2] .") AND i.type = '%s'";
Steven Wittens's avatar
Steven Wittens committed
842
  $arguments = array_merge($arguments1, $query[3], array($type, $query[4]));
843
  $result = db_query_temporary("SELECT i.type, i.sid, SUM(i.score * t.count) AS relevance, COUNT(*) AS matches FROM {search_index} i INNER JOIN {search_total} t ON i.word = t.word $join1 WHERE $conditions GROUP BY i.type, i.sid HAVING COUNT(*) >= %d", $arguments, 'temp_search_sids');
844
845

  // Calculate maximum relevance, to normalize it
846
  $normalize = db_result(db_query('SELECT MAX(relevance) FROM temp_search_sids'));
847
  if (!$normalize) {
848
849
    return array();
  }
850
  $select2 = str_replace('i.relevance', '('. (1.0 / $normalize) .' * i.relevance)', $select2);
851

852
853
854
  // Second pass: only keep items that match the complicated keywords conditions (phrase search, negative keywords, ...)
  $conditions = '('. $query[0] .')';
  $arguments = array_merge($arguments2, $query[1]);
855
856
  $result = db_query_temporary("SELECT i.type, i.sid, $select2 FROM temp_search_sids i INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type $join2 WHERE $conditions ORDER BY score DESC", $arguments, 'temp_search_results');
  if (($count = db_result(db_query('SELECT COUNT(*) FROM temp_search_results'))) == 0) {
857
    return array();
858
859
860
  }
  $count_query = "SELECT $count";

861
  // Do actual search query
862
  $result = pager_query("SELECT * FROM temp_search_results", 10, 0, $count_query, $arguments);
863
864
865
866
867
  $results = array();
  while ($item = db_fetch_object($result)) {
    $results[] = $item->sid;
  }
  return $results;
Kjartan's avatar
Kjartan committed
868
869
}

870
871
872
873
874
875
876
877
878
879
/**
 * Helper function for grabbing search keys.
 */
function search_get_keys() {
  // Extract keys as remainder of path
  // Note: support old GET format of searches for existing links.
  $path = explode('/', $_GET['q'], 3);
  return count($path) == 3 ? $path[2] : $_REQUEST['keys'];
}

Dries's avatar
   
Dries committed
880
881
882
883
/**
 * Menu callback; presents the search form and/or search results.
 */
function search_view() {
884
885
886
887
888
  $type = arg(1);

  // Search form submits with POST but redirects to GET. This way we can keep
  // the search query URL clean as a whistle:
  // search/type/keyword+keyword
889
  if (isset($_POST['edit']['keys'])) {
890
891
892
    if ($type == '') {
      $type = 'node';
    }
893
    $keys = module_invoke($type, 'search', 'post', $_POST['edit']['keys']);
894
    drupal_goto('search/'. $type .'/'. (is_null($keys) ? $_POST['edit']['keys'] : $keys));
895
896
897
  }
  else if ($type == '') {
    // Note: search/node can not be a default tab because it would take on the
898
    // path of its parent (search). It would prevent remembering keywords when
899
900
901
902
    // switching tabs. This is why we drupal_goto to it from the parent instead.
    drupal_goto('search/node');
  }
  $keys = search_get_keys();
Dries's avatar
   
Dries committed
903

Dries's avatar
   
Dries committed
904
  if (user_access('search content')) {
905
906
907
    // Only perform search if there is non-whitespace search term:
    if (trim($keys)) {
      // Log the search keys:
Dries's avatar
Dries committed
908
      watchdog('search',
909
        t('Search: %keys (%type).', array('%keys' => theme('placeholder', $keys), '%type' => module_invoke($type, 'search', 'name'))),
Dries's avatar
Dries committed
910
        WATCHDOG_NOTICE,
911
        l(t('results'), 'search/'. $type .'/'. $keys)
Dries's avatar
Dries committed
912
        );
Dries's avatar
   
Dries committed
913

914
      // Collect the search results:
915
      $results = search_data($keys, $type);
Dries's avatar
   
Dries committed
916

Dries's avatar
   
Dries committed
917
      if ($results) {
918
        $results = theme('box', t('Search results'), $results);
Dries's avatar
   
Dries committed
919
920
      }
      else {
921
        $results = theme('box', t('Your search yielded no results'), search_help('search#noresults'));
Dries's avatar
   
Dries committed
922
      }
Dries's avatar
   
Dries committed
923
    }
924
925
926
927
928
929
    else if (isset($_POST['edit'])) {
      form_set_error('keys', t('Please enter some keywords.'));
    }

    // Construct the search form.
    // Note, we do this last because of the form_set_error() above.
930
    $output = search_form(NULL, $keys, $type);
931
932

    $output .= $results;
Dries's avatar
   
Dries committed
933

Dries's avatar
   
Dries committed
934
    return $output;
935
  }
Dries's avatar
   
Dries committed
936
  else {
Dries's avatar
   
Dries committed
937
    drupal_access_denied();
Dries's avatar
   
Dries committed
938
  }
Kjartan's avatar
Kjartan committed
939
940
}

941
942
943
944
945
946
947
948
/**
 * @defgroup search Search interface
 * @{
 * The Drupal search interface manages a global search mechanism.
 *
 * Modules may plug into this system to provide searches of different types of
 * data. Most of the system is handled by search.module, so this must be enabled
 * for all of the search features to work.
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
 *
 * There are three ways to interact with the search system:
 * - Specifically for searching nodes, you can implement nodeapi('update index')
 *   and nodeapi('search result'). However, note that the search system already
 *   indexes all visible output of a node, i.e. everything displayed normally
 *   by hook_view() and hook_nodeapi('view'). This is usually sufficient.
 *   You should only use this mechanism if you want additional, non-visible data
 *   to be indexed.
 * - Implement hook_search(). This will create a search tab for your module on
 *   the /search page with a simple keyword search form. You may optionally
 *   implement hook_search_item() to customize the display of your results.
 * - Implement hook_update_index(). This allows your module to use Drupal's
 *   HTML indexing mechanism for searching full text efficiently.
 *
 * If your module needs to provide a more complicated search form, then you need
Dries's avatar
Dries committed
964
 * to implement it yourself without hook_search(). In that case, you should
965
966
 * define it as a local task (tab) under the /search page (e.g. /search/mymodule)
 * so that users can easily find it.
967
968
969
970
971
972
973
974
975
 */

/**
 * Render a search form.
 *
 * @param $action
 *   Form action. Defaults to "search".
 * @param $keys
 *   The search string entered by the user, containing keywords for the search.
976
977
978
 * @param $type
 *   The type of search to render the node for. Must be the name of module
 *   which implements hook_search(). Defaults to 'node'.
979
980
 * @param $prompt
 *   A piece of text to put before the form (e.g. "Enter your keywords")
981
982
983
 * @return
 *   An HTML string containing the search form.
 */
984
function search_form($action = '', $keys = '', $type = null, $prompt = null) {
985
986

  if (!$action) {
987
    $action = url('search/'. $type);
988
  }
989
990
991
  if (is_null($prompt)) {
    $prompt = t('Enter your keywords');
  }
992

993
  $form = array();
994
995
  $form['#action'] = $action;
  $form['#attributes'] = array('class' => 'search-form');
996
997
  $form['basic'] = array('#type' => 'item', '#title' => $prompt);
  $form['basic']['inline'] = array('#type' => 'markup', '#prefix' => '<div class="container-inline">', '#suffix' => '</div>');
998
  $form['basic']['inline']['keys'] = array('#type' => 'textfield', '#title' => '', '#default_value' => $keys, '#size' => $prompt ? 40 : 20, '#maxlength' => 255);
999
  $form['basic']['inline']['submit'] = array('#type' => 'submit', '#value' => t('Search'));
1000

1001
  $form_module = module_invoke($type, 'search', 'form', $keys);
1002
  if (isset($form_module) && is_array($form_module)) {
1003
1004
    $form = array_merge($form, $form_module);
  }
1005

1006
  return drupal_get_form('search_form', $form);
1007
1008
1009
}

/**
Steven Wittens's avatar
Steven Wittens committed
1010
 * Perform a standard search on the given keys, and return the formatted results.
1011
1012
1013
1014
1015
1016
1017
 */
function search_data($keys = NULL, $type = 'node') {
  $output = '';

  if (isset($keys)) {
    if (module_hook($type, 'search')) {
      $results = module_invoke($type, 'search', 'search', $keys);
1018
      if (isset($results) && is_array($results) && count($results)) {
1019
1020
1021
1022
1023
        $output .= '<dl class="search-results">';
        foreach ($results as $entry) {
          $output .= theme('search_item', $entry, $type);
        }
        $output .= '</dl>';
1024
        $output .= theme('pager', NULL, 15, 0);
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
      }
    }
  }

  return $output;
}

/**
 * Returns snippets from a piece of text, with certain keywords highlighted.
 * Used for formatting search results.
 *
 * @param $keys
1037
 *   A string containing a search query.
1038
1039
1040
1041
1042
1043
1044
1045
 *
 * @param $text
 *   The text to extract fragments from.
 *
 * @return
 *   A string containing HTML for the excerpt.
 */
function search_excerpt($keys, $text) {
Steven Wittens's avatar
Steven Wittens committed
1046
1047
1048
  // We highlight around non-indexable or CJK characters.
  $boundary = '(?:(?<=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .'])|(?=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .']))';

1049
1050
1051
1052
1053
  // Extract positive keywords and phrases
  preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' '. $keys, $matches);
  $keys = array_merge($matches[2], $matches[3]);

  // Prepare text
Steven Wittens's avatar
Steven Wittens committed
1054
  $text = ' '. strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text)) .' ';
1055
1056
  array_walk($keys, '_search_excerpt_replace');
  $workkeys = $keys;
1057
1058
1059
1060

  // Extract a fragment per keyword for at most 4 keywords.
  // First we collect ranges of text around each keyword, starting/ending
  // at spaces.
1061
  // If the sum of all fragments is too short, we look for second occurrences.
1062
1063
1064
  $ranges = array();
  $included = array();
  $length = 0;
1065
1066
  while ($length < 256 && count($workkeys)) {
    foreach ($workkeys as $k => $key) {
1067
      if (strlen($key) == 0) {
1068
        unset($workkeys[$k]);
1069
        unset($keys[$k]);
1070
1071
1072
1073
1074
        continue;
      }
      if ($length >= 256) {
        break;
      }
1075
      // Remember occurrence of key so we can skip over it if more occurrences
1076
1077
1078
1079
      // are desired.
      if (!isset($included[$key])) {
        $included[$key] = 0;
      }
1080
1081
      // Locate a keyword (position $p), then locate a space in front (position
      // $q) and behind it (position $s)
Steven Wittens's avatar
Steven Wittens committed
1082
      if (preg_match('/'. $boundary . $key . $boundary .'/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
1083
        $p = $match[0][1];
1084
1085
1086
1087
1088
1089
1090
1091
        if (($q = strpos($text, ' ', max(0, $p - 60))) !== false) {
          $end = substr($text, $p, 80);
          if (($s = strrpos($end, ' ')) !== false) {
            $ranges[$q] = $p + $s;
            $length += $p + $s - $q;
            $included[$key] = $p + 1;
          }
          else {
1092
            unset($workkeys[$k]);
1093
1094
1095
          }
        }
        else {
1096
          unset($workkeys[$k]);
1097
1098
1099
        }
      }
      else {
1100
        unset($workkeys[$k]);
1101
1102
      }
    }
1103
  }
1104