search.module 42.8 KB
Newer Older
Dries's avatar
   
Dries committed
1
<?php
2
// $Id$
Dries's avatar
   
Dries committed
3

Dries's avatar
   
Dries committed
4
5
6
7
8
/**
 * @file
 * Enables site-wide keyword searching.
 */

9
/**
10
 * Matches Unicode character classes to exclude from the search index.
Steven Wittens's avatar
Steven Wittens committed
11
 *
12
13
 * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
 *
14
 * The index only contains the following character classes:
15
16
17
18
19
20
21
 * Lu     Letter, Uppercase
 * Ll     Letter, Lowercase
 * Lt     Letter, Titlecase
 * Lo     Letter, Other
 * Nd     Number, Decimal Digit
 * No     Number, Other
 */
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
define('PREG_CLASS_SEARCH_EXCLUDE',
'\x{0}-\x{2f}\x{3a}-\x{40}\x{5b}-\x{60}\x{7b}-\x{bf}\x{d7}\x{f7}\x{2b0}-'.
'\x{385}\x{387}\x{3f6}\x{482}-\x{489}\x{559}-\x{55f}\x{589}-\x{5c7}\x{5f3}-'.
'\x{61f}\x{640}\x{64b}-\x{65e}\x{66a}-\x{66d}\x{670}\x{6d4}\x{6d6}-\x{6ed}'.
'\x{6fd}\x{6fe}\x{700}-\x{70f}\x{711}\x{730}-\x{74a}\x{7a6}-\x{7b0}\x{901}-'.
'\x{903}\x{93c}\x{93e}-\x{94d}\x{951}-\x{954}\x{962}-\x{965}\x{970}\x{981}-'.
'\x{983}\x{9bc}\x{9be}-\x{9cd}\x{9d7}\x{9e2}\x{9e3}\x{9f2}-\x{a03}\x{a3c}-'.
'\x{a4d}\x{a70}\x{a71}\x{a81}-\x{a83}\x{abc}\x{abe}-\x{acd}\x{ae2}\x{ae3}'.
'\x{af1}-\x{b03}\x{b3c}\x{b3e}-\x{b57}\x{b70}\x{b82}\x{bbe}-\x{bd7}\x{bf0}-'.
'\x{c03}\x{c3e}-\x{c56}\x{c82}\x{c83}\x{cbc}\x{cbe}-\x{cd6}\x{d02}\x{d03}'.
'\x{d3e}-\x{d57}\x{d82}\x{d83}\x{dca}-\x{df4}\x{e31}\x{e34}-\x{e3f}\x{e46}-'.
'\x{e4f}\x{e5a}\x{e5b}\x{eb1}\x{eb4}-\x{ebc}\x{ec6}-\x{ecd}\x{f01}-\x{f1f}'.
'\x{f2a}-\x{f3f}\x{f71}-\x{f87}\x{f90}-\x{fd1}\x{102c}-\x{1039}\x{104a}-'.
'\x{104f}\x{1056}-\x{1059}\x{10fb}\x{10fc}\x{135f}-\x{137c}\x{1390}-\x{1399}'.
'\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16f0}\x{1712}-\x{1714}'.
'\x{1732}-\x{1736}\x{1752}\x{1753}\x{1772}\x{1773}\x{17b4}-\x{17db}\x{17dd}'.
'\x{17f0}-\x{180e}\x{1843}\x{18a9}\x{1920}-\x{1945}\x{19b0}-\x{19c0}\x{19c8}'.
'\x{19c9}\x{19de}-\x{19ff}\x{1a17}-\x{1a1f}\x{1d2c}-\x{1d61}\x{1d78}\x{1d9b}-'.
'\x{1dc3}\x{1fbd}\x{1fbf}-\x{1fc1}\x{1fcd}-\x{1fcf}\x{1fdd}-\x{1fdf}\x{1fed}-'.
'\x{1fef}\x{1ffd}-\x{2070}\x{2074}-\x{207e}\x{2080}-\x{2101}\x{2103}-\x{2106}'.
'\x{2108}\x{2109}\x{2114}\x{2116}-\x{2118}\x{211e}-\x{2123}\x{2125}\x{2127}'.
'\x{2129}\x{212e}\x{2132}\x{213a}\x{213b}\x{2140}-\x{2144}\x{214a}-\x{2b13}'.
'\x{2ce5}-\x{2cff}\x{2d6f}\x{2e00}-\x{3005}\x{3007}-\x{303b}\x{303d}-\x{303f}'.
'\x{3099}-\x{309e}\x{30a0}\x{30fb}-\x{30fe}\x{3190}-\x{319f}\x{31c0}-\x{31cf}'.
'\x{3200}-\x{33ff}\x{4dc0}-\x{4dff}\x{a015}\x{a490}-\x{a716}\x{a802}\x{a806}'.
'\x{a80b}\x{a823}-\x{a82b}\x{d800}-\x{f8ff}\x{fb1e}\x{fb29}\x{fd3e}\x{fd3f}'.
'\x{fdfc}-\x{fe6b}\x{feff}-\x{ff0f}\x{ff1a}-\x{ff20}\x{ff3b}-\x{ff40}\x{ff5b}-'.
'\x{ff65}\x{ff70}\x{ff9e}\x{ff9f}\x{ffe0}-\x{fffd}');
50
51

/**
Steven Wittens's avatar
Steven Wittens committed
52
 * Matches all 'N' Unicode character classes (numbers)
53
 */
54
55
56
57
58
59
60
61
62
63
define('PREG_CLASS_NUMBERS',
'\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}'.
'\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}'.
'\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}'.
'\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-'.
'\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}'.
'\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}'.
'\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}'.
'\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-'.
'\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}');
64
65

/**
Steven Wittens's avatar
Steven Wittens committed
66
 * Matches all 'P' Unicode character classes (punctuation)
67
 */
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
define('PREG_CLASS_PUNCTUATION',
'\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}'.
'\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}'.
'\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}'.
'\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}'.
'\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}'.
'\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}'.
'\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}'.
'\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}'.
'\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-'.
'\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}'.
'\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}'.
'\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}'.
'\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}'.
'\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-'.
'\x{ff65}');

/**
 * Matches all CJK characters that are candidates for auto-splitting
 * (Chinese, Japanese, Korean).
 * Contains kana and BMP ideographs.
 */
define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}'.
'\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}');
92

Dries's avatar
   
Dries committed
93
94
95
/**
 * Implementation of hook_help().
 */
96
function search_help($section) {
Dries's avatar
   
Dries committed
97
  switch ($section) {
98
99
100
101
102
103
104
105
106
107
108
109
    case 'admin/help#search':
      $output = '<p>'. t('The search module adds the ability to search for content by keywords.  Search is often the only practical way to find content on a large site.  Search is useful for finding users and posts by searching on keywords.') .'</p>';
      $output .= '<p>'. t('The search engine works by maintaining an index of the words in your site\'s content. It indexes the posts and users.  You can adjust the settings to tweak the indexing behaviour. Note that the search requires cron to be set up correctly.  The index percentage sets the maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.') .'</p>';
      $output .= t('<p>You can</p>
<ul>
<li>read about how your site uses cron in the <a href="%admin-help-system">administer &gt;&gt; help &gt;&gt; system</a>.</li>
<li>run your <a href="%file-cron">cron.php</a>.</li>
<li>read about <a href="%external-http-drupal-org-node-23714">configuring cron jobs</a>.</li>
<li><a href="%admin-settings-search">administer &gt;&gt; settings &gt;&gt; search</a>.</ul>
', array('%admin-help-system' => url('admin/help/system'), '%file-cron' => 'cron.php', '%external-http-drupal-org-node-23714' => 'http://drupal.org/node/23714', '%admin-settings-search' => url('admin/settings/search')));
      $output .= '<p>'. t('For more information please read the configuration and customization handbook <a href="%search">Search page</a>.', array('%search' => 'http://www.drupal.org/handbook/modules/search/')) .'</p>';
      return $output;
Dries's avatar
   
Dries committed
110
111
    case 'admin/modules#description':
      return t('Enables site-wide keyword searching.');
112
113
    case 'admin/settings/search':
      return t('
Steven Wittens's avatar
Steven Wittens committed
114
<p>The search engine works by maintaining an index of the words in your site\'s content. You can adjust the settings below to tweak the indexing behaviour. Note that the search requires cron to be set up correctly.</p>
115
');
116
117
118
    case 'search#noresults':
      return t('<p><ul>
<li>Check if your spelling is correct.</li>
119
120
121
<li>Remove quotes around phrases to match each word individually: <em>"blue smurf"</em> will match less than <em>blue smurf</em>.</li>
<li>Consider loosening your query with <em>OR</em>: <em>blue smurf</em> will match less than <em>blue OR smurf</em>.</li>
</ul></p>');
Dries's avatar
   
Dries committed
122
  }
Dries's avatar
   
Dries committed
123
}
Kjartan's avatar
Kjartan committed
124
125

/**
Dries's avatar
   
Dries committed
126
 * Implementation of hook_perm().
Kjartan's avatar
Kjartan committed
127
 */
Dries's avatar
   
Dries committed
128
function search_perm() {
Dries's avatar
   
Dries committed
129
  return array('search content', 'administer search');
Dries's avatar
   
Dries committed
130
131
}

132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
/**
 * Implementation of hook_block().
 */
function search_block($op = 'list', $delta = 0) {
  global $user;
  if ($op == 'list') {
    $blocks[0]['info'] = t('Search form');
    return $blocks;
  }
  else if ($op == 'view' && user_access('search content') && arg(0) != 'search') {
    $block['content'] = search_form('', '', null, '');
    $block['subject'] = t('Search');
    return $block;
  }
}

Dries's avatar
   
Dries committed
148
149
150
/**
 * Implementation of hook_menu().
 */
Dries's avatar
   
Dries committed
151
function search_menu($may_cache) {
Dries's avatar
   
Dries committed
152
  $items = array();
Dries's avatar
   
Dries committed
153
154

  if ($may_cache) {
Dries's avatar
Dries committed
155
    $items[] = array('path' => 'search', 'title' => t('search'),
Dries's avatar
   
Dries committed
156
157
158
      'callback' => 'search_view',
      'access' => user_access('search content'),
      'type' => MENU_SUGGESTED_ITEM);
159
    $items[] = array('path' => 'admin/settings/search/wipe', 'title' => t('Clear index'),
160
161
162
      'callback' => 'search_wipe_confirm',
      'access' => user_access('administer search'),
      'type' => MENU_CALLBACK);
163
164
165
166
167
  }
  else if (arg(0) == 'search') {
    // To remember the user's search keywords when switching across tabs,
    // we dynamically add the keywords to the search tabs' paths.
    $keys = search_get_keys();
168
    $keys = strlen($keys) ? '/'. drupal_urlencode($keys) : '';
Dries's avatar
Dries committed
169
    foreach (module_list() as $name) {
170
171
      if (module_hook($name, 'search') && $title = module_invoke($name, 'search', 'name')) {
        $items[] = array('path' => 'search/'. $name . $keys, 'title' => $title,
Dries's avatar
Dries committed
172
173
          'callback' => 'search_view',
          'access' => user_access('search content'),
174
          'type' => MENU_LOCAL_TASK);
Dries's avatar
Dries committed
175
176
      }
    }
Dries's avatar
   
Dries committed
177
178
  }

Dries's avatar
   
Dries committed
179
180
181
182
  return $items;
}

/**
183
 * Implementation of hook_validate().
Dries's avatar
   
Dries committed
184
 */
185
function search_settings_form_validate($form_id, &$form) {
186
  if ($_POST['op'] == t('Re-index site')) {
187
188
189
190
191
    drupal_goto('admin/settings/search/wipe');
  }
  // If these settings change, the index needs to be rebuilt.
  if ((variable_get('minimum_word_size', 3) != $form['minimum_word_size']) ||
      (variable_get('overlap_cjk', true) != $form['overlap_cjk'])) {
192
193
    drupal_set_message(t('The index will be rebuilt.'));
    search_wipe();
Dries's avatar
Dries committed
194
  }
195
}
Dries's avatar
   
Dries committed
196

197
198
199
200
/**
 * Menu callback; displays the search module settings page.
 */
function search_settings() {
201
202
203
204
205
206
207
208
209
  // Collect some stats
  $remaining = 0;
  $total = 0;
  foreach (module_list() as $module) {
    if (module_hook($module, 'search')) {
      $status = module_invoke($module, 'search', 'status');
      $remaining += $status['remaining'];
      $total += $status['total'];
    }
Dries's avatar
Dries committed
210
  }
211
  $count = format_plural($remaining, 'There is 1 item left to index.', 'There are %count items left to index.');
212
  $percentage = ((int)min(100, 100 * ($total - $remaining) / max(1, $total))) . '%';
213
  $status = '<p><strong>'. t('%percentage of the site has been indexed.', array('%percentage' => $percentage)) .' '. $count .'</strong></p>';
214
215
  $form['status'] = array('#type' => 'fieldset', '#title' => t('Indexing status'));
  $form['status']['status'] = array('#type' => 'markup', '#value' => $status);
216
  $form['status']['wipe'] = array('#type' => 'submit', '#value' => t('Re-index site'));
217
218

  $items = drupal_map_assoc(array(10, 20, 50, 100, 200, 500));
219
220

  // Indexing throttle:
221
222
  $form['indexing_throttle'] = array('#type' => 'fieldset', '#title' => t('Indexing throttle'));
  $form['indexing_throttle']['search_cron_limit'] = array('#type' => 'select', '#title' => t('Items to index per cron run'), '#default_value' => variable_get('search_cron_limit', 100), '#options' => $items, '#description' => t('The maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.'));
Dries's avatar
   
Dries committed
223
  // Indexing settings:
224
  $form['indexing_settings'] = array('#type' => 'fieldset', '#title' => t('Indexing settings'));
225
226
  $form['indexing_settings']['info'] = array('#type' => 'markup', '#value' => '<em>'. t('<p>Changing the settings below will cause the site index to be rebuilt. The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed.</p><p>The default settings should be appropriate for the majority of sites.</p>') .'</em>');
  $form['indexing_settings']['minimum_word_size'] = array('#type' => 'textfield', '#title' => t('Minimum word length to index'), '#default_value' => variable_get('minimum_word_size', 3), '#size' => 5, '#maxlength' => 3, '#description' => t('The number of characters a word has to be to be indexed. A lower setting means better search result ranking, but also a larger database. Each search query must contain at least one keyword that is this size (or longer).'));
227
  $form['indexing_settings']['remove_short'] = array('#type' => 'textfield', '#title' => t('Minimum word length to search for'), '#default_value' => variable_get('remove_short', 3), '#size' => 5, '#maxlength' => 3, '#description' => t('The number of characters a word has to be to be searched for, including wildcard characters.'));
228
  $form['indexing_settings']['overlap_cjk'] = array('#type' => 'checkbox', '#title' => t('Simple CJK handling'), '#default_value' => variable_get('overlap_cjk', true), '#description' => t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Turn this off if you want to use an external preprocessor for this instead. Does not affect other languages.'));
Dries's avatar
   
Dries committed
229

230
231
  // Per module settings
  $form = array_merge($form, module_invoke_all('search', 'admin'));
232
  return $form;
Dries's avatar
   
Dries committed
233
234
}

235
236
237
238
239
/**
 * Menu callback: confirm wiping of the index.
 */
function search_wipe_confirm() {
  return confirm_form('search_wipe_confirm', $form, t('Are you sure you want to re-index the site?'),
240
                  'admin/forums', t(' The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed. This action cannot be undone.'), t('Re-index site'), t('Cancel'));
241
242
243
244
245
}

/**
 * Handler for wipe confirmation
 */
246
function search_wipe_confirm_submit($form_id, &$form) {
247
248
249
250
251
252
253
  if ($form['confirm']) {
    search_wipe();
    drupal_set_message(t('The index will be rebuilt.'));
    drupal_goto('admin/settings/search');
  }
}

Dries's avatar
Dries committed
254
/**
255
256
257
258
259
260
261
 * Wipes a part of or the entire search index.
 *
 * @param $sid
 *  (optional) The SID of the item to wipe. If specified, $type must be passed
 *  too.
 * @param $type
 *  (optional) The type of item to wipe.
Dries's avatar
Dries committed
262
 */
263
function search_wipe($sid = NULL, $type = NULL) {
264
265
266
267
  if ($type == NULL && $sid == NULL) {
    module_invoke_all('search', 'reset');
  }
  else {
268
    db_query("DELETE FROM {search_dataset} WHERE sid = %d AND type = '%s'", $sid, $type);
269
270
271
    db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'", $sid, $type);
    db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type);
  }
Dries's avatar
Dries committed
272
273
}

274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/**
 * Marks a word as dirty (or retrieves the list of dirty words). This is used
 * during indexing (cron). Words which are dirty have outdated total counts in
 * the search_total table, and need to be recounted.
 */
function search_dirty($word = null) {
  static $dirty = array();
  if ($word !== null) {
    $dirty[$word] = true;
  }
  else {
    return $dirty;
  }
}

Kjartan's avatar
Kjartan committed
289
/**
Dries's avatar
   
Dries committed
290
291
 * Implementation of hook_cron().
 *
292
293
 * Fires hook_update_index() in all modules and cleans up dirty words (see
 * search_dirty).
Kjartan's avatar
Kjartan committed
294
295
 */
function search_cron() {
Dries's avatar
Dries committed
296
  // Update word index
Kjartan's avatar
Kjartan committed
297
  foreach (module_list() as $module) {
298
299
    module_invoke($module, 'update_index');
  }
300
  // Update word IDF (Inverse Document Frequency) counts for new/changed words
301
  foreach (search_dirty() as $word => $dummy) {
302
    // Get total count
303
    $total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word));
304
305
306
    // Apply Zipf's law to equalize the probability distribution
    $total = log10(1 + 1/(max(1, $total)));
    db_query("UPDATE {search_total} SET count = %f WHERE word = '%s'", $total, $word);
Dries's avatar
Dries committed
307
    if (!db_affected_rows()) {
308
      db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %f)", $word, $total);
Dries's avatar
Dries committed
309
310
311
312
313
314
315
316
    }
  }
  // Find words that were deleted from search_index, but are still in
  // search_total. We use a LEFT JOIN between the two tables and keep only the
  // rows which fail to join.
  $result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL");
  while ($word = db_fetch_object($result)) {
    db_query("DELETE FROM {search_total} WHERE word = '%s'", $word->realword);
Kjartan's avatar
Kjartan committed
317
318
319
320
  }
}

/**
321
 * Simplifies a string according to indexing rules.
Kjartan's avatar
Kjartan committed
322
 */
323
function search_simplify($text) {
324
325
  // Decode entities to UTF-8
  $text = decode_entities($text);
326

327
328
329
  // Lowercase
  $text = drupal_strtolower($text);

330
331
  // Call an external processor for word handling.
  search_preprocess($text);
Kjartan's avatar
Kjartan committed
332

333
334
  // Simple CJK handling
  if (variable_get('overlap_cjk', true)) {
Dries's avatar
Dries committed
335
    $text = preg_replace_callback('/['. PREG_CLASS_CJK .']+/u', 'search_expand_cjk', $text);
336
  }
337

338
339
340
341
342
343
344
  // To improve searching for numerical data such as dates, IP addresses
  // or version numbers, we consider a group of numerical characters
  // separated only by punctuation characters to be one piece.
  // This also means that searching for e.g. '20/03/1984' also returns
  // results with '20-03-1984' in them.
  // Readable regexp: ([number]+)[punctuation]+(?=[number])
  $text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text);
Kjartan's avatar
Kjartan committed
345

346
347
348
  // The dot, underscore and dash are simply removed. This allows meaningful
  // search behaviour with acronyms and URLs.
  $text = preg_replace('/[._-]+/', '', $text);
Kjartan's avatar
Kjartan committed
349

350
351
352
  // With the exception of the rules above, we consider all punctuation,
  // marks, spacers, etc, to be a word boundary.
  $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text);
Kjartan's avatar
Kjartan committed
353

354
355
356
357
358
  return $text;
}

/**
 * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping
359
 * sequences of characters ('minimum_word_size' long).
360
361
 */
function search_expand_cjk($matches) {
362
363
364
365
366
367
368
  $min = variable_get('minimum_word_size', 3);
  $str = $matches[0];
  $l = drupal_strlen($str);
  // Passthrough short words
  if ($l <= $min) {
    return $str;
  }
369
  $tokens = ' ';
370
371
  // FIFO queue of characters
  $chars = array();
372
373
374
375
  // Begin loop
  for ($i = 0; $i < $l; ++$i) {
    // Grab next character
    $current = drupal_substr($str, 0, 1);
376
377
378
379
380
381
    $str = substr($str, strlen($current));
    $chars[] = $current;
    if ($i >= $min - 1) {
      $tokens .= implode('', $chars) .' ';
      array_shift($chars);
    }
382
383
384
385
386
387
388
389
390
391
392
393
394
395
  }
  return $tokens;
}

/**
 * Splits a string into tokens for indexing.
 */
function search_index_split($text) {
  static $last = null;
  static $lastsplit = null;

  if ($last == $text) {
    return $lastsplit;
  }
396
  // Process words
397
  $text = search_simplify($text);
398
  $words = explode(' ', $text);
399
  array_walk($words, '_search_index_truncate');
Kjartan's avatar
Kjartan committed
400

401
402
403
404
405
406
407
  // Save last keyword result
  $last = $text;
  $lastsplit = $words;

  return $words;
}

408
/**
409
 * Helper function for array_walk in search_index_split.
410
 */
411
function _search_index_truncate(&$text) {
Steven Wittens's avatar
Steven Wittens committed
412
  $text = truncate_utf8($text, 50);
413
414
}

415
416
417
418
/**
 * Invokes hook_search_preprocess() in modules.
 */
function search_preprocess(&$text) {
419
420
  foreach (module_implements('search_preprocess') as $module) {
    $text = module_invoke($module, 'search_preprocess', $text);
Kjartan's avatar
Kjartan committed
421
422
423
424
  }
}

/**
Steven Wittens's avatar
Steven Wittens committed
425
 * Update the full-text search index for a particular item.
426
427
428
 *
 * @param $sid
 *   A number identifying this particular item (e.g. node id).
Kjartan's avatar
Kjartan committed
429
 *
430
431
432
433
434
 * @param $type
 *   A string defining this type of item (e.g. 'node')
 *
 * @param $text
 *   The content of this item. Must be a piece of HTML text.
435
436
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
437
 */
438
function search_index($sid, $type, $text) {
439
  $minimum_word_size = variable_get('minimum_word_size', 4);
440

441
  // Link matching
442
  global $base_url;
443
  $node_regexp = '@href=[\'"]?(?:'. preg_quote($base_url, '@') .'/)?(?:\?q=)?/?((?![a-z]+:)[^\'">]+)[\'">]@i';
444
445
446

  // Multipliers for scores of words inside certain HTML tags.
  // Note: 'a' must be included for link ranking to work.
447
  $tags = array('h1' => 25,
448
449
450
451
452
                'h2' => 18,
                'h3' => 15,
                'h4' => 12,
                'h5' => 9,
                'h6' => 6,
453
454
455
456
457
                'u' => 3,
                'b' => 3,
                'i' => 3,
                'strong' => 3,
                'em' => 3,
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
                'a' => 10);

  // Strip off all ignored tags to speed up processing, but insert space before/after
  // them to keep word boundaries.
  $text = str_replace(array('<', '>'), array(' <', '> '), $text);
  $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');

  // Split HTML tags from plain text.
  $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  // Note: PHP ensures the array consists of alternating delimiters and literals
  // and begins and ends with a literal (inserting $null as required).

  $tag = false; // Odd/even counter. Tag or no tag.
  $link = false; // State variable for link analyser
  $score = 1; // Starting score per word
473
474
475
476
  $accum = ' '; // Accumulator for cleaned up data
  $tagstack = array(); // Stack with open tags
  $tagwords = 0; // Counter for consecutive words
  $focus = 1; // Focus state
477

478
  $results = array(0 => array()); // Accumulator for words for index
479
480
481
482
483

  foreach ($split as $value) {
    if ($tag) {
      // Increase or decrease score per word based on tag
      list($tagname) = explode(' ', $value, 2);
484
      $tagname = drupal_strtolower($tagname);
485
      // Closing or opening tag?
486
      if ($tagname{0} == '/') {
487
488
489
490
        $tagname = substr($tagname, 1);
        // If we encounter unexpected tags, reset score to avoid incorrect boosting.
        if (!count($tagstack) || $tagstack[0] != $tagname) {
          $tagstack = array();
491
492
          $score = 1;
        }
493
494
495
496
497
        else {
          // Remove from tag stack and decrement score
          $score = max(1, $score - $tags[array_shift($tagstack)]);
        }
        if ($tagname == 'a') {
498
499
          $link = false;
        }
Kjartan's avatar
Kjartan committed
500
      }
501
      else {
502
503
504
505
        if ($tagstack[0] == $tagname) {
          // None of the tags we look for make sense when nested identically.
          // If they are, it's probably broken HTML.
          $tagstack = array();
506
          $score = 1;
507
508
509
510
511
512
        }
        else {
          // Add to open tag stack and increment score
          array_unshift($tagstack, $tagname);
          $score += $tags[$tagname];
        }
513
514
515
516
        if ($tagname == 'a') {
          // Check if link points to a node on this site
          if (preg_match($node_regexp, $value, $match)) {
            $path = drupal_get_normal_path($match[1]);
517
            if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
518
519
              $linknid = $match[1];
              if ($linknid > 0) {
520
521
522
523
524
525
                // Note: ignore links to uncachable nodes to avoid redirect bugs.
                $node = db_fetch_object(db_query('SELECT n.title, n.nid, n.vid, r.format FROM {node} n INNER JOIN {node_revisions} r ON n.vid = r.vid WHERE n.nid = %d', $linknid));
                if (filter_format_allowcache($node->format)) {
                  $link = true;
                  $linktitle = $node->title;
                }
526
527
              }
            }
Kjartan's avatar
Kjartan committed
528
529
530
          }
        }
      }
531
532
      // A tag change occurred, reset counter.
      $tagwords = 0;
533
534
535
536
    }
    else {
      // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
      if ($value != '') {
537
538
539
540
541
542
543
        if ($link) {
          // Check to see if the node link text is its URL. If so, we use the target node title instead.
          if (preg_match('!^https?://!i', $value)) {
            $value = $linktitle;
          }
        }
        $words = search_index_split($value);
544
        foreach ($words as $word) {
545
546
547
          // Add word to accumulator
          $accum .= $word .' ';
          $num = is_numeric($word);
548
          // Check wordlength
549
550
551
552
553
554
          if ($num || drupal_strlen($word) >= $minimum_word_size) {
            // Normalize numbers
            if ($num) {
              $word = (int)ltrim($word, '-0');
            }

555
556
557
558
            if ($link) {
              if (!isset($results[$linknid])) {
                $results[$linknid] = array();
              }
559
              $results[$linknid][$word] += $score * $focus;
560
561
            }
            else {
562
563
564
565
              $results[0][$word] += $score * $focus;
              // Focus is a decaying value in terms of the amount of unique words up to this point.
              // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
              $focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015));
566
567
            }
          }
568
569
570
571
572
573
          $tagwords++;
          // Too many words inside a single tag probably mean a tag was accidentally left open.
          if (count($tagstack) && $tagwords >= 15) {
            $tagstack = array();
            $score = 1;
          }
Dries's avatar
   
Dries committed
574
        }
Kjartan's avatar
Kjartan committed
575
576
      }
    }
577
    $tag = !$tag;
Kjartan's avatar
Kjartan committed
578
579
  }

580
  search_wipe($sid, $type);
Kjartan's avatar
Kjartan committed
581

582
583
584
  // Insert cleaned up data into dataset
  db_query("INSERT INTO {search_dataset} (sid, type, data) VALUES (%d, '%s', '%s')", $sid, $type, $accum);

585
586
587
588
589
590
  // Insert results into search index
  foreach ($results[0] as $word => $score) {
    db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %d)", $word, $sid, $type, $score);
    search_dirty($word);
  }
  unset($results[0]);
Dries's avatar
   
Dries committed
591

592
593
594
595
596
  // Now insert links to nodes
  foreach ($results as $nid => $words) {
    foreach ($words as $word => $score) {
      db_query("INSERT INTO {search_index} (word, sid, type, fromsid, fromtype, score) VALUES ('%s', %d, '%s', %d, '%s', %d)", $word, $nid, 'node', $sid, $type, $score);
      search_dirty($word);
Kjartan's avatar
Kjartan committed
597
598
599
600
    }
  }
}

601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
/**
 * Extract a module-specific search option from a search query. e.g. 'type:book'
 */
function search_query_extract($keys, $option) {
  if (preg_match('/(^| )'. $option .':([^ ]*)( |$)/i', $keys, $matches)) {
    return $matches[2];
  }
}

/**
 * Return a query with the given module-specific search option inserted in.
 * e.g. 'type:book'.
 */
function search_query_insert($keys, $option, $value = '') {
  if (search_query_extract($keys, $option)) {
    $keys = trim(preg_replace('/(^| )'. $option .':[^ ]*/i', '', $keys));
  }
  if ($value != '') {
    $keys .= ' '. $option .':'. $value;
  }
  return $keys;
}

/**
 * Parse a search query into SQL conditions.
 *
627
 * We build a query that matches the dataset bodies.
628
629
630
631
632
633
634
635
636
637
638
639
640
641
 */
function search_parse_query($text) {
  $keys = array('positive' => array(), 'negative' => array());

  // Tokenize query string
  preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' '. $text, $matches, PREG_SET_ORDER);

  if (count($matches) < 1) {
    return NULL;
  }

  // Classify tokens
  $or = false;
  foreach ($matches as $match) {
642
    $phrase = false;
643
644
645
    // Strip off quotes
    if ($match[2]{0} == '"') {
      $match[2] = substr($match[2], 1, -1);
646
      $phrase = true;
647
648
    }
    // Simplify keyword according to indexing rules
649
650
651
    $words = search_simplify($match[2]);
    // Re-explode in case simplification added more words, except when matching a phrase
    $words = $phrase ? array($words) : explode(' ', $words);
652
653
    // Negative matches
    if ($match[1] == '-') {
654
      $keys['negative'] = array_merge($keys['negative'], $words);
655
656
657
658
659
660
661
662
663
664
665
    }
    // OR operator: instead of a single keyword, we store an array of all
    // OR'd keywords.
    elseif ($match[2] == 'OR' && count($keys['positive'])) {
      $keys['positive'][] = array(array_pop($keys['positive']));
      $or = true;
      continue;
    }
    // Plain keyword
    else {
      if ($or) {
666
667
        // Add to last element (which is an array)
        $keys['positive'][count($keys['positive']) - 1] = array_merge($keys['positive'][count($keys['positive']) - 1], $words);
668
669
      }
      else {
670
        $keys['positive'] = array_merge($keys['positive'], $words);
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
      }
    }
    $or = false;
  }

  // Convert keywords into SQL statements.
  $scorewords = array();
  $query = array();
  $query2 = array();
  $arguments = array();
  $arguments2 = array();
  $matches = 0; // Counts the minimal number of words per item must match in the index.
  // Positive matches
  foreach ($keys['positive'] as $key) {
    // Group of ORed terms
    if (is_array($key) && count($key)) {
      $queryor = array();
      foreach ($key as $or) {
        $q = _search_parse_query($or, $scorewords);
        if ($q) {
          $queryor[] = $q;
          $arguments[] = $or;
        }
      }
      if (count($queryor)) {
        $query[] = '('. implode(' OR ', $queryor) .')';
      }
    }
    // Single ANDed term
    else {
      $q = _search_parse_query($key, $scorewords);
      if ($q) {
        $query[] = $q;
        $arguments[] = $key;
      }
    }
    $matches++;
  }
  foreach ($keys['negative'] as $key) {
    $q = _search_parse_query($key, $scorewords, true);
    if ($q) {
      $query[] = $q;
      $arguments[] = $key;
    }
  }
  // We separate word-index conditions because they are not needed in the
  // counting query.
  foreach ($scorewords as $word) {
    $query2[] = "i.word = '%s'";
    $arguments2[] = $word;
  }
  $query = implode(' AND ', $query);
  $query2 = implode(' OR ', $query2);
  return array($query, $arguments, $query2, $arguments2, $matches);
}

/**
 * Helper function for search_parse_query();
 */
function _search_parse_query(&$word, &$scores, $not = false) {
  // Determine the scorewords of this word/phrase
  if (!$not) {
    $split = explode(' ', $word);
    foreach ($split as $s) {
      $num = is_numeric($s);
      if ($num || drupal_strlen($s) >= variable_get('minimum_word_size', 4)) {
        $scores[] = $num ? ((int)ltrim($word, '-0')) : $s;
      }
    }
  }
  // Return matching snippet
  return "d.data ". ($not ? 'NOT ' : '') ."LIKE '%% %s %%'";
}

Kjartan's avatar
Kjartan committed
745
/**
Steven Wittens's avatar
Steven Wittens committed
746
 * Do a query on the full-text search index for a word or words.
747
 *
Steven Wittens's avatar
Steven Wittens committed
748
749
 * This function is normally only called by each module that support the
 * indexed search (and thus, implements hook_update_index()).
750
 *
751
752
753
754
 * Two queries are performed which can be extended by the caller.
 *
 * The first query selects a set of possible matches based on the search index
 * and any extra given restrictions. This is the classic "OR" search.
755
 *
756
 * SELECT i.type, i.sid, SUM(i.score*t.count) AS relevance
757
 * FROM {search_index} i
758
759
760
 * INNER JOIN {search_total} t ON i.word = t.word
 * $join1
 * WHERE $where1 AND (...)
761
 * GROUP BY i.type, i.sid
762
763
764
765
766
767
768
769
770
771
772
 *
 * The second query further refines this set by verifying advanced text
 * conditions (such as AND, negative or phrase matches), and orders the results
 * on a the column or expression 'score':
 *
 * SELECT i.type, i.sid, $select2
 * FROM temp_search_sids i
 * INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type
 * $join2
 * WHERE (...)
 * ORDER BY score DESC
773
 *
774
 * @param $keywords
775
776
777
778
 *   A search string as entered by the user.
 *
 * @param $type
 *   A string identifying the calling module.
Kjartan's avatar
Kjartan committed
779
 *
780
781
 * @param $join1
 *   (optional) Inserted into the JOIN part of the first SQL query.
782
783
 *   For example "INNER JOIN {node} n ON n.nid = i.sid".
 *
784
785
786
787
788
789
790
791
792
793
794
 * @param $where1
 *   (optional) Inserted into the WHERE part of the first SQL query.
 *   For example "(n.status > %d)".
 *
 * @param $arguments1
 *   (optional) Extra SQL arguments belonging to the first query.
 *
 * @param $select2
 *   (optional) Inserted into the SELECT pat of the second query. Must contain
 *   a column selected as 'score'.
 *   defaults to 'i.relevance AS score'
795
 *
796
797
798
799
800
801
 * @param $join2
 *   (optional) Inserted into the JOIN par of the second SQL query.
 *   For example "INNER JOIN {node_comment_statistics} n ON n.nid = i.sid"
 *
 * @param $arguments2
 *   (optional) Extra SQL arguments belonging to the second query parameter.
802
 *
803
804
 * @return
 *   An array of SIDs for the search results.
805
806
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
807
 */
808
809
function do_search($keywords, $type, $join1 = '', $where1 = '1', $arguments1 = array(), $select2 = 'i.relevance AS score', $join2 = '', $arguments2 = array()) {
  $query = search_parse_query($keywords);
810

811
812
  if ($query === NULL || $query[0] == '' || $query[2] == '') {
    return array();
813
  }
814

815
816
817
818
  // First pass: select all possible matching sids, doing a simple index-based OR matching on the keywords.
  // 'matches' is used to reject those items that cannot possibly match the query.
  $conditions = $where1 .' AND ('. $query[2] .") AND i.type = '%s'";
  $arguments = array_merge($arguments1, $query[3], array($type, $query[4]));
819
  $result = db_query_temporary("SELECT i.type, i.sid, SUM(i.score * t.count) AS relevance, COUNT(*) AS matches FROM {search_index} i INNER JOIN {search_total} t ON i.word = t.word $join1 WHERE $conditions GROUP BY i.type, i.sid HAVING COUNT(*) >= %d", $arguments, 'temp_search_sids');
820
821

  // Calculate maximum relevance, to normalize it
822
  $normalize = db_result(db_query('SELECT MAX(relevance) FROM temp_search_sids'));
823
  if (!$normalize) {
824
825
    return array();
  }
826
  $select2 = str_replace('i.relevance', '('. (1.0 / $normalize) .' * i.relevance)', $select2);
827

828
829
830
  // Second pass: only keep items that match the complicated keywords conditions (phrase search, negative keywords, ...)
  $conditions = '('. $query[0] .')';
  $arguments = array_merge($arguments2, $query[1]);
831
832
  $result = db_query_temporary("SELECT i.type, i.sid, $select2 FROM temp_search_sids i INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type $join2 WHERE $conditions ORDER BY score DESC", $arguments, 'temp_search_results');
  if (($count = db_result(db_query('SELECT COUNT(*) FROM temp_search_results'))) == 0) {
833
    return array();
834
835
836
  }
  $count_query = "SELECT $count";

837
  // Do actual search query
838
  $result = pager_query("SELECT * FROM temp_search_results", 10, 0, $count_query, $arguments);
839
840
841
842
843
  $results = array();
  while ($item = db_fetch_object($result)) {
    $results[] = $item->sid;
  }
  return $results;
Kjartan's avatar
Kjartan committed
844
845
}

846
847
848
849
850
851
852
853
854
855
/**
 * Helper function for grabbing search keys.
 */
function search_get_keys() {
  // Extract keys as remainder of path
  // Note: support old GET format of searches for existing links.
  $path = explode('/', $_GET['q'], 3);
  return count($path) == 3 ? $path[2] : $_REQUEST['keys'];
}

Dries's avatar
   
Dries committed
856
857
858
859
/**
 * Menu callback; presents the search form and/or search results.
 */
function search_view() {
860
861
862
863
864
  $type = arg(1);

  // Search form submits with POST but redirects to GET. This way we can keep
  // the search query URL clean as a whistle:
  // search/type/keyword+keyword
865
  if (isset($_POST['op'])) {
866
867
868
    if ($type == '') {
      $type = 'node';
    }
869
    $keys = module_invoke($type, 'search', 'post', $_POST['edit']['keys']);
870
    drupal_goto('search/'. drupal_urlencode($type) .'/'. drupal_urlencode(is_null($keys) ? $_POST['edit']['keys'] : $keys));
871
872
873
  }
  else if ($type == '') {
    // Note: search/node can not be a default tab because it would take on the
874
    // path of its parent (search). It would prevent remembering keywords when
875
876
877
878
    // switching tabs. This is why we drupal_goto to it from the parent instead.
    drupal_goto('search/node');
  }
  $keys = search_get_keys();
Dries's avatar
   
Dries committed
879

Dries's avatar
   
Dries committed
880
  if (user_access('search content')) {
881
882
883
    // Only perform search if there is non-whitespace search term:
    if (trim($keys)) {
      // Log the search keys:
Dries's avatar
Dries committed
884
      watchdog('search',
885
        t('Search: %keys (%type).', array('%keys' => theme('placeholder', $keys), '%type' => module_invoke($type, 'search', 'name'))),
Dries's avatar
Dries committed
886
        WATCHDOG_NOTICE,
887
        l(t('results'), 'search/'. drupal_urlencode($type) .'/'. drupal_urlencode($keys))
Dries's avatar
Dries committed
888
        );
Dries's avatar
   
Dries committed
889

890
      // Collect the search results:
891
      $results = search_data($keys, $type);
Dries's avatar
   
Dries committed
892

Dries's avatar
   
Dries committed
893
      if ($results) {
894
        $results = theme('box', t('Search results'), $results);
Dries's avatar
   
Dries committed
895
896
      }
      else {
897
        $results = theme('box', t('Your search yielded no results'), search_help('search#noresults'));
Dries's avatar
   
Dries committed
898
      }
Dries's avatar
   
Dries committed
899
    }
900
901
902
903
904
905
    else if (isset($_POST['edit'])) {
      form_set_error('keys', t('Please enter some keywords.'));
    }

    // Construct the search form.
    // Note, we do this last because of the form_set_error() above.
906
    $output = search_form(NULL, $keys, $type);
907
908

    $output .= $results;
Dries's avatar
   
Dries committed
909

Dries's avatar
   
Dries committed
910
    return $output;
911
  }
Dries's avatar
   
Dries committed
912
  else {
Dries's avatar
   
Dries committed
913
    drupal_access_denied();
Dries's avatar
   
Dries committed
914
  }
Kjartan's avatar
Kjartan committed
915
916
}

917
918
919
920
921
922
923
924
/**
 * @defgroup search Search interface
 * @{
 * The Drupal search interface manages a global search mechanism.
 *
 * Modules may plug into this system to provide searches of different types of
 * data. Most of the system is handled by search.module, so this must be enabled
 * for all of the search features to work.
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
 *
 * There are three ways to interact with the search system:
 * - Specifically for searching nodes, you can implement nodeapi('update index')
 *   and nodeapi('search result'). However, note that the search system already
 *   indexes all visible output of a node, i.e. everything displayed normally
 *   by hook_view() and hook_nodeapi('view'). This is usually sufficient.
 *   You should only use this mechanism if you want additional, non-visible data
 *   to be indexed.
 * - Implement hook_search(). This will create a search tab for your module on
 *   the /search page with a simple keyword search form. You may optionally
 *   implement hook_search_item() to customize the display of your results.
 * - Implement hook_update_index(). This allows your module to use Drupal's
 *   HTML indexing mechanism for searching full text efficiently.
 *
 * If your module needs to provide a more complicated search form, then you need
Dries's avatar
Dries committed
940
 * to implement it yourself without hook_search(). In that case, you should
941
942
 * define it as a local task (tab) under the /search page (e.g. /search/mymodule)
 * so that users can easily find it.
943
944
945
946
947
948
949
950
951
 */

/**
 * Render a search form.
 *
 * @param $action
 *   Form action. Defaults to "search".
 * @param $keys
 *   The search string entered by the user, containing keywords for the search.
952
953
954
 * @param $type
 *   The type of search to render the node for. Must be the name of module
 *   which implements hook_search(). Defaults to 'node'.
955
956
 * @param $prompt
 *   A piece of text to put before the form (e.g. "Enter your keywords")
957
958
959
 * @return
 *   An HTML string containing the search form.
 */
960
function search_form($action = '', $keys = '', $type = null, $prompt = null) {
961
962

  if (!$action) {
963
    $action = url('search/'. $type);
964
  }
965
966
967
  if (is_null($prompt)) {
    $prompt = t('Enter your keywords');
  }
968

969
  $form = array();
970
971
  $form['#action'] = $action;
  $form['#attributes'] = array('class' => 'search-form');
972
973
974
975
  $form['basic'] = array('#type' => 'item', '#title' => $prompt);
  $form['basic']['inline'] = array('#type' => 'markup', '#prefix' => '<div class="container-inline">', '#suffix' => '</div>');
  $form['basic']['inline']['keys'] = array('#type' => 'textfield', '#title' => '', '#default_value' => $keys, '#size' => $prompt ? 40 : 30, '#maxlength' => 255);
  $form['basic']['inline']['submit'] = array('#type' => 'submit', '#value' => t('Search'));
976

977
978
979
980
  $form_module = module_invoke($type, 'search', 'form', $keys);
  if (is_array($form_module)) {
    $form = array_merge($form, $form_module);
  }
981

982
  return drupal_get_form('search_form', $form);
983
984
985
}

/**
Steven Wittens's avatar
Steven Wittens committed
986
 * Perform a standard search on the given keys, and return the formatted results.
987
988
989
990
991
992
993
994
995
996
997
998
999
 */
function search_data($keys = NULL, $type = 'node') {
  $output = '';

  if (isset($keys)) {
    if (module_hook($type, 'search')) {
      $results = module_invoke($type, 'search', 'search', $keys);
      if (is_array($results) && count($results)) {
        $output .= '<dl class="search-results">';
        foreach ($results as $entry) {
          $output .= theme('search_item', $entry, $type);
        }
        $output .= '</dl>';
1000
        $output .= theme('pager', NULL, 15, 0);
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
      }
    }
  }

  return $output;
}

/**
 * Returns snippets from a piece of text, with certain keywords highlighted.
 * Used for formatting search results.
 *
 * @param $keys
1013
 *   A string containing a search query.
1014
1015
1016
1017
1018
1019
1020
1021
 *
 * @param $text
 *   The text to extract fragments from.
 *
 * @return
 *   A string containing HTML for the excerpt.
 */
function search_excerpt($keys, $text) {
1022
1023
1024
1025
1026
  // Extract positive keywords and phrases
  preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' '. $keys, $matches);
  $keys = array_merge($matches[2], $matches[3]);

  // Prepare text
1027
  $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
1028
1029
  array_walk($keys, '_search_excerpt_replace');
  $workkeys = $keys;
1030
1031
1032
1033

  // Extract a fragment per keyword for at most 4 keywords.
  // First we collect ranges of text around each keyword, starting/ending
  // at spaces.
1034
  // If the sum of all fragments is too short, we look for second occurrences.
1035
1036
1037
  $ranges = array();
  $included = array();
  $length = 0;
1038
1039
  while ($length < 256 && count($workkeys)) {
    foreach ($workkeys as $k => $key) {
1040
      if (strlen($key) == 0) {
1041
        unset($workkeys[$k]);
1042
        unset($keys[$k]);
1043
1044
1045
1046
1047
        continue;
      }
      if ($length >= 256) {
        break;
      }
1048
      // Remember occurrence of key so we can skip over it if more occurrences
1049
1050
1051
1052
      // are desired.
      if (!isset($included[$key])) {
        $included[$key] = 0;
      }
1053
1054
1055
1056
      // Locate a keyword (position $p), then locate a space in front (position
      // $q) and behind it (position $s)
      if (preg_match('/\b'. $key .'\b/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
        $p = $match[0][1];
1057
1058
1059
1060
1061
1062
1063
1064
        if (($q = strpos($text, ' ', max(0, $p - 60))) !== false) {
          $end = substr($text, $p, 80);
          if (($s = strrpos($end, ' ')) !== false) {
            $ranges[$q] = $p + $s;
            $length += $p + $s - $q;
            $included[$key] = $p + 1;
          }
          else {
1065
            unset($workkeys[$k]);
1066
1067
1068
          }
        }
        else {
1069
          unset($workkeys[$k]);
1070
1071
1072
        }
      }
      else {
1073
        unset($workkeys[$k]);
1074
1075
      }
    }
1076
  }
1077

1078
1079
1080
  // If we didn't find anything, return the beginning.
  if (count($ranges) == 0) {
    return truncate_utf8($text, 256) . ' ...';
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

  // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
  $newranges = array();
  foreach ($ranges as $from2 => $to2) {
    if (!isset($from1)) {
      $from1 = $from2;
      $to1 = $to2;
      continue;
    }
    if ($from2 <= $to1) {
      $to1 = max($to1, $to2);
    }
    else {
      $newranges[$from1] = $to1;
      $from1 = $from2;
      $to1 = $to2;
    }
  }
  $newranges[$from1] = $to1;

  // Fetch text
  $out = array();
  foreach ($newranges as $from => $to) {
    $out[] = substr($text, $from, $to - $from);
  }
1110
  $text = (isset($newranges[0]) ? '' : '... '). implode(' ... ', $out) .' ...';
1111
1112

  // Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
1113
  $text = preg_replace('/\b('. implode('|', $keys) .')\b/iu', '<strong>\0</strong>', $text);
1114
1115
1116
  return $text;
}

1117
1118
1119
1120
/**
 * @} End of "defgroup search".
 */

1121
1122
1123
/**
 * Helper function for array_walk in search_except.
 */
1124
1125
function _search_excerpt_replace(&$text) {
  $text = preg_quote($text, '/');
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
}

/**
 * Format a single result entry of a search query.
 *
 * Modules may implement hook_search_item() in order to override this default
 * function to display search results.
 *
 * @param $item
 *   A single search result as returned by hook_search(). The result should be
Dries's avatar
Dries committed
1136
1137
1138
 *   an array with keys "link", "title", "type", "user", "date", and "snippet".
 *   Optionally, "extra" can be an array of extra info to show along with the
 *   result.
1139
 * @param $type
1140
1141
1142
 *   The type of item found, such as "user" or "node".
 *
 * @ingroup themeable
1143
1144
1145
1146
1147
1148
 */
function theme_search_item($item, $type) {
  if (module_hook($type, 'search_item')) {
    $output = module_invoke($type, 'search_item', $item);
  }
  else {
1149
    $output = ' <dt class="title"><a href="'. check_url($item['link']) .'">'. check_plain($item['title']) .'</a></dt>';
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
    $info = array();
    if ($item['type']) {
      $info[] = $item['type'];
    }
    if ($item['user']) {
      $info[] = $item['user'];
    }
    if ($item['date']) {
      $info[] = format_date($item['date'], 'small');
    }
Dries's avatar
Dries committed
1160
1161
    if (is_array($item['extra'])) {
      $info = array_merge($info, $item['extra']);
1162
1163
1164
1165
1166
1167
    }
    $output .= ' <dd>'. ($item['snippet'] ? '<p>'. $item['snippet'] . '</p>' : '') . '<p class="search-info">' . implode(' - ', $info) .'</p></dd>';
  }

  return $output;
}