search.module 30.6 KB
Newer Older
Dries's avatar
 
Dries committed
1
<?php
2
// $Id$
Dries's avatar
 
Dries committed
3

Dries's avatar
   
Dries committed
4
5
6
7
8
/**
 * @file
 * Enables site-wide keyword searching.
 */

9
/**
10
 * Matches Unicode character classes to exclude from the search index.
Steven Wittens's avatar
Steven Wittens committed
11
 *
12
13
 * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
 *
14
 * The index only contains the following character classes:
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
 * Lu     Letter, Uppercase
 * Ll     Letter, Lowercase
 * Lt     Letter, Titlecase
 * Lm     Letter, Modifier
 * Lo     Letter, Other
 * Mn     Mark, Nonspacing
 * Mc     Mark, Spacing Combining
 * Nd     Number, Decimal Digit
 * Nl     Number, Letter
 * No     Number, Other
 * Sm     Symbol, Math
 * Sc     Symbol, Currency
 * Sk     Symbol, Modifier
 * So     Symbol, Other
 *
30
 * All character classes not in the list above (enclosing marks, punctuation, control codes and spacers):
31
32
33
34
35
 * 'Me', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Zs', 'Zl', 'Zp', 'Cc', 'Cf', 'Cs', 'Co'
 */
define('PREG_CLASS_SEARCH_EXCLUDE', '\x{0}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}\x{5f}\x{7b}\x{7d}\x{7f}-\x{a1}\x{ab}\x{ad}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{488}\x{489}\x{55a}-\x{55f}\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{600}-\x{603}\x{60c}\x{60d}\x{61b}\x{61f}\x{66a}-\x{66d}\x{6d4}\x{6dd}\x{6de}\x{700}-\x{70d}\x{70f}\x{964}\x{965}\x{970}\x{df4}\x{e4f}\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}\x{1361}-\x{1368}\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}\x{1736}\x{17b4}\x{17b5}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{180e}\x{1944}\x{1945}\x{2000}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}\x{205f}-\x{2063}\x{206a}-\x{206f}\x{207d}\x{207e}\x{208d}\x{208e}\x{20dd}-\x{20e0}\x{20e2}-\x{20e4}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}\x{3000}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}\x{30fb}\x{d800}\x{db7f}\x{db80}\x{dbff}\x{dc00}\x{dfff}\x{e000}\x{f8ff}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}\x{fe6a}\x{fe6b}\x{feff}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-\x{ff65}\x{fff9}-\x{fffb}\x{10100}\x{10101}\x{1039f}\x{1d173}-\x{1d17a}\x{e0001}\x{e0020}-\x{e007f}\x{f0000}\x{ffffd}\x{100000}');

/**
Steven Wittens's avatar
Steven Wittens committed
36
 * Matches all 'N' Unicode character classes (numbers)
37
38
39
40
 */
define('PREG_CLASS_NUMBERS', '\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}\x{10107}-\x{10133}\x{10320}-\x{10323}\x{1034a}\x{104a0}-\x{104a9}\x{1d7ce}-\x{1d7ff}');

/**
Steven Wittens's avatar
Steven Wittens committed
41
 * Matches all 'P' Unicode character classes (punctuation)
42
43
44
 */
define('PREG_CLASS_PUNCTUATION', '\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-\x{ff65}\x{10100}\x{10101}\x{1039f}');

Dries's avatar
   
Dries committed
45
46
47
48
/**
 * Implementation of hook_help().
 */
function search_help($section = 'admin/help#search') {
Dries's avatar
   
Dries committed
49
  switch ($section) {
Dries's avatar
   
Dries committed
50
51
    case 'admin/modules#description':
      return t('Enables site-wide keyword searching.');
52
53
    case 'admin/settings/search':
      return t('
Steven Wittens's avatar
Steven Wittens committed
54
<p>The search engine works by maintaining an index of the words in your site\'s content. You can adjust the settings below to tweak the indexing behaviour. Note that the search requires cron to be set up correctly.</p>
55
');
56
57
58
59
60
61
    case 'search#noresults':
      return t('<p><ul>
<li>Check if your spelling is correct.</li>
<li>Try using wildcards: <em>walk*</em> matches <em>walker</em>, <em>walking</em>, ...</li>
<li>Use longer words (words shorter than %number letters are ignored).</li>
</ul></p>', array('%number' => variable_get('minimum_word_size', 3)));
Dries's avatar
   
Dries committed
62
  }
Dries's avatar
   
Dries committed
63
}
Kjartan's avatar
Kjartan committed
64
65

/**
Dries's avatar
   
Dries committed
66
 * Implementation of hook_perm().
Kjartan's avatar
Kjartan committed
67
 */
Dries's avatar
 
Dries committed
68
function search_perm() {
Dries's avatar
   
Dries committed
69
  return array('search content', 'administer search');
Dries's avatar
 
Dries committed
70
71
}

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
/**
 * Implementation of hook_block().
 */
function search_block($op = 'list', $delta = 0) {
  global $user;
  if ($op == 'list') {
    $blocks[0]['info'] = t('Search form');
    return $blocks;
  }
  else if ($op == 'view' && user_access('search content') && arg(0) != 'search') {
    $block['content'] = search_form('', '', null, '');
    $block['subject'] = t('Search');
    return $block;
  }
}

Dries's avatar
   
Dries committed
88
89
90
/**
 * Implementation of hook_menu().
 */
Dries's avatar
   
Dries committed
91
function search_menu($may_cache) {
Dries's avatar
   
Dries committed
92
  $items = array();
Dries's avatar
   
Dries committed
93
94

  if ($may_cache) {
Dries's avatar
Dries committed
95
    $items[] = array('path' => 'search', 'title' => t('search'),
Dries's avatar
   
Dries committed
96
97
98
      'callback' => 'search_view',
      'access' => user_access('search content'),
      'type' => MENU_SUGGESTED_ITEM);
Dries's avatar
Dries committed
99

100
101
102
103
104
105
106
107
108
109
    $items[] = array('path' => 'admin/settings/search', 'title' => t('search'),
      'callback' => 'search_admin',
      'type' => MENU_NORMAL_ITEM,
      'access' => user_access('administer site configuration'));
  }
  else if (arg(0) == 'search') {
    // To remember the user's search keywords when switching across tabs,
    // we dynamically add the keywords to the search tabs' paths.
    $keys = search_get_keys();
    $keys = strlen($keys) ? '/'. $keys : '';
Dries's avatar
Dries committed
110
    foreach (module_list() as $name) {
111
112
      if (module_hook($name, 'search') && $title = module_invoke($name, 'search', 'name')) {
        $items[] = array('path' => 'search/'. $name . $keys, 'title' => $title,
Dries's avatar
Dries committed
113
114
          'callback' => 'search_view',
          'access' => user_access('search content'),
115
          'type' => MENU_LOCAL_TASK);
Dries's avatar
Dries committed
116
117
      }
    }
Dries's avatar
   
Dries committed
118
119
  }

Dries's avatar
   
Dries committed
120
121
122
  return $items;
}

123

Dries's avatar
   
Dries committed
124
125
126
/**
 * Menu callback; displays the search module settings page.
 */
127
function search_admin() {
Dries's avatar
   
Dries committed
128
  if ($_POST) {
129
130
    // If the word length settings change, the index needs to be rebuilt.
    if (variable_get('minimum_word_size', 3) != $_POST['edit']['minimum_word_size']) {
131
132
      drupal_set_message(t('The index will be rebuilt.'));
      search_wipe();
133
      system_settings_save();
134
135
136
137
    }
    else {
      system_settings_save();
    }
Dries's avatar
Dries committed
138
  }
Dries's avatar
   
Dries committed
139

140
141
142
143
144
145
146
147
148
  // Collect some stats
  $remaining = 0;
  $total = 0;
  foreach (module_list() as $module) {
    if (module_hook($module, 'search')) {
      $status = module_invoke($module, 'search', 'status');
      $remaining += $status['remaining'];
      $total += $status['total'];
    }
Dries's avatar
Dries committed
149
  }
150
  $count = format_plural($remaining, 'There is 1 item left to index.', 'There are %count items left to index.');
151
  $percentage = ((int)min(100, 100 * ($total - $remaining) / max(1, $total))) . '%';
152
  $status = '<p><strong>'. t('%percentage of the site has been indexed.', array('%percentage' => $percentage)) .' '. $count .'</strong></p>';
153
154
155
156
157
158
  $output = form_group('Indexing status', $status);

  // Indexing throttle:
  $items = drupal_map_assoc(array(10, 20, 50, 100, 200, 500));
  $group = form_select(t('Items to index per cron run'), 'search_cron_limit', variable_get('search_cron_limit', 100), $items, t('The maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.'));
  $output .= form_group(t('Indexing throttle'), $group);
Dries's avatar
   
Dries committed
159
  // Indexing settings:
160
  $group = '<em>'. t('<p>Changing the setting below will cause the site index to be rebuilt. The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed.</p><p>The default settings should be appropriate for the majority of sites.</p>') .'</em>';
161
  $group .= form_textfield(t('Minimum word length to index'), 'minimum_word_size', variable_get('minimum_word_size', 3), 3, 3, t('The number of characters a word has to be to be indexed. Words shorter than this will not be searchable.'));
162
  $group .= form_textfield(t('Minimum word length to search for'), 'remove_short', variable_get('remove_short', 3), 3, 3, t('The number of characters a word has to be to be searched for, including wildcard characters.'));
163
  $output .= form_group(t('Indexing settings'), $group);
Dries's avatar
   
Dries committed
164

Dries's avatar
   
Dries committed
165
  return system_settings_form($output);
Dries's avatar
   
Dries committed
166
167
}

Dries's avatar
Dries committed
168
/**
169
170
171
172
173
174
175
 * Wipes a part of or the entire search index.
 *
 * @param $sid
 *  (optional) The SID of the item to wipe. If specified, $type must be passed
 *  too.
 * @param $type
 *  (optional) The type of item to wipe.
Dries's avatar
Dries committed
176
 */
177
function search_wipe($sid = NULL, $type = NULL) {
178
179
180
181
182
183
184
  if ($type == NULL && $sid == NULL) {
    module_invoke_all('search', 'reset');
  }
  else {
    db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'", $sid, $type);
    db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type);
  }
Dries's avatar
Dries committed
185
186
}

187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
/**
 * Marks a word as dirty (or retrieves the list of dirty words). This is used
 * during indexing (cron). Words which are dirty have outdated total counts in
 * the search_total table, and need to be recounted.
 */
function search_dirty($word = null) {
  static $dirty = array();
  if ($word !== null) {
    $dirty[$word] = true;
  }
  else {
    return $dirty;
  }
}

Kjartan's avatar
Kjartan committed
202
/**
Dries's avatar
   
Dries committed
203
204
 * Implementation of hook_cron().
 *
205
206
 * Fires hook_update_index() in all modules and cleans up dirty words (see
 * search_dirty).
Kjartan's avatar
Kjartan committed
207
208
 */
function search_cron() {
Dries's avatar
Dries committed
209
  // Update word index
Kjartan's avatar
Kjartan committed
210
  foreach (module_list() as $module) {
211
212
    module_invoke($module, 'update_index');
  }
Dries's avatar
Dries committed
213
  // Update word counts for new/changed words
214
215
  foreach (search_dirty() as $word => $dummy) {
    $total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word));
Dries's avatar
Dries committed
216
217
    db_query("UPDATE {search_total} SET count = %d WHERE word = '%s'", $total, $word);
    if (!db_affected_rows()) {
218
219
220
221
222
      // Note: affected rows does not count matching rows that already had the right value!
      $exists = db_result(db_query("SELECT COUNT(*) FROM {search_total} WHERE word = '%s'", $word));
      if (!$exists) {
        db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %d)", $word, $total);
      }
Dries's avatar
Dries committed
223
224
225
226
227
228
229
230
    }
  }
  // Find words that were deleted from search_index, but are still in
  // search_total. We use a LEFT JOIN between the two tables and keep only the
  // rows which fail to join.
  $result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL");
  while ($word = db_fetch_object($result)) {
    db_query("DELETE FROM {search_total} WHERE word = '%s'", $word->realword);
Kjartan's avatar
Kjartan committed
231
232
233
234
  }
}

/**
235
 * Splits a string into component words according to indexing rules.
Kjartan's avatar
Kjartan committed
236
 */
237
238
239
function search_keywords_split($text) {
  static $last = null;
  static $lastsplit = null;
Kjartan's avatar
Kjartan committed
240

241
242
243
  if ($last == $text) {
    return $lastsplit;
  }
Kjartan's avatar
Kjartan committed
244

245
246
  // Decode entities to UTF-8
  $text = decode_entities($text);
247

248
249
  // Call an external processor for word handling.
  search_preprocess($text);
Kjartan's avatar
Kjartan committed
250

251
252
253
254
255
256
257
  // To improve searching for numerical data such as dates, IP addresses
  // or version numbers, we consider a group of numerical characters
  // separated only by punctuation characters to be one piece.
  // This also means that searching for e.g. '20/03/1984' also returns
  // results with '20-03-1984' in them.
  // Readable regexp: ([number]+)[punctuation]+(?=[number])
  $text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text);
Kjartan's avatar
Kjartan committed
258

259
260
261
  // The dot, underscore and dash are simply removed. This allows meaningful
  // search behaviour with acronyms and URLs.
  $text = preg_replace('/[._-]+/', '', $text);
Kjartan's avatar
Kjartan committed
262

263
264
265
  // With the exception of the rules above, we consider all punctuation,
  // marks, spacers, etc, to be a word boundary.
  $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text);
Kjartan's avatar
Kjartan committed
266

267
268
  // Process words
  $words = explode(' ', $text);
269
  array_walk($words, '_search_keywords_truncate');
Kjartan's avatar
Kjartan committed
270

271
272
273
274
275
276
277
  // Save last keyword result
  $last = $text;
  $lastsplit = $words;

  return $words;
}

278
279
280
281
/**
 * Helper function for array_walk in search_keywords_split.
 */
function _search_keywords_truncate(&$text) {
Steven Wittens's avatar
Steven Wittens committed
282
  $text = truncate_utf8($text, 50);
283
284
}

285
286
/**
 * Loosens up a set of search keywords by adding wildcards, if possible.
287
 *
288
289
290
291
292
293
294
295
296
297
298
299
 * @param $text
 *   The keywords as entered by the user.
 * @return
 *   If more wildcards can be added, the adjusted keywords are returned.
 *   If the query is already as loose as possible, NULL is returned.
 */
function search_keywords_variation($text) {
  $text = trim($text);
  $new = preg_replace('/\*+/', '*', '*'. implode('* *', explode(' ', trim($text))) .'*');
  return ($new != $text) ? $new : NULL;
}

300
301
302
303
/**
 * Invokes hook_search_preprocess() in modules.
 */
function search_preprocess(&$text) {
304
305
  foreach (module_implements('search_preprocess') as $module) {
    $text = module_invoke($module, 'search_preprocess', $text);
Kjartan's avatar
Kjartan committed
306
307
308
  }
}

309

Kjartan's avatar
Kjartan committed
310
/**
Steven Wittens's avatar
Steven Wittens committed
311
 * Update the full-text search index for a particular item.
312
313
314
 *
 * @param $sid
 *   A number identifying this particular item (e.g. node id).
Kjartan's avatar
Kjartan committed
315
 *
316
317
318
319
320
 * @param $type
 *   A string defining this type of item (e.g. 'node')
 *
 * @param $text
 *   The content of this item. Must be a piece of HTML text.
321
322
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
323
 */
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
function search_index($sid, $type, $text) {
  $minimum_word_size = variable_get('minimum_word_size', 3);

  global $base_url;
  $node_regexp = '!href=[\'"]?(?:'. preg_quote($base_url) .'/)?(?:\?q=)?([^\'">]+)[\'">]!i';

  // Multipliers for scores of words inside certain HTML tags.
  // Note: 'a' must be included for link ranking to work.
  $tags = array('h1' => 21,
                'h2' => 18,
                'h3' => 15,
                'h4' => 12,
                'h5' => 9,
                'h6' => 6,
                'u' => 5,
                'b' => 5,
                'strong' => 5,
                'em' => 5,
                'a' => 10);

  // Strip off all ignored tags to speed up processing, but insert space before/after
  // them to keep word boundaries.
  $text = str_replace(array('<', '>'), array(' <', '> '), $text);
  $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');

  // Split HTML tags from plain text.
  $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  // Note: PHP ensures the array consists of alternating delimiters and literals
  // and begins and ends with a literal (inserting $null as required).

  $tag = false; // Odd/even counter. Tag or no tag.
  $link = false; // State variable for link analyser
  $score = 1; // Starting score per word

  $results = array(0 => array());

  foreach ($split as $value) {
    if ($tag) {
      // Increase or decrease score per word based on tag
      list($tagname) = explode(' ', $value, 2);
      $tagname = strtolower($tagname);
      if ($tagname{0} == '/') {
        $score -= $tags[substr($tagname, 1)];
        if ($score < 1) { // possible due to bad HTML
          $score = 1;
        }
        if ($tagname == '/a') {
          $link = false;
        }
Kjartan's avatar
Kjartan committed
373
      }
374
375
376
377
378
      else {
        if ($tagname == 'a') {
          // Check if link points to a node on this site
          if (preg_match($node_regexp, $value, $match)) {
            $path = drupal_get_normal_path($match[1]);
379
            if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
380
381
382
383
384
              $linknid = $match[1];
              if ($linknid > 0) {
                $link = true;
              }
            }
Kjartan's avatar
Kjartan committed
385
386
          }
        }
387
        $score += $tags[$tagname];
Kjartan's avatar
Kjartan committed
388
      }
389
390
391
392
393
394
395
396
    }
    else {
      // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
      if ($value != '') {
        $words = search_keywords_split($value);
        foreach ($words as $word) {
          // Check wordlength
          if (string_length($word) >= $minimum_word_size) {
397
            // Note: strtolower can be used because the value is only used internally.
398
399
400
401
402
403
404
405
406
407
408
            $word = strtolower($word);
            if ($link) {
              if (!isset($results[$linknid])) {
                $results[$linknid] = array();
              }
              $results[$linknid][$word] += $score;
            }
            else {
              $results[0][$word] += $score;
            }
          }
Dries's avatar
   
Dries committed
409
        }
Kjartan's avatar
Kjartan committed
410
411
      }
    }
412
    $tag = !$tag;
Kjartan's avatar
Kjartan committed
413
414
  }

415
  search_wipe($sid, $type);
Kjartan's avatar
Kjartan committed
416

417
418
419
420
421
422
  // Insert results into search index
  foreach ($results[0] as $word => $score) {
    db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %d)", $word, $sid, $type, $score);
    search_dirty($word);
  }
  unset($results[0]);
Dries's avatar
   
Dries committed
423

424
425
426
427
428
  // Now insert links to nodes
  foreach ($results as $nid => $words) {
    foreach ($words as $word => $score) {
      db_query("INSERT INTO {search_index} (word, sid, type, fromsid, fromtype, score) VALUES ('%s', %d, '%s', %d, '%s', %d)", $word, $nid, 'node', $sid, $type, $score);
      search_dirty($word);
Kjartan's avatar
Kjartan committed
429
430
431
432
433
    }
  }
}

/**
Steven Wittens's avatar
Steven Wittens committed
434
 * Do a query on the full-text search index for a word or words.
435
 *
Steven Wittens's avatar
Steven Wittens committed
436
437
 * This function is normally only called by each module that support the
 * indexed search (and thus, implements hook_update_index()).
438
 *
439
 * The final query is an SQL select on the search_index table. As a guide for
440
441
442
443
444
445
446
447
448
 * writing the optional extra SQL fragments (see below), use this query:
 *
 * SELECT i.type, i.sid, i.word, SUM(i.score/t.count) AS score
 * FROM {search_index} i
 * $join INNER JOIN {search_total} t ON i.word = t.word
 * WHERE $where AND (i.word = '...' OR ...)
 * GROUP BY i.type, i.sid
 * ORDER BY score DESC";
 *
449
 * @param $keywords
450
451
452
453
 *   A search string as entered by the user.
 *
 * @param $type
 *   A string identifying the calling module.
Kjartan's avatar
Kjartan committed
454
 *
455
456
457
458
459
460
461
462
 * @param $join
 *   (optional) A string to be inserted into the JOIN part of the SQL query.
 *   For example "INNER JOIN {node} n ON n.nid = i.sid".
 *
 * @param $where
 *   (optional) A string to be inserted into the WHERE part of the SQL query.
 *   For example "(n.status > 0)".
 *
463
464
465
 * @param $variation
 *   Used internally. Must not be specified.
 *
466
467
 * @return
 *   An array of SIDs for the search results.
468
469
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
470
 */
471
function do_search($keywords, $type, $join = '', $where = '1', $variation = true) {
472
  // Note, we replace the wildcards with U+FFFD (Replacement character) to pass
473
  // through the keyword extractor. Multiple wildcards are collapsed into one.
474
  $keys = preg_replace('!\*+!', '�', $keywords);
475
476
477
478
479
480

  // Split into words
  $keys = search_keywords_split($keys);

  $words = array();
  $arguments = array();
481
  $refused = array();
482
483
484
  // Build WHERE clause
  foreach ($keys as $word) {
    if (string_length($word) < variable_get('remove_short', 3)) {
485
486
487
      if ($word != '') {
        $refused[] = str_replace('�', '*', $word);
      }
488
489
      continue;
    }
Steven Wittens's avatar
Steven Wittens committed
490
    if (strpos($word, '�') !== false) {
491
      // Note: strtolower can be used because the value is only used internally.
492
      $words[] = "i.word LIKE '%s'";
493
      $arguments[] = str_replace('�', '%', strtolower($word));
494
495
496
    }
    else {
      $words[] = "i.word = '%s'";
497
      $arguments[] = strtolower($word);
498
499
    }
  }
500
  // Tell the user which words were excluded
501
502
503
504
  if (count($refused) && $variation) {
    $message = format_plural(count($refused),
                             'The word %words was not included because it is too short.',
                             'The words %words were not included because they were too short.');
505
    drupal_set_message(strtr($message, array('%words' => theme('placeholder', implode(', ', $refused)))));
506
  }
507

508
509
510
  if (count($words) == 0) {
    return array();
  }
511
  $conditions = $where .' AND ('. implode(' OR ', $words) .')';
512
513

  // Get result count (for pager)
514
  $count = db_num_rows(db_query("SELECT DISTINCT i.sid, i.type FROM {search_index} i $join WHERE $conditions", $arguments));
515
  if ($count == 0) {
516
517
518
519
520
521
522
    // Try out a looser search query if nothing was found.
    if ($variation && $loose = search_keywords_variation($keywords)) {
      return do_search($loose, $type, $join, $where, false);
    }
    else {
      return array();
    }
523
524
525
526
  }
  $count_query = "SELECT $count";

  // Do pager query
527
  $query = "SELECT i.type, i.sid, SUM(i.score/t.count) AS score FROM {search_index} i $join INNER JOIN {search_total} t ON i.word = t.word WHERE $conditions GROUP BY i.type, i.sid ORDER BY score DESC";
528
  $result = pager_query($query, 15, 0, $count_query, $arguments);
529
530
531
532
533
534
535

  $results = array();
  while ($item = db_fetch_object($result)) {
    $results[] = $item->sid;
  }

  return $results;
Kjartan's avatar
Kjartan committed
536
537
}

538
539
540
541
542
543
544
545
546
547
/**
 * Helper function for grabbing search keys.
 */
function search_get_keys() {
  // Extract keys as remainder of path
  // Note: support old GET format of searches for existing links.
  $path = explode('/', $_GET['q'], 3);
  return count($path) == 3 ? $path[2] : $_REQUEST['keys'];
}

Dries's avatar
   
Dries committed
548
549
550
551
/**
 * Menu callback; presents the search form and/or search results.
 */
function search_view() {
552
553
554
555
556
557
  $type = arg(1);

  // Search form submits with POST but redirects to GET. This way we can keep
  // the search query URL clean as a whistle:
  // search/type/keyword+keyword
  if ($_POST['edit']['keys']) {
558
559
560
    if ($type == '') {
      $type = 'node';
    }
561
    drupal_goto('search/'. urlencode($type) .'/'. urlencode($_POST['edit']['keys']));
562
563
564
  }
  else if ($type == '') {
    // Note: search/node can not be a default tab because it would take on the
565
    // path of its parent (search). It would prevent remembering keywords when
566
567
568
569
    // switching tabs. This is why we drupal_goto to it from the parent instead.
    drupal_goto('search/node');
  }
  $keys = search_get_keys();
Dries's avatar
   
Dries committed
570

Dries's avatar
   
Dries committed
571
  if (user_access('search content')) {
572
573
574
    // Only perform search if there is non-whitespace search term:
    if (trim($keys)) {
      // Log the search keys:
Dries's avatar
Dries committed
575
      watchdog('search',
576
        t('Search: %keys (%type).', array('%keys' => theme('placeholder', $keys), '%type' => module_invoke($type, 'search', 'name'))),
Dries's avatar
Dries committed
577
        WATCHDOG_NOTICE,
578
        l(t('results'), 'search/'. urlencode($type) .'/'. urlencode($keys))
Dries's avatar
Dries committed
579
        );
Dries's avatar
   
Dries committed
580

581
      // Collect the search results:
582
      $results = search_data($keys, $type);
Dries's avatar
 
Dries committed
583

Dries's avatar
   
Dries committed
584
      if ($results) {
585
        $results = theme('box', t('Search results'), $results);
Dries's avatar
   
Dries committed
586
587
      }
      else {
588
        $results = theme('box', t('Your search yielded no results'), search_help('search#noresults'));
Dries's avatar
   
Dries committed
589
      }
Dries's avatar
 
Dries committed
590
    }
591
592
593
594
595
596
    else if (isset($_POST['edit'])) {
      form_set_error('keys', t('Please enter some keywords.'));
    }

    // Construct the search form.
    // Note, we do this last because of the form_set_error() above.
597
    $output = search_form(NULL, $keys, $type);
598
599

    $output .= $results;
Dries's avatar
 
Dries committed
600

Dries's avatar
   
Dries committed
601
    return $output;
602
  }
Dries's avatar
 
Dries committed
603
  else {
Dries's avatar
   
Dries committed
604
    drupal_access_denied();
Dries's avatar
 
Dries committed
605
  }
Kjartan's avatar
Kjartan committed
606
607
}

608
609
610
611
612
613
614
615
/**
 * @defgroup search Search interface
 * @{
 * The Drupal search interface manages a global search mechanism.
 *
 * Modules may plug into this system to provide searches of different types of
 * data. Most of the system is handled by search.module, so this must be enabled
 * for all of the search features to work.
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
 *
 * There are three ways to interact with the search system:
 * - Specifically for searching nodes, you can implement nodeapi('update index')
 *   and nodeapi('search result'). However, note that the search system already
 *   indexes all visible output of a node, i.e. everything displayed normally
 *   by hook_view() and hook_nodeapi('view'). This is usually sufficient.
 *   You should only use this mechanism if you want additional, non-visible data
 *   to be indexed.
 * - Implement hook_search(). This will create a search tab for your module on
 *   the /search page with a simple keyword search form. You may optionally
 *   implement hook_search_item() to customize the display of your results.
 * - Implement hook_update_index(). This allows your module to use Drupal's
 *   HTML indexing mechanism for searching full text efficiently.
 *
 * If your module needs to provide a more complicated search form, then you need
Dries's avatar
Dries committed
631
 * to implement it yourself without hook_search(). In that case, you should
632
633
 * define it as a local task (tab) under the /search page (e.g. /search/mymodule)
 * so that users can easily find it.
634
635
636
637
638
639
640
641
642
 */

/**
 * Render a search form.
 *
 * @param $action
 *   Form action. Defaults to "search".
 * @param $keys
 *   The search string entered by the user, containing keywords for the search.
643
644
645
 * @param $type
 *   The type of search to render the node for. Must be the name of module
 *   which implements hook_search(). Defaults to 'node'.
646
647
 * @param $prompt
 *   A piece of text to put before the form (e.g. "Enter your keywords")
648
649
650
 * @return
 *   An HTML string containing the search form.
 */
651
function search_form($action = '', $keys = '', $type = null, $prompt = null) {
652
653
654
  $edit = $_POST['edit'];

  if (!$action) {
655
    $action = url('search/'. $type);
656
  }
Dries's avatar
Dries committed
657
658
659
  if (!$type) {
    $type = 'node';
  }
660
661
662
  if (is_null($prompt)) {
    $prompt = t('Enter your keywords');
  }
663
664
665

  $output = ' <div class="search-form">';
  $box = '<div class="container-inline">';
666
  $box .= form_textfield('', 'keys', $keys, $prompt ? 40 : 20, 255);
667
  $box .= form_submit(t('Search'));
668
  $box .= '</div>';
669
  $output .= form_item($prompt, $box);
670
671
672
673
674
675
  $output .= '</div>';

  return form($output, 'post', $action);
}

/**
Steven Wittens's avatar
Steven Wittens committed
676
 * Perform a standard search on the given keys, and return the formatted results.
677
678
679
680
681
682
683
684
685
686
687
688
689
 */
function search_data($keys = NULL, $type = 'node') {
  $output = '';

  if (isset($keys)) {
    if (module_hook($type, 'search')) {
      $results = module_invoke($type, 'search', 'search', $keys);
      if (is_array($results) && count($results)) {
        $output .= '<dl class="search-results">';
        foreach ($results as $entry) {
          $output .= theme('search_item', $entry, $type);
        }
        $output .= '</dl>';
690
        $output .= theme('pager', NULL, 15, 0);
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
      }
    }
  }

  return $output;
}

/**
 * Returns snippets from a piece of text, with certain keywords highlighted.
 * Used for formatting search results.
 *
 * @param $keys
 *   A string containing keywords. They are split into words using the same
 *   rules as search indexing.
 *
 * @param $text
 *   The text to extract fragments from.
 *
 * @return
 *   A string containing HTML for the excerpt.
 */
function search_excerpt($keys, $text) {
  $keys = search_keywords_split($keys);
  $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
715
716
  array_walk($keys, '_search_excerpt_replace');
  $workkeys = $keys;
717
718
719
720

  // Extract a fragment per keyword for at most 4 keywords.
  // First we collect ranges of text around each keyword, starting/ending
  // at spaces.
721
  // If the sum of all fragments is too short, we look for second occurrences.
722
723
724
  $ranges = array();
  $included = array();
  $length = 0;
725
726
  while ($length < 256 && count($workkeys)) {
    foreach ($workkeys as $k => $key) {
727
      if (strlen($key) == 0) {
728
        unset($workkeys[$k]);
729
730
731
732
733
        continue;
      }
      if ($length >= 256) {
        break;
      }
734
      // Remember occurrence of key so we can skip over it if more occurrences
735
736
737
738
      // are desired.
      if (!isset($included[$key])) {
        $included[$key] = 0;
      }
739
740
741
742
      // Locate a keyword (position $p), then locate a space in front (position
      // $q) and behind it (position $s)
      if (preg_match('/\b'. $key .'\b/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
        $p = $match[0][1];
743
744
745
746
747
748
749
750
        if (($q = strpos($text, ' ', max(0, $p - 60))) !== false) {
          $end = substr($text, $p, 80);
          if (($s = strrpos($end, ' ')) !== false) {
            $ranges[$q] = $p + $s;
            $length += $p + $s - $q;
            $included[$key] = $p + 1;
          }
          else {
751
            unset($workkeys[$k]);
752
753
754
          }
        }
        else {
755
          unset($workkeys[$k]);
756
757
758
        }
      }
      else {
759
        unset($workkeys[$k]);
760
761
      }
    }
762
  }
763

764
765
766
  // If we didn't find anything, return the beginning.
  if (count($ranges) == 0) {
    return truncate_utf8($text, 256) . ' ...';
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

  // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
  $newranges = array();
  foreach ($ranges as $from2 => $to2) {
    if (!isset($from1)) {
      $from1 = $from2;
      $to1 = $to2;
      continue;
    }
    if ($from2 <= $to1) {
      $to1 = max($to1, $to2);
    }
    else {
      $newranges[$from1] = $to1;
      $from1 = $from2;
      $to1 = $to2;
    }
  }
  $newranges[$from1] = $to1;

  // Fetch text
  $out = array();
  foreach ($newranges as $from => $to) {
    $out[] = substr($text, $from, $to - $from);
  }
796
  $text = (isset($newranges[0]) ? '' : '... '). implode(' ... ', $out) .' ...';
797
798

  // Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
799
  $text = preg_replace('/\b('. implode('|', $keys) .')\b/iu', '<strong>\0</strong>', $text);
800
801
802
  return $text;
}

803
804
805
806
/**
 * @} End of "defgroup search".
 */

807
808
809
/**
 * Helper function for array_walk in search_except.
 */
810
811
function _search_excerpt_replace(&$text) {
  $text = preg_quote($text, '/');
812
813
814
815
816
817
818
819
820
821
}

/**
 * Format a single result entry of a search query.
 *
 * Modules may implement hook_search_item() in order to override this default
 * function to display search results.
 *
 * @param $item
 *   A single search result as returned by hook_search(). The result should be
Dries's avatar
Dries committed
822
823
824
 *   an array with keys "link", "title", "type", "user", "date", and "snippet".
 *   Optionally, "extra" can be an array of extra info to show along with the
 *   result.
825
 * @param $type
826
827
828
 *   The type of item found, such as "user" or "node".
 *
 * @ingroup themeable
829
830
831
832
833
834
 */
function theme_search_item($item, $type) {
  if (module_hook($type, 'search_item')) {
    $output = module_invoke($type, 'search_item', $item);
  }
  else {
835
    $output = ' <dt class="title"><a href="'. check_url($item['link']) .'">'. check_plain($item['title']) .'</a></dt>';
836
837
838
839
840
841
842
843
844
845
    $info = array();
    if ($item['type']) {
      $info[] = $item['type'];
    }
    if ($item['user']) {
      $info[] = $item['user'];
    }
    if ($item['date']) {
      $info[] = format_date($item['date'], 'small');
    }
Dries's avatar
Dries committed
846
847
    if (is_array($item['extra'])) {
      $info = array_merge($info, $item['extra']);
848
849
850
851
852
853
854
855
    }
    $output .= ' <dd>'. ($item['snippet'] ? '<p>'. $item['snippet'] . '</p>' : '') . '<p class="search-info">' . implode(' - ', $info) .'</p></dd>';
  }

  return $output;
}


Dries's avatar
   
Dries committed
856
?>