search.module 28 KB
Newer Older
Dries's avatar
 
Dries committed
1
<?php
2
// $Id$
Dries's avatar
 
Dries committed
3

Dries's avatar
   
Dries committed
4
5
6
7
8
/**
 * @file
 * Enables site-wide keyword searching.
 */

9
/**
10
 * Matches Unicode character classes to exclude from the search index.
Steven Wittens's avatar
Steven Wittens committed
11
 *
12
13
 * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
 *
14
 * The index only contains the following character classes:
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
 * Lu     Letter, Uppercase
 * Ll     Letter, Lowercase
 * Lt     Letter, Titlecase
 * Lm     Letter, Modifier
 * Lo     Letter, Other
 * Mn     Mark, Nonspacing
 * Mc     Mark, Spacing Combining
 * Nd     Number, Decimal Digit
 * Nl     Number, Letter
 * No     Number, Other
 * Sm     Symbol, Math
 * Sc     Symbol, Currency
 * Sk     Symbol, Modifier
 * So     Symbol, Other
 *
30
 * All character classes not in the list above (enclosing marks, punctuation, control codes and spacers):
31
32
33
34
35
 * 'Me', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Zs', 'Zl', 'Zp', 'Cc', 'Cf', 'Cs', 'Co'
 */
define('PREG_CLASS_SEARCH_EXCLUDE', '\x{0}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}\x{5f}\x{7b}\x{7d}\x{7f}-\x{a1}\x{ab}\x{ad}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{488}\x{489}\x{55a}-\x{55f}\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{600}-\x{603}\x{60c}\x{60d}\x{61b}\x{61f}\x{66a}-\x{66d}\x{6d4}\x{6dd}\x{6de}\x{700}-\x{70d}\x{70f}\x{964}\x{965}\x{970}\x{df4}\x{e4f}\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}\x{1361}-\x{1368}\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}\x{1736}\x{17b4}\x{17b5}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{180e}\x{1944}\x{1945}\x{2000}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}\x{205f}-\x{2063}\x{206a}-\x{206f}\x{207d}\x{207e}\x{208d}\x{208e}\x{20dd}-\x{20e0}\x{20e2}-\x{20e4}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}\x{3000}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}\x{30fb}\x{d800}\x{db7f}\x{db80}\x{dbff}\x{dc00}\x{dfff}\x{e000}\x{f8ff}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}\x{fe6a}\x{fe6b}\x{feff}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-\x{ff65}\x{fff9}-\x{fffb}\x{10100}\x{10101}\x{1039f}\x{1d173}-\x{1d17a}\x{e0001}\x{e0020}-\x{e007f}\x{f0000}\x{ffffd}\x{100000}');

/**
Steven Wittens's avatar
Steven Wittens committed
36
 * Matches all 'N' Unicode character classes (numbers)
37
38
39
40
 */
define('PREG_CLASS_NUMBERS', '\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}\x{10107}-\x{10133}\x{10320}-\x{10323}\x{1034a}\x{104a0}-\x{104a9}\x{1d7ce}-\x{1d7ff}');

/**
Steven Wittens's avatar
Steven Wittens committed
41
 * Matches all 'P' Unicode character classes (punctuation)
42
43
44
 */
define('PREG_CLASS_PUNCTUATION', '\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-\x{ff65}\x{10100}\x{10101}\x{1039f}');

Dries's avatar
   
Dries committed
45
46
47
48
/**
 * Implementation of hook_help().
 */
function search_help($section = 'admin/help#search') {
Dries's avatar
   
Dries committed
49
  switch ($section) {
Dries's avatar
   
Dries committed
50
51
    case 'admin/modules#description':
      return t('Enables site-wide keyword searching.');
52
53
    case 'admin/settings/search':
      return t('
Steven Wittens's avatar
Steven Wittens committed
54
<p>The search engine works by maintaining an index of the words in your site\'s content. You can adjust the settings below to tweak the indexing behaviour. Note that the search requires cron to be set up correctly.</p>
55
');
56
57
58
59
60
61
    case 'search#noresults':
      return t('<p><ul>
<li>Check if your spelling is correct.</li>
<li>Try using wildcards: <em>walk*</em> matches <em>walker</em>, <em>walking</em>, ...</li>
<li>Use longer words (words shorter than %number letters are ignored).</li>
</ul></p>', array('%number' => variable_get('minimum_word_size', 3)));
Dries's avatar
   
Dries committed
62
  }
Dries's avatar
   
Dries committed
63
}
Kjartan's avatar
Kjartan committed
64
65

/**
Dries's avatar
   
Dries committed
66
 * Implementation of hook_perm().
Kjartan's avatar
Kjartan committed
67
 */
Dries's avatar
 
Dries committed
68
function search_perm() {
Dries's avatar
   
Dries committed
69
  return array('search content', 'administer search');
Dries's avatar
 
Dries committed
70
71
}

Dries's avatar
   
Dries committed
72
73
74
/**
 * Implementation of hook_menu().
 */
Dries's avatar
   
Dries committed
75
function search_menu($may_cache) {
Dries's avatar
   
Dries committed
76
  $items = array();
Dries's avatar
   
Dries committed
77
78

  if ($may_cache) {
Dries's avatar
Dries committed
79
    $items[] = array('path' => 'search', 'title' => t('search'),
Dries's avatar
   
Dries committed
80
81
82
      'callback' => 'search_view',
      'access' => user_access('search content'),
      'type' => MENU_SUGGESTED_ITEM);
Dries's avatar
Dries committed
83
84
85
86
87
88
89
90
91
92

    foreach (module_list() as $name) {
      if (module_hook($name, 'search')) {
        $items[] = array('path' => 'search/'. $name, 'title' => module_invoke($name, 'search', 'name'),
          'callback' => 'search_view',
          'access' => user_access('search content'),
          'type' => $name == 'node' ? MENU_DEFAULT_LOCAL_TASK : MENU_LOCAL_TASK);
      }
    }

Dries's avatar
Dries committed
93
    $items[] = array('path' => 'admin/settings/search', 'title' => t('search'),
94
      'callback' => 'search_admin',
95
      'type' => MENU_NORMAL_ITEM,
96
      'access' => user_access('administer site configuration'));
Dries's avatar
   
Dries committed
97
98
  }

Dries's avatar
   
Dries committed
99
100
101
102
103
104
  return $items;
}

/**
 * Menu callback; displays the search module settings page.
 */
105
function search_admin() {
Dries's avatar
   
Dries committed
106
  if ($_POST) {
107
108
109
110
111
112
113
114
115
116
    if (variable_get('minimum_word_size', 3) != $_POST['edit']['minimum_word_size'] || 
        variable_get('remove_short', 3) != $_POST['edit']['remove_short']) {
      // Note: ensure logical order of messages
      system_settings_save();
      drupal_set_message(t('The index will be rebuilt.'));
      search_wipe();
    }
    else {
      system_settings_save();
    }
Dries's avatar
Dries committed
117
  }
Dries's avatar
   
Dries committed
118

119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
  // Collect some stats
  $remaining = 0;
  $total = 0;
  foreach (module_list() as $module) {
    if (module_hook($module, 'search')) {
      $status = module_invoke($module, 'search', 'status');
      $remaining += $status['remaining'];
      $total += $status['total'];
    }
  } 
  $percentage = ((int)min(100, 100 * ($total - $remaining) / max(1, $total))) . '%';
  $status = '<p><strong>'. t('%percentage of the site has been indexed. There are %count items left to index.', array('%percentage' => $percentage, '%count' => $remaining)) .'</strong></p>';
  $output = form_group('Indexing status', $status);

  // Indexing throttle:
  $items = drupal_map_assoc(array(10, 20, 50, 100, 200, 500));
  $group = form_select(t('Items to index per cron run'), 'search_cron_limit', variable_get('search_cron_limit', 100), $items, t('The maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.'));
  $output .= form_group(t('Indexing throttle'), $group);
Dries's avatar
   
Dries committed
137
  // Indexing settings:
138
139
  $group = '<em>'. t('<p>Changing the settings below will cause the site index to be rebuilt. The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed.</p><p>The default settings should be appropriate for the majority of sites.</p>') .'</em>';
  $group .= form_textfield(t('Minimum word length to index'), 'minimum_word_size', variable_get('minimum_word_size', 3), 3, 3, t('The number of characters a word has to be to be indexed. Words shorter than this will not be searchable.'));
140
  $group .= form_textfield(t('Minimum word length to search for'), 'remove_short', variable_get('remove_short', 3), 3, 3, t('The number of characters a word has to be to be searched for, including wildcard characters.'));
141
  $output .= form_group(t('Indexing settings'), $group);
Dries's avatar
   
Dries committed
142

Dries's avatar
   
Dries committed
143
  print theme('page', system_settings_form($output));
Dries's avatar
   
Dries committed
144
145
}

Dries's avatar
Dries committed
146
/**
147
148
149
150
151
152
153
 * Wipes a part of or the entire search index.
 *
 * @param $sid
 *  (optional) The SID of the item to wipe. If specified, $type must be passed
 *  too.
 * @param $type
 *  (optional) The type of item to wipe.
Dries's avatar
Dries committed
154
 */
155
function search_wipe($sid = NULL, $type = NULL) {
156
157
158
159
160
161
162
  if ($type == NULL && $sid == NULL) {
    module_invoke_all('search', 'reset');
  }
  else {
    db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'", $sid, $type);
    db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type);
  }
Dries's avatar
Dries committed
163
164
}

165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/**
 * Marks a word as dirty (or retrieves the list of dirty words). This is used
 * during indexing (cron). Words which are dirty have outdated total counts in
 * the search_total table, and need to be recounted.
 */
function search_dirty($word = null) {
  static $dirty = array();
  if ($word !== null) {
    $dirty[$word] = true;
  }
  else {
    return $dirty;
  }
}

Kjartan's avatar
Kjartan committed
180
/**
Dries's avatar
   
Dries committed
181
182
 * Implementation of hook_cron().
 *
183
184
 * Fires hook_update_index() in all modules and cleans up dirty words (see
 * search_dirty).
Kjartan's avatar
Kjartan committed
185
186
 */
function search_cron() {
Dries's avatar
Dries committed
187
  // Update word index
Kjartan's avatar
Kjartan committed
188
  foreach (module_list() as $module) {
189
190
    module_invoke($module, 'update_index');
  }
Dries's avatar
Dries committed
191
  // Update word counts for new/changed words
192
193
  foreach (search_dirty() as $word => $dummy) {
    $total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word));
Dries's avatar
Dries committed
194
195
    db_query("UPDATE {search_total} SET count = %d WHERE word = '%s'", $total, $word);
    if (!db_affected_rows()) {
196
197
198
199
200
      // Note: affected rows does not count matching rows that already had the right value!
      $exists = db_result(db_query("SELECT COUNT(*) FROM {search_total} WHERE word = '%s'", $word));
      if (!$exists) {
        db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %d)", $word, $total);
      }
Dries's avatar
Dries committed
201
202
203
204
205
206
207
208
    }
  }
  // Find words that were deleted from search_index, but are still in
  // search_total. We use a LEFT JOIN between the two tables and keep only the
  // rows which fail to join.
  $result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL");
  while ($word = db_fetch_object($result)) {
    db_query("DELETE FROM {search_total} WHERE word = '%s'", $word->realword);
Kjartan's avatar
Kjartan committed
209
210
211
212
  }
}

/**
213
 * Splits a string into component words according to indexing rules.
Kjartan's avatar
Kjartan committed
214
 */
215
216
217
function search_keywords_split($text) {
  static $last = null;
  static $lastsplit = null;
Kjartan's avatar
Kjartan committed
218

219
220
221
  if ($last == $text) {
    return $lastsplit;
  }
Kjartan's avatar
Kjartan committed
222

223
224
  // Decode entities to UTF-8
  $text = decode_entities($text);
225

226
227
  // Call an external processor for word handling.
  search_preprocess($text);
Kjartan's avatar
Kjartan committed
228

229
230
231
232
233
234
235
  // To improve searching for numerical data such as dates, IP addresses
  // or version numbers, we consider a group of numerical characters
  // separated only by punctuation characters to be one piece.
  // This also means that searching for e.g. '20/03/1984' also returns
  // results with '20-03-1984' in them.
  // Readable regexp: ([number]+)[punctuation]+(?=[number])
  $text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text);
Kjartan's avatar
Kjartan committed
236

237
238
239
  // The dot, underscore and dash are simply removed. This allows meaningful
  // search behaviour with acronyms and URLs.
  $text = preg_replace('/[._-]+/', '', $text);
Kjartan's avatar
Kjartan committed
240

241
242
243
  // With the exception of the rules above, we consider all punctuation,
  // marks, spacers, etc, to be a word boundary.
  $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text);
Kjartan's avatar
Kjartan committed
244

245
246
  // Process words
  $words = explode(' ', $text);
247
  array_walk($words, '_search_keywords_truncate');
Kjartan's avatar
Kjartan committed
248

249
250
251
252
253
254
255
  // Save last keyword result
  $last = $text;
  $lastsplit = $words;

  return $words;
}

256
257
258
259
/**
 * Helper function for array_walk in search_keywords_split.
 */
function _search_keywords_truncate(&$text) {
Steven Wittens's avatar
Steven Wittens committed
260
  $text = truncate_utf8($text, 50);
261
262
}

263
264
265
266
267
268
269
270
271
272
273
274
/**
 * Invokes hook_search_preprocess() in modules.
 */
function search_preprocess(&$text) {
  static $modules = null;
  // Cache list of modules which implement this hook. This function gets called
  // a lot during reindexing.
  if (!is_array($modules)) {
    $modules = array();
    foreach (module_list() as $module) {
      if (module_hook($module, 'search_preprocess')) {
        $modules[] = $module;
Dries's avatar
   
Dries committed
275
276
277
      }
    }
  }
278
279
280
281
  // Process $text
  if (count($modules) > 0) {
    foreach ($modules as $module) {
      $text = module_invoke($module, 'search_preprocess', $text);
Kjartan's avatar
Kjartan committed
282
283
284
285
    }
  }
}

286

Kjartan's avatar
Kjartan committed
287
/**
288
289
290
291
 * Update the search index for a particular item.
 *
 * @param $sid
 *   A number identifying this particular item (e.g. node id).
Kjartan's avatar
Kjartan committed
292
 *
293
294
295
296
297
 * @param $type
 *   A string defining this type of item (e.g. 'node')
 *
 * @param $text
 *   The content of this item. Must be a piece of HTML text.
298
299
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
300
 */
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
function search_index($sid, $type, $text) {
  $minimum_word_size = variable_get('minimum_word_size', 3);

  global $base_url;
  $node_regexp = '!href=[\'"]?(?:'. preg_quote($base_url) .'/)?(?:\?q=)?([^\'">]+)[\'">]!i';

  // Multipliers for scores of words inside certain HTML tags.
  // Note: 'a' must be included for link ranking to work.
  $tags = array('h1' => 21,
                'h2' => 18,
                'h3' => 15,
                'h4' => 12,
                'h5' => 9,
                'h6' => 6,
                'u' => 5,
                'b' => 5,
                'strong' => 5,
                'em' => 5,
                'a' => 10);

  // Strip off all ignored tags to speed up processing, but insert space before/after
  // them to keep word boundaries.
  $text = str_replace(array('<', '>'), array(' <', '> '), $text);
  $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');

  // Split HTML tags from plain text.
  $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  // Note: PHP ensures the array consists of alternating delimiters and literals
  // and begins and ends with a literal (inserting $null as required).

  $tag = false; // Odd/even counter. Tag or no tag.
  $link = false; // State variable for link analyser
  $score = 1; // Starting score per word

  $results = array(0 => array());

  foreach ($split as $value) {
    if ($tag) {
      // Increase or decrease score per word based on tag
      list($tagname) = explode(' ', $value, 2);
      $tagname = strtolower($tagname);
      if ($tagname{0} == '/') {
        $score -= $tags[substr($tagname, 1)];
        if ($score < 1) { // possible due to bad HTML
          $score = 1;
        }
        if ($tagname == '/a') {
          $link = false;
        }
Kjartan's avatar
Kjartan committed
350
      }
351
352
353
354
355
      else {
        if ($tagname == 'a') {
          // Check if link points to a node on this site
          if (preg_match($node_regexp, $value, $match)) {
            $path = drupal_get_normal_path($match[1]);
356
            if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
357
358
359
360
361
              $linknid = $match[1];
              if ($linknid > 0) {
                $link = true;
              }
            }
Kjartan's avatar
Kjartan committed
362
363
          }
        }
364
        $score += $tags[$tagname];
Kjartan's avatar
Kjartan committed
365
      }
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
    }
    else {
      // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
      if ($value != '') {
        $words = search_keywords_split($value);
        foreach ($words as $word) {
          // Check wordlength
          if (string_length($word) >= $minimum_word_size) {
            $word = strtolower($word);
            if ($link) {
              if (!isset($results[$linknid])) {
                $results[$linknid] = array();
              }
              $results[$linknid][$word] += $score;
            }
            else {
              $results[0][$word] += $score;
            }
          }
Dries's avatar
   
Dries committed
385
        }
Kjartan's avatar
Kjartan committed
386
387
      }
    }
388
    $tag = !$tag;
Kjartan's avatar
Kjartan committed
389
390
  }

391
  search_wipe($sid, $type);
Kjartan's avatar
Kjartan committed
392

393
394
395
396
397
398
  // Insert results into search index
  foreach ($results[0] as $word => $score) {
    db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %d)", $word, $sid, $type, $score);
    search_dirty($word);
  }
  unset($results[0]);
Dries's avatar
   
Dries committed
399

400
401
402
403
404
  // Now insert links to nodes
  foreach ($results as $nid => $words) {
    foreach ($words as $word => $score) {
      db_query("INSERT INTO {search_index} (word, sid, type, fromsid, fromtype, score) VALUES ('%s', %d, '%s', %d, '%s', %d)", $word, $nid, 'node', $sid, $type, $score);
      search_dirty($word);
Kjartan's avatar
Kjartan committed
405
406
407
408
409
    }
  }
}

/**
410
411
 * Perform a search on a word or words.
 *
412
413
 * This function is called by each module that supports the indexed search
 * (and thus, implements hook_update_index()).
414
 *
415
 * The final query is an SQL select on the search_index table. As a guide for
416
417
418
419
420
421
422
423
424
425
426
427
428
429
 * writing the optional extra SQL fragments (see below), use this query:
 *
 * SELECT i.type, i.sid, i.word, SUM(i.score/t.count) AS score
 * FROM {search_index} i
 * $join INNER JOIN {search_total} t ON i.word = t.word
 * WHERE $where AND (i.word = '...' OR ...)
 * GROUP BY i.type, i.sid
 * ORDER BY score DESC";
 *
 * @param $keys
 *   A search string as entered by the user.
 *
 * @param $type
 *   A string identifying the calling module.
Kjartan's avatar
Kjartan committed
430
 *
431
432
433
434
435
436
437
438
439
440
 * @param $join
 *   (optional) A string to be inserted into the JOIN part of the SQL query.
 *   For example "INNER JOIN {node} n ON n.nid = i.sid".
 *
 * @param $where
 *   (optional) A string to be inserted into the WHERE part of the SQL query.
 *   For example "(n.status > 0)".
 *
 * @return
 *   An array of SIDs for the search results.
441
442
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
443
 */
444
445
446
function do_search($keys, $type, $join = '', $where = '1') {
  // Note, we replace the wildcards with U+FFFD (Replacement character) to pass
  // through the keyword extractor.
Steven Wittens's avatar
Steven Wittens committed
447
  $keys = str_replace('*', '�', $keys);
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462

  // Split into words
  $keys = search_keywords_split($keys);
  // Lowercase
  foreach ($keys as $k => $v) {
    $keys[$k] = strtolower($v);
  }

  $words = array();
  $arguments = array();
  // Build WHERE clause
  foreach ($keys as $word) {
    if (string_length($word) < variable_get('remove_short', 3)) {
      continue;
    }
Steven Wittens's avatar
Steven Wittens committed
463
    if (strpos($word, '�') !== false) {
464
      $words[] = "i.word LIKE '%s'";
Steven Wittens's avatar
Steven Wittens committed
465
      $arguments[] = str_replace('�', '%', $word);
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
    }
    else {
      $words[] = "i.word = '%s'";
      $arguments[] = $word;
    }
  }
  if (count($words) == 0) {
    return array();
  }
  $where .= ' AND ('. implode(' OR ', $words) .')';

  // Get result count (for pager)
  $count = db_result(db_query("SELECT COUNT(DISTINCT i.sid, i.type) FROM {search_index} i $join WHERE $where", $arguments));
  if ($count == 0) {
    return array();
  }
  $count_query = "SELECT $count";

  // Do pager query
  $query = "SELECT i.type, i.sid, i.word, SUM(i.score/t.count) AS score FROM {search_index} i $join INNER JOIN {search_total} t ON i.word = t.word WHERE $where GROUP BY i.type, i.sid ORDER BY score DESC";
486
  $result = pager_query($query, 15, 0, $count_query, $arguments);
487
488
489
490
491
492
493

  $results = array();
  while ($item = db_fetch_object($result)) {
    $results[] = $item->sid;
  }

  return $results;
Kjartan's avatar
Kjartan committed
494
495
}

Dries's avatar
   
Dries committed
496
497
498
499
/**
 * Menu callback; presents the search form and/or search results.
 */
function search_view() {
500
  $keys = isset($_GET['keys']) ? $_GET['keys'] : $_POST['edit']['keys'];
Dries's avatar
Dries committed
501
  $type = arg(1) ? arg(1) : (isset($_GET['type']) ? $_GET['type'] : ($_POST['edit']['type'] ? $_POST['edit']['type'] : 'node'));
Dries's avatar
   
Dries committed
502

Dries's avatar
   
Dries committed
503
  if (user_access('search content')) {
504
505
506
    // Only perform search if there is non-whitespace search term:
    if (trim($keys)) {
      // Log the search keys:
Dries's avatar
Dries committed
507
508
      watchdog('search',
        t('Search: %keys (%type).', array('%keys' => "<em>$keys</em>", '%type' => module_invoke($type, 'search', 'name'))),
Dries's avatar
Dries committed
509
        WATCHDOG_NOTICE,
Dries's avatar
Dries committed
510
511
        l(t('results'), 'search', NULL, 'keys='. urlencode($keys) . '&type='. urlencode($type))
        );
Dries's avatar
   
Dries committed
512

513
      // Collect the search results:
514
      $results = search_data($keys, $type);
Dries's avatar
 
Dries committed
515

Dries's avatar
   
Dries committed
516
      if ($results) {
517
        $results = theme('box', t('Search results'), $results);
Dries's avatar
   
Dries committed
518
519
      }
      else {
520
        $results = theme('box', t('Your search yielded no results'), search_help('search#noresults'));
Dries's avatar
   
Dries committed
521
      }
Dries's avatar
 
Dries committed
522
    }
523
524
525
526
527
528
529
530
531
    else if (isset($_POST['edit'])) {
      form_set_error('keys', t('Please enter some keywords.'));
    }

    // Construct the search form.
    // Note, we do this last because of the form_set_error() above.
    $output = search_form(NULL, $keys, $type, TRUE);

    $output .= $results;
Dries's avatar
 
Dries committed
532

533
    print theme('page', $output);
534
  }
Dries's avatar
 
Dries committed
535
  else {
Dries's avatar
   
Dries committed
536
    drupal_access_denied();
Dries's avatar
 
Dries committed
537
  }
Kjartan's avatar
Kjartan committed
538
539
540

}

541
542
543
544
545
546
547
548
/**
 * @defgroup search Search interface
 * @{
 * The Drupal search interface manages a global search mechanism.
 *
 * Modules may plug into this system to provide searches of different types of
 * data. Most of the system is handled by search.module, so this must be enabled
 * for all of the search features to work.
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
 *
 * There are three ways to interact with the search system:
 * - Specifically for searching nodes, you can implement nodeapi('update index')
 *   and nodeapi('search result'). However, note that the search system already
 *   indexes all visible output of a node, i.e. everything displayed normally
 *   by hook_view() and hook_nodeapi('view'). This is usually sufficient.
 *   You should only use this mechanism if you want additional, non-visible data
 *   to be indexed.
 * - Implement hook_search(). This will create a search tab for your module on
 *   the /search page with a simple keyword search form. You may optionally
 *   implement hook_search_item() to customize the display of your results.
 * - Implement hook_update_index(). This allows your module to use Drupal's
 *   HTML indexing mechanism for searching full text efficiently.
 *
 * If your module needs to provide a more complicated search form, then you need
 * to implement it yourself without hook_search(). In that case, you should 
 * define it as a local task (tab) under the /search page (e.g. /search/mymodule)
 * so that users can easily find it.
567
568
569
570
571
572
573
574
575
 */

/**
 * Render a search form.
 *
 * @param $action
 *   Form action. Defaults to "search".
 * @param $keys
 *   The search string entered by the user, containing keywords for the search.
576
577
578
 * @param $type
 *   The type of search to render the node for. Must be the name of module
 *   which implements hook_search(). Defaults to 'node'.
579
580
581
 * @return
 *   An HTML string containing the search form.
 */
582
function search_form($action = '', $keys = '', $type = null) {
583
584
585
586
587
  $edit = $_POST['edit'];

  if (!$action) {
    $action = url('search');
  }
Dries's avatar
Dries committed
588
589
590
  if (!$type) {
    $type = 'node';
  }
591
592
593
594
595
596
597

  $output = ' <div class="search-form">';
  $box = '<div class="container-inline">';
  $box .= form_textfield('', 'keys', $keys, 40, 255);
  $box .= form_submit(t('Search'));;
  $box .= '</div>';
  $output .= form_item(t('Enter your keywords'), $box);
Dries's avatar
Dries committed
598
  $output .= form_hidden('type', $type);
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
  $output .= '</div>';

  return form($output, 'post', $action);
}

/**
 * Perform a search on the given keys, and return the formatted results.
 */
function search_data($keys = NULL, $type = 'node') {
  $output = '';

  if (isset($keys)) {
    if (module_hook($type, 'search')) {
      $results = module_invoke($type, 'search', 'search', $keys);
      if (is_array($results) && count($results)) {
        $output .= '<dl class="search-results">';
        foreach ($results as $entry) {
          $output .= theme('search_item', $entry, $type);
        }
        $output .= '</dl>';
        $output .= theme('pager', NULL, 15, 0, array('keys' => $keys, 'type' => $type));
      }
    }
  }

  return $output;
}

/**
 * Returns snippets from a piece of text, with certain keywords highlighted.
 * Used for formatting search results.
 *
 * @param $keys
 *   A string containing keywords. They are split into words using the same
 *   rules as search indexing.
 *
 * @param $text
 *   The text to extract fragments from.
 *
 * @return
 *   A string containing HTML for the excerpt.
 */
function search_excerpt($keys, $text) {
  $keys = search_keywords_split($keys);
  $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
644
645
  array_walk($keys, '_search_excerpt_replace');
  $workkeys = $keys;
646
647
648
649

  // Extract a fragment per keyword for at most 4 keywords.
  // First we collect ranges of text around each keyword, starting/ending
  // at spaces.
650
  // If the sum of all fragments is too short, we look for second occurences.
651
652
653
  $ranges = array();
  $included = array();
  $length = 0;
654
655
  while ($length < 256 && count($workkeys)) {
    foreach ($workkeys as $k => $key) {
656
      if (strlen($key) == 0) {
657
        unset($workkeys[$k]);
658
659
660
661
662
663
664
665
666
667
        continue;
      }
      if ($length >= 256) {
        break;
      }
      // Remember occurence of key so we can skip over it if more occurences
      // are desired.
      if (!isset($included[$key])) {
        $included[$key] = 0;
      }
668
669
670
671
      // Locate a keyword (position $p), then locate a space in front (position
      // $q) and behind it (position $s)
      if (preg_match('/\b'. $key .'\b/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
        $p = $match[0][1];
672
673
674
675
676
677
678
679
        if (($q = strpos($text, ' ', max(0, $p - 60))) !== false) {
          $end = substr($text, $p, 80);
          if (($s = strrpos($end, ' ')) !== false) {
            $ranges[$q] = $p + $s;
            $length += $p + $s - $q;
            $included[$key] = $p + 1;
          }
          else {
680
            unset($workkeys[$k]);
681
682
683
          }
        }
        else {
684
          unset($workkeys[$k]);
685
686
687
        }
      }
      else {
688
        unset($workkeys[$k]);
689
690
      }
    }
691
  }
692

693
694
695
  // If we didn't find anything, return the beginning.
  if (count($ranges) == 0) {
    return truncate_utf8($text, 256) . ' ...';
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

  // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
  $newranges = array();
  foreach ($ranges as $from2 => $to2) {
    if (!isset($from1)) {
      $from1 = $from2;
      $to1 = $to2;
      continue;
    }
    if ($from2 <= $to1) {
      $to1 = max($to1, $to2);
    }
    else {
      $newranges[$from1] = $to1;
      $from1 = $from2;
      $to1 = $to2;
    }
  }
  $newranges[$from1] = $to1;

  // Fetch text
  $out = array();
  foreach ($newranges as $from => $to) {
    $out[] = substr($text, $from, $to - $from);
  }
725
  $text = (isset($newranges[0]) ? '' : '... '). implode(' ... ', $out) .' ...';
726
727

  // Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
728
  $text = preg_replace('/\b('. implode('|', $keys) .')\b/iu', '<strong>\0</strong>', $text);
729
730
731
  return $text;
}

732
733
734
735
/**
 * @} End of "defgroup search".
 */

736
737
738
/**
 * Helper function for array_walk in search_except.
 */
739
740
function _search_excerpt_replace(&$text) {
  $text = preg_quote($text, '/');
741
742
743
744
745
746
747
748
749
750
}

/**
 * Format a single result entry of a search query.
 *
 * Modules may implement hook_search_item() in order to override this default
 * function to display search results.
 *
 * @param $item
 *   A single search result as returned by hook_search(). The result should be
Dries's avatar
Dries committed
751
752
753
 *   an array with keys "link", "title", "type", "user", "date", and "snippet".
 *   Optionally, "extra" can be an array of extra info to show along with the
 *   result.
754
 * @param $type
755
756
757
 *   The type of item found, such as "user" or "node".
 *
 * @ingroup themeable
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
 */
function theme_search_item($item, $type) {
  if (module_hook($type, 'search_item')) {
    $output = module_invoke($type, 'search_item', $item);
  }
  else {
    $output = ' <dt class="title"><a href="'. $item['link'] .'">'. $item['title'] .'</a></dt>';
    $info = array();
    if ($item['type']) {
      $info[] = $item['type'];
    }
    if ($item['user']) {
      $info[] = $item['user'];
    }
    if ($item['date']) {
      $info[] = format_date($item['date'], 'small');
    }
Dries's avatar
Dries committed
775
776
    if (is_array($item['extra'])) {
      $info = array_merge($info, $item['extra']);
777
778
779
780
781
782
783
784
    }
    $output .= ' <dd>'. ($item['snippet'] ? '<p>'. $item['snippet'] . '</p>' : '') . '<p class="search-info">' . implode(' - ', $info) .'</p></dd>';
  }

  return $output;
}


Dries's avatar
   
Dries committed
785
?>