search.module 30.7 KB
Newer Older
Dries's avatar
 
Dries committed
1
<?php
2
// $Id$
Dries's avatar
 
Dries committed
3

Dries's avatar
 
Dries committed
4 5 6 7 8
/**
 * @file
 * Enables site-wide keyword searching.
 */

9
/**
10
 * Matches Unicode character classes to exclude from the search index.
Steven Wittens's avatar
Steven Wittens committed
11
 *
12 13
 * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
 *
14
 * The index only contains the following character classes:
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
 * Lu     Letter, Uppercase
 * Ll     Letter, Lowercase
 * Lt     Letter, Titlecase
 * Lm     Letter, Modifier
 * Lo     Letter, Other
 * Mn     Mark, Nonspacing
 * Mc     Mark, Spacing Combining
 * Nd     Number, Decimal Digit
 * Nl     Number, Letter
 * No     Number, Other
 * Sm     Symbol, Math
 * Sc     Symbol, Currency
 * Sk     Symbol, Modifier
 * So     Symbol, Other
 *
30
 * All character classes not in the list above (enclosing marks, punctuation, control codes and spacers):
31 32 33 34 35
 * 'Me', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Zs', 'Zl', 'Zp', 'Cc', 'Cf', 'Cs', 'Co'
 */
define('PREG_CLASS_SEARCH_EXCLUDE', '\x{0}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}\x{5f}\x{7b}\x{7d}\x{7f}-\x{a1}\x{ab}\x{ad}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{488}\x{489}\x{55a}-\x{55f}\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{600}-\x{603}\x{60c}\x{60d}\x{61b}\x{61f}\x{66a}-\x{66d}\x{6d4}\x{6dd}\x{6de}\x{700}-\x{70d}\x{70f}\x{964}\x{965}\x{970}\x{df4}\x{e4f}\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}\x{1361}-\x{1368}\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}\x{1736}\x{17b4}\x{17b5}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{180e}\x{1944}\x{1945}\x{2000}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}\x{205f}-\x{2063}\x{206a}-\x{206f}\x{207d}\x{207e}\x{208d}\x{208e}\x{20dd}-\x{20e0}\x{20e2}-\x{20e4}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}\x{3000}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}\x{30fb}\x{d800}\x{db7f}\x{db80}\x{dbff}\x{dc00}\x{dfff}\x{e000}\x{f8ff}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}\x{fe6a}\x{fe6b}\x{feff}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-\x{ff65}\x{fff9}-\x{fffb}\x{10100}\x{10101}\x{1039f}\x{1d173}-\x{1d17a}\x{e0001}\x{e0020}-\x{e007f}\x{f0000}\x{ffffd}\x{100000}');

/**
Steven Wittens's avatar
Steven Wittens committed
36
 * Matches all 'N' Unicode character classes (numbers)
37 38 39 40
 */
define('PREG_CLASS_NUMBERS', '\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}\x{10107}-\x{10133}\x{10320}-\x{10323}\x{1034a}\x{104a0}-\x{104a9}\x{1d7ce}-\x{1d7ff}');

/**
Steven Wittens's avatar
Steven Wittens committed
41
 * Matches all 'P' Unicode character classes (punctuation)
42 43 44
 */
define('PREG_CLASS_PUNCTUATION', '\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-\x{ff65}\x{10100}\x{10101}\x{1039f}');

Dries's avatar
 
Dries committed
45 46 47 48
/**
 * Implementation of hook_help().
 */
function search_help($section = 'admin/help#search') {
Dries's avatar
 
Dries committed
49
  switch ($section) {
Dries's avatar
 
Dries committed
50 51
    case 'admin/modules#description':
      return t('Enables site-wide keyword searching.');
52 53
    case 'admin/settings/search':
      return t('
Steven Wittens's avatar
Steven Wittens committed
54
<p>The search engine works by maintaining an index of the words in your site\'s content. You can adjust the settings below to tweak the indexing behaviour. Note that the search requires cron to be set up correctly.</p>
55
');
56 57 58 59 60 61
    case 'search#noresults':
      return t('<p><ul>
<li>Check if your spelling is correct.</li>
<li>Try using wildcards: <em>walk*</em> matches <em>walker</em>, <em>walking</em>, ...</li>
<li>Use longer words (words shorter than %number letters are ignored).</li>
</ul></p>', array('%number' => variable_get('minimum_word_size', 3)));
Dries's avatar
 
Dries committed
62
  }
Dries's avatar
 
Dries committed
63
}
Kjartan's avatar
Kjartan committed
64 65

/**
Dries's avatar
 
Dries committed
66
 * Implementation of hook_perm().
Kjartan's avatar
Kjartan committed
67
 */
Dries's avatar
 
Dries committed
68
function search_perm() {
Dries's avatar
 
Dries committed
69
  return array('search content', 'administer search');
Dries's avatar
 
Dries committed
70 71
}

72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
/**
 * Implementation of hook_block().
 */
function search_block($op = 'list', $delta = 0) {
  global $user;
  if ($op == 'list') {
    $blocks[0]['info'] = t('Search form');
    return $blocks;
  }
  else if ($op == 'view' && user_access('search content') && arg(0) != 'search') {
    $block['content'] = search_form('', '', null, '');
    $block['subject'] = t('Search');
    return $block;
  }
}

Dries's avatar
 
Dries committed
88 89 90
/**
 * Implementation of hook_menu().
 */
Dries's avatar
 
Dries committed
91
function search_menu($may_cache) {
Dries's avatar
 
Dries committed
92
  $items = array();
Dries's avatar
 
Dries committed
93 94

  if ($may_cache) {
Dries's avatar
Dries committed
95
    $items[] = array('path' => 'search', 'title' => t('search'),
Dries's avatar
 
Dries committed
96 97 98
      'callback' => 'search_view',
      'access' => user_access('search content'),
      'type' => MENU_SUGGESTED_ITEM);
Dries's avatar
Dries committed
99

100 101 102 103 104 105 106 107 108 109
    $items[] = array('path' => 'admin/settings/search', 'title' => t('search'),
      'callback' => 'search_admin',
      'type' => MENU_NORMAL_ITEM,
      'access' => user_access('administer site configuration'));
  }
  else if (arg(0) == 'search') {
    // To remember the user's search keywords when switching across tabs,
    // we dynamically add the keywords to the search tabs' paths.
    $keys = search_get_keys();
    $keys = strlen($keys) ? '/'. $keys : '';
Dries's avatar
Dries committed
110 111
    foreach (module_list() as $name) {
      if (module_hook($name, 'search')) {
112
        $items[] = array('path' => 'search/'. $name . $keys, 'title' => module_invoke($name, 'search', 'name'),
Dries's avatar
Dries committed
113 114
          'callback' => 'search_view',
          'access' => user_access('search content'),
115
          'type' => MENU_LOCAL_TASK);
Dries's avatar
Dries committed
116 117
      }
    }
Dries's avatar
 
Dries committed
118 119
  }

Dries's avatar
 
Dries committed
120 121 122
  return $items;
}

123

Dries's avatar
 
Dries committed
124 125 126
/**
 * Menu callback; displays the search module settings page.
 */
127
function search_admin() {
Dries's avatar
 
Dries committed
128
  if ($_POST) {
129 130
    // If the word length settings change, the index needs to be rebuilt.
    if (variable_get('minimum_word_size', 3) != $_POST['edit']['minimum_word_size']) {
131 132 133 134 135 136 137 138
      // Note: ensure logical order of messages
      system_settings_save();
      drupal_set_message(t('The index will be rebuilt.'));
      search_wipe();
    }
    else {
      system_settings_save();
    }
Dries's avatar
Dries committed
139
  }
Dries's avatar
 
Dries committed
140

141 142 143 144 145 146 147 148 149
  // Collect some stats
  $remaining = 0;
  $total = 0;
  foreach (module_list() as $module) {
    if (module_hook($module, 'search')) {
      $status = module_invoke($module, 'search', 'status');
      $remaining += $status['remaining'];
      $total += $status['total'];
    }
Dries's avatar
Dries committed
150
  }
151
  $count = format_plural($remaining, 'There is 1 item left to index.', 'There are %count items left to index.');
152
  $percentage = ((int)min(100, 100 * ($total - $remaining) / max(1, $total))) . '%';
153
  $status = '<p><strong>'. t('%percentage of the site has been indexed.', array('%percentage' => $percentage)) .' '. $count .'</strong></p>';
154 155 156 157 158 159
  $output = form_group('Indexing status', $status);

  // Indexing throttle:
  $items = drupal_map_assoc(array(10, 20, 50, 100, 200, 500));
  $group = form_select(t('Items to index per cron run'), 'search_cron_limit', variable_get('search_cron_limit', 100), $items, t('The maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.'));
  $output .= form_group(t('Indexing throttle'), $group);
Dries's avatar
 
Dries committed
160
  // Indexing settings:
161
  $group = '<em>'. t('<p>Changing the setting below will cause the site index to be rebuilt. The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed.</p><p>The default settings should be appropriate for the majority of sites.</p>') .'</em>';
162
  $group .= form_textfield(t('Minimum word length to index'), 'minimum_word_size', variable_get('minimum_word_size', 3), 3, 3, t('The number of characters a word has to be to be indexed. Words shorter than this will not be searchable.'));
163
  $group .= form_textfield(t('Minimum word length to search for'), 'remove_short', variable_get('remove_short', 3), 3, 3, t('The number of characters a word has to be to be searched for, including wildcard characters.'));
164
  $output .= form_group(t('Indexing settings'), $group);
Dries's avatar
 
Dries committed
165

Dries's avatar
 
Dries committed
166
  print theme('page', system_settings_form($output));
Dries's avatar
 
Dries committed
167 168
}

Dries's avatar
Dries committed
169
/**
170 171 172 173 174 175 176
 * Wipes a part of or the entire search index.
 *
 * @param $sid
 *  (optional) The SID of the item to wipe. If specified, $type must be passed
 *  too.
 * @param $type
 *  (optional) The type of item to wipe.
Dries's avatar
Dries committed
177
 */
178
function search_wipe($sid = NULL, $type = NULL) {
179 180 181 182 183 184 185
  if ($type == NULL && $sid == NULL) {
    module_invoke_all('search', 'reset');
  }
  else {
    db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'", $sid, $type);
    db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type);
  }
Dries's avatar
Dries committed
186 187
}

188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
/**
 * Marks a word as dirty (or retrieves the list of dirty words). This is used
 * during indexing (cron). Words which are dirty have outdated total counts in
 * the search_total table, and need to be recounted.
 */
function search_dirty($word = null) {
  static $dirty = array();
  if ($word !== null) {
    $dirty[$word] = true;
  }
  else {
    return $dirty;
  }
}

Kjartan's avatar
Kjartan committed
203
/**
Dries's avatar
 
Dries committed
204 205
 * Implementation of hook_cron().
 *
206 207
 * Fires hook_update_index() in all modules and cleans up dirty words (see
 * search_dirty).
Kjartan's avatar
Kjartan committed
208 209
 */
function search_cron() {
Dries's avatar
Dries committed
210
  // Update word index
Kjartan's avatar
Kjartan committed
211
  foreach (module_list() as $module) {
212 213
    module_invoke($module, 'update_index');
  }
Dries's avatar
Dries committed
214
  // Update word counts for new/changed words
215 216
  foreach (search_dirty() as $word => $dummy) {
    $total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word));
Dries's avatar
Dries committed
217 218
    db_query("UPDATE {search_total} SET count = %d WHERE word = '%s'", $total, $word);
    if (!db_affected_rows()) {
219 220 221 222 223
      // Note: affected rows does not count matching rows that already had the right value!
      $exists = db_result(db_query("SELECT COUNT(*) FROM {search_total} WHERE word = '%s'", $word));
      if (!$exists) {
        db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %d)", $word, $total);
      }
Dries's avatar
Dries committed
224 225 226 227 228 229 230 231
    }
  }
  // Find words that were deleted from search_index, but are still in
  // search_total. We use a LEFT JOIN between the two tables and keep only the
  // rows which fail to join.
  $result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL");
  while ($word = db_fetch_object($result)) {
    db_query("DELETE FROM {search_total} WHERE word = '%s'", $word->realword);
Kjartan's avatar
Kjartan committed
232 233 234 235
  }
}

/**
236
 * Splits a string into component words according to indexing rules.
Kjartan's avatar
Kjartan committed
237
 */
238 239 240
function search_keywords_split($text) {
  static $last = null;
  static $lastsplit = null;
Kjartan's avatar
Kjartan committed
241

242 243 244
  if ($last == $text) {
    return $lastsplit;
  }
Kjartan's avatar
Kjartan committed
245

246 247
  // Decode entities to UTF-8
  $text = decode_entities($text);
248

249 250
  // Call an external processor for word handling.
  search_preprocess($text);
Kjartan's avatar
Kjartan committed
251

252 253 254 255 256 257 258
  // To improve searching for numerical data such as dates, IP addresses
  // or version numbers, we consider a group of numerical characters
  // separated only by punctuation characters to be one piece.
  // This also means that searching for e.g. '20/03/1984' also returns
  // results with '20-03-1984' in them.
  // Readable regexp: ([number]+)[punctuation]+(?=[number])
  $text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text);
Kjartan's avatar
Kjartan committed
259

260 261 262
  // The dot, underscore and dash are simply removed. This allows meaningful
  // search behaviour with acronyms and URLs.
  $text = preg_replace('/[._-]+/', '', $text);
Kjartan's avatar
Kjartan committed
263

264 265 266
  // With the exception of the rules above, we consider all punctuation,
  // marks, spacers, etc, to be a word boundary.
  $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text);
Kjartan's avatar
Kjartan committed
267

268 269
  // Process words
  $words = explode(' ', $text);
270
  array_walk($words, '_search_keywords_truncate');
Kjartan's avatar
Kjartan committed
271

272 273 274 275 276 277 278
  // Save last keyword result
  $last = $text;
  $lastsplit = $words;

  return $words;
}

279 280 281 282
/**
 * Helper function for array_walk in search_keywords_split.
 */
function _search_keywords_truncate(&$text) {
Steven Wittens's avatar
Steven Wittens committed
283
  $text = truncate_utf8($text, 50);
284 285
}

286 287
/**
 * Loosens up a set of search keywords by adding wildcards, if possible.
288
 *
289 290 291 292 293 294 295 296 297 298 299 300
 * @param $text
 *   The keywords as entered by the user.
 * @return
 *   If more wildcards can be added, the adjusted keywords are returned.
 *   If the query is already as loose as possible, NULL is returned.
 */
function search_keywords_variation($text) {
  $text = trim($text);
  $new = preg_replace('/\*+/', '*', '*'. implode('* *', explode(' ', trim($text))) .'*');
  return ($new != $text) ? $new : NULL;
}

301 302 303 304
/**
 * Invokes hook_search_preprocess() in modules.
 */
function search_preprocess(&$text) {
305 306
  foreach (module_implements('search_preprocess') as $module) {
    $text = module_invoke($module, 'search_preprocess', $text);
Kjartan's avatar
Kjartan committed
307 308 309
  }
}

310

Kjartan's avatar
Kjartan committed
311
/**
Steven Wittens's avatar
Steven Wittens committed
312
 * Update the full-text search index for a particular item.
313 314 315
 *
 * @param $sid
 *   A number identifying this particular item (e.g. node id).
Kjartan's avatar
Kjartan committed
316
 *
317 318 319 320 321
 * @param $type
 *   A string defining this type of item (e.g. 'node')
 *
 * @param $text
 *   The content of this item. Must be a piece of HTML text.
322 323
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
324
 */
325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
function search_index($sid, $type, $text) {
  $minimum_word_size = variable_get('minimum_word_size', 3);

  global $base_url;
  $node_regexp = '!href=[\'"]?(?:'. preg_quote($base_url) .'/)?(?:\?q=)?([^\'">]+)[\'">]!i';

  // Multipliers for scores of words inside certain HTML tags.
  // Note: 'a' must be included for link ranking to work.
  $tags = array('h1' => 21,
                'h2' => 18,
                'h3' => 15,
                'h4' => 12,
                'h5' => 9,
                'h6' => 6,
                'u' => 5,
                'b' => 5,
                'strong' => 5,
                'em' => 5,
                'a' => 10);

  // Strip off all ignored tags to speed up processing, but insert space before/after
  // them to keep word boundaries.
  $text = str_replace(array('<', '>'), array(' <', '> '), $text);
  $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');

  // Split HTML tags from plain text.
  $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  // Note: PHP ensures the array consists of alternating delimiters and literals
  // and begins and ends with a literal (inserting $null as required).

  $tag = false; // Odd/even counter. Tag or no tag.
  $link = false; // State variable for link analyser
  $score = 1; // Starting score per word

  $results = array(0 => array());

  foreach ($split as $value) {
    if ($tag) {
      // Increase or decrease score per word based on tag
      list($tagname) = explode(' ', $value, 2);
      $tagname = strtolower($tagname);
      if ($tagname{0} == '/') {
        $score -= $tags[substr($tagname, 1)];
        if ($score < 1) { // possible due to bad HTML
          $score = 1;
        }
        if ($tagname == '/a') {
          $link = false;
        }
Kjartan's avatar
Kjartan committed
374
      }
375 376 377 378 379
      else {
        if ($tagname == 'a') {
          // Check if link points to a node on this site
          if (preg_match($node_regexp, $value, $match)) {
            $path = drupal_get_normal_path($match[1]);
380
            if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
381 382 383 384 385
              $linknid = $match[1];
              if ($linknid > 0) {
                $link = true;
              }
            }
Kjartan's avatar
Kjartan committed
386 387
          }
        }
388
        $score += $tags[$tagname];
Kjartan's avatar
Kjartan committed
389
      }
390 391 392 393 394 395 396 397
    }
    else {
      // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
      if ($value != '') {
        $words = search_keywords_split($value);
        foreach ($words as $word) {
          // Check wordlength
          if (string_length($word) >= $minimum_word_size) {
398
            // Note: strtolower can be used because the value is only used internally.
399 400 401 402 403 404 405 406 407 408 409
            $word = strtolower($word);
            if ($link) {
              if (!isset($results[$linknid])) {
                $results[$linknid] = array();
              }
              $results[$linknid][$word] += $score;
            }
            else {
              $results[0][$word] += $score;
            }
          }
Dries's avatar
 
Dries committed
410
        }
Kjartan's avatar
Kjartan committed
411 412
      }
    }
413
    $tag = !$tag;
Kjartan's avatar
Kjartan committed
414 415
  }

416
  search_wipe($sid, $type);
Kjartan's avatar
Kjartan committed
417

418 419 420 421 422 423
  // Insert results into search index
  foreach ($results[0] as $word => $score) {
    db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %d)", $word, $sid, $type, $score);
    search_dirty($word);
  }
  unset($results[0]);
Dries's avatar
 
Dries committed
424

425 426 427 428 429
  // Now insert links to nodes
  foreach ($results as $nid => $words) {
    foreach ($words as $word => $score) {
      db_query("INSERT INTO {search_index} (word, sid, type, fromsid, fromtype, score) VALUES ('%s', %d, '%s', %d, '%s', %d)", $word, $nid, 'node', $sid, $type, $score);
      search_dirty($word);
Kjartan's avatar
Kjartan committed
430 431 432 433 434
    }
  }
}

/**
Steven Wittens's avatar
Steven Wittens committed
435
 * Do a query on the full-text search index for a word or words.
436
 *
Steven Wittens's avatar
Steven Wittens committed
437 438
 * This function is normally only called by each module that support the
 * indexed search (and thus, implements hook_update_index()).
439
 *
440
 * The final query is an SQL select on the search_index table. As a guide for
441 442 443 444 445 446 447 448 449
 * writing the optional extra SQL fragments (see below), use this query:
 *
 * SELECT i.type, i.sid, i.word, SUM(i.score/t.count) AS score
 * FROM {search_index} i
 * $join INNER JOIN {search_total} t ON i.word = t.word
 * WHERE $where AND (i.word = '...' OR ...)
 * GROUP BY i.type, i.sid
 * ORDER BY score DESC";
 *
450
 * @param $keywords
451 452 453 454
 *   A search string as entered by the user.
 *
 * @param $type
 *   A string identifying the calling module.
Kjartan's avatar
Kjartan committed
455
 *
456 457 458 459 460 461 462 463
 * @param $join
 *   (optional) A string to be inserted into the JOIN part of the SQL query.
 *   For example "INNER JOIN {node} n ON n.nid = i.sid".
 *
 * @param $where
 *   (optional) A string to be inserted into the WHERE part of the SQL query.
 *   For example "(n.status > 0)".
 *
464 465 466
 * @param $variation
 *   Used internally. Must not be specified.
 *
467 468
 * @return
 *   An array of SIDs for the search results.
469 470
 *
 * @ingroup search
Kjartan's avatar
Kjartan committed
471
 */
472
function do_search($keywords, $type, $join = '', $where = '1', $variation = true) {
473
  // Note, we replace the wildcards with U+FFFD (Replacement character) to pass
474
  // through the keyword extractor. Multiple wildcards are collapsed into one.
475
  $keys = preg_replace('!\*+!', '�', $keywords);
476 477 478 479 480 481

  // Split into words
  $keys = search_keywords_split($keys);

  $words = array();
  $arguments = array();
482
  $refused = array();
483 484 485
  // Build WHERE clause
  foreach ($keys as $word) {
    if (string_length($word) < variable_get('remove_short', 3)) {
486 487 488
      if ($word != '') {
        $refused[] = str_replace('�', '*', $word);
      }
489 490
      continue;
    }
Steven Wittens's avatar
Steven Wittens committed
491
    if (strpos($word, '�') !== false) {
492
      // Note: strtolower can be used because the value is only used internally.
493
      $words[] = "i.word LIKE '%s'";
494
      $arguments[] = str_replace('�', '%', strtolower($word));
495 496 497
    }
    else {
      $words[] = "i.word = '%s'";
498
      $arguments[] = strtolower($word);
499 500
    }
  }
501
  // Tell the user which words were excluded
502 503 504 505
  if (count($refused) && $variation) {
    $message = format_plural(count($refused),
                             'The word %words was not included because it is too short.',
                             'The words %words were not included because they were too short.');
506
    drupal_set_message(strtr($message, array('%words' => theme('placeholder', implode(', ', $refused)))));
507
  }
508

509 510 511
  if (count($words) == 0) {
    return array();
  }
512
  $conditions = $where .' AND ('. implode(' OR ', $words) .')';
513 514

  // Get result count (for pager)
515
  $count = db_num_rows(db_query("SELECT DISTINCT i.sid, i.type FROM {search_index} i $join WHERE $conditions", $arguments));
516
  if ($count == 0) {
517 518 519 520 521 522 523
    // Try out a looser search query if nothing was found.
    if ($variation && $loose = search_keywords_variation($keywords)) {
      return do_search($loose, $type, $join, $where, false);
    }
    else {
      return array();
    }
524 525 526 527
  }
  $count_query = "SELECT $count";

  // Do pager query
528
  $query = "SELECT i.type, i.sid, SUM(i.score/t.count) AS score FROM {search_index} i $join INNER JOIN {search_total} t ON i.word = t.word WHERE $conditions GROUP BY i.type, i.sid ORDER BY score DESC";
529
  $result = pager_query($query, 15, 0, $count_query, $arguments);
530 531 532 533 534 535 536

  $results = array();
  while ($item = db_fetch_object($result)) {
    $results[] = $item->sid;
  }

  return $results;
Kjartan's avatar
Kjartan committed
537 538
}

539 540 541 542 543 544 545 546 547 548
/**
 * Helper function for grabbing search keys.
 */
function search_get_keys() {
  // Extract keys as remainder of path
  // Note: support old GET format of searches for existing links.
  $path = explode('/', $_GET['q'], 3);
  return count($path) == 3 ? $path[2] : $_REQUEST['keys'];
}

Dries's avatar
 
Dries committed
549 550 551 552
/**
 * Menu callback; presents the search form and/or search results.
 */
function search_view() {
553 554 555 556 557 558
  $type = arg(1);

  // Search form submits with POST but redirects to GET. This way we can keep
  // the search query URL clean as a whistle:
  // search/type/keyword+keyword
  if ($_POST['edit']['keys']) {
559 560 561
    if ($type == '') {
      $type = 'node';
    }
562
    drupal_goto('search/'. urlencode($type) .'/'. urlencode($_POST['edit']['keys']));
563 564 565
  }
  else if ($type == '') {
    // Note: search/node can not be a default tab because it would take on the
566
    // path of its parent (search). It would prevent remembering keywords when
567 568 569 570
    // switching tabs. This is why we drupal_goto to it from the parent instead.
    drupal_goto('search/node');
  }
  $keys = search_get_keys();
Dries's avatar
 
Dries committed
571

Dries's avatar
 
Dries committed
572
  if (user_access('search content')) {
573 574 575
    // Only perform search if there is non-whitespace search term:
    if (trim($keys)) {
      // Log the search keys:
Dries's avatar
Dries committed
576
      watchdog('search',
577
        t('Search: %keys (%type).', array('%keys' => theme('placeholder', $keys), '%type' => module_invoke($type, 'search', 'name'))),
Dries's avatar
Dries committed
578
        WATCHDOG_NOTICE,
579
        l(t('results'), 'search/'. urlencode($type) .'/'. urlencode($keys))
Dries's avatar
Dries committed
580
        );
Dries's avatar
 
Dries committed
581

582
      // Collect the search results:
583
      $results = search_data($keys, $type);
Dries's avatar
 
Dries committed
584

Dries's avatar
 
Dries committed
585
      if ($results) {
586
        $results = theme('box', t('Search results'), $results);
Dries's avatar
 
Dries committed
587 588
      }
      else {
589
        $results = theme('box', t('Your search yielded no results'), search_help('search#noresults'));
Dries's avatar
 
Dries committed
590
      }
Dries's avatar
 
Dries committed
591
    }
592 593 594 595 596 597
    else if (isset($_POST['edit'])) {
      form_set_error('keys', t('Please enter some keywords.'));
    }

    // Construct the search form.
    // Note, we do this last because of the form_set_error() above.
598
    $output = search_form(NULL, $keys, $type);
599 600

    $output .= $results;
Dries's avatar
 
Dries committed
601

602
    print theme('page', $output);
603
  }
Dries's avatar
 
Dries committed
604
  else {
Dries's avatar
 
Dries committed
605
    drupal_access_denied();
Dries's avatar
 
Dries committed
606
  }
Kjartan's avatar
Kjartan committed
607 608
}

609 610 611 612 613 614 615 616
/**
 * @defgroup search Search interface
 * @{
 * The Drupal search interface manages a global search mechanism.
 *
 * Modules may plug into this system to provide searches of different types of
 * data. Most of the system is handled by search.module, so this must be enabled
 * for all of the search features to work.
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
 *
 * There are three ways to interact with the search system:
 * - Specifically for searching nodes, you can implement nodeapi('update index')
 *   and nodeapi('search result'). However, note that the search system already
 *   indexes all visible output of a node, i.e. everything displayed normally
 *   by hook_view() and hook_nodeapi('view'). This is usually sufficient.
 *   You should only use this mechanism if you want additional, non-visible data
 *   to be indexed.
 * - Implement hook_search(). This will create a search tab for your module on
 *   the /search page with a simple keyword search form. You may optionally
 *   implement hook_search_item() to customize the display of your results.
 * - Implement hook_update_index(). This allows your module to use Drupal's
 *   HTML indexing mechanism for searching full text efficiently.
 *
 * If your module needs to provide a more complicated search form, then you need
Dries's avatar
Dries committed
632
 * to implement it yourself without hook_search(). In that case, you should
633 634
 * define it as a local task (tab) under the /search page (e.g. /search/mymodule)
 * so that users can easily find it.
635 636 637 638 639 640 641 642 643
 */

/**
 * Render a search form.
 *
 * @param $action
 *   Form action. Defaults to "search".
 * @param $keys
 *   The search string entered by the user, containing keywords for the search.
644 645 646
 * @param $type
 *   The type of search to render the node for. Must be the name of module
 *   which implements hook_search(). Defaults to 'node'.
647 648
 * @param $prompt
 *   A piece of text to put before the form (e.g. "Enter your keywords")
649 650 651
 * @return
 *   An HTML string containing the search form.
 */
652
function search_form($action = '', $keys = '', $type = null, $prompt = null) {
653 654 655
  $edit = $_POST['edit'];

  if (!$action) {
656
    $action = url('search/'. $type);
657
  }
Dries's avatar
Dries committed
658 659 660
  if (!$type) {
    $type = 'node';
  }
661 662 663
  if (is_null($prompt)) {
    $prompt = t('Enter your keywords');
  }
664 665 666

  $output = ' <div class="search-form">';
  $box = '<div class="container-inline">';
667
  $box .= form_textfield('', 'keys', $keys, $prompt ? 40 : 20, 255);
668
  $box .= form_submit(t('Search'));
669
  $box .= '</div>';
670
  $output .= form_item($prompt, $box);
671 672 673 674 675 676
  $output .= '</div>';

  return form($output, 'post', $action);
}

/**
Steven Wittens's avatar
Steven Wittens committed
677
 * Perform a standard search on the given keys, and return the formatted results.
678 679 680 681 682 683 684 685 686 687 688 689 690
 */
function search_data($keys = NULL, $type = 'node') {
  $output = '';

  if (isset($keys)) {
    if (module_hook($type, 'search')) {
      $results = module_invoke($type, 'search', 'search', $keys);
      if (is_array($results) && count($results)) {
        $output .= '<dl class="search-results">';
        foreach ($results as $entry) {
          $output .= theme('search_item', $entry, $type);
        }
        $output .= '</dl>';
691
        $output .= theme('pager', NULL, 15, 0);
692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715
      }
    }
  }

  return $output;
}

/**
 * Returns snippets from a piece of text, with certain keywords highlighted.
 * Used for formatting search results.
 *
 * @param $keys
 *   A string containing keywords. They are split into words using the same
 *   rules as search indexing.
 *
 * @param $text
 *   The text to extract fragments from.
 *
 * @return
 *   A string containing HTML for the excerpt.
 */
function search_excerpt($keys, $text) {
  $keys = search_keywords_split($keys);
  $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
716 717
  array_walk($keys, '_search_excerpt_replace');
  $workkeys = $keys;
718 719 720 721

  // Extract a fragment per keyword for at most 4 keywords.
  // First we collect ranges of text around each keyword, starting/ending
  // at spaces.
722
  // If the sum of all fragments is too short, we look for second occurrences.
723 724 725
  $ranges = array();
  $included = array();
  $length = 0;
726 727
  while ($length < 256 && count($workkeys)) {
    foreach ($workkeys as $k => $key) {
728
      if (strlen($key) == 0) {
729
        unset($workkeys[$k]);
730 731 732 733 734
        continue;
      }
      if ($length >= 256) {
        break;
      }
735
      // Remember occurrence of key so we can skip over it if more occurrences
736 737 738 739
      // are desired.
      if (!isset($included[$key])) {
        $included[$key] = 0;
      }
740 741 742 743
      // Locate a keyword (position $p), then locate a space in front (position
      // $q) and behind it (position $s)
      if (preg_match('/\b'. $key .'\b/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
        $p = $match[0][1];
744 745 746 747 748 749 750 751
        if (($q = strpos($text, ' ', max(0, $p - 60))) !== false) {
          $end = substr($text, $p, 80);
          if (($s = strrpos($end, ' ')) !== false) {
            $ranges[$q] = $p + $s;
            $length += $p + $s - $q;
            $included[$key] = $p + 1;
          }
          else {
752
            unset($workkeys[$k]);
753 754 755
          }
        }
        else {
756
          unset($workkeys[$k]);
757 758 759
        }
      }
      else {
760
        unset($workkeys[$k]);
761 762
      }
    }
763
  }
764

765 766 767
  // If we didn't find anything, return the beginning.
  if (count($ranges) == 0) {
    return truncate_utf8($text, 256) . ' ...';
768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

  // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
  $newranges = array();
  foreach ($ranges as $from2 => $to2) {
    if (!isset($from1)) {
      $from1 = $from2;
      $to1 = $to2;
      continue;
    }
    if ($from2 <= $to1) {
      $to1 = max($to1, $to2);
    }
    else {
      $newranges[$from1] = $to1;
      $from1 = $from2;
      $to1 = $to2;
    }
  }
  $newranges[$from1] = $to1;

  // Fetch text
  $out = array();
  foreach ($newranges as $from => $to) {
    $out[] = substr($text, $from, $to - $from);
  }
797
  $text = (isset($newranges[0]) ? '' : '... '). implode(' ... ', $out) .' ...';
798 799

  // Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
800
  $text = preg_replace('/\b('. implode('|', $keys) .')\b/iu', '<strong>\0</strong>', $text);
801 802 803
  return $text;
}

804 805 806 807
/**
 * @} End of "defgroup search".
 */

808 809 810
/**
 * Helper function for array_walk in search_except.
 */
811 812
function _search_excerpt_replace(&$text) {
  $text = preg_quote($text, '/');
813 814 815 816 817 818 819 820 821 822
}

/**
 * Format a single result entry of a search query.
 *
 * Modules may implement hook_search_item() in order to override this default
 * function to display search results.
 *
 * @param $item
 *   A single search result as returned by hook_search(). The result should be
Dries's avatar
Dries committed
823 824 825
 *   an array with keys "link", "title", "type", "user", "date", and "snippet".
 *   Optionally, "extra" can be an array of extra info to show along with the
 *   result.
826
 * @param $type
827 828 829
 *   The type of item found, such as "user" or "node".
 *
 * @ingroup themeable
830 831 832 833 834 835
 */
function theme_search_item($item, $type) {
  if (module_hook($type, 'search_item')) {
    $output = module_invoke($type, 'search_item', $item);
  }
  else {
836
    $output = ' <dt class="title"><a href="'. check_url($item['link']) .'">'. check_plain($item['title']) .'</a></dt>';
837 838 839 840 841 842 843 844 845 846
    $info = array();
    if ($item['type']) {
      $info[] = $item['type'];
    }
    if ($item['user']) {
      $info[] = $item['user'];
    }
    if ($item['date']) {
      $info[] = format_date($item['date'], 'small');
    }
Dries's avatar
Dries committed
847 848
    if (is_array($item['extra'])) {
      $info = array_merge($info, $item['extra']);
849 850 851 852 853 854 855 856
    }
    $output .= ' <dd>'. ($item['snippet'] ? '<p>'. $item['snippet'] . '</p>' : '') . '<p class="search-info">' . implode(' - ', $info) .'</p></dd>';
  }

  return $output;
}


Dries's avatar
 
Dries committed
857
?>