Commit 909d6928 authored by Steven Wittens's avatar Steven Wittens
Browse files

- #28159: Advanced search features (hello from DrupalCon)

Presentation about it:
http://www.acko.net/files/drupal-search-slim.pdf
parent 782d5c98
......@@ -562,6 +562,16 @@ CREATE TABLE role (
UNIQUE KEY name (name)
) TYPE=MyISAM;
--
-- Table structure for table 'search_dataset'
--
CREATE TABLE search_dataset (
sid int(10) unsigned NOT NULL default '0',
type varchar(16) default NULL,
data longtext NOT NULL,
KEY sid_type (sid, type)
) TYPE=MyISAM;
--
-- Table structure for table 'search_index'
--
......@@ -572,9 +582,9 @@ CREATE TABLE search_index (
type varchar(16) default NULL,
fromsid int(10) unsigned NOT NULL default '0',
fromtype varchar(16) default NULL,
score int(10) unsigned default NULL,
KEY sid (sid),
KEY fromsid (fromsid),
score float default NULL,
KEY sid_type (sid, type),
KEY from_sid_type (fromsid, fromtype),
KEY word (word)
) TYPE=MyISAM;
......@@ -584,7 +594,7 @@ CREATE TABLE search_index (
CREATE TABLE search_total (
word varchar(50) NOT NULL default '',
count int(10) unsigned default NULL,
count float default NULL,
PRIMARY KEY (word)
) TYPE=MyISAM;
......
......@@ -571,6 +571,16 @@ CREATE TABLE role (
UNIQUE (name)
);
--
-- Table structure for table 'search_dataset'
--
CREATE TABLE search_dataset (
sid integer NOT NULL default '0',
type varchar(16) default NULL,
data text NOT NULL default '',
KEY sid_type (sid, type)
);
--
-- Table structure for search_index
--
......@@ -581,10 +591,10 @@ CREATE TABLE search_index (
type varchar(16) default NULL,
fromsid integer NOT NULL default '0',
fromtype varchar(16) default NULL,
score integer default NULL
score float default NULL
);
CREATE INDEX search_index_sid_idx ON search_index(sid);
CREATE INDEX search_index_fromsid_idx ON search_index(fromsid);
CREATE INDEX search_index_sid_type_idx ON search_index(sid, type);
CREATE INDEX search_index_from_sid_type_idx ON search_index(fromsid, fromtype);
CREATE INDEX search_index_word_idx ON search_index(word);
--
......
......@@ -66,7 +66,8 @@
"2005-08-25" => "update_146",
"2005-09-07" => "update_147",
"2005-09-18" => "update_148",
"2005-09-27" => "update_149"
"2005-09-27" => "update_149",
"2005-10-15" => "update_150"
);
function update_110() {
......@@ -846,6 +847,78 @@ function update_149() {
return $ret;
}
function update_150() {
$ret = array();
$ret[] = update_sql("DELETE FROM {variable} WHERE name = 'node_cron_last'");
$ret[] = update_sql("DELETE FROM {variable} WHERE name = 'minimum_word_size'");
$ret[] = update_sql("DELETE FROM {variable} WHERE name = 'remove_short'");
$ret[] = update_sql("DELETE FROM {node_counter} WHERE nid = 0");
$ret[] = update_sql('DROP TABLE {search_index}');
$ret[] = update_sql('DROP TABLE {search_total}');
switch ($GLOBALS['db_type']) {
case 'mysqli':
case 'mysql':
$ret[] = update_sql("CREATE TABLE {search_dataset} (
sid int(10) unsigned NOT NULL default '0',
type varchar(16) default NULL,
data longtext NOT NULL,
KEY sid_type (sid, type)
)");
$ret[] = update_sql("CREATE TABLE {search_index} (
word varchar(50) NOT NULL default '',
sid int(10) unsigned NOT NULL default '0',
type varchar(16) default NULL,
fromsid int(10) unsigned NOT NULL default '0',
fromtype varchar(16) default NULL,
score float default NULL,
KEY sid_type (sid, type),
KEY from_sid_type (fromsid, fromtype),
KEY word (word)
)");
$ret[] = update_sql("CREATE TABLE {search_total} (
word varchar(50) NOT NULL default '',
count float default NULL,
PRIMARY KEY word (word)
)");
break;
case 'pgsql':
$ret[] = update_sql("CREATE TABLE {search_dataset} (
sid integer NOT NULL default '0',
type varchar(16) default NULL,
data text NOT NULL default '',
KEY sid_type (sid, type)
)");
$ret[] = update_sql("CREATE TABLE {search_index} (
word varchar(50) NOT NULL default '',
sid integer NOT NULL default '0',
type varchar(16) default NULL,
fromsid integer NOT NULL default '0',
fromtype varchar(16) default NULL,
score float default NULL
)");
$ret[] = update_sql("CREATE INDEX search_index_sid_type_idx ON {search_index}(sid, type)");
$ret[] = update_sql("CREATE INDEX search_index_from_sid_type_idx ON {search_index}(fromsid, fromtype)");
$ret[] = update_sql("CREATE INDEX search_index_word_idx ON {search_index}(word)");
$ret[] = update_sql("CREATE TABLE {search_total} (
word varchar(50) NOT NULL default '',
count float default NULL
)");
$ret[] = update_sql("CREATE INDEX search_total_word_idx ON {search_total}(word)");
break;
default:
break;
}
return $ret;
}
function update_sql($sql) {
$edit = $_POST["edit"];
$result = db_query($sql);
......
......@@ -238,6 +238,50 @@ function db_query_range($query) {
return _db_query($query);
}
/**
* Runs a SELECT query and stores its results in a temporary table.
*
* Use this as a substitute for db_query() when the results need to stored
* in a temporary table. Temporary tables exist for the duration of the page
* request.
* User-supplied arguments to the query should be passed in as separate parameters
* so that they can be properly escaped to avoid SQL injection attacks.
*
* Note that if you need to know how many results were returned, you should do
* a SELECT COUNT(*) on the temporary table afterwards. db_num_rows() and
* db_affected_rows() do not give consistent result across different database
* types in this case.
*
* @param $query
* A string containing a normal SELECT SQL query.
* @param ...
* A variable number of arguments which are substituted into the query using
* printf() syntax. Instead of a variable number of query arguments, you may
* also pass a single array containing the query arguments.
* @param $table
* The name of the temporary table to select into. This name will not be
* prefixed as there is no risk of collision.
* @return
* A database query result resource, or FALSE if the query was not executed
* correctly.
*/
function db_query_temporary($query) {
$args = func_get_args();
$tablename = array_pop($args);
$query = preg_replace('/^SELECT/i', 'CREATE TEMPORARY TABLE '. $tablename .' SELECT', db_prefix_tables($query));
if (count($args) > 1) {
// Check for array (alternative syntax).
if (is_array($args[1])) {
$args = array_merge(array($query), $args[1]);
}
$args = array_map('db_escape_string', $args);
$args[0] = $query;
$query = call_user_func_array('sprintf', $args);
}
return _db_query($query);
}
/**
* Returns a properly formatted Binary Large OBject value.
*
......
......@@ -205,6 +205,11 @@ function db_affected_rows() {
* User-supplied arguments to the query should be passed in as separate parameters
* so that they can be properly escaped to avoid SQL injection attacks.
*
* Note that if you need to know how many results were returned, you should do
* a SELECT COUNT(*) on the temporary table afterwards. db_num_rows() and
* db_affected_rows() do not give consistent result across different database
* types in this case.
*
* @param $query
* A string containing an SQL query.
* @param ...
......@@ -238,6 +243,50 @@ function db_query_range($query) {
return _db_query($query);
}
/**
* Runs a SELECT query and stores its results in a temporary table.
*
* Use this as a substitute for db_query() when the results need to stored
* in a temporary table. Temporary tables exist for the duration of the page
* request.
* User-supplied arguments to the query should be passed in as separate parameters
* so that they can be properly escaped to avoid SQL injection attacks.
*
* Note that if you need to know how many results were returned, you should do
* a SELECT COUNT(*) on the temporary table afterwards. db_num_rows() and
* db_affected_rows() do not give consistent result across different database
* types.
*
* @param $query
* A string containing a normal SELECT SQL query.
* @param ...
* A variable number of arguments which are substituted into the query using
* printf() syntax. Instead of a variable number of query arguments, you may
* also pass a single array containing the query arguments.
* @param $table
* The name of the temporary table to select into. This name will not be
* prefixed as there is no risk of collision.
* @return
* A database query result resource, or FALSE if the query was not executed
* correctly.
*/
function db_query_temporary($query) {
$args = func_get_args();
$tablename = array_pop($args);
$query = preg_replace('/^SELECT/i', 'CREATE TEMPORARY TABLE '. $tablename .' SELECT', db_prefix_tables($query));
if (count($args) > 1) {
// Check for array (alternative syntax).
if (is_array($args[1])) {
$args = array_merge(array($query), $args[1]);
}
$args = array_map('db_escape_string', $args);
$args[0] = $query;
$query = call_user_func_array('sprintf', $args);
}
return _db_query($query);
}
/**
* Returns a properly formatted Binary Large OBject value.
*
......
......@@ -223,6 +223,50 @@ function db_query_range($query) {
return _db_query($query);
}
/**
* Runs a SELECT query and stores its results in a temporary table.
*
* Use this as a substitute for db_query() when the results need to stored
* in a temporary table. Temporary tables exist for the duration of the page
* request.
* User-supplied arguments to the query should be passed in as separate parameters
* so that they can be properly escaped to avoid SQL injection attacks.
*
* Note that if you need to know how many results were returned, you should do
* a SELECT COUNT(*) on the temporary table afterwards. db_num_rows() and
* db_affected_rows() do not give consistent result across different database
* types in this case.
*
* @param $query
* A string containing a normal SELECT SQL query.
* @param ...
* A variable number of arguments which are substituted into the query using
* printf() syntax. Instead of a variable number of query arguments, you may
* also pass a single array containing the query arguments.
* @param $table
* The name of the temporary table to select into. This name will not be
* prefixed as there is no risk of collision.
* @return
* A database query result resource, or FALSE if the query was not executed
* correctly.
*/
function db_query_temporary($query) {
$args = func_get_args();
$tablename = array_pop($args);
$query = preg_replace('/^SELECT/i', 'CREATE TEMPORARY TABLE '. $tablename .' AS', db_prefix_tables($query));
if (count($args) > 1) {
// Check for array (alternative syntax).
if (is_array($args[1])) {
$args = array_merge(array($query), $args[1]);
}
$args = array_map('db_escape_string', $args);
$args[0] = $query;
$query = call_user_func_array('sprintf', $args);
}
return _db_query($query);
}
/**
* Returns a properly formatted Binary Large OBject value.
*
......
......@@ -445,6 +445,13 @@ img.screenshot {
.search-results .search-info {
font-size: 0.85em;
}
.search-advanced .criterium {
float: left;
margin-right: 2em;
}
.search-advanced .action {
clear: left;
}
#tracker td.replies {
text-align: center;
}
......
......@@ -597,17 +597,112 @@ function node_search($op = 'search', $keys = null) {
switch ($op) {
case 'name':
return t('content');
case 'reset':
variable_del('node_cron_last');
return;
case 'status':
$last = variable_get('node_cron_last', 0);
$total = db_result(db_query('SELECT COUNT(*) FROM {node} WHERE status = 1 AND moderate = 0'));
$remaining = db_result(db_query('SELECT COUNT(*) FROM {node} n LEFT JOIN {node_comment_statistics} c ON n.nid = c.nid WHERE n.status = 1 AND n.moderate = 0 AND (n.created > %d OR n.changed > %d OR c.last_comment_timestamp > %d)', $last, $last, $last));
return array('remaining' => $remaining, 'total' => $total);
case 'admin':
$form = array();
// Output form for defining rank factor weights.
$form['content_ranking'] = array('#type' => 'fieldset', '#title' => t('Content ranking'));
$form['content_ranking']['#theme'] = 'node_search_admin';
$form['content_ranking']['info'] = array('#type' => 'markup', '#value' => '<em>'. t('The following numbers control which properties the content search should favor when ordering the results. Higher numbers mean more influence. Zero means the property is ignored.') .'</em>');
$ranking = array('node_rank_relevance' => t('Keyword relevance'),
'node_rank_recent' => t('Recently posted'));
if (module_exist('comment')) {
$ranking['node_rank_comments'] = t('Number of comments');
}
if (module_exist('statistics') && variable_get('statistics_count_content_views', 0)) {
$ranking['node_rank_views'] = t('Number of views');
}
// Note: reversed to reflect that higher number = higher ranking.
$options = drupal_map_assoc(range(0, 10));
foreach ($ranking as $var => $title) {
$form['content_ranking']['factors'][$var] = array('#title' => $title, '#type' => 'select', '#options' => $options, '#default_value' => variable_get($var, 5));
}
return $form;
case 'search':
list($join, $where) = _db_rewrite_sql();
$find = do_search($keys, 'node', 'INNER JOIN {node} n ON n.nid = i.sid '. $join .' INNER JOIN {users} u ON n.uid = u.uid', 'n.status = 1'. (empty($where) ? '' : ' AND '. $where));
// Build matching conditions
list($join1, $where1) = _db_rewrite_sql();
$arguments1 = array();
$conditions1 = 'n.status = 1';
if ($type = search_query_extract($keys, 'type')) {
$types = array();
foreach (explode(',', $type) as $t) {
$types[] = "n.type = '%s'";
$arguments1[] = $t;
}
$conditions1 .= ' AND ('. implode(' OR ', $types) .')';
$keys = search_query_insert($keys, 'type');
}
if ($category = search_query_extract($keys, 'category')) {
$categories = array();
foreach (explode(',', $category) as $c) {
$categories[] = "tn.tid = %d";
$arguments1[] = $c;
}
$conditions1 .= ' AND ('. implode(' OR ', $categories) .')';
$join1 .= ' INNER JOIN {term_node} tn ON n.nid = tn.nid';
$keys = search_query_insert($keys, 'category');
}
// Build ranking expression (we try to map each parameter to a
// uniform distribution in the range 0..1).
$ranking = array();
$arguments2 = array();
$join2 = '';
// Used to avoid joining on node_comment_statistics twice
$stats_join = false;
if ($weight = (int)variable_get('node_rank_relevance', 5)) {
// Average relevance values hover around 0.15
$ranking[] = '%d * i.relevance';
$arguments2[] = $weight;
}
if ($weight = (int)variable_get('node_rank_recent', 5)) {
// Exponential decay with half-life of 6 months, starting at last indexed node
$ranking[] = '%d * POW(2, (GREATEST(n.created, n.changed, c.last_comment_timestamp) - %d) * 6.43e-8)';
$arguments2[] = $weight;
$arguments2[] = (int)variable_get('node_cron_last', 0);
$join2 .= ' INNER JOIN {node} n ON n.nid = i.sid LEFT JOIN {node_comment_statistics} c ON c.nid = i.sid';
$stats_join = true;
}
if (module_exist('comment') && $weight = (int)variable_get('node_rank_comments', 5)) {
// Inverse law that maps the highest reply count on the site to 1 and 0 to 0.
$scale = variable_get('node_cron_comments_scale', 0.0);
$ranking[] = '%d * (2.0 - 2.0 / (1.0 + c.comment_count * %f))';
$arguments2[] = $weight;
$arguments2[] = $scale;
if (!$stats_join) {
$join2 .= ' LEFT JOIN {node_comment_statistics} c ON c.nid = i.sid';
}
}
if (module_exist('statistics') && variable_get('statistics_count_content_views', 0) &&
$weight = (int)variable_get('node_rank_views', 5)) {
// Inverse law that maps the highest view count on the site to 1 and 0 to 0.
$scale = variable_get('node_cron_views_scale', 0.0);
$ranking[] = '%d * (2.0 - 2.0 / (1.0 + nc.totalcount * %f))';
$arguments2[] = $weight;
$arguments2[] = $scale;
$join2 .= ' LEFT JOIN {node_counter} nc ON n.nid = nc.nid';
}
$select2 = (count($ranking) ? implode(' + ', $ranking) : 'i.relevance') . ' AS score';
// Do search
$find = do_search($keys, 'node', 'INNER JOIN {node} n ON n.nid = i.sid '. $join1 .' INNER JOIN {users} u ON n.uid = u.uid', $conditions1 . (empty($where1) ? '' : ' AND '. $where1), $arguments1, $select2, $join2, $arguments2);
// Load results
$results = array();
foreach ($find as $item) {
$node = node_load($item);
......@@ -622,19 +717,86 @@ function node_search($op = 'search', $keys = null) {
// Allow modules to change $node->body before viewing.
node_invoke_nodeapi($node, 'view', false, false);
// Fetch comments for snippet
$node->body .= module_invoke('comment', 'nodeapi', $node, 'update index');
$extra = node_invoke_nodeapi($node, 'search result');
$results[] = array('link' => url('node/'. $item),
'type' => node_get_name($node),
'title' => $node->title,
'user' => theme('username', $node),
'date' => $node->changed,
'node' => $node,
'extra' => $extra,
'snippet' => search_excerpt($keys, $node->body));
}
return $results;
case 'form':
$form = array();
// Keyword boxes
$form['advanced'] = array('#type' => 'fieldset', '#title' => t('Advanced search'), '#collapsible' => true, '#collapsed' => true, '#attributes' => array('class' => 'search-advanced'));
$form['advanced']['keywords'] = array('#type' => 'markup', '#prefix' => '<div class="criterium">', '#suffix' => '</div>');
$form['advanced']['keywords']['or'] = array('#type' => 'textfield', '#title' => t('Containing any of the words'), '#size' => 30, '#maxlength' => 255);
$form['advanced']['keywords']['phrase'] = array('#type' => 'textfield', '#title' => t('Containing the phrase'), '#size' => 30, '#maxlength' => 255);
$form['advanced']['keywords']['negative'] = array('#type' => 'textfield', '#title' => t('Containing none of the words'), '#size' => 30, '#maxlength' => 255);
// Taxonomy box
if ($taxonomy = module_invoke('taxonomy', 'form_all')) {
$form['advanced']['category'] = array('#type' => 'select', '#title' => t('Only in the category'), '#prefix' => '<div class="criterium">', '#suffix' => '</div>', '#options' => $taxonomy, '#extra' => 'size="10"', '#multiple' => true);
}
// Node types
$types = node_get_types();
$form['advanced']['type'] = array('#type' => 'checkboxes', '#title' => t('Only of the type'), '#prefix' => '<div class="criterium">', '#suffix' => '</div>', '#options' => $types, '#multiple' => true);
$form['advanced']['submit'] = array('#type' => 'submit', '#value' => t('Advanced Search'), '#prefix' => '<div class="action">', '#suffix' => '</div>');
return $form;
case 'post':
// Insert extra restrictions into the search keywords string.
$edit = &$_POST['edit'];
if (is_array($edit['type'])) {
$keys = search_query_insert($keys, 'type', implode(',', array_keys($edit['type'])));
}
if (is_array($edit['category'])) {
$keys = search_query_insert($keys, 'category', implode(',', $edit['category']));
}
if ($edit['or'] != '') {
if (preg_match_all('/ ("[^"]+"|[^" ]+)/i', ' '. $edit['or'], $matches)) {
$keys = $keys .' '. implode(' OR ', $matches[1]);
}
}
if ($edit['negative'] != '') {
if (preg_match_all('/ ("[^"]+"|[^" ]+)/i', ' '. $edit['negative'], $matches)) {
$keys = $keys .' -'. implode(' -', $matches[1]);
}
}
if ($edit['phrase'] != '') {
$keys .= ' "'. str_replace('"', ' ', $edit['phrase']) .'"';
}
return trim($keys);
}
}
function theme_node_search_admin($form) {
$output = form_render($form['info']);
$header = array(t('Factor'), t('Weight'));
foreach (element_children($form['factors']) as $key) {
$row = array();
$row[] = $form['factors'][$key]['#title'];
unset($form['factors'][$key]['#title']);
$row[] = form_render($form['factors'][$key]);
$rows[] = $row;
}
$output .= theme('table', $header, $rows);
$output .= form_render($form);
return $output;
}
/**
* Menu callback; presents general node configuration options.
*/
......@@ -1864,6 +2026,10 @@ function node_update_index() {
$last = variable_get('node_cron_last', 0);
$limit = (int)variable_get('search_cron_limit', 100);
// Store the maximum possible comments per thread (used for ranking by reply count)
variable_set('node_cron_comments_scale', 1.0 / max(1, db_result(db_query('SELECT MAX(comment_count) FROM {node_comment_statistics}'))));
variable_set('node_cron_views_scale', 1.0 / max(1, db_result(db_query('SELECT MAX(totalcount) FROM {node_counter}'))));
$result = db_query_range('SELECT n.nid, c.last_comment_timestamp FROM {node} n LEFT JOIN {node_comment_statistics} c ON n.nid = c.nid WHERE n.status = 1 AND n.moderate = 0 AND (n.created > %d OR n.changed > %d OR c.last_comment_timestamp > %d) ORDER BY GREATEST(n.created, n.changed, c.last_comment_timestamp) ASC', $last, $last, $last, 0, $limit);
while ($node = db_fetch_object($result)) {
......
......@@ -597,17 +597,112 @@ function node_search($op = 'search', $keys = null) {
switch ($op) {
case 'name':
return t('content');
case 'reset':
variable_del('node_cron_last');
return;
case 'status':
$last = variable_get('node_cron_last', 0);
$total = db_result(db_query('SELECT COUNT(*) FROM {node} WHERE status = 1 AND moderate = 0'));
$remaining = db_result(db_query('SELECT COUNT(*) FROM {node} n LEFT JOIN {node_comment_statistics} c ON n.nid = c.nid WHERE n.status = 1 AND n.moderate = 0 AND (n.created > %d OR n.changed > %d OR c.last_comment_timestamp > %d)', $last, $last, $last));
return array('remaining' => $remaining, 'total' => $total);
case 'admin':
$form = array();
// Output form for defining rank factor weights.
$form['content_ranking'] = array('#type' => 'fieldset', '#title' => t('Content ranking'));
$form['content_ranking']['#theme'] = 'node_search_admin';
$form['content_ranking']['info'] = array('#type' => 'markup', '#value' => '<em>'. t('The following numbers control which properties the content search should favor when ordering the results. Higher numbers mean more influence. Zero means the property is ignored.') .'</em>');
$ranking = array('node_rank_relevance' => t('Keyword relevance'),
'node_rank_recent' => t('Recently posted'));
if (module_exist('comment')) {
$ranking['node_rank_comments'] = t('Number of comments');
}
if (module_exist('statistics') && variable_get('statistics_count_content_views', 0)) {
$ranking['node_rank_views'] = t('Number of views');
}
// Note: reversed to reflect that higher number = higher ranking.
$options = drupal_map_assoc(range(0, 10));
foreach ($ranking as $var => $title) {
$form['content_ranking']['factors'][$var] = array('#title' => $title, '#type' => 'select', '#options' => $options, '#default_value' => variable_get($var, 5));
}
return $form;
case 'search':
list($join, $where) = _db_rewrite_sql();
$find = do_search($keys, 'node', 'INNER JOIN {node} n ON n.nid = i.sid '. $join .' INNER JOIN {users} u ON n.uid = u.uid', 'n.status = 1'. (empty($where) ? '' : ' AND '. $where));
// Build matching conditions
list($join1, $where1) = _db_rewrite_sql();
$arguments1 = array();
$conditions1 = 'n.status = 1';
if ($type = search_query_extract($keys, 'type')) {
$types = array();
foreach (explode(',', $type) as $t) {
$types[] = "n.type = '%s'";
$arguments1[] = $t;
}
$conditions1 .= ' AND ('. implode(' OR ', $types) .')';
$keys = search_query_insert($keys, 'type');
}
if ($category = search_query_extract($keys, 'category')) {
$categories = array();
foreach (explode(',', $category) as $c) {
$categories[] = "tn.tid = %d";
$arguments1[] = $c;
}