Newer
Older
* Matches Unicode character classes to exclude from the search index.
* See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
*
* The index only contains the following character classes:
* Lu Letter, Uppercase
* Ll Letter, Lowercase
* Lt Letter, Titlecase
* Lo Letter, Other
* Nd Number, Decimal Digit
* No Number, Other
*/
define('PREG_CLASS_SEARCH_EXCLUDE',
'\x{0}-\x{2f}\x{3a}-\x{40}\x{5b}-\x{60}\x{7b}-\x{bf}\x{d7}\x{f7}\x{2b0}-'.
'\x{385}\x{387}\x{3f6}\x{482}-\x{489}\x{559}-\x{55f}\x{589}-\x{5c7}\x{5f3}-'.
'\x{61f}\x{640}\x{64b}-\x{65e}\x{66a}-\x{66d}\x{670}\x{6d4}\x{6d6}-\x{6ed}'.
'\x{6fd}\x{6fe}\x{700}-\x{70f}\x{711}\x{730}-\x{74a}\x{7a6}-\x{7b0}\x{901}-'.
'\x{903}\x{93c}\x{93e}-\x{94d}\x{951}-\x{954}\x{962}-\x{965}\x{970}\x{981}-'.
'\x{983}\x{9bc}\x{9be}-\x{9cd}\x{9d7}\x{9e2}\x{9e3}\x{9f2}-\x{a03}\x{a3c}-'.
'\x{a4d}\x{a70}\x{a71}\x{a81}-\x{a83}\x{abc}\x{abe}-\x{acd}\x{ae2}\x{ae3}'.
'\x{af1}-\x{b03}\x{b3c}\x{b3e}-\x{b57}\x{b70}\x{b82}\x{bbe}-\x{bd7}\x{bf0}-'.
'\x{c03}\x{c3e}-\x{c56}\x{c82}\x{c83}\x{cbc}\x{cbe}-\x{cd6}\x{d02}\x{d03}'.
'\x{d3e}-\x{d57}\x{d82}\x{d83}\x{dca}-\x{df4}\x{e31}\x{e34}-\x{e3f}\x{e46}-'.
'\x{e4f}\x{e5a}\x{e5b}\x{eb1}\x{eb4}-\x{ebc}\x{ec6}-\x{ecd}\x{f01}-\x{f1f}'.
'\x{f2a}-\x{f3f}\x{f71}-\x{f87}\x{f90}-\x{fd1}\x{102c}-\x{1039}\x{104a}-'.
'\x{104f}\x{1056}-\x{1059}\x{10fb}\x{10fc}\x{135f}-\x{137c}\x{1390}-\x{1399}'.
'\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16f0}\x{1712}-\x{1714}'.
'\x{1732}-\x{1736}\x{1752}\x{1753}\x{1772}\x{1773}\x{17b4}-\x{17db}\x{17dd}'.
'\x{17f0}-\x{180e}\x{1843}\x{18a9}\x{1920}-\x{1945}\x{19b0}-\x{19c0}\x{19c8}'.
'\x{19c9}\x{19de}-\x{19ff}\x{1a17}-\x{1a1f}\x{1d2c}-\x{1d61}\x{1d78}\x{1d9b}-'.
'\x{1dc3}\x{1fbd}\x{1fbf}-\x{1fc1}\x{1fcd}-\x{1fcf}\x{1fdd}-\x{1fdf}\x{1fed}-'.
'\x{1fef}\x{1ffd}-\x{2070}\x{2074}-\x{207e}\x{2080}-\x{2101}\x{2103}-\x{2106}'.
'\x{2108}\x{2109}\x{2114}\x{2116}-\x{2118}\x{211e}-\x{2123}\x{2125}\x{2127}'.
'\x{2129}\x{212e}\x{2132}\x{213a}\x{213b}\x{2140}-\x{2144}\x{214a}-\x{2b13}'.
'\x{2ce5}-\x{2cff}\x{2d6f}\x{2e00}-\x{3005}\x{3007}-\x{303b}\x{303d}-\x{303f}'.
Gábor Hojtsy
committed
'\x{3099}-\x{309e}\x{30a0}\x{30fb}\x{30fd}\x{30fe}\x{3190}-\x{319f}\x{31c0}-'.
'\x{31cf}\x{3200}-\x{33ff}\x{4dc0}-\x{4dff}\x{a015}\x{a490}-\x{a716}\x{a802}'.
'\x{a806}\x{a80b}\x{a823}-\x{a82b}\x{d800}-\x{f8ff}\x{fb1e}\x{fb29}\x{fd3e}'.
'\x{fd3f}\x{fdfc}-\x{fe6b}\x{feff}-\x{ff0f}\x{ff1a}-\x{ff20}\x{ff3b}-\x{ff40}'.
'\x{ff5b}-\x{ff65}\x{ff70}\x{ff9e}\x{ff9f}\x{ffe0}-\x{fffd}');
* Matches all 'N' Unicode character classes (numbers)
define('PREG_CLASS_NUMBERS',
'\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}'.
'\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}'.
'\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}'.
'\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-'.
'\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}'.
'\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}'.
'\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}'.
'\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-'.
'\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}');
* Matches all 'P' Unicode character classes (punctuation)
define('PREG_CLASS_PUNCTUATION',
'\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}'.
'\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}'.
'\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}'.
'\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}'.
'\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}'.
'\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}'.
'\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}'.
'\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}'.
'\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-'.
'\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}'.
'\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}'.
'\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}'.
'\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}'.
'\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-'.
'\x{ff65}');
/**
* Matches all CJK characters that are candidates for auto-splitting
* (Chinese, Japanese, Korean).
* Contains kana and BMP ideographs.
*/
define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}'.
'\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}');
Gábor Hojtsy
committed
function search_help($path, $arg) {
switch ($path) {
Dries Buytaert
committed
case 'admin/help#search':
Gábor Hojtsy
committed
$output = '<p>'. t('The search module adds the ability to search for content by keywords. Search is often the only practical way to find content on a large site, and is useful for finding both users and posts.') .'</p>';
$output .= '<p>'. t('To provide keyword searching, the search engine maintains an index of words found in your site\'s content. To build and maintain this index, a correctly configured <a href="@cron">cron maintenance task</a> is required. Indexing behavior can be adjusted using the <a href="@searchsettings">search settings page</a>; for example, the <em>Number of items to index per cron run</em> sets the maximum number of items indexed in each pass of a <a href="@cron">cron maintenance task</a>. If necessary, reduce this number to prevent timeouts and memory errors when indexing.', array('@cron' => url('admin/reports/status'), '@searchsettings' => url('admin/settings/search'))) .'</p>';
Gábor Hojtsy
committed
$output .= '<p>'. t('For more information, see the online handbook entry for <a href="@search">Search module</a>.', array('@search' => 'http://drupal.org/handbook/modules/search/')) .'</p>';
Dries Buytaert
committed
return $output;
case 'admin/settings/search':
Gábor Hojtsy
committed
return '<p>'. t('The search engine maintains an index of words found in your site\'s content. To build and maintain this index, a correctly configured <a href="@cron">cron maintenance task</a> is required. Indexing behavior can be adjusted using the settings below.', array('@cron' => url('admin/reports/status'))) .'</p>';
case 'search#noresults':
return t('<ul>
<li>Check if your spelling is correct.</li>
<li>Remove quotes around phrases to match each word individually: <em>"blue smurf"</em> will match less than <em>blue smurf</em>.</li>
<li>Consider loosening your query with <em>OR</em>: <em>blue smurf</em> will match less than <em>blue OR smurf</em>.</li>
Dries Buytaert
committed
/**
* Implementation of hook_theme()
*/
function search_theme() {
return array(
'search_theme_form' => array(
'arguments' => array('form' => NULL),
Dries Buytaert
committed
'template' => 'search-theme-form',
Dries Buytaert
committed
),
'search_block_form' => array(
'arguments' => array('form' => NULL),
Dries Buytaert
committed
'template' => 'search-block-form',
Dries Buytaert
committed
),
Dries Buytaert
committed
'search_result' => array(
'arguments' => array('result' => NULL, 'type' => NULL),
'file' => 'search.pages.inc',
Dries Buytaert
committed
'template' => 'search-result',
Dries Buytaert
committed
),
Dries Buytaert
committed
'search_results' => array(
Dries Buytaert
committed
'arguments' => array('results' => NULL, 'type' => NULL),
'file' => 'search.pages.inc',
Dries Buytaert
committed
'template' => 'search-results',
Dries Buytaert
committed
),
);
}
return array('search content', 'use advanced search', 'administer search');
/**
* Implementation of hook_block().
*/
function search_block($op = 'list', $delta = 0) {
if ($op == 'list') {
$blocks[0]['info'] = t('Search form');
// Not worth caching.
$blocks[0]['cache'] = BLOCK_NO_CACHE;
Steven Wittens
committed
else if ($op == 'view' && user_access('search content')) {
Dries Buytaert
committed
$block['content'] = drupal_get_form('search_block_form');
$block['subject'] = t('Search');
return $block;
}
}
Dries Buytaert
committed
function search_menu() {
$items['search'] = array(
'title' => 'Search',
Dries Buytaert
committed
'page callback' => 'search_view',
'access arguments' => array('search content'),
'type' => MENU_SUGGESTED_ITEM,
'file' => 'search.pages.inc',
Dries Buytaert
committed
);
$items['admin/settings/search'] = array(
'title' => 'Search settings',
'description' => 'Configure relevance settings for search and other indexing options',
Dries Buytaert
committed
'page callback' => 'drupal_get_form',
'page arguments' => array('search_admin_settings'),
'access arguments' => array('administer search'),
'type' => MENU_NORMAL_ITEM,
'file' => 'search.admin.inc',
Dries Buytaert
committed
);
$items['admin/settings/search/wipe'] = array(
'title' => 'Clear index',
Dries Buytaert
committed
'page callback' => 'drupal_get_form',
'page arguments' => array('search_wipe_confirm'),
'access arguments' => array('administer search'),
'type' => MENU_CALLBACK,
'file' => 'search.admin.inc',
Dries Buytaert
committed
);
$items['admin/reports/search'] = array(
'title' => 'Top search phrases',
'description' => 'View most popular search phrases.',
'page callback' => 'dblog_top',
Dries Buytaert
committed
'page arguments' => array('search'),
'file' => 'dblog.admin.inc',
'file path' => drupal_get_path('module', 'dblog'),
Dries Buytaert
committed
);
Dries Buytaert
committed
foreach (module_implements('search') as $name) {
$items['search/'. $name .'/%menu_tail'] = array(
'title callback' => 'module_invoke',
'title arguments' => array($name, 'search', 'name', TRUE),
Dries Buytaert
committed
'page callback' => 'search_view',
'page arguments' => array($name),
'access callback' => '_search_menu',
'access arguments' => array($name),
'type' => MENU_LOCAL_TASK,
'parent' => 'search',
'file' => 'search.pages.inc',
Dries Buytaert
committed
);
Dries Buytaert
committed
return $items;
}
function _search_menu($name) {
return user_access('search content') && module_invoke($name, 'search', 'name');
* Wipes a part of or the entire search index.
*
* @param $sid
* (optional) The SID of the item to wipe. If specified, $type must be passed
* too.
* @param $type
* (optional) The type of item to wipe.
function search_wipe($sid = NULL, $type = NULL, $reindex = FALSE) {
if ($type == NULL && $sid == NULL) {
module_invoke_all('search', 'reset');
}
else {
db_query("DELETE FROM {search_dataset} WHERE sid = %d AND type = '%s'", $sid, $type);
Dries Buytaert
committed
db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'", $sid, $type);
// Don't remove links if re-indexing.
if (!$reindex) {
db_query("DELETE FROM {search_node_links} WHERE sid = %d AND type = '%s'", $sid, $type);
}
/**
* Marks a word as dirty (or retrieves the list of dirty words). This is used
* during indexing (cron). Words which are dirty have outdated total counts in
* the search_total table, and need to be recounted.
*/
function search_dirty($word = NULL) {
static $dirty = array();
if ($word !== NULL) {
$dirty[$word] = TRUE;
}
else {
return $dirty;
}
}
* Fires hook_update_index() in all modules and cleans up dirty words (see
* search_dirty).
Steven Wittens
committed
// We register a shutdown function to ensure that search_total is always up
// to date.
register_shutdown_function('search_update_totals');
module_invoke($module, 'update_index');
}
Steven Wittens
committed
}
/**
* This function is called on shutdown to ensure that search_total is always
* up to date (even if cron times out or otherwise fails).
*/
function search_update_totals() {
// Update word IDF (Inverse Document Frequency) counts for new/changed words
foreach (search_dirty() as $word => $dummy) {
// Get total count
$total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word));
// Apply Zipf's law to equalize the probability distribution
$total = log10(1 + 1/(max(1, $total)));
db_query("UPDATE {search_total} SET count = %f WHERE word = '%s'", $total, $word);
Gábor Hojtsy
committed
@db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %f)", $word, $total);
}
}
// Find words that were deleted from search_index, but are still in
// search_total. We use a LEFT JOIN between the two tables and keep only the
// rows which fail to join.
$result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL");
while ($word = db_fetch_object($result)) {
db_query("DELETE FROM {search_total} WHERE word = '%s'", $word->realword);
* Simplifies a string according to indexing rules.
function search_simplify($text) {
// Decode entities to UTF-8
$text = decode_entities($text);
Steven Wittens
committed
// Lowercase
$text = drupal_strtolower($text);
// Call an external processor for word handling.
search_invoke_preprocess($text);
Steven Wittens
committed
// Simple CJK handling
if (variable_get('overlap_cjk', TRUE)) {
$text = preg_replace_callback('/['. PREG_CLASS_CJK .']+/u', 'search_expand_cjk', $text);
Steven Wittens
committed
}
// To improve searching for numerical data such as dates, IP addresses
// or version numbers, we consider a group of numerical characters
// separated only by punctuation characters to be one piece.
// This also means that searching for e.g. '20/03/1984' also returns
// results with '20-03-1984' in them.
// Readable regexp: ([number]+)[punctuation]+(?=[number])
$text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text);
// The dot, underscore and dash are simply removed. This allows meaningful
Gábor Hojtsy
committed
// search behavior with acronyms and URLs.
$text = preg_replace('/[._-]+/', '', $text);
// With the exception of the rules above, we consider all punctuation,
// marks, spacers, etc, to be a word boundary.
Dries Buytaert
committed
$text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE .']+/u', ' ', $text);
return $text;
}
/**
* Basic CJK tokenizer. Simply splits a string into consecutive, overlapping
Steven Wittens
committed
* sequences of characters ('minimum_word_size' long).
*/
function search_expand_cjk($matches) {
Steven Wittens
committed
$min = variable_get('minimum_word_size', 3);
$str = $matches[0];
$l = drupal_strlen($str);
// Passthrough short words
if ($l <= $min) {
Steven Wittens
committed
}
$tokens = ' ';
Steven Wittens
committed
// FIFO queue of characters
$chars = array();
// Begin loop
for ($i = 0; $i < $l; ++$i) {
// Grab next character
$current = drupal_substr($str, 0, 1);
Steven Wittens
committed
$str = substr($str, strlen($current));
$chars[] = $current;
if ($i >= $min - 1) {
$tokens .= implode('', $chars) .' ';
array_shift($chars);
}
}
return $tokens;
}
/**
* Splits a string into tokens for indexing.
*/
function search_index_split($text) {
static $last = NULL;
static $lastsplit = NULL;
if ($last == $text) {
return $lastsplit;
}
// Process words
$text = search_simplify($text);
$words = explode(' ', $text);
array_walk($words, '_search_index_truncate');
// Save last keyword result
$last = $text;
$lastsplit = $words;
return $words;
}
* Helper function for array_walk in search_index_split.
function _search_index_truncate(&$text) {
/**
* Invokes hook_search_preprocess() in modules.
*/
function search_invoke_preprocess(&$text) {
foreach (module_implements('search_preprocess') as $module) {
$text = module_invoke($module, 'search_preprocess', $text);
* Update the full-text search index for a particular item.
*
* @param $sid
* A number identifying this particular item (e.g. node id).
* @param $type
* A string defining this type of item (e.g. 'node')
*
* @param $text
* The content of this item. Must be a piece of HTML text.
*
* @ingroup search
function search_index($sid, $type, $text) {
$minimum_word_size = variable_get('minimum_word_size', 3);
// Link matching
global $base_url;
$node_regexp = '@href=[\'"]?(?:'. preg_quote($base_url, '@') .'/|'. preg_quote(base_path(), '@') .')(?:\?q=)?/?((?![a-z]+:)[^\'">]+)[\'">]@i';
// Multipliers for scores of words inside certain HTML tags.
// Note: 'a' must be included for link ranking to work.
$tags = array('h1' => 25,
'h2' => 18,
'h3' => 15,
'h4' => 12,
'h5' => 9,
'h6' => 6,
'u' => 3,
'b' => 3,
'i' => 3,
'strong' => 3,
'em' => 3,
'a' => 10);
// Strip off all ignored tags to speed up processing, but insert space before/after
// them to keep word boundaries.
$text = str_replace(array('<', '>'), array(' <', '> '), $text);
$text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');
// Split HTML tags from plain text.
$split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
// Note: PHP ensures the array consists of alternating delimiters and literals
// and begins and ends with a literal (inserting $null as required).
$tag = FALSE; // Odd/even counter. Tag or no tag.
$link = FALSE; // State variable for link analyser
$score = 1; // Starting score per word
$accum = ' '; // Accumulator for cleaned up data
$tagstack = array(); // Stack with open tags
$tagwords = 0; // Counter for consecutive words
$focus = 1; // Focus state
$results = array(0 => array()); // Accumulator for words for index
foreach ($split as $value) {
if ($tag) {
// Increase or decrease score per word based on tag
list($tagname) = explode(' ', $value, 2);
Steven Wittens
committed
$tagname = drupal_strtolower($tagname);
// Closing or opening tag?
if ($tagname[0] == '/') {
$tagname = substr($tagname, 1);
// If we encounter unexpected tags, reset score to avoid incorrect boosting.
if (!count($tagstack) || $tagstack[0] != $tagname) {
$tagstack = array();
$score = 1;
}
else {
// Remove from tag stack and decrement score
$score = max(1, $score - $tags[array_shift($tagstack)]);
}
if ($tagname == 'a') {
$link = FALSE;
Steven Wittens
committed
if (isset($tagstack[0]) && $tagstack[0] == $tagname) {
// None of the tags we look for make sense when nested identically.
// If they are, it's probably broken HTML.
$tagstack = array();
}
else {
// Add to open tag stack and increment score
array_unshift($tagstack, $tagname);
$score += $tags[$tagname];
}
if ($tagname == 'a') {
// Check if link points to a node on this site
if (preg_match($node_regexp, $value, $match)) {
$path = drupal_get_normal_path($match[1]);
if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
$linknid = $match[1];
if ($linknid > 0) {
// Note: ignore links to uncachable nodes to avoid redirect bugs.
$node = db_fetch_object(db_query('SELECT n.title, n.nid, n.vid, r.format FROM {node} n INNER JOIN {node_revisions} r ON n.vid = r.vid WHERE n.nid = %d', $linknid));
if (filter_format_allowcache($node->format)) {
$link = TRUE;
$linktitle = $node->title;
}
// A tag change occurred, reset counter.
$tagwords = 0;
}
else {
// Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
if ($value != '') {
if ($link) {
// Check to see if the node link text is its URL. If so, we use the target node title instead.
if (preg_match('!^https?://!i', $value)) {
$value = $linktitle;
}
}
$words = search_index_split($value);
foreach ($words as $word) {
// Add word to accumulator
$accum .= $word .' ';
$num = is_numeric($word);
// Check wordlength
if ($num || drupal_strlen($word) >= $minimum_word_size) {
// Normalize numbers
if ($num) {
$word = (int)ltrim($word, '-0');
}
Dries Buytaert
committed
// Links score mainly for the target.
if ($link) {
if (!isset($results[$linknid])) {
$results[$linknid] = array();
}
Dries Buytaert
committed
$results[$linknid][] = $word;
// Reduce score of the link caption in the source.
$focus *= 0.2;
Dries Buytaert
committed
// Fall-through
if (!isset($results[0][$word])) {
$results[0][$word] = 0;
Dries Buytaert
committed
$results[0][$word] += $score * $focus;
// Focus is a decaying value in terms of the amount of unique words up to this point.
// From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
$focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015));
$tagwords++;
// Too many words inside a single tag probably mean a tag was accidentally left open.
if (count($tagstack) && $tagwords >= 15) {
$tagstack = array();
$score = 1;
}
$tag = !$tag;
search_wipe($sid, $type, TRUE);
// Insert cleaned up data into dataset
Dries Buytaert
committed
db_query("INSERT INTO {search_dataset} (sid, type, data, reindex) VALUES (%d, '%s', '%s', %d)", $sid, $type, $accum, 0);
// Insert results into search index
foreach ($results[0] as $word => $score) {
// Try inserting first because this will succeed most times, but because
// the database collates similar words (accented and non-accented), the
// insert can fail, in which case we need to add the word scores together.
@db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %f)", $word, $sid, $type, $score);
if (!db_affected_rows()) {
db_query("UPDATE {search_index} SET score = score + %f WHERE word = '%s' AND sid = %d AND type = '%s'", $score, $word, $sid, $type);
}
search_dirty($word);
}
unset($results[0]);
Dries Buytaert
committed
// Get all previous links from this item.
$result = db_query("SELECT nid, caption FROM {search_node_links} WHERE sid = %d AND type = '%s'", $sid, $type);
$links = array();
while ($link = db_fetch_object($result)) {
$links[$link->nid] = $link->caption;
}
// Now store links to nodes.
foreach ($results as $nid => $words) {
Dries Buytaert
committed
$caption = implode(' ', $words);
if (isset($links[$nid])) {
if ($links[$nid] != $caption) {
// Update the existing link and mark the node for reindexing.
db_query("UPDATE {search_node_links} SET caption = '%s' WHERE sid = %d AND type = '%s' AND nid = %d", $caption, $sid, $type, $nid);
search_touch_node($nid);
}
// Unset the link to mark it as processed.
unset($links[$nid]);
}
else {
// Insert the existing link and mark the node for reindexing.
db_query("INSERT INTO {search_node_links} (caption, sid, type, nid) VALUES ('%s', %d, '%s', %d)", $caption, $sid, $type, $nid);
search_touch_node($nid);
Dries Buytaert
committed
// Any left-over links in $links no longer exist. Delete them and mark the nodes for reindexing.
Gábor Hojtsy
committed
foreach ($links as $nid => $caption) {
Dries Buytaert
committed
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
db_query("DELETE FROM {search_node_links} WHERE sid = %d AND type = '%s' AND nid = %d", $sid, $type, $nid);
search_touch_node($nid);
}
}
/**
* Change a node's changed timestamp to 'now' to force reindexing.
*
* @param $nid
* The nid of the node that needs reindexing.
*/
function search_touch_node($nid) {
db_query("UPDATE {search_dataset} SET reindex = %d WHERE sid = %d AND type = 'node'", time(), $nid);
}
/**
* Implementation of hook_nodeapi().
*/
function search_nodeapi(&$node, $op, $teaser = NULL, $page = NULL) {
switch ($op) {
// Transplant links to a node into the target node.
case 'update index':
$result = db_query("SELECT caption FROM {search_node_links} WHERE nid = %d", $node->nid);
$output = array();
while ($link = db_fetch_object($result)) {
$output[] = $link->caption;
}
Gábor Hojtsy
committed
if (count($output)) {
return '<a>('. implode(', ', $output) .')</a>';
}
break;
Dries Buytaert
committed
// Reindex the node when it is updated. The node is automatically indexed
// when it is added, simply by being added to the node table.
case 'update':
search_touch_node($node->nid);
break;
}
}
/**
* Implementation of hook_comment().
*/
function search_comment($a1, $op) {
switch ($op) {
// Reindex the node when comments are added or changed
case 'insert':
case 'update':
case 'delete':
case 'publish':
case 'unpublish':
search_touch_node(is_array($a1) ? $a1['nid'] : $a1->nid);
break;
}
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
/**
* Extract a module-specific search option from a search query. e.g. 'type:book'
*/
function search_query_extract($keys, $option) {
if (preg_match('/(^| )'. $option .':([^ ]*)( |$)/i', $keys, $matches)) {
return $matches[2];
}
}
/**
* Return a query with the given module-specific search option inserted in.
* e.g. 'type:book'.
*/
function search_query_insert($keys, $option, $value = '') {
if (search_query_extract($keys, $option)) {
$keys = trim(preg_replace('/(^| )'. $option .':[^ ]*/i', '', $keys));
}
if ($value != '') {
$keys .= ' '. $option .':'. $value;
}
return $keys;
}
/**
* Parse a search query into SQL conditions.
*
Dries Buytaert
committed
* We build two queries that matches the dataset bodies. @See do_search for
* more about these.
*
* @param $text
* The search keys.
* @return
* A list of six elements.
* * A series of statements AND'd together which will be used to provide all
* possible matches.
* * Arguments for this query part.
* * A series of exact word matches OR'd together.
* * Arguments for this query part.
* * A bool indicating whether this is a simple query or not. Negative
* terms, presence of both AND / OR make this FALSE.
* * A bool indicating the presence of a lowercase or. Maybe the user
* wanted to use OR.
*/
function search_parse_query($text) {
$keys = array('positive' => array(), 'negative' => array());
// Tokenize query string
preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' '. $text, $matches, PREG_SET_ORDER);
if (count($matches) < 1) {
return NULL;
}
// Classify tokens
$or = FALSE;
Gábor Hojtsy
committed
$warning = '';
Dries Buytaert
committed
$simple = TRUE;
foreach ($matches as $match) {
$phrase = FALSE;
if ($match[2]{0} == '"') {
$match[2] = substr($match[2], 1, -1);
$phrase = TRUE;
Dries Buytaert
committed
$simple = FALSE;
// Simplify keyword according to indexing rules and external preprocessors
Steven Wittens
committed
$words = search_simplify($match[2]);
// Re-explode in case simplification added more words, except when matching a phrase
$words = $phrase ? array($words) : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);
// Negative matches
if ($match[1] == '-') {
Steven Wittens
committed
$keys['negative'] = array_merge($keys['negative'], $words);
}
// OR operator: instead of a single keyword, we store an array of all
// OR'd keywords.
elseif ($match[2] == 'OR' && count($keys['positive'])) {
$last = array_pop($keys['positive']);
// Starting a new OR?
if (!is_array($last)) {
$last = array($last);
}
$keys['positive'][] = $last;
$or = TRUE;
continue;
}
Gábor Hojtsy
committed
// AND operator: implied, so just ignore it
elseif ($match[2] == 'AND' || $match[2] == 'and') {
$warning = $match[2];
continue;
}
// Plain keyword
else {
Dries Buytaert
committed
if ($match[2] == 'or') {
Gábor Hojtsy
committed
$warning = $match[2];
Dries Buytaert
committed
}
Steven Wittens
committed
// Add to last element (which is an array)
$keys['positive'][count($keys['positive']) - 1] = array_merge($keys['positive'][count($keys['positive']) - 1], $words);
Steven Wittens
committed
$keys['positive'] = array_merge($keys['positive'], $words);
$or = FALSE;
}
// Convert keywords into SQL statements.
$query = array();
$query2 = array();
$arguments = array();
$arguments2 = array();
Dries Buytaert
committed
$simple_and = FALSE;
$simple_or = FALSE;
// Positive matches
foreach ($keys['positive'] as $key) {
// Group of ORed terms
if (is_array($key) && count($key)) {
Dries Buytaert
committed
$simple_or = TRUE;
$queryor = array();
$any = FALSE;
foreach ($key as $or) {
list($q, $num_new_scores) = _search_parse_query($or, $arguments2);
$any |= $num_new_scores;
if ($q) {
$queryor[] = $q;
$arguments[] = $or;
}
}
if (count($queryor)) {
$query[] = '('. implode(' OR ', $queryor) .')';
// A group of OR keywords only needs to match once
$matches += ($any > 0);
}
}
// Single ANDed term
else {
Dries Buytaert
committed
$simple_and = TRUE;
list($q, $num_new_scores, $num_valid_words) = _search_parse_query($key, $arguments2);
if ($q) {
$query[] = $q;
$arguments[] = $key;
if (!$num_valid_words) {
$simple = FALSE;
}
$matches += $num_new_scores;
}
}
}
Dries Buytaert
committed
if ($simple_and && $simple_or) {
$simple = FALSE;
}
foreach ($keys['negative'] as $key) {
list($q) = _search_parse_query($key, $arguments2, TRUE);
if ($q) {
$query[] = $q;
$arguments[] = $key;
Dries Buytaert
committed
$simple = FALSE;
}
}
$query = implode(' AND ', $query);
$query2 = substr(str_repeat("i.word = '%s' OR ", count($arguments2)), 0, -4);
Gábor Hojtsy
committed
return array($query, $arguments, $query2, $arguments2, $matches, $simple, $warning);
}
/**
* Helper function for search_parse_query();
*/
function _search_parse_query(&$word, &$scores, $not = FALSE) {
$num_new_scores = 0;
$num_valid_words = 0;
// Determine the scorewords of this word/phrase
if (!$not) {
$split = explode(' ', $word);
foreach ($split as $s) {
$num = is_numeric($s);
if ($num || drupal_strlen($s) >= variable_get('minimum_word_size', 3)) {
$s = $num ? ((int)ltrim($s, '-0')) : $s;
if (!isset($scores[$s])) {
$scores[$s] = $s;
$num_new_scores++;
$num_valid_words++;
}
}
}
return array("d.data ". ($not ? 'NOT ' : '') ."LIKE '%% %s %%'", $num_new_scores, $num_valid_words);
* Do a query on the full-text search index for a word or words.
* This function is normally only called by each module that support the
* indexed search (and thus, implements hook_update_index()).
Dries Buytaert
committed
* Results are retrieved in two logical passes. However, the two passes are
* joined together into a single query. And in the case of most simple
* queries the second pass is not even used.
Dries Buytaert
committed
* The first pass selects a set of all possible matches, which has the benefit
* of also providing the exact result set for simple "AND" or "OR" searches.
Dries Buytaert
committed
* The second portion of the query further refines this set by verifying
* advanced text conditions (such negative or phrase matches)
* @param $keywords
* A search string as entered by the user.
*
* @param $type
* A string identifying the calling module.
* @param $join1
* (optional) Inserted into the JOIN part of the first SQL query.
* For example "INNER JOIN {node} n ON n.nid = i.sid".
*
* @param $where1
* (optional) Inserted into the WHERE part of the first SQL query.
* For example "(n.status > %d)".
*
* @param $arguments1
* (optional) Extra SQL arguments belonging to the first query.
*
Dries Buytaert
committed
* @param $columns2
* (optional) Inserted into the SELECT pat of the second query. Must contain
* a column selected as 'score'.
* defaults to 'i.relevance AS score'
* @param $join2
* (optional) Inserted into the JOIN par of the second SQL query.
* For example "INNER JOIN {node_comment_statistics} n ON n.nid = i.sid"
*
* @param $arguments2
* (optional) Extra SQL arguments belonging to the second query parameter.
*
* @param $sort_parameters
* (optional) SQL arguments for sorting the final results.
* Default: 'ORDER BY score DESC'
*
* @return
Gábor Hojtsy
committed
* An array of objects for the search results.
*
* @ingroup search
Dries Buytaert
committed
function do_search($keywords, $type, $join1 = '', $where1 = '1 = 1', $arguments1 = array(), $columns2 = 'i.relevance AS score', $join2 = '', $arguments2 = array(), $sort_parameters = 'ORDER BY score DESC') {
$query = search_parse_query($keywords);
form_set_error('keys', t('You must include at least one positive keyword with @count characters or more.', array('@count' => variable_get('minimum_word_size', 3))));
Dries Buytaert
committed
if ($query[6]) {
Gábor Hojtsy
committed
if ($query[6] == 'or') {
drupal_set_message(t('Search for either of the two terms with uppercase <strong>OR</strong>. For example, <strong>cats OR dogs</strong>.'));
}
Dries Buytaert
committed
}
if ($query === NULL || $query[0] == '' || $query[2] == '') {
return array();
Dries Buytaert
committed
// Build query for keyword normalization.
$conditions = "$where1 AND ($query[2]) AND i.type = '%s'";
$arguments1 = array_merge($arguments1, $query[3], array($type));
$join = "INNER JOIN {search_total} t ON i.word = t.word $join1";
if (!$query[5]) {
$conditions .= " AND ($query[0])";
$arguments1 = array_merge($arguments1, $query[1]);
$join .= " INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type";
}
Dries Buytaert
committed
// Calculate maximum keyword relevance, to normalize it.
$select = "SELECT SUM(i.score * t.count) AS score FROM {search_index} i $join WHERE $conditions GROUP BY i.type, i.sid HAVING COUNT(*) >= %d ORDER BY score DESC";
Dries Buytaert
committed
$arguments = array_merge($arguments1, array($query[4]));
$normalize = db_result(db_query_range($select, $arguments, 0, 1));
if (!$normalize) {
return array();
}
Dries Buytaert
committed
$columns2 = str_replace('i.relevance', '('. (1.0 / $normalize) .' * SUM(i.score * t.count))', $columns2);
Dries Buytaert
committed
// Build query to retrieve results.
$select = "SELECT i.type, i.sid, $columns2 FROM {search_index} i $join $join2 WHERE $conditions GROUP BY i.type, i.sid HAVING COUNT(*) >= %d";
$count_select = "SELECT COUNT(*) FROM ($select) n1";
$arguments = array_merge($arguments2, $arguments1, array($query[4]));
// Do actual search query
Dries Buytaert
committed
$result = pager_query("$select $sort_parameters", 10, 0, $count_select, $arguments);
$results = array();
while ($item = db_fetch_object($result)) {
}
return $results;
/**
* Helper function for grabbing search keys.
*/
function search_get_keys() {
static $return;
if (!isset($return)) {
// Extract keys as remainder of path
// Note: support old GET format of searches for existing links.
$path = explode('/', $_GET['q'], 3);
$keys = empty($_REQUEST['keys']) ? '' : $_REQUEST['keys'];
$return = count($path) == 3 ? $path[2] : $keys;
}
return $return;
/**
* @defgroup search Search interface
* @{
* The Drupal search interface manages a global search mechanism.
*
* Modules may plug into this system to provide searches of different types of
* data. Most of the system is handled by search.module, so this must be enabled
* for all of the search features to work.
*
* There are three ways to interact with the search system:
Gábor Hojtsy
committed
* - Specifically for searching nodes, you can implement
* hook_nodeapi('update index') and hook_nodeapi('search result'). However,
* note that the search system already indexes all visible output of a node,
* i.e. everything displayed normally by hook_view() and hook_nodeapi('view').
* This is usually sufficient. You should only use this mechanism if you want
* additional, non-visible data to be indexed.
* - Implement hook_search(). This will create a search tab for your module on
Gábor Hojtsy
committed
* the /search page with a simple keyword search form.
* - Implement hook_update_index(). This allows your module to use Drupal's
* HTML indexing mechanism for searching full text efficiently.
*