summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNathaniel Catchpole2013-12-20 12:15:04 (GMT)
committerNathaniel Catchpole2013-12-20 12:15:04 (GMT)
commitf82a06840fa05278fffadc567fc9bc65c9c86bc9 (patch)
treebb36b64610d16590554247269a6f866fdb28bc93
parente16798a856b8c2b88a078ff5dd0a10066bca9ab9 (diff)
Issue #916086 by jhodgdon, mcarbone, pwolanin, Albert Volkman: Search_excerpt() doesn't highlight words that are matched via search_simplify().
-rw-r--r--core/modules/search/lib/Drupal/search/Tests/SearchExcerptTest.php55
-rw-r--r--core/modules/search/lib/Drupal/search/Tests/SearchPreprocessLangcodeTest.php8
-rw-r--r--core/modules/search/search.module295
-rw-r--r--core/modules/search/tests/modules/search_langcode_test/search_langcode_test.module13
4 files changed, 215 insertions, 156 deletions
diff --git a/core/modules/search/lib/Drupal/search/Tests/SearchExcerptTest.php b/core/modules/search/lib/Drupal/search/Tests/SearchExcerptTest.php
index 8deeef8..ff7eaeb 100644
--- a/core/modules/search/lib/Drupal/search/Tests/SearchExcerptTest.php
+++ b/core/modules/search/lib/Drupal/search/Tests/SearchExcerptTest.php
@@ -19,7 +19,7 @@ class SearchExcerptTest extends WebTestBase {
*
* @var array
*/
- public static $modules = array('search');
+ public static $modules = array('search', 'search_langcode_test');
public static function getInfo() {
return array(
@@ -48,10 +48,19 @@ class SearchExcerptTest extends WebTestBase {
$this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Entire string is returned when keyword is not found in short string');
$result = preg_replace('| +|', ' ', search_excerpt('fox', $text));
- $this->assertEqual($result, 'The quick brown <strong>fox</strong> &amp; jumps over the lazy dog ...', 'Found keyword is highlighted');
+ $this->assertEqual($result, 'The quick brown <strong>fox</strong> &amp; jumps over the lazy dog', 'Found keyword is highlighted');
+
+ $expected = '<strong>The</strong> quick brown fox &amp; jumps over <strong>the</strong> lazy dog';
+ $result = preg_replace('| +|', ' ', search_excerpt('The', $text));
+ $this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Keyword is highlighted at beginning of short string');
+
+ $expected = 'The quick brown fox &amp; jumps over the lazy <strong>dog</strong>';
+ $result = preg_replace('| +|', ' ', search_excerpt('dog', $text));
+ $this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Keyword is highlighted at end of short string');
$longtext = str_repeat($text . ' ', 10);
$result = preg_replace('| +|', ' ', search_excerpt('nothing', $longtext));
+ $expected = 'The quick brown fox &amp; jumps over the lazy dog';
$this->assertTrue(strpos($result, $expected) === 0, 'When keyword is not found in long string, return value starts as expected');
$entities = str_repeat('k&eacute;sz&iacute;t&eacute;se ', 20);
@@ -101,10 +110,10 @@ class SearchExcerptTest extends WebTestBase {
// Test phrases with characters which are being truncated.
$result = preg_replace('| +|', ' ', search_excerpt('"ipsum _"', $text));
- $this->assertTrue(strpos($result, '<strong>ipsum </strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part containing "_" is ignored.');
+ $this->assertTrue(strpos($result, '<strong>ipsum</strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part containing "_" is ignored.');
$result = preg_replace('| +|', ' ', search_excerpt('"ipsum 0000"', $text));
- $this->assertTrue(strpos($result, '<strong>ipsum </strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part "0000" is ignored.');
+ $this->assertTrue(strpos($result, '<strong>ipsum</strong>') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part "0000" is ignored.');
// Test combination of the valid keyword and keyword containing only
// characters which are being truncated during simplification.
@@ -113,5 +122,43 @@ class SearchExcerptTest extends WebTestBase {
$result = preg_replace('| +|', ' ', search_excerpt('ipsum 0000', $text));
$this->assertTrue(strpos($result, '<strong>ipsum</strong>') !== FALSE, 'Only valid keyword is highlighted and invalid keyword "0000" is ignored.');
+
+ // Test using the hook_search_preprocess() from the test module.
+ // The hook replaces "finding" or "finds" with "find".
+ // So, if we search for "find" or "finds" or "finding", we should
+ // highlight "finding".
+ $text = "this tests finding a string";
+ $result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex'));
+ $this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, search for finds');
+ $result = preg_replace('| +|', ' ', search_excerpt('find', $text, 'ex'));
+ $this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, search for find');
+
+ // Just to be sure, test with the replacement at the beginning and end.
+ $text = "finding at the beginning";
+ $result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex'));
+ $this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, text at start');
+
+ $text = "at the end finding";
+ $result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex'));
+ $this->assertTrue(strpos($result, '<strong>finding</strong>') !== FALSE, 'Search excerpt works with preprocess hook, text at end');
+
+ // Testing with a one-to-many replacement: the test module replaces DIC
+ // with Dependency Injection Container.
+ $text = "something about the DIC is happening";
+ $result = preg_replace('| +|', ' ', search_excerpt('Dependency', $text, 'ex'));
+ $this->assertTrue(strpos($result, '<strong>DIC</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym first word');
+
+ $result = preg_replace('| +|', ' ', search_excerpt('Injection', $text, 'ex'));
+ $this->assertTrue(strpos($result, '<strong>DIC</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym second word');
+
+ $result = preg_replace('| +|', ' ', search_excerpt('Container', $text, 'ex'));
+ $this->assertTrue(strpos($result, '<strong>DIC</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym third word');
+
+ // Testing with a many-to-one replacement: the test module replaces
+ // hypertext markup language with HTML.
+ $text = "we always use hypertext markup language to describe things";
+ $result = preg_replace('| +|', ' ', search_excerpt('html', $text, 'ex'));
+ $this->assertTrue(strpos($result, '<strong>hypertext markup language</strong>') !== FALSE, 'Search excerpt works with preprocess hook, acronym many to one');
+
}
}
diff --git a/core/modules/search/lib/Drupal/search/Tests/SearchPreprocessLangcodeTest.php b/core/modules/search/lib/Drupal/search/Tests/SearchPreprocessLangcodeTest.php
index 6ae7e88..13f5a8d 100644
--- a/core/modules/search/lib/Drupal/search/Tests/SearchPreprocessLangcodeTest.php
+++ b/core/modules/search/lib/Drupal/search/Tests/SearchPreprocessLangcodeTest.php
@@ -53,11 +53,13 @@ class SearchPreprocessLangcodeTest extends SearchTestBase {
// function manually is needed to finish the indexing process.
search_update_totals();
- // Search for the title of the node with a POST query.
- $edit = array('or' => $node->label());
+ // Search for the additional text that is added by the preprocess
+ // function. If you search for text that is in the node, preprocess is
+ // not invoked on the node during the search excerpt generation.
+ $edit = array('or' => 'Additional text');
$this->drupalPostForm('search/node', $edit, t('Advanced search'));
- // Checks if the langcode has been passed by hook_search_preprocess().
+ // Checks if the langcode message has been set by hook_search_preprocess().
$this->assertText('Langcode Preprocess Test: en');
}
diff --git a/core/modules/search/search.module b/core/modules/search/search.module
index 4afd6dc..ca160bf 100644
--- a/core/modules/search/search.module
+++ b/core/modules/search/search.module
@@ -621,24 +621,23 @@ function search_mark_for_reindex($type, $sid) {
*/
/**
- * Returns snippets from a piece of text, with certain keywords highlighted.
+ * Returns snippets from a piece of text, with search keywords highlighted.
*
* Used for formatting search results.
*
- * @param $keys
+ * @param string $keys
* A string containing a search query.
- *
- * @param $text
+ * @param string $text
* The text to extract fragments from.
*
- * @return
+ * @return string
* A string containing HTML for the excerpt.
*/
function search_excerpt($keys, $text, $langcode = NULL) {
// We highlight around non-indexable or CJK characters.
$boundary = '(?:(?<=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . '])|(?=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . ']))';
- // Extract positive keywords and phrases
+ // Extract positive keywords and phrases.
preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' ' . $keys, $matches);
$keys = array_merge($matches[2], $matches[3]);
@@ -646,78 +645,77 @@ function search_excerpt($keys, $text, $langcode = NULL) {
$text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
$text = decode_entities($text);
- // Slash-escape quotes in the search keyword string.
- array_walk($keys, '_search_excerpt_replace');
- $workkeys = $keys;
+ // Make a list of unique keywords that are actually found in the text,
+ // which could be items in $keys or replacements that are equivalent through
+ // search_simplify().
+ $temp_keys = array();
+ foreach ($keys as $key) {
+ $key = _search_find_match_with_simplify($key, $text, $boundary, $langcode);
+ if (isset($key)) {
+ // Quote slashes so they can be used in regular expressions.
+ $temp_keys[] = preg_quote($key, '/');
+ }
+ }
+ // Several keywords could have simplified down to the same thing, so pick
+ // out the unique ones.
+ $keys = array_unique($temp_keys);
- // Extract fragments around keywords.
- // First we collect ranges of text around each keyword, starting/ending
- // at spaces, trying to get to 256 characters.
- // If the sum of all fragments is too short, we look for second occurrences.
+ // Extract fragments of about 60 characters around keywords, bounded by word
+ // boundary characters. Try to reach 256 characters, using second occurrences
+ // if necessary.
$ranges = array();
- $included = array();
- $foundkeys = array();
$length = 0;
- while ($length < 256 && count($workkeys)) {
- foreach ($workkeys as $k => $key) {
- if (strlen($key) == 0) {
- unset($workkeys[$k]);
- unset($keys[$k]);
- continue;
- }
+ $look_start = array();
+ $remaining_keys = $keys;
+
+ while ($length < 256 && !empty($remaining_keys)) {
+ $found_keys = array();
+ foreach ($remaining_keys as $key) {
if ($length >= 256) {
break;
}
- // Remember occurrence of key so we can skip over it if more occurrences
- // are desired.
- if (!isset($included[$key])) {
- $included[$key] = 0;
- }
- // Locate a keyword (position $p, always >0 because $text starts with a
- // space). First try bare keyword, but if that doesn't work, try to find a
- // derived form from search_simplify().
- $p = 0;
- if (preg_match('/' . $boundary . $key . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
- $p = $match[0][1];
- }
- else {
- $info = search_simplify_excerpt_match($key, $text, $included[$key], $boundary, $langcode);
- if ($info['where']) {
- $p = $info['where'];
- if ($info['keyword']) {
- $foundkeys[] = $info['keyword'];
- }
- }
+
+ // Remember where we last found $key, in case we are coming through a
+ // second time.
+ if (!isset($look_start[$key])) {
+ $look_start[$key] = 0;
}
- // Now locate a space in front (position $q) and behind it (position $s),
- // leaving about 60 characters extra before and after for context.
- // Note that a space was added to the front and end of $text above.
- if ($p) {
- if (($q = strpos(' ' . $text, ' ', max(0, $p - 61))) !== FALSE) {
- $end = substr($text . ' ', $p, 80);
- if (($s = strrpos($end, ' ')) !== FALSE) {
- // Account for the added spaces.
- $q = max($q - 1, 0);
- $s = min($s, strlen($end) - 1);
- $ranges[$q] = $p + $s;
- $length += $p + $s - $q;
- $included[$key] = $p + 1;
- }
- else {
- unset($workkeys[$k]);
+
+ // See if we can find $key after where we found it the last time. Since
+ // we are requiring a match on a word boundary, make sure $text starts
+ // and ends with a space.
+ $matches = array();
+ if (preg_match('/' . $boundary . $key . $boundary . '/iu', ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) {
+ $found_position = $matches[0][1];
+ $look_start[$key] = $found_position + 1;
+ // Keep track of which keys we found this time, in case we need to
+ // pass through again to find more text.
+ $found_keys[] = $key;
+
+ // Locate a space before and after this match, leaving about 60
+ // characters of context on each end.
+ $before = strpos(' ' . $text, ' ', max(0, $found_position - 61));
+ if ($before !== FALSE && $before <= $found_position) {
+ $after = strrpos(' ' . $text . ' ', ' ', min($found_position + 61, strlen($text) + 1));
+ if ($after !== FALSE && $after > $found_position) {
+ // Account for the spaces we added.
+ $before = max($before - 1, 0);
+ $after = min($after - 1, strlen($text));
+ if ($before < $after) {
+ // Save this range.
+ $ranges[$before] = $after;
+ $length += $after - $before;
+ }
}
}
- else {
- unset($workkeys[$k]);
- }
- }
- else {
- unset($workkeys[$k]);
}
}
+ // Next time through this loop, only look for keys we found this time,
+ // if any.
+ $remaining_keys = $found_keys;
}
- if (count($ranges) == 0) {
+ if (empty($ranges)) {
// We didn't find any keyword matches, so just return the first part of the
// text. We also need to re-encode any HTML special characters that we
// entity-decoded above.
@@ -727,43 +725,46 @@ function search_excerpt($keys, $text, $langcode = NULL) {
// Sort the text ranges by starting position.
ksort($ranges);
- // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
- $newranges = array();
- foreach ($ranges as $from2 => $to2) {
- if (!isset($from1)) {
- $from1 = $from2;
- $to1 = $to2;
+ // Collapse overlapping text ranges into one. The sorting makes it O(n).
+ $new_ranges = array();
+ $max_end = 0;
+ foreach ($ranges as $this_from => $this_to) {
+ $max_end = max($max_end, $this_to);
+ if (!isset($working_from)) {
+ // This is the first time through this loop: initialize.
+ $working_from = $this_from;
+ $working_to = $this_to;
continue;
}
- if ($from2 <= $to1) {
- $to1 = max($to1, $to2);
+ if ($this_from <= $working_to) {
+ // The ranges overlap: combine them.
+ $working_to = max($working_to, $this_to);
}
else {
- $newranges[$from1] = $to1;
- $from1 = $from2;
- $to1 = $to2;
+ // The ranges do not overlap: save the working range and start a new one.
+ $new_ranges[$working_from] = $working_to;
+ $working_from = $this_from;
+ $working_to = $this_to;
}
}
- $newranges[$from1] = $to1;
+ // Save the remaining working range.
+ $new_ranges[$working_from] = $working_to;
- // Fetch text
+ // Fetch text within the combined ranges we found.
$out = array();
- foreach ($newranges as $from => $to) {
+ foreach ($new_ranges as $from => $to) {
$out[] = substr($text, $from, $to - $from);
}
- // Let translators have the ... separator text as one chunk.
+ // Combine the text chunks with "..." separators. The "..." needs to be
+ // translated. Let translators have the ... separator text as one chunk.
$dots = explode('!excerpt', t('... !excerpt ... !excerpt ...'));
-
- $text = (isset($newranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . $dots[2];
+ $text = (isset($new_ranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . (($max_end < strlen($text) - 1) ? $dots[2] : '');
$text = check_plain($text);
- // Slash-escape quotes in keys found in a derived form and merge with original keys.
- array_walk($foundkeys, '_search_excerpt_replace');
- $keys = array_merge($keys, $foundkeys);
-
- // Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
- $text = preg_replace('/' . $boundary . '(' . implode('|', $keys) . ')' . $boundary . '/iu', '<strong>\0</strong>', $text);
+ // Highlight keywords. Must be done at once to prevent conflicts ('strong'
+ // and '<strong>').
+ $text = trim(preg_replace('/' . $boundary . '(?:' . implode('|', $keys) . ')' . $boundary . '/iu', '<strong>\0</strong>', ' ' . $text . ' '));
return $text;
}
@@ -772,83 +773,81 @@ function search_excerpt($keys, $text, $langcode = NULL) {
*/
/**
- * Helper function for array_walk() in search_excerpt().
- */
-function _search_excerpt_replace(&$text) {
- $text = preg_quote($text, '/');
-}
-
-/**
- * Finds words in the original text that matched via search_simplify().
- *
- * This is called in search_excerpt() if an exact match is not found in the
- * text, so that we can find the derived form that matches.
+ * Finds an appropriate keyword in text.
*
* @param $key
* The keyword to find.
* @param $text
* The text to search for the keyword.
- * @param $offset
- * Offset position in $text to start searching at.
* @param $boundary
- * Text to include in a regular expression that will match a word boundary.
+ * Regular expression for boundary characters between words.
+ * @param $langcode
+ * Language code.
*
* @return
- * FALSE if no match is found. If a match is found, return an associative
- * array with element 'where' giving the position of the match, and element
- * 'keyword' giving the actual word found in the text at that position.
+ * A segment of $text that is between word boundary characters that either
+ * matches $key directly, or matches $key when both this text segment and
+ * $key are processed by search_simplify(). If a matching text segment is
+ * not located, NULL is returned.
*/
-function search_simplify_excerpt_match($key, $text, $offset, $boundary, $langcode = NULL) {
- $pos = NULL;
- $simplified_key = search_simplify($key, $langcode);
- $simplified_text = search_simplify($text, $langcode);
-
- // Return immediately if simplified key or text are empty.
- if (!$simplified_key || !$simplified_text) {
- return FALSE;
+function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NULL) {
+ // See if $key appears as-is. When testing, make sure $text starts/ends with
+ // a space, because we require $key to be surrounded by word boundary
+ // characters.
+ $temp = trim($key);
+ if ($temp == '') {
+ return NULL;
+ }
+ if (preg_match('/' . $boundary . preg_quote($temp, '/') . $boundary . '/iu', ' ' . $text . ' ')) {
+ return $key;
}
- // Check if we have a match after simplification in the text.
- if (!preg_match('/' . $boundary . $simplified_key . $boundary . '/iu', $simplified_text, $match, PREG_OFFSET_CAPTURE, $offset)) {
- return FALSE;
+ // Run both text and key through search_simplify.
+ $simplified_key = trim(search_simplify($key, $langcode));
+ $simplified_text = trim(search_simplify($text, $langcode));
+ if ($simplified_key == '' || $simplified_text == '' || strpos($simplified_text, $simplified_key) === FALSE) {
+ // The simplfied keyword and text do not match at all, or are empty.
+ return NULL;
}
- // If we get here, we have a match. Now find the exact location of the match
- // and the original text that matched. Start by splitting up the text by all
- // potential starting points of the matching text and iterating through them.
- $split = array_filter(preg_split('/' . $boundary . '/iu', $text, -1, PREG_SPLIT_OFFSET_CAPTURE), '_search_excerpt_match_filter');
- foreach ($split as $value) {
- // Skip starting points before the offset.
- if ($value[1] < $offset) {
- continue;
+ // Split $text into words, keeping track of where the word boundaries are.
+ $words = preg_split('/' . $boundary . '/iu', $text, NULL, PREG_SPLIT_OFFSET_CAPTURE);
+ // Add an entry pointing to the end of the string, for the loop below.
+ $words[] = array('', strlen($text));
+ $num_words = count($words);
+
+ // Find the smallest segment of complete words in $text that we can simplify
+ // to match $simplified_key.
+ $start_position = 0;
+ $word_end = 0;
+ for ($word_index = 0; $word_index < $num_words; $word_index++) {
+ // See if we can move the starting position out from our previously-saved
+ // best position to here and still have a match.
+ $trial_position = $words[$word_index][1];
+ if ($trial_position < strlen($text)) {
+ $candidate = substr($text, $trial_position);
+ $test_text = trim(search_simplify($candidate, $langcode));
+ if (strpos($test_text, $simplified_key) !== FALSE) {
+ $start_position = $trial_position;
+ $word_end = $trial_position + strlen($words[$word_index][0]);
+ continue;
+ }
}
- // Check a window of 80 characters after the starting point for a match,
- // based on the size of the excerpt window.
- $window = substr($text, $value[1], 80);
- $simplified_window = search_simplify($window);
- if (strpos($simplified_window, $simplified_key) === 0) {
- // We have a match in this window. Store the position of the match.
- $pos = $value[1];
- // Iterate through the text in the window until we find the full original
- // matching text.
- $length = strlen($window);
- for ($i = 1; $i <= $length; $i++) {
- $keyfound = substr($text, $value[1], $i);
- if ($simplified_key == search_simplify($keyfound)) {
- break;
- }
+ // See if we can end at our currently-saved word-ending position and still
+ // match, in which case this is the minimal matching string.
+ if ($word_end > $start_position) {
+ $candidate = substr($text, $start_position, $word_end - $start_position);
+ $test_text = trim(search_simplify($candidate, $langcode));
+ if (strpos($test_text, $simplified_key) !== FALSE) {
+ return $candidate;
}
- break;
}
- }
- return $pos ? array('where' => $pos, 'keyword' => $keyfound) : FALSE;
-}
+ // Save the end position of this word for the next time through this loop.
+ $word_end = $trial_position + strlen($words[$word_index][0]);
+ }
-/**
- * Helper function for array_filter() in search_search_excerpt_match().
- */
-function _search_excerpt_match_filter($var) {
- return strlen(trim($var[0]));
+ // If we get here, we couldn't find a match.
+ return NULL;
}
diff --git a/core/modules/search/tests/modules/search_langcode_test/search_langcode_test.module b/core/modules/search/tests/modules/search_langcode_test/search_langcode_test.module
index 3b52753..cb2a27d 100644
--- a/core/modules/search/tests/modules/search_langcode_test/search_langcode_test.module
+++ b/core/modules/search/tests/modules/search_langcode_test/search_langcode_test.module
@@ -15,14 +15,25 @@ function search_langcode_test_search_preprocess($text, $langcode = NULL) {
if ($text == 'we are testing') {
$text .= ' test tested';
}
- // Prints the langcode for testPreprocessLangcode().
+ // Prints the langcode for testPreprocessLangcode() and adds some
+ // extra text.
else {
drupal_set_message('Langcode Preprocess Test: ' . $langcode);
+ $text .= 'Additional text';
}
}
// Prints the langcode for testPreprocessLangcode().
elseif (isset($langcode)) {
drupal_set_message('Langcode Preprocess Test: ' . $langcode);
+
+ // Preprocessing for the excerpt test.
+ if ($langcode == 'ex') {
+ $text = str_replace('finding', 'find', $text);
+ $text = str_replace('finds', 'find', $text);
+ $text = str_replace('dic', ' dependency injection container', $text);
+ $text = str_replace('hypertext markup language', 'html', $text);
+ }
}
+
return $text;
}