diff --git a/core/modules/search/lib/Drupal/search/Tests/SearchExcerptTest.php b/core/modules/search/lib/Drupal/search/Tests/SearchExcerptTest.php index 8deeef828c8121c87c94014a0f892e89232b3116..ff7eaeb8b0d6ef35fd92f8176eb7cd33f849d7c8 100644 --- a/core/modules/search/lib/Drupal/search/Tests/SearchExcerptTest.php +++ b/core/modules/search/lib/Drupal/search/Tests/SearchExcerptTest.php @@ -19,7 +19,7 @@ class SearchExcerptTest extends WebTestBase { * * @var array */ - public static $modules = array('search'); + public static $modules = array('search', 'search_langcode_test'); public static function getInfo() { return array( @@ -48,10 +48,19 @@ function testSearchExcerpt() { $this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Entire string is returned when keyword is not found in short string'); $result = preg_replace('| +|', ' ', search_excerpt('fox', $text)); - $this->assertEqual($result, 'The quick brown fox & jumps over the lazy dog ...', 'Found keyword is highlighted'); + $this->assertEqual($result, 'The quick brown fox & jumps over the lazy dog', 'Found keyword is highlighted'); + + $expected = 'The quick brown fox & jumps over the lazy dog'; + $result = preg_replace('| +|', ' ', search_excerpt('The', $text)); + $this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Keyword is highlighted at beginning of short string'); + + $expected = 'The quick brown fox & jumps over the lazy dog'; + $result = preg_replace('| +|', ' ', search_excerpt('dog', $text)); + $this->assertEqual(preg_replace('| +|', ' ', $result), $expected, 'Keyword is highlighted at end of short string'); $longtext = str_repeat($text . ' ', 10); $result = preg_replace('| +|', ' ', search_excerpt('nothing', $longtext)); + $expected = 'The quick brown fox & jumps over the lazy dog'; $this->assertTrue(strpos($result, $expected) === 0, 'When keyword is not found in long string, return value starts as expected'); $entities = str_repeat('készítése ', 20); @@ -101,10 +110,10 @@ function testSearchExcerptSimplified() { // Test phrases with characters which are being truncated. $result = preg_replace('| +|', ' ', search_excerpt('"ipsum _"', $text)); - $this->assertTrue(strpos($result, 'ipsum ') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part containing "_" is ignored.'); + $this->assertTrue(strpos($result, 'ipsum') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part containing "_" is ignored.'); $result = preg_replace('| +|', ' ', search_excerpt('"ipsum 0000"', $text)); - $this->assertTrue(strpos($result, 'ipsum ') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part "0000" is ignored.'); + $this->assertTrue(strpos($result, 'ipsum') !== FALSE, 'Only valid part of the phrase is highlighted and invalid part "0000" is ignored.'); // Test combination of the valid keyword and keyword containing only // characters which are being truncated during simplification. @@ -113,5 +122,43 @@ function testSearchExcerptSimplified() { $result = preg_replace('| +|', ' ', search_excerpt('ipsum 0000', $text)); $this->assertTrue(strpos($result, 'ipsum') !== FALSE, 'Only valid keyword is highlighted and invalid keyword "0000" is ignored.'); + + // Test using the hook_search_preprocess() from the test module. + // The hook replaces "finding" or "finds" with "find". + // So, if we search for "find" or "finds" or "finding", we should + // highlight "finding". + $text = "this tests finding a string"; + $result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex')); + $this->assertTrue(strpos($result, 'finding') !== FALSE, 'Search excerpt works with preprocess hook, search for finds'); + $result = preg_replace('| +|', ' ', search_excerpt('find', $text, 'ex')); + $this->assertTrue(strpos($result, 'finding') !== FALSE, 'Search excerpt works with preprocess hook, search for find'); + + // Just to be sure, test with the replacement at the beginning and end. + $text = "finding at the beginning"; + $result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex')); + $this->assertTrue(strpos($result, 'finding') !== FALSE, 'Search excerpt works with preprocess hook, text at start'); + + $text = "at the end finding"; + $result = preg_replace('| +|', ' ', search_excerpt('finds', $text, 'ex')); + $this->assertTrue(strpos($result, 'finding') !== FALSE, 'Search excerpt works with preprocess hook, text at end'); + + // Testing with a one-to-many replacement: the test module replaces DIC + // with Dependency Injection Container. + $text = "something about the DIC is happening"; + $result = preg_replace('| +|', ' ', search_excerpt('Dependency', $text, 'ex')); + $this->assertTrue(strpos($result, 'DIC') !== FALSE, 'Search excerpt works with preprocess hook, acronym first word'); + + $result = preg_replace('| +|', ' ', search_excerpt('Injection', $text, 'ex')); + $this->assertTrue(strpos($result, 'DIC') !== FALSE, 'Search excerpt works with preprocess hook, acronym second word'); + + $result = preg_replace('| +|', ' ', search_excerpt('Container', $text, 'ex')); + $this->assertTrue(strpos($result, 'DIC') !== FALSE, 'Search excerpt works with preprocess hook, acronym third word'); + + // Testing with a many-to-one replacement: the test module replaces + // hypertext markup language with HTML. + $text = "we always use hypertext markup language to describe things"; + $result = preg_replace('| +|', ' ', search_excerpt('html', $text, 'ex')); + $this->assertTrue(strpos($result, 'hypertext markup language') !== FALSE, 'Search excerpt works with preprocess hook, acronym many to one'); + } } diff --git a/core/modules/search/lib/Drupal/search/Tests/SearchPreprocessLangcodeTest.php b/core/modules/search/lib/Drupal/search/Tests/SearchPreprocessLangcodeTest.php index 6ae7e8845ee3d89f4748bbd4508583994bd62517..13f5a8d8d9aa02270375bf258bae4d45deebd132 100644 --- a/core/modules/search/lib/Drupal/search/Tests/SearchPreprocessLangcodeTest.php +++ b/core/modules/search/lib/Drupal/search/Tests/SearchPreprocessLangcodeTest.php @@ -53,11 +53,13 @@ function testPreprocessLangcode() { // function manually is needed to finish the indexing process. search_update_totals(); - // Search for the title of the node with a POST query. - $edit = array('or' => $node->label()); + // Search for the additional text that is added by the preprocess + // function. If you search for text that is in the node, preprocess is + // not invoked on the node during the search excerpt generation. + $edit = array('or' => 'Additional text'); $this->drupalPostForm('search/node', $edit, t('Advanced search')); - // Checks if the langcode has been passed by hook_search_preprocess(). + // Checks if the langcode message has been set by hook_search_preprocess(). $this->assertText('Langcode Preprocess Test: en'); } diff --git a/core/modules/search/search.module b/core/modules/search/search.module index 4afd6dccb611fdedf39bd4db2c4d53af2369231f..ca160bfcd2448021e4dd150addcd70b58be9bc1a 100644 --- a/core/modules/search/search.module +++ b/core/modules/search/search.module @@ -621,24 +621,23 @@ function search_mark_for_reindex($type, $sid) { */ /** - * Returns snippets from a piece of text, with certain keywords highlighted. + * Returns snippets from a piece of text, with search keywords highlighted. * * Used for formatting search results. * - * @param $keys + * @param string $keys * A string containing a search query. - * - * @param $text + * @param string $text * The text to extract fragments from. * - * @return + * @return string * A string containing HTML for the excerpt. */ function search_excerpt($keys, $text, $langcode = NULL) { // We highlight around non-indexable or CJK characters. $boundary = '(?:(?<=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . '])|(?=[' . Unicode::PREG_CLASS_WORD_BOUNDARY . PREG_CLASS_CJK . ']))'; - // Extract positive keywords and phrases + // Extract positive keywords and phrases. preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' ' . $keys, $matches); $keys = array_merge($matches[2], $matches[3]); @@ -646,78 +645,77 @@ function search_excerpt($keys, $text, $langcode = NULL) { $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text)); $text = decode_entities($text); - // Slash-escape quotes in the search keyword string. - array_walk($keys, '_search_excerpt_replace'); - $workkeys = $keys; + // Make a list of unique keywords that are actually found in the text, + // which could be items in $keys or replacements that are equivalent through + // search_simplify(). + $temp_keys = array(); + foreach ($keys as $key) { + $key = _search_find_match_with_simplify($key, $text, $boundary, $langcode); + if (isset($key)) { + // Quote slashes so they can be used in regular expressions. + $temp_keys[] = preg_quote($key, '/'); + } + } + // Several keywords could have simplified down to the same thing, so pick + // out the unique ones. + $keys = array_unique($temp_keys); - // Extract fragments around keywords. - // First we collect ranges of text around each keyword, starting/ending - // at spaces, trying to get to 256 characters. - // If the sum of all fragments is too short, we look for second occurrences. + // Extract fragments of about 60 characters around keywords, bounded by word + // boundary characters. Try to reach 256 characters, using second occurrences + // if necessary. $ranges = array(); - $included = array(); - $foundkeys = array(); $length = 0; - while ($length < 256 && count($workkeys)) { - foreach ($workkeys as $k => $key) { - if (strlen($key) == 0) { - unset($workkeys[$k]); - unset($keys[$k]); - continue; - } + $look_start = array(); + $remaining_keys = $keys; + + while ($length < 256 && !empty($remaining_keys)) { + $found_keys = array(); + foreach ($remaining_keys as $key) { if ($length >= 256) { break; } - // Remember occurrence of key so we can skip over it if more occurrences - // are desired. - if (!isset($included[$key])) { - $included[$key] = 0; - } - // Locate a keyword (position $p, always >0 because $text starts with a - // space). First try bare keyword, but if that doesn't work, try to find a - // derived form from search_simplify(). - $p = 0; - if (preg_match('/' . $boundary . $key . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) { - $p = $match[0][1]; - } - else { - $info = search_simplify_excerpt_match($key, $text, $included[$key], $boundary, $langcode); - if ($info['where']) { - $p = $info['where']; - if ($info['keyword']) { - $foundkeys[] = $info['keyword']; - } - } + + // Remember where we last found $key, in case we are coming through a + // second time. + if (!isset($look_start[$key])) { + $look_start[$key] = 0; } - // Now locate a space in front (position $q) and behind it (position $s), - // leaving about 60 characters extra before and after for context. - // Note that a space was added to the front and end of $text above. - if ($p) { - if (($q = strpos(' ' . $text, ' ', max(0, $p - 61))) !== FALSE) { - $end = substr($text . ' ', $p, 80); - if (($s = strrpos($end, ' ')) !== FALSE) { - // Account for the added spaces. - $q = max($q - 1, 0); - $s = min($s, strlen($end) - 1); - $ranges[$q] = $p + $s; - $length += $p + $s - $q; - $included[$key] = $p + 1; - } - else { - unset($workkeys[$k]); + + // See if we can find $key after where we found it the last time. Since + // we are requiring a match on a word boundary, make sure $text starts + // and ends with a space. + $matches = array(); + if (preg_match('/' . $boundary . $key . $boundary . '/iu', ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) { + $found_position = $matches[0][1]; + $look_start[$key] = $found_position + 1; + // Keep track of which keys we found this time, in case we need to + // pass through again to find more text. + $found_keys[] = $key; + + // Locate a space before and after this match, leaving about 60 + // characters of context on each end. + $before = strpos(' ' . $text, ' ', max(0, $found_position - 61)); + if ($before !== FALSE && $before <= $found_position) { + $after = strrpos(' ' . $text . ' ', ' ', min($found_position + 61, strlen($text) + 1)); + if ($after !== FALSE && $after > $found_position) { + // Account for the spaces we added. + $before = max($before - 1, 0); + $after = min($after - 1, strlen($text)); + if ($before < $after) { + // Save this range. + $ranges[$before] = $after; + $length += $after - $before; + } } } - else { - unset($workkeys[$k]); - } - } - else { - unset($workkeys[$k]); } } + // Next time through this loop, only look for keys we found this time, + // if any. + $remaining_keys = $found_keys; } - if (count($ranges) == 0) { + if (empty($ranges)) { // We didn't find any keyword matches, so just return the first part of the // text. We also need to re-encode any HTML special characters that we // entity-decoded above. @@ -727,43 +725,46 @@ function search_excerpt($keys, $text, $langcode = NULL) { // Sort the text ranges by starting position. ksort($ranges); - // Now we collapse overlapping text ranges into one. The sorting makes it O(n). - $newranges = array(); - foreach ($ranges as $from2 => $to2) { - if (!isset($from1)) { - $from1 = $from2; - $to1 = $to2; + // Collapse overlapping text ranges into one. The sorting makes it O(n). + $new_ranges = array(); + $max_end = 0; + foreach ($ranges as $this_from => $this_to) { + $max_end = max($max_end, $this_to); + if (!isset($working_from)) { + // This is the first time through this loop: initialize. + $working_from = $this_from; + $working_to = $this_to; continue; } - if ($from2 <= $to1) { - $to1 = max($to1, $to2); + if ($this_from <= $working_to) { + // The ranges overlap: combine them. + $working_to = max($working_to, $this_to); } else { - $newranges[$from1] = $to1; - $from1 = $from2; - $to1 = $to2; + // The ranges do not overlap: save the working range and start a new one. + $new_ranges[$working_from] = $working_to; + $working_from = $this_from; + $working_to = $this_to; } } - $newranges[$from1] = $to1; + // Save the remaining working range. + $new_ranges[$working_from] = $working_to; - // Fetch text + // Fetch text within the combined ranges we found. $out = array(); - foreach ($newranges as $from => $to) { + foreach ($new_ranges as $from => $to) { $out[] = substr($text, $from, $to - $from); } - // Let translators have the ... separator text as one chunk. + // Combine the text chunks with "..." separators. The "..." needs to be + // translated. Let translators have the ... separator text as one chunk. $dots = explode('!excerpt', t('... !excerpt ... !excerpt ...')); - - $text = (isset($newranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . $dots[2]; + $text = (isset($new_ranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . (($max_end < strlen($text) - 1) ? $dots[2] : ''); $text = check_plain($text); - // Slash-escape quotes in keys found in a derived form and merge with original keys. - array_walk($foundkeys, '_search_excerpt_replace'); - $keys = array_merge($keys, $foundkeys); - - // Highlight keywords. Must be done at once to prevent conflicts ('strong' and ''). - $text = preg_replace('/' . $boundary . '(' . implode('|', $keys) . ')' . $boundary . '/iu', '\0', $text); + // Highlight keywords. Must be done at once to prevent conflicts ('strong' + // and ''). + $text = trim(preg_replace('/' . $boundary . '(?:' . implode('|', $keys) . ')' . $boundary . '/iu', '\0', ' ' . $text . ' ')); return $text; } @@ -772,83 +773,81 @@ function search_excerpt($keys, $text, $langcode = NULL) { */ /** - * Helper function for array_walk() in search_excerpt(). - */ -function _search_excerpt_replace(&$text) { - $text = preg_quote($text, '/'); -} - -/** - * Finds words in the original text that matched via search_simplify(). - * - * This is called in search_excerpt() if an exact match is not found in the - * text, so that we can find the derived form that matches. + * Finds an appropriate keyword in text. * * @param $key * The keyword to find. * @param $text * The text to search for the keyword. - * @param $offset - * Offset position in $text to start searching at. * @param $boundary - * Text to include in a regular expression that will match a word boundary. + * Regular expression for boundary characters between words. + * @param $langcode + * Language code. * * @return - * FALSE if no match is found. If a match is found, return an associative - * array with element 'where' giving the position of the match, and element - * 'keyword' giving the actual word found in the text at that position. + * A segment of $text that is between word boundary characters that either + * matches $key directly, or matches $key when both this text segment and + * $key are processed by search_simplify(). If a matching text segment is + * not located, NULL is returned. */ -function search_simplify_excerpt_match($key, $text, $offset, $boundary, $langcode = NULL) { - $pos = NULL; - $simplified_key = search_simplify($key, $langcode); - $simplified_text = search_simplify($text, $langcode); - - // Return immediately if simplified key or text are empty. - if (!$simplified_key || !$simplified_text) { - return FALSE; +function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NULL) { + // See if $key appears as-is. When testing, make sure $text starts/ends with + // a space, because we require $key to be surrounded by word boundary + // characters. + $temp = trim($key); + if ($temp == '') { + return NULL; + } + if (preg_match('/' . $boundary . preg_quote($temp, '/') . $boundary . '/iu', ' ' . $text . ' ')) { + return $key; } - // Check if we have a match after simplification in the text. - if (!preg_match('/' . $boundary . $simplified_key . $boundary . '/iu', $simplified_text, $match, PREG_OFFSET_CAPTURE, $offset)) { - return FALSE; + // Run both text and key through search_simplify. + $simplified_key = trim(search_simplify($key, $langcode)); + $simplified_text = trim(search_simplify($text, $langcode)); + if ($simplified_key == '' || $simplified_text == '' || strpos($simplified_text, $simplified_key) === FALSE) { + // The simplfied keyword and text do not match at all, or are empty. + return NULL; } - // If we get here, we have a match. Now find the exact location of the match - // and the original text that matched. Start by splitting up the text by all - // potential starting points of the matching text and iterating through them. - $split = array_filter(preg_split('/' . $boundary . '/iu', $text, -1, PREG_SPLIT_OFFSET_CAPTURE), '_search_excerpt_match_filter'); - foreach ($split as $value) { - // Skip starting points before the offset. - if ($value[1] < $offset) { - continue; + // Split $text into words, keeping track of where the word boundaries are. + $words = preg_split('/' . $boundary . '/iu', $text, NULL, PREG_SPLIT_OFFSET_CAPTURE); + // Add an entry pointing to the end of the string, for the loop below. + $words[] = array('', strlen($text)); + $num_words = count($words); + + // Find the smallest segment of complete words in $text that we can simplify + // to match $simplified_key. + $start_position = 0; + $word_end = 0; + for ($word_index = 0; $word_index < $num_words; $word_index++) { + // See if we can move the starting position out from our previously-saved + // best position to here and still have a match. + $trial_position = $words[$word_index][1]; + if ($trial_position < strlen($text)) { + $candidate = substr($text, $trial_position); + $test_text = trim(search_simplify($candidate, $langcode)); + if (strpos($test_text, $simplified_key) !== FALSE) { + $start_position = $trial_position; + $word_end = $trial_position + strlen($words[$word_index][0]); + continue; + } } - // Check a window of 80 characters after the starting point for a match, - // based on the size of the excerpt window. - $window = substr($text, $value[1], 80); - $simplified_window = search_simplify($window); - if (strpos($simplified_window, $simplified_key) === 0) { - // We have a match in this window. Store the position of the match. - $pos = $value[1]; - // Iterate through the text in the window until we find the full original - // matching text. - $length = strlen($window); - for ($i = 1; $i <= $length; $i++) { - $keyfound = substr($text, $value[1], $i); - if ($simplified_key == search_simplify($keyfound)) { - break; - } + // See if we can end at our currently-saved word-ending position and still + // match, in which case this is the minimal matching string. + if ($word_end > $start_position) { + $candidate = substr($text, $start_position, $word_end - $start_position); + $test_text = trim(search_simplify($candidate, $langcode)); + if (strpos($test_text, $simplified_key) !== FALSE) { + return $candidate; } - break; } - } - return $pos ? array('where' => $pos, 'keyword' => $keyfound) : FALSE; -} + // Save the end position of this word for the next time through this loop. + $word_end = $trial_position + strlen($words[$word_index][0]); + } -/** - * Helper function for array_filter() in search_search_excerpt_match(). - */ -function _search_excerpt_match_filter($var) { - return strlen(trim($var[0])); + // If we get here, we couldn't find a match. + return NULL; } diff --git a/core/modules/search/tests/modules/search_langcode_test/search_langcode_test.module b/core/modules/search/tests/modules/search_langcode_test/search_langcode_test.module index 3b527535e093aaf8ad48a1339fb8dac2288e3812..cb2a27d34bbb9735995ea59c1589880d223400b8 100644 --- a/core/modules/search/tests/modules/search_langcode_test/search_langcode_test.module +++ b/core/modules/search/tests/modules/search_langcode_test/search_langcode_test.module @@ -15,14 +15,25 @@ function search_langcode_test_search_preprocess($text, $langcode = NULL) { if ($text == 'we are testing') { $text .= ' test tested'; } - // Prints the langcode for testPreprocessLangcode(). + // Prints the langcode for testPreprocessLangcode() and adds some + // extra text. else { drupal_set_message('Langcode Preprocess Test: ' . $langcode); + $text .= 'Additional text'; } } // Prints the langcode for testPreprocessLangcode(). elseif (isset($langcode)) { drupal_set_message('Langcode Preprocess Test: ' . $langcode); + + // Preprocessing for the excerpt test. + if ($langcode == 'ex') { + $text = str_replace('finding', 'find', $text); + $text = str_replace('finds', 'find', $text); + $text = str_replace('dic', ' dependency injection container', $text); + $text = str_replace('hypertext markup language', 'html', $text); + } } + return $text; }