Commit 801a0987 authored by webchick's avatar webchick

#916086 by mcarbone, jhodgdon: Fixed search_excerpt() doesn't highlight words...

#916086 by mcarbone, jhodgdon: Fixed search_excerpt() doesn't highlight words that are matched via search_simplify()
parent 039f3af6
......@@ -1142,12 +1142,13 @@ function search_excerpt($keys, $text) {
array_walk($keys, '_search_excerpt_replace');
$workkeys = $keys;
// Extract a fragment per keyword for at most 4 keywords.
// Extract fragments around keywords.
// First we collect ranges of text around each keyword, starting/ending
// at spaces.
// at spaces, trying to get to 256 characters.
// If the sum of all fragments is too short, we look for second occurrences.
$ranges = array();
$included = array();
$foundkeys = array();
$length = 0;
while ($length < 256 && count($workkeys)) {
foreach ($workkeys as $k => $key) {
......@@ -1164,10 +1165,26 @@ function search_excerpt($keys, $text) {
if (!isset($included[$key])) {
$included[$key] = 0;
}
// Locate a keyword (position $p), then locate a space in front (position
// $q) and behind it (position $s)
// Locate a keyword (position $p, always >0 because $text starts with a
// space). First try bare keyword, but if that doesn't work, try to find a
// derived form from search_simplify().
$p = 0;
if (preg_match('/' . $boundary . $key . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
$p = $match[0][1];
}
else {
$info = search_simplify_excerpt_match($key, $text, $included[$key], $boundary);
if ($info['where']) {
$p = $info['where'];
if ($info['keyword']) {
$foundkeys[] = $info['keyword'];
}
}
}
// Now locate a space in front (position $q) and behind it (position $s),
// leaving about 60 characters extra before and after for context.
// Note that a space was added to the front and end of $text above.
if ($p) {
if (($q = strpos(' ' . $text, ' ', max(0, $p - 61))) !== FALSE) {
$end = substr($text . ' ', $p, 80);
if (($s = strrpos($end, ' ')) !== FALSE) {
......@@ -1233,6 +1250,10 @@ function search_excerpt($keys, $text) {
$text = (isset($newranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . $dots[2];
$text = check_plain($text);
// Slash-escape quotes in keys found in a derived form and merge with original keys.
array_walk($foundkeys, '_search_excerpt_replace');
$keys = array_merge($keys, $foundkeys);
// Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
$text = preg_replace('/' . $boundary . '(' . implode('|', $keys) . ')' . $boundary . '/iu', '<strong>\0</strong>', $text);
return $text;
......@@ -1249,6 +1270,76 @@ function _search_excerpt_replace(&$text) {
$text = preg_quote($text, '/');
}
/**
* Find words in the original text that matched via search_simplify().
*
* This is called in search_excerpt() if an exact match is not found in the
* text, so that we can find the derived form that matches.
*
* @param $key
* The keyword to find.
* @param $text
* The text to search for the keyword.
* @param $offset
* Offset position in $text to start searching at.
* @param $boundary
* Text to include in a regular expression that will match a word boundary.
*
* @return
* FALSE if no match is found. If a match is found, return an associative
* array with element 'where' giving the position of the match, and element
* 'keyword' giving the actual word found in the text at that position.
*/
function search_simplify_excerpt_match($key, $text, $offset, $boundary) {
$pos = NULL;
$simplified_key = search_simplify($key);
$simplified_text = search_simplify($text);
// Check if we have a match after simplification in the text.
if (!preg_match('/' . $boundary . $simplified_key . $boundary . '/iu', $simplified_text, $match, PREG_OFFSET_CAPTURE, $offset)) {
return FALSE;
}
// If we get here, we have a match. Now find the exact location of the match
// and the original text that matched. Start by splitting up the text by all
// potential starting points of the matching text and iterating through them.
$split = array_filter(preg_split('/' . $boundary . '/iu', $text, -1, PREG_SPLIT_OFFSET_CAPTURE), '_search_excerpt_match_filter');
foreach ($split as $value) {
// Skip starting points before the offset.
if ($value[1] < $offset) {
continue;
}
// Check a window of 80 characters after the starting point for a match,
// based on the size of the excerpt window.
$window = substr($text, $value[1], 80);
$simplified_window = search_simplify($window);
if (strpos($simplified_window, $simplified_key) === 0) {
// We have a match in this window. Store the position of the match.
$pos = $value[1];
// Iterate through the text in the window until we find the full original
// matching text.
$length = strlen($window);
for ($i = 1; $i <= $length; $i++) {
$keyfound = substr($text, $value[1], $i);
if ($simplified_key == search_simplify($keyfound)) {
break;
}
}
break;
}
}
return $pos ? array('where' => $pos, 'keyword' => $keyfound) : FALSE;
}
/**
* Helper function for array_filter() in search_search_excerpt_match().
*/
function _search_excerpt_match_filter($var) {
return strlen(trim($var[0]));
}
/**
* Implements hook_forms().
*/
......
......@@ -1582,6 +1582,40 @@ class SearchExcerptTestCase extends DrupalUnitTestCase {
$this->assertFalse(strpos($result, '&'), 'Entities are not present in excerpt');
$this->assertTrue(strpos($result, 'í') > 0, 'Entities are converted in excerpt');
}
/**
* Tests search_excerpt() with search keywords matching simplified words.
*
* Excerpting should handle keywords that are matched only after going through
* search_simplify(). This test passes keywords that match simplified words
* and compares them with strings that contain the original unsimplified word.
*/
function testSearchExcerptSimplified() {
$lorem1 = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam vitae arcu at leo cursus laoreet. Curabitur dui tortor, adipiscing malesuada tempor in, bibendum ac diam. Cras non tellus a libero pellentesque condimentum. What is a Drupalism? Suspendisse ac lacus libero. Ut non est vel nisl faucibus interdum nec sed leo. Pellentesque sem risus, vulputate eu semper eget, auctor in libero.';
$lorem2 = 'Ut fermentum est vitae metus convallis scelerisque. Phasellus pellentesque rhoncus tellus, eu dignissim purus posuere id. Quisque eu fringilla ligula. Morbi ullamcorper, lorem et mattis egestas, tortor neque pretium velit, eget eleifend odio turpis eu purus. Donec vitae metus quis leo pretium tincidunt a pulvinar sem. Morbi adipiscing laoreet mauris vel placerat. Nullam elementum, nisl sit amet scelerisque malesuada, dolor nunc hendrerit quam, eu ultrices erat est in orci.';
// Make some text with some keywords that will get simplified.
$text = $lorem1 . ' Number: 123456.7890 Hyphenated: one-two abc,def ' . $lorem2;
// Note: The search_excerpt() function adds some extra spaces -- not
// important for HTML formatting. Remove these for comparison.
$result = preg_replace('| +|', ' ', search_excerpt('123456.7890', $text));
$this->assertTrue(strpos($result, 'Number: <strong>123456.7890</strong>') !== FALSE, 'Numeric keyword is highlighted with exact match');
$result = preg_replace('| +|', ' ', search_excerpt('1234567890', $text));
$this->assertTrue(strpos($result, 'Number: <strong>123456.7890</strong>') !== FALSE, 'Numeric keyword is highlighted with simplified match');
$result = preg_replace('| +|', ' ', search_excerpt('Number 1234567890', $text));
$this->assertTrue(strpos($result, '<strong>Number</strong>: <strong>123456.7890</strong>') !== FALSE, 'Punctuated and numeric keyword is highlighted with simplified match');
$result = preg_replace('| +|', ' ', search_excerpt('"Number 1234567890"', $text));
$this->assertTrue(strpos($result, '<strong>Number: 123456.7890</strong>') !== FALSE, 'Phrase with punctuated and numeric keyword is highlighted with simplified match');
$result = preg_replace('| +|', ' ', search_excerpt('"Hyphenated onetwo"', $text));
$this->assertTrue(strpos($result, '<strong>Hyphenated: one-two</strong>') !== FALSE, 'Phrase with punctuated and hyphenated keyword is highlighted with simplified match');
$result = preg_replace('| +|', ' ', search_excerpt('"abc def"', $text));
$this->assertTrue(strpos($result, '<strong>abc,def</strong>') !== FALSE, 'Phrase with keyword simplified into two separate words is highlighted with simplified match');
}
}
/**
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment