Commit f4a9d47a authored by alexpott's avatar alexpott
Browse files

Issue #2459325 by jhodgdon: Document that language is not set on search keyword preprocessing

parent 95305c46
......@@ -14,7 +14,8 @@
* Preprocess text for search.
*
* This hook is called to preprocess both the text added to the search index
* and the keywords users have submitted for searching.
* and the keywords users have submitted for searching. The same processing
* needs to be applied to both so that searches will find matches.
*
* Possible uses:
* - Adding spaces between words of Chinese or Japanese text.
......@@ -22,14 +23,23 @@
* instance, walk, walked, walking, and walks in searching.
* - Expanding abbreviations and acronymns that occur in text.
*
* @param $text
* @param string $text
* The text to preprocess. This is a single piece of plain text extracted
* from between two HTML tags or from the search query. It will not contain
* any HTML entities or HTML tags.
* @param $langcode
* The language code of the entity that has been found.
* @param string|null $langcode
* The language code for the language the text is in, if known. When this hook
* is invoked during search indexing, the language will most likely be known
* and passed in. This is left up to the search plugin;
* \Drupal\node\Plugin\Search\NodeSearch does pass in the node
* language. However, when this hook is invoked during searching, in order to
* let a module apply the same preprocessing to the search keywords and
* indexed text so they will match, $langcode will be NULL. A hook
* implementation can call the getCurrentLanguage() method on the
* 'language_manager' service to determine the current language and act
* accordingly.
*
* @return
* @return string
* The text after preprocessing. Note that if your module decides not to
* alter the text, it should return the original text. Also, after
* preprocessing, words in the text should be separated by a space.
......@@ -37,9 +47,14 @@
* @ingroup search
*/
function hook_search_preprocess($text, $langcode = NULL) {
// If the language is not set, get it from the language manager.
if (!isset($langcode)) {
$langcode = \Drupal::languageManager()->getCurrentLanguage()->getId();
}
// If the langcode is set to 'en' then add variations of the word "testing"
// which can also be found during English language searches.
if (isset($langcode) && $langcode == 'en') {
if ($langcode == 'en') {
// Add the alternate verb forms for the word "testing".
if ($text == 'we are testing') {
$text .= ' test tested';
......
......@@ -224,13 +224,25 @@ function search_update_totals() {
}
/**
* Simplifies a string according to indexing rules.
* Simplifies and preprocesses text for searching.
*
* @param $text
* Processing steps:
* - Entities are decoded.
* - Text is lower-cased and diacritics (accents) are removed.
* - hook_search_preprocess() is invoked.
* - CJK (Chinese, Japanese, Korean) characters are processed, depending on
* the search settings.
* - Punctuation is processed (removed or replaced with spaces, depending on
* where it is; see code for details).
* - Words are truncated to 50 characters maximum.
*
* @param string $text
* Text to simplify.
* @param string|null $langcode
* Language code for the language of $text, if known.
*
* @return
* Simplified text.
* @return string
* Simplified and processed text.
*
* @see hook_search_preprocess()
*/
......@@ -292,13 +304,13 @@ function search_simplify($text, $langcode = NULL) {
* is equal to the 'minimum_word_size' variable. This tokenizing is only done
* if the 'overlap_cjk' variable is TRUE.
*
* @param $matches
* @param array $matches
* This function is a callback for preg_replace_callback(), which is called
* from search_simplify(). So, $matches is an array of regular expression
* matches, which means that $matches[0] contains the matched text -- a
* string of CJK characters to tokenize.
*
* @return
* @return string
* Tokenized text, starting and ending with a space character.
*/
function search_expand_cjk($matches) {
......@@ -328,7 +340,17 @@ function search_expand_cjk($matches) {
}
/**
* Simplifies and splits a string into tokens for indexing.
* Simplifies and splits a string into words for indexing.
*
* @param string $text
* Text to process.
* @param string|null $langcode
* Language code for the language of $text, if known.
*
* @return array
* Array of words in the simplified, preprocessed text.
*
* @see search_simplify()
*/
function search_index_split($text, $langcode = NULL) {
$last = &drupal_static(__FUNCTION__);
......@@ -359,7 +381,12 @@ function _search_index_truncate(&$text) {
}
/**
* Invokes hook_search_preprocess() in modules.
* Invokes hook_search_preprocess() to simplify text.
*
* @param string $text
* Text to preprocess, passed by reference and altered in place.
* @param string|null $langcode
* Language code for the language of $text, if known.
*/
function search_invoke_preprocess(&$text, $langcode = NULL) {
foreach (\Drupal::moduleHandler()->getImplementations('search_preprocess') as $module) {
......@@ -370,14 +397,14 @@ function search_invoke_preprocess(&$text, $langcode = NULL) {
/**
* Updates the full-text search index for a particular item.
*
* @param $type
* @param string $type
* The plugin ID or other machine-readable type of this item,
* which should be less than 64 bytes.
* @param $sid
* @param int $sid
* An ID number identifying this particular item (e.g., node ID).
* @param $langcode
* Language code for text being indexed.
* @param $text
* @param string $langcode
* Language code for the language of the text being indexed.
* @param string $text
* The content of this item. Must be a piece of HTML or plain text.
*
* @ingroup search
......@@ -585,6 +612,8 @@ function search_mark_for_reindex($type = NULL, $sid = NULL, $langcode = NULL) {
* A string containing a search query.
* @param string $text
* The text to extract fragments from.
* @param string|null $langcode
* Language code for the language of $text, if known.
*
* @return string
* A string containing HTML for the excerpt.
......@@ -736,14 +765,14 @@ function search_excerpt($keys, $text, $langcode = NULL) {
/**
* Finds an appropriate keyword in text.
*
* @param $key
* @param string $key
* The keyword to find.
* @param $text
* @param string $text
* The text to search for the keyword.
* @param $boundary
* @param string $boundary
* Regular expression for boundary characters between words.
* @param $langcode
* Language code.
* @param string|null $langcode
* Language code for the language of $text, if known.
*
* @return
* A segment of $text that is between word boundary characters that either
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment