Unverified Commit 640ea36a authored by alexpott's avatar alexpott

Issue #3075703 by kim.pepper, andypost, ravi.shankar, daffie, pavnish, jungle,...

Issue #3075703 by kim.pepper, andypost, ravi.shankar, daffie, pavnish, jungle, alexpott, jhodgdon: Move search text processing to a service
parent e0d57869
......@@ -10,40 +10,28 @@
use Drupal\Core\Form\FormStateInterface;
use Drupal\Core\Routing\RouteMatchInterface;
use Drupal\Core\Url;
use Drupal\search\SearchTextProcessorInterface;
/**
* Matches all 'N' Unicode character classes (numbers)
*
* @deprecated in drupal:9.1.0 and is removed from drupal:10.0.0. Use
* \Drupal\search\SearchTextProcessorInterface::PREG_CLASS_NUMBERS instead.
*
* @see https://www.drupal.org/node/3078162
*/
define('PREG_CLASS_NUMBERS',
'\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}' .
'\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}' .
'\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}' .
'\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-' .
'\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}' .
'\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}' .
'\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}' .
'\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-' .
'\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}');
define('PREG_CLASS_NUMBERS', SearchTextProcessorInterface::PREG_CLASS_NUMBERS);
/**
* Matches all 'P' Unicode character classes (punctuation)
*
* @deprecated in drupal:9.1.0 and is removed from drupal:10.0.0. Use
* \Drupal\search\SearchTextProcessorInterface::PREG_CLASS_PUNCTUATION
* instead.
*
* @see https://www.drupal.org/node/3078162
*/
define('PREG_CLASS_PUNCTUATION',
'\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}' .
'\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}' .
'\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}' .
'\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}' .
'\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}' .
'\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}' .
'\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}' .
'\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}' .
'\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-' .
'\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}' .
'\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}' .
'\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}' .
'\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}' .
'\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-' .
'\x{ff65}');
define('PREG_CLASS_PUNCTUATION', SearchTextProcessorInterface::PREG_CLASS_PUNCTUATION);
/**
* Matches CJK (Chinese, Japanese, Korean) letter-like characters.
......@@ -58,13 +46,12 @@
* considered symbols. (See
* http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt)
*
* @see search_expand_cjk()
* @deprecated in drupal:9.1.0 and is removed from drupal:10.0.0. Use
* \Drupal\search\SearchTextProcessorInterface::PREG_CLASS_CJK instead.
*
* @see https://www.drupal.org/node/3078162
*/
define('PREG_CLASS_CJK', '\x{1100}-\x{11FF}\x{3040}-\x{309F}\x{30A1}-\x{318E}' .
'\x{31A0}-\x{31B7}\x{31F0}-\x{31FF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FCF}' .
'\x{A000}-\x{A48F}\x{A4D0}-\x{A4FD}\x{A960}-\x{A97F}\x{AC00}-\x{D7FF}' .
'\x{F900}-\x{FAFF}\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}\x{FF66}-\x{FFDC}' .
'\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}');
define('PREG_CLASS_CJK', SearchTextProcessorInterface::PREG_CLASS_CJK);
/**
* Implements hook_help().
......@@ -156,53 +143,15 @@ function search_cron() {
* @return string
* Simplified and processed text.
*
* @deprecated in drupal:9.1.0 and is removed from drupal:10.0.0. Use
* \Drupal\search\SearchTextProcessorInterface::analyze() instead.
*
* @see https://www.drupal.org/node/3078162
* @see hook_search_preprocess()
*/
function search_simplify($text, $langcode = NULL) {
// Decode entities to UTF-8
$text = Html::decodeEntities($text);
// Lowercase
$text = mb_strtolower($text);
// Remove diacritics.
$text = \Drupal::service('transliteration')->removeDiacritics($text);
// Call an external processor for word handling.
search_invoke_preprocess($text, $langcode);
// Simple CJK handling
if (\Drupal::config('search.settings')->get('index.overlap_cjk')) {
$text = preg_replace_callback('/[' . PREG_CLASS_CJK . ']+/u', 'search_expand_cjk', $text);
}
// To improve searching for numerical data such as dates, IP addresses
// or version numbers, we consider a group of numerical characters
// separated only by punctuation characters to be one piece.
// This also means that searching for e.g. '20/03/1984' also returns
// results with '20-03-1984' in them.
// Readable regexp: ([number]+)[punctuation]+(?=[number])
$text = preg_replace('/([' . PREG_CLASS_NUMBERS . ']+)[' . PREG_CLASS_PUNCTUATION . ']+(?=[' . PREG_CLASS_NUMBERS . '])/u', '\1', $text);
// Multiple dot and dash groups are word boundaries and replaced with space.
// No need to use the unicode modifier here because 0-127 ASCII characters
// can't match higher UTF-8 characters as the leftmost bit of those are 1.
$text = preg_replace('/[.-]{2,}/', ' ', $text);
// The dot, underscore and dash are simply removed. This allows meaningful
// search behavior with acronyms and URLs. See unicode note directly above.
$text = preg_replace('/[._-]+/', '', $text);
// With the exception of the rules above, we consider all punctuation,
// marks, spacers, etc, to be a word boundary.
$text = preg_replace('/[' . Unicode::PREG_CLASS_WORD_BOUNDARY . ']+/u', ' ', $text);
// Truncate everything to 50 characters.
$words = explode(' ', $text);
array_walk($words, '_search_index_truncate');
$text = implode(' ', $words);
return $text;
@trigger_error('search_simplify() is deprecated in drupal:9.1.0 and is removed from drupal:10.0.0. Use \Drupal\search\SearchTextProcessorInterface::analyze() instead. See https://www.drupal.org/node/3078162', E_USER_DEPRECATED);
return \Drupal::service('search.text_processor')->analyze($text, $langcode);
}
/**
......@@ -224,8 +173,14 @@ function search_simplify($text, $langcode = NULL) {
*
* @return string
* Tokenized text, starting and ending with a space character.
*
* @deprecated in drupal:9.1.0 and is removed from drupal:10.0.0. Use a
* custom implementation of SearchTextProcessorInterface instead.
*
* @see https://www.drupal.org/node/3078162
*/
function search_expand_cjk($matches) {
@trigger_error('search_expand_cjk() is deprecated in drupal:9.1.0 and is removed from drupal:10.0.0. Use a custom implementation of SearchTextProcessorInterface instead. instead. See https://www.drupal.org/node/3078162', E_USER_DEPRECATED);
$min = \Drupal::config('search.settings')->get('index.minimum_word_size');
$str = $matches[0];
$length = mb_strlen($str);
......@@ -262,46 +217,15 @@ function search_expand_cjk($matches) {
* @return array
* Array of words in the simplified, preprocessed text.
*
* @deprecated in drupal:9.1.0 and is removed from drupal:10.0.0. Use a
* \Drupal\search\SearchTextProcessorInterface::process() instead.
*
* @see https://www.drupal.org/node/3078162
* @see search_simplify()
*/
function search_index_split($text, $langcode = NULL) {
$last = &drupal_static(__FUNCTION__);
$lastsplit = &drupal_static(__FUNCTION__ . ':lastsplit');
if ($last == $text) {
return $lastsplit;
}
// Process words
$text = search_simplify($text, $langcode);
$words = explode(' ', $text);
// Save last keyword result
$last = $text;
$lastsplit = $words;
return $words;
}
/**
* Helper function for array_walk in search_index_split.
*/
function _search_index_truncate(&$text) {
// Use a static array to avoid re-truncating text we've done before.
// The same words may often be passed in during excerpt generation.
static $truncated = [];
if (isset($truncated[$text])) {
$text = $truncated[$text];
return;
}
// If we didn't find it in the static array, perform the operation.
$original = $text;
if (is_numeric($text)) {
$text = ltrim($text, '0');
}
$text = Unicode::truncate($text, 50);
// Save it for the next time.
$truncated[$original] = $text;
@trigger_error('search_index_split() is deprecated in drupal:9.1.0 and is removed from drupal:10.0.0. Use \Drupal\search\SearchTextProcessorInterface::process() instead. See https://www.drupal.org/node/3078162', E_USER_DEPRECATED);
return \Drupal::service('search.text_processor')->process($text, $langcode);
}
/**
......@@ -311,8 +235,14 @@ function _search_index_truncate(&$text) {
* Text to preprocess, passed by reference and altered in place.
* @param string|null $langcode
* Language code for the language of $text, if known.
*
* @deprecated in drupal:9.1.0 and is removed from drupal:10.0.0. Use a
* custom implementation of SearchTextProcessor instead.
*
* @see https://www.drupal.org/node/3078162
*/
function search_invoke_preprocess(&$text, $langcode = NULL) {
@trigger_error('search_invoke_preprocess() is deprecated in drupal:9.1.0 and is removed from drupal:10.0.0. Use a custom implementation of SearchTextProcessorInterface instead. See https://www.drupal.org/node/3078162', E_USER_DEPRECATED);
foreach (\Drupal::moduleHandler()->getImplementations('search_preprocess') as $module) {
$text = \Drupal::moduleHandler()->invoke($module, 'search_preprocess', [$text, $langcode]);
}
......@@ -385,7 +315,7 @@ function search_excerpt($keys, $text, $langcode = NULL) {
// Make a list of unique keywords that are actually found in the text,
// which could be items in $keys or replacements that are equivalent through
// search_simplify().
// \Drupal\search\SearchTextProcessorInterface::analyze().
$temp_keys = [];
foreach ($keys as $key) {
$key = _search_find_match_with_simplify($key, $text, $boundary_character, $langcode);
......@@ -535,8 +465,9 @@ function search_excerpt($keys, $text, $langcode = NULL) {
* @return string|null
* A segment of $text that is between word boundary characters that either
* matches $key directly, or matches $key when both this text segment and
* $key are processed by search_simplify(). If a matching text segment is
* not located, NULL is returned.
* $key are processed by
* \Drupal\search\SearchTextProcessorInterface::analyze(). If a matching text
* segment is not located, NULL is returned.
*/
function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NULL) {
$preceded_by_boundary = '(?<=' . $boundary . ')';
......@@ -564,9 +495,11 @@ function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NU
return mb_substr($text, $position, mb_strlen($new_key));
}
// Run both text and key through search_simplify.
$simplified_key = trim(search_simplify($key, $langcode));
$simplified_text = trim(search_simplify($text, $langcode));
// Run both text and key through text processor.
/** @var \Drupal\search\SearchTextProcessorInterface $text_processor */
$text_processor = \Drupal::service('search.text_processor');
$simplified_key = trim($text_processor->analyze($key, $langcode));
$simplified_text = trim($text_processor->analyze($text, $langcode));
if ($simplified_key == '' || $simplified_text == '' || strpos($simplified_text, $simplified_key) === FALSE) {
// The simplified keyword and text do not match at all, or are empty.
return NULL;
......@@ -579,7 +512,7 @@ function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NU
// Using a binary search, find the earliest possible ending position in
// $text where it will still match the keyword after applying
// search_simplify().
// \Drupal\search\SearchTextProcessorInterface::analyze().
$start_index = 0;
$start_pos = $words[$start_index][1];
$min_end_index = 1;
......@@ -591,7 +524,7 @@ function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NU
$proposed_end_pos = $words[$proposed_end_index][1];
// Since the split was done with preg_split(), the positions are byte counts
// not character counts, so use substr() not mb_substr() here.
$trial_text = trim(search_simplify(substr($text, $start_pos, $proposed_end_pos - $start_pos), $langcode));
$trial_text = trim($text_processor->analyze(substr($text, $start_pos, $proposed_end_pos - $start_pos), $langcode));
if (strpos($trial_text, $simplified_key) !== FALSE) {
// The proposed endpoint is fine, text still matches.
$max_end_index = $proposed_end_index;
......@@ -605,7 +538,8 @@ function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NU
// Now do the same for the starting position: using a binary search, find the
// latest possible starting position in $text where it will still match the
// keyword after applying search_simplify().
// keyword after applying
// \Drupal\search\SearchTextProcessorInterface::analyze().
$end_index = $min_end_index;
$end_pos = $words[$end_index][1];
$min_start_index = 0;
......@@ -617,7 +551,7 @@ function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NU
$proposed_start_pos = $words[$proposed_start_index][1];
// Since the split was done with preg_split(), the positions are byte counts
// not character counts, so use substr() not mb_substr() here.
$trial_text = trim(search_simplify(substr($text, $proposed_start_pos, $end_pos - $proposed_start_pos), $langcode));
$trial_text = trim($text_processor->analyze(substr($text, $proposed_start_pos, $end_pos - $proposed_start_pos), $langcode));
if (strpos($trial_text, $simplified_key) !== FALSE) {
// The proposed start point is fine, text still matches.
$min_start_index = $proposed_start_index;
......
......@@ -9,4 +9,8 @@ services:
search.index:
class: Drupal\search\SearchIndex
arguments: ['@config.factory', '@database','@database.replica', '@cache_tags.invalidator']
arguments: ['@config.factory', '@database','@database.replica', '@cache_tags.invalidator', '@search.text_processor']
search.text_processor:
class: Drupal\search\SearchTextProcessor
arguments: ['@transliteration', '@config.factory', '@module_handler']
......@@ -41,6 +41,13 @@ class SearchIndex implements SearchIndexInterface {
*/
protected $cacheTagsInvalidator;
/**
* The text processor.
*
* @var \Drupal\search\SearchTextProcessorInterface
*/
protected $textProcessor;
/**
* SearchIndex constructor.
*
......@@ -52,12 +59,19 @@ class SearchIndex implements SearchIndexInterface {
* The database replica connection.
* @param \Drupal\Core\Cache\CacheTagsInvalidatorInterface $cache_tags_invalidator
* The cache tags invalidator.
* @param \Drupal\search\SearchTextProcessorInterface $text_processor
* The text processor.
*/
public function __construct(ConfigFactoryInterface $config_factory, Connection $connection, Connection $replica, CacheTagsInvalidatorInterface $cache_tags_invalidator) {
public function __construct(ConfigFactoryInterface $config_factory, Connection $connection, Connection $replica, CacheTagsInvalidatorInterface $cache_tags_invalidator, SearchTextProcessorInterface $text_processor = NULL) {
$this->configFactory = $config_factory;
$this->connection = $connection;
$this->replica = $replica;
$this->cacheTagsInvalidator = $cache_tags_invalidator;
if ($text_processor === NULL) {
@trigger_error('Calling ' . __METHOD__ . ' without $text_processor argument is deprecated in drupal:9.1.0 and will be required in drupal:10.0.0. See https://www.drupal.org/node/3078162', E_USER_DEPRECATED);
$text_processor = \Drupal::service('search.text_processor');
}
$this->textProcessor = $text_processor;
}
/**
......@@ -140,7 +154,7 @@ public function index($type, $sid, $langcode, $text, $update_weights = TRUE) {
// Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty
// values.
if ($value != '') {
$words = search_index_split($value, $langcode);
$words = $this->textProcessor->process($value, $langcode);
foreach ($words as $word) {
// Add word to accumulator.
$accum .= $word . ' ';
......
......@@ -233,6 +233,8 @@ protected function parseSearchExpression() {
// Classify tokens.
$in_or = FALSE;
$limit_combinations = \Drupal::config('search.settings')->get('and_or_limit');
/** @var \Drupal\search\SearchTextProcessorInterface $text_processor */
$text_processor = \Drupal::service('search.text_processor');
// The first search expression does not count as AND.
$and_count = -1;
$or_count = 0;
......@@ -255,7 +257,7 @@ protected function parseSearchExpression() {
// Simplify keyword according to indexing rules and external
// preprocessors. Use same process as during search indexing, so it
// will match search index.
$words = search_simplify($match[2]);
$words = $text_processor->analyze($match[2]);
// Re-explode in case simplification added more words, except when
// matching a phrase.
$words = $phrase ? [$words] : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);
......
<?php
namespace Drupal\search;
use Drupal\Component\Transliteration\TransliterationInterface;
use Drupal\Component\Utility\Html;
use Drupal\Component\Utility\Unicode;
use Drupal\Core\Config\ConfigFactoryInterface;
use Drupal\Core\Extension\ModuleHandlerInterface;
/**
* Processes search text for indexing.
*/
class SearchTextProcessor implements SearchTextProcessorInterface {
/**
* The transliteration service.
*
* @var \Drupal\Component\Transliteration\TransliterationInterface
*/
protected $transliteration;
/**
* The config factory.
*
* @var \Drupal\Core\Config\ConfigFactoryInterface
*/
protected $configFactory;
/**
* The module handler.
*
* @var \Drupal\Core\Extension\ModuleHandlerInterface
*/
protected $moduleHandler;
/**
* SearchTextProcessor constructor.
*
* @param \Drupal\Component\Transliteration\TransliterationInterface $transliteration
* The transliteration service.
* @param \Drupal\Core\Config\ConfigFactoryInterface $config_factory
* The config factory.
* @param \Drupal\Core\Extension\ModuleHandlerInterface $module_handler
* The module handler.
*/
public function __construct(TransliterationInterface $transliteration, ConfigFactoryInterface $config_factory, ModuleHandlerInterface $module_handler) {
$this->transliteration = $transliteration;
$this->configFactory = $config_factory;
$this->moduleHandler = $module_handler;
}
/**
* {@inheritdoc}
*/
public function process(string $text, ?string $langcode = NULL): array {
$text = $this->analyze($text, $langcode);
return explode(' ', $text);
}
/**
* {@inheritdoc}
*/
public function analyze(string $text, ?string $langcode = NULL): string {
// Decode entities to UTF-8.
$text = Html::decodeEntities($text);
// Lowercase.
$text = mb_strtolower($text);
// Remove diacritics.
$text = $this->transliteration->removeDiacritics($text);
// Call an external processor for word handling.
$this->invokePreprocess($text, $langcode);
// Simple CJK handling.
if ($this->configFactory->get('search.settings')->get('index.overlap_cjk')) {
$text = preg_replace_callback('/[' . self::PREG_CLASS_CJK . ']+/u', [$this, 'expandCjk'], $text);
}
// To improve searching for numerical data such as dates, IP addresses
// or version numbers, we consider a group of numerical characters
// separated only by punctuation characters to be one piece.
// This also means that searching for e.g. '20/03/1984' also returns
// results with '20-03-1984' in them.
// Readable regexp: ([number]+)[punctuation]+(?=[number])
$text = preg_replace('/([' . self::PREG_CLASS_NUMBERS . ']+)[' . self::PREG_CLASS_PUNCTUATION . ']+(?=[' . self::PREG_CLASS_NUMBERS . '])/u', '\1', $text);
// Multiple dot and dash groups are word boundaries and replaced with space.
// No need to use the unicode modifier here because 0-127 ASCII characters
// can't match higher UTF-8 characters as the leftmost bit of those are 1.
$text = preg_replace('/[.-]{2,}/', ' ', $text);
// The dot, underscore and dash are simply removed. This allows meaningful
// search behavior with acronyms and URLs. See unicode note directly above.
$text = preg_replace('/[._-]+/', '', $text);
// With the exception of the rules above, we consider all punctuation,
// marks, spacers, etc, to be a word boundary.
$text = preg_replace('/[' . Unicode::PREG_CLASS_WORD_BOUNDARY . ']+/u', ' ', $text);
// Truncate everything to 50 characters.
$words = explode(' ', $text);
array_walk($words, [$this, 'truncate']);
$text = implode(' ', $words);
return $text;
}
/**
* Invokes hook_search_preprocess() to simplify text.
*
* @param string $text
* Text to preprocess, passed by reference and altered in place.
* @param string|null $langcode
* Language code for the language of $text, if known.
*/
protected function invokePreprocess(string &$text, ?string $langcode = NULL): void {
foreach ($this->moduleHandler->getImplementations('search_preprocess') as $module) {
$text = $this->moduleHandler->invoke($module, 'search_preprocess', [$text, $langcode]);
}
}
/**
* Splits CJK (Chinese, Japanese, Korean) text into tokens.
*
* The Search module matches exact words, where a word is defined to be a
* sequence of characters delimited by spaces or punctuation. CJK languages
* are written in long strings of characters, though, not split up into words.
* So in order to allow search matching, we split up CJK text into tokens
* consisting of consecutive, overlapping sequences of characters whose length
* is equal to the 'minimum_word_size' variable. This tokenizing is only done
* if the 'overlap_cjk' variable is TRUE.
*
* @param array $matches
* This function is a callback for preg_replace_callback(), which is called
* from self::analyze(). So, $matches is an array of regular expression
* matches, which means that $matches[0] contains the matched text -- a
* string of CJK characters to tokenize.
*
* @return string
* Tokenized text, starting and ending with a space character.
*/
protected function expandCjk(array $matches): string {
$min = $this->configFactory->get('search.settings')->get('index.minimum_word_size');
$str = $matches[0];
$length = mb_strlen($str);
// If the text is shorter than the minimum word size, don't tokenize it.
if ($length <= $min) {
return ' ' . $str . ' ';
}
$tokens = ' ';
// Build a FIFO queue of characters.
$chars = [];
for ($i = 0; $i < $length; $i++) {
// Add the next character off the beginning of the string to the queue.
$current = mb_substr($str, 0, 1);
$str = substr($str, strlen($current));
$chars[] = $current;
if ($i >= $min - 1) {
// Make a token of $min characters, and add it to the token string.
$tokens .= implode('', $chars) . ' ';
// Shift out the first character in the queue.
array_shift($chars);
}
}
return $tokens;
}
/**
* Helper function for array_walk in ::analyze().
*
* @param string $text
* The text to be truncated.
*/
protected function truncate(string &$text): void {
if (is_numeric($text)) {
$text = ltrim($text, '0');
}
if (mb_strlen($text) <= 50) {
return;
}
$text = mb_substr($text, 0, 50);
}
}
<?php
namespace Drupal\search;
/**
* Processes search text for indexing.
*/
interface SearchTextProcessorInterface {
/**
* Matches all 'N' Unicode character classes (numbers)
*/
const PREG_CLASS_NUMBERS =
'\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}' .
'\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}' .
'\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}' .
'\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-' .
'\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}' .
'\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}' .
'\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}' .
'\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-' .
'\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}';
/**
* Matches all 'P' Unicode character classes (punctuation)
*/
const PREG_CLASS_PUNCTUATION =
'\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}' .
'\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}' .
'\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}' .
'\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}' .
'\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}' .
'\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}' .
'\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}' .
'\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}' .
'\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-' .
'\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}' .
'\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}' .
'\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}' .
'\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}' .
'\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-' .
'\x{ff65}';
/**
* Matches CJK (Chinese, Japanese, Korean) letter-like characters.
*
* This list is derived from the "East Asian Scripts" section of
* http://www.unicode.org/charts/index.html, as well as a comment on
* http://unicode.org/reports/tr11/tr11-11.html listing some character
* ranges that are reserved for additional CJK ideographs.
*
* The character ranges do not include numbers, punctuation, or symbols, since
* these are handled separately in search. Note that radicals and strokes are
* considered symbols. (See
* http://www.unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt)
*
* @see \Drupal\search\SearchTextProcessor::expandCjk()
*/
const PREG_CLASS_CJK =
'\x{1100}-\x{11FF}\x{3040}-\x{309F}\x{30A1}-\x{318E}' .
'\x{31A0}-\x{31B7}\x{31F0}-\x{31FF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FCF}' .
'\x{A000}-\x{A48F}\x{A4D0}-\x{A4FD}\x{A960}-\x{A97F}\x{AC00}-\x{D7FF}' .
'\x{F900}-\x{FAFF}\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}\x{FF66}-\x{FFDC}' .
'\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}';
/**
* Processes text into words for indexing.