Commit c380576b authored by drunken monkey's avatar drunken monkey

Issue #2274279 by drunken monkey: Rework the conversion from "fulltext" to tokenized_text

parent 5bce36c7
......@@ -12,6 +12,7 @@ use Drupal\Core\Datetime\DateFormatterInterface;
use Drupal\Core\Extension\ModuleHandlerInterface;
use Drupal\Core\Form\FormStateInterface;
use Drupal\Core\KeyValueStore\KeyValueStoreInterface;
use Drupal\search_api\Plugin\search_api\data_type\value\TextToken;
use Psr\Log\LoggerInterface;
use Drupal\Core\Logger\RfcLogLevel;
use Drupal\Core\Render\Element;
......@@ -1138,7 +1139,7 @@ class Database extends BackendPluginBase {
foreach ($field->getValues() as $field_value) {
$converted_value = $this->convert($field_value, $type, $field->getOriginalType(), $index);
// Don't add NULL values to the return array. Also, adding an empty
// Don't add NULL values to the array of values. Also, adding an empty
// array is, of course, a waste of time.
if (isset($converted_value) && $converted_value !== array()) {
$values = array_merge($values, is_array($converted_value) ? $converted_value : array($converted_value));
......@@ -1159,7 +1160,7 @@ class Database extends BackendPluginBase {
$db_info['field_tables'][$field_id]['multi-valued'] = TRUE;
}
if (Utility::isTextType($type, array('text', 'tokenized_text'))) {
if (Utility::isTextType($type)) {
// Remember the text table the first time we encounter it.
if (!isset($text_table)) {
$text_table = $table;
......@@ -1167,9 +1168,10 @@ class Database extends BackendPluginBase {
$unique_tokens = array();
$denormalized_value = '';
/** @var \Drupal\search_api\Plugin\search_api\data_type\value\TextTokenInterface $token */
foreach ($values as $token) {
$word = $token['value'];
$score = $token['score'];
$word = $token->getText();
$score = $token->getBoost();
// Store the first 30 characters of the string as the denormalized
// value.
......@@ -1310,68 +1312,54 @@ class Database extends BackendPluginBase {
protected function convert($value, $type, $original_type, IndexInterface $index) {
if (!isset($value)) {
// For text fields, we have to return an array even if the value is NULL.
return Utility::isTextType($type, array('text', 'tokenized_text')) ? array() : NULL;
return Utility::isTextType($type) ? array() : NULL;
}
switch ($type) {
case 'text':
// For dates, splitting the timestamp makes no sense.
if ($original_type == 'date') {
$value = $this->getDateFormatter()->format($value, 'custom', 'Y y F M n m j d l D');
}
$ret = array();
foreach (preg_split('/[^\p{L}\p{N}]+/u', $value, -1, PREG_SPLIT_NO_EMPTY) as $v) {
if ($v) {
if (Unicode::strlen($v) > 50) {
$this->getLogger()->warning('An overlong word (more than 50 characters) was encountered while indexing: %word.<br />Database search servers currently cannot index such words correctly – the word was therefore trimmed to the allowed length. Ensure you are using a tokenizer preprocessor.', array('%word' => $v));
$v = Unicode::substr($v, 0, 50);
/** @var \Drupal\search_api\Plugin\search_api\data_type\value\TextValueInterface $value */
$tokens = $value->getTokens();
if ($tokens === NULL) {
$tokens = array();
$text = $value->getText();
// For dates, splitting the timestamp makes no sense.
if ($original_type == 'date') {
$text = $this->getDateFormatter()
->format($text, 'custom', 'Y y F M n m j d l D');
}
foreach (static::splitIntoWords($text) as $word) {
if ($word) {
if (Unicode::strlen($word) > 50) {
$this->getLogger()->warning('An overlong word (more than 50 characters) was encountered while indexing: %word.<br />Since database search servers currently cannot index words of more than 50 characters, the word was truncated for indexing. If this should not be a single word, please make sure the "Tokenizer" processor is enabled and configured correctly for index %index.', array('%word' => $word, '%index' => $index->label()));
$word = Unicode::substr($word, 0, 50);
}
$tokens[] = new TextToken($word);
}
$ret[] = array(
'value' => $v,
'score' => 1,
);
}
}
// This used to fall through the tokenized case.
return $ret;
case 'tokenized_text':
while (TRUE) {
foreach ($value as $i => $v) {
// Check for over-long tokens.
$score = $v['score'];
$v = $v['value'];
if (Unicode::strlen($v) > 50) {
$words = preg_split('/[^\p{L}\p{N}]+/u', $v, -1, PREG_SPLIT_NO_EMPTY);
if (count($words) > 1 && max(array_map('Drupal\Component\Utility\Unicode::strlen', $words)) <= 50) {
// Overlong token is due to bad tokenizing.
// Check for "Tokenizer" preprocessor on index.
if (empty($index->getProcessors()['tokenizer'])) {
$this->getLogger()->warning('An overlong word (more than 50 characters) was encountered while indexing, due to bad tokenizing. It is recommended to enable the "Tokenizer" preprocessor for indexes using database servers. Otherwise, the backend class has to use its own, fixed tokenizing.');
}
else {
$this->getLogger()->warning('An overlong word (more than 50 characters) was encountered while indexing, due to bad tokenizing. Please check your settings for the "Tokenizer" preprocessor to ensure that data is tokenized correctly.');
}
}
$tokens = array();
foreach ($words as $word) {
if (Unicode::strlen($word) > 50) {
$this->getLogger()->warning('An overlong word (more than 50 characters) was encountered while indexing: %word.<br />Database search servers currently cannot index such words correctly – the word was therefore trimmed to the allowed length.', array('%word' => $word));
$word = Unicode::substr($word, 0, 50);
else {
while (TRUE) {
foreach ($tokens as $i => $token) {
// Check for over-long tokens.
$score = $token->getBoost();
$word = $token->getText();
if (Unicode::strlen($word) > 50) {
$new_tokens = array();
foreach (static::splitIntoWords($word) as $word) {
if (Unicode::strlen($word) > 50) {
$this->getLogger()->warning('An overlong word (more than 50 characters) was encountered while indexing: %word.<br />Since database search servers currently cannot index words of more than 50 characters, the word was truncated for indexing. If this should not be a single word, please make sure the "Tokenizer" processor is enabled and configured correctly for index %index.', array('%word' => $word, '%index' => $index->label()));
$word = Unicode::substr($word, 0, 50);
}
$new_tokens[] = new TextToken($word, $score);
}
$tokens[] = array(
'value' => $word,
'score' => $score,
);
array_splice($tokens, $i, 1, $new_tokens);
// Restart the loop looking through all the tokens.
continue 2;
}
array_splice($value, $i, 1, $tokens);
// Restart the loop looking through all the tokens.
continue 2;
}
break;
}
break;
}
return $value;
return $tokens;
case 'string':
case 'uri':
......@@ -1404,6 +1392,21 @@ class Database extends BackendPluginBase {
}
}
/**
* Splits the given string into words.
*
* Word characters as seen by this method are only alphanumerics.
*
* @param string $text
* The string to split.
*
* @return string[]
* All groups of alphanumeric characters contained in the string.
*/
protected static function splitIntoWords($text) {
return preg_split('/[^\p{L}\p{N}]+/u', $text, -1, PREG_SPLIT_NO_EMPTY);
}
/**
* {@inheritdoc}
*/
......@@ -1729,7 +1732,7 @@ class Database extends BackendPluginBase {
$this->ignored[$keys] = 1;
return NULL;
}
$words = preg_split('/[^\p{L}\p{N}]+/u', $processed_keys, -1, PREG_SPLIT_NO_EMPTY);
$words = static::splitIntoWords($processed_keys);
if (count($words) > 1) {
$processed_keys = $this->splitKeys($words);
if ($processed_keys) {
......@@ -2389,7 +2392,7 @@ class Database extends BackendPluginBase {
// Also collect all keywords already contained in the query so we don't
// suggest them.
$keys = preg_split('/[^\p{L}\p{N}]+/u', $user_input, -1, PREG_SPLIT_NO_EMPTY);
$keys = static::splitIntoWords($user_input);
$keys = array_combine($keys, $keys);
if ($incomplete_key) {
$keys[$incomplete_key] = $incomplete_key;
......
......@@ -49,12 +49,12 @@ class SearchApiDataType extends Plugin {
public $default = FALSE;
/**
* The fallback data type for this data type.
* The ID of the fallback data type for this data type.
*
* Needs to be one of the default data types defined in the Search API itself.
*
* @var string
*/
public $fallback_type = 'text';
public $fallback_type = 'string';
}
......@@ -146,10 +146,6 @@ interface BackendSpecificInterface {
* The search index for which items should be indexed.
* @param \Drupal\search_api\Item\ItemInterface[] $items
* An array of items to be indexed, keyed by their item IDs.
* The value of fields with the "tokenized_text" type is an array of tokens.
* Each token is an array containing the following keys:
* - value: The word that the token represents.
* - score: A score for the importance of that word.
*
* @return string[]
* The IDs of all items that were successfully indexed.
......
......@@ -98,7 +98,7 @@ abstract class DataTypePluginBase extends PluginBase implements DataTypeInterfac
* {@inheritdoc}
*/
public function getFallbackType() {
return !empty($this->pluginDefinition['fallback_type']) ? $this->pluginDefinition['fallback_type'] : 'text';
return !empty($this->pluginDefinition['fallback_type']) ? $this->pluginDefinition['fallback_type'] : 'string';
}
/**
......
......@@ -3,10 +3,20 @@
namespace Drupal\search_api\Plugin\search_api\data_type;
use Drupal\search_api\DataType\DataTypePluginBase;
use Drupal\search_api\Plugin\search_api\data_type\value\TextValue;
/**
* Provides a full text data type.
*
* This data type uses objects of type
* \Drupal\search_api\Plugin\search_api\data_type\value\TextValueInterface for
* its values.
*
* The same is expected of all data types that specify this type as their
* fallback.
*
* @see \Drupal\search_api\Plugin\search_api\data_type\value\TextValueInterface
*
* @SearchApiDataType(
* id = "text",
* label = @Translation("Fulltext"),
......@@ -16,4 +26,11 @@ use Drupal\search_api\DataType\DataTypePluginBase;
*/
class TextDataType extends DataTypePluginBase {
/**
* {@inheritdoc}
*/
public function getValue($value) {
return new TextValue((string) $value);
}
}
<?php
namespace Drupal\search_api\Plugin\search_api\data_type\value;
/**
* Represents a single text token contained in a fulltext field's value.
*
* @see \Drupal\search_api\Plugin\search_api\data_type\value\TextValueInterface
*/
class TextToken implements TextTokenInterface {
/**
* The actual text value of this token.
*
* @var string
*/
protected $text;
/**
* The boost value for this token.
*
* @var float
*/
protected $boost = 1.0;
/**
* Constructs a TextToken object.
*
* @param string $text
* The text value of the token.
* @param float $boost
* (optional) The boost for the token.
*/
public function __construct($text, $boost = 1.0) {
$this->text = $text;
$this->boost = $boost;
}
/**
* {@inheritdoc}
*/
public function getText() {
return $this->text;
}
/**
* {@inheritdoc}
*/
public function setText($text) {
$this->text = $text;
return $this;
}
/**
* {@inheritdoc}
*/
public function getBoost() {
return $this->boost;
}
/**
* {@inheritdoc}
*/
public function setBoost($boost) {
$this->boost = $boost;
return $this;
}
/**
* Implements the magic __toString() method.
*/
public function __toString() {
return $this->getText();
}
}
<?php
namespace Drupal\search_api\Plugin\search_api\data_type\value;
/**
* Provides an interface for text tokens.
*/
interface TextTokenInterface {
/**
* Retrieves the text value of this token.
*
* @return string
* The the text value of this token.
*/
public function getText();
/**
* Sets the the text value of this token.
*
* @param string $text
* The new the text value of this token.
*
* @return $this
*/
public function setText($text);
/**
* Retrieves the boost for this token.
*
* @return float
* The boost for this token.
*/
public function getBoost();
/**
* Sets the boost for this token.
*
* @param float $boost
* The new boost for this token.
*
* @return $this
*/
public function setBoost($boost);
}
<?php
namespace Drupal\search_api\Plugin\search_api\data_type\value;
/**
* Represents a single value of a fulltext field.
*/
class TextValue implements TextValueInterface {
/**
* The current text value.
*
* @var string
*/
protected $text;
/**
* The original text value.
*
* @var string
*/
protected $originalText;
/**
* The tokens created for this text value (if any).
*
* @var \Drupal\search_api\Plugin\search_api\data_type\value\TextTokenInterface[]|null
*/
protected $tokens;
/**
* An array of properties for this text value.
*
* @var array
*/
protected $properties = array();
/**
* Constructs a TextValue object.
*
* @param string $text
* The original text value.
*/
public function __construct($text) {
$this->text = $this->originalText = $text;
}
/**
* {@inheritdoc}
*/
public function getText() {
return $this->text;
}
/**
* {@inheritdoc}
*/
public function toText() {
$tokens = $this->getTokens();
if ($tokens !== NULL) {
$to_string = function (TextTokenInterface $token) {
return $token->getText();
};
return implode(' ', array_map($to_string, $tokens));
}
return $this->getText();
}
/**
* {@inheritdoc}
*/
public function setText($text) {
$this->text = $text;
return $this;
}
/**
* {@inheritdoc}
*/
public function getOriginalText() {
return $this->originalText;
}
/**
* {@inheritdoc}
*/
public function setOriginalText($originalText) {
$this->originalText = $originalText;
return $this;
}
/**
* {@inheritdoc}
*/
public function getTokens() {
return $this->tokens;
}
/**
* {@inheritdoc}
*/
public function setTokens(array $tokens = NULL) {
$this->tokens = $tokens;
return $this;
}
/**
* {@inheritdoc}
*/
public function getProperties() {
return $this->properties;
}
/**
* {@inheritdoc}
*/
public function getProperty($name, $default = NULL) {
if (array_key_exists($name, $this->properties)) {
return $this->properties[$name];
}
return $default;
}
/**
* {@inheritdoc}
*/
public function setProperties($properties) {
$this->properties = $properties;
return $this;
}
/**
* {@inheritdoc}
*/
public function setProperty($name, $value = TRUE) {
$this->properties[$name] = $value;
return $this;
}
/**
* Implements the magic __toString() method.
*/
public function __toString() {
return $this->toText();
}
}
<?php
namespace Drupal\search_api\Plugin\search_api\data_type\value;
/**
* Provides an interface for fulltext field values.
*/
interface TextValueInterface {
/**
* Retrieves the currently stored text value.
*
* @return string
* The currently stored text value.
*
* @see \Drupal\search_api\Plugin\search_api\data_type\value\TextValue::toText()
*/
public function getText();
/**
* Retrieves the current effective text value.
*
* This will be a concatenation of all tokens' text values, if tokens have
* been set, or the currently stored text value otherwise.
*
* @return string
* The effective text value for this object.
*/
public function toText();
/**
* Sets the currently stored text value.
*
* @param string $text
* The new text value.
*
* @return $this
*/
public function setText($text);
/**
* Retrieves the original text value.
*
* @return string
* The original text value.
*/
public function getOriginalText();
/**
* Sets the original text value.
*
* @param string $originalText
* The new original text value.
*
* @return $this
*/
public function setOriginalText($originalText);
/**
* Retrieves the text tokens this text value was split into, if any.
*
* @return \Drupal\search_api\Plugin\search_api\data_type\value\TextTokenInterface[]|null
* The text tokens this text value was split into, or NULL if the value has
* not been tokenized in any way yet.
*/
public function getTokens();
/**
* Sets the text tokens for the text value.
*
* @param \Drupal\search_api\Plugin\search_api\data_type\value\TextTokenInterface[]|null $tokens
* The new text tokens, or NULL to remove them.
*
* @return $this
*/
public function setTokens(array $tokens = NULL);
/**
* Retrieves the properties set for this text value.
*
* @return array
* An associative array of properties. Known properties include:
* - lowercase: Whether the value has been lowercased (type: bool)
* - tokenized: Whether the value has been tokenized into individual words
* (type: bool)
* - strip_html: Whether HTML has been stripped from this value (type: bool)
*/
public function getProperties();
/**
* Retrieves a specific property of this text value.
*
* @param string $name
* The property's name.
* @param mixed $default
* (optional) The default to return if the property wasn't set yet.
*
* @return mixed
* Either the property's value, or the given $default if it wasn't set yet.
*/
public function getProperty($name, $default = NULL);
/**
* Sets the properties of this text value.
*
* @param array $properties
* An associative array of properties.
*
* @return $this
*/
public function setProperties($properties);
/**
* Sets the properties of this text value.
*
* @param string $name
* The property's name.
* @param mixed $value
* (optional) The value to set for the property.
*
* @return $this
*/
public function setProperty($name, $value = TRUE);
}
......@@ -5,6 +5,8 @@ namespace Drupal\search_api\Plugin\search_api\processor;
use Drupal\Component\Utility\Html;
use Drupal\Core\Form\FormStateInterface;
use Drupal\Core\Url;
use Drupal\search_api\Item\FieldInterface;
use Drupal\search_api\Plugin\search_api\data_type\value\TextValueInterface;
use Drupal\search_api\Processor\FieldsProcessorPluginBase;
use Drupal\search_api\Utility;
use Symfony\Component\Yaml\Dumper;
......@@ -132,11 +134,24 @@ class HtmlFilter extends FieldsProcessorPluginBase {
/**
* {@inheritdoc}
*/
protected function processFieldValue(&$value, &$type) {
protected function processField(FieldInterface $field) {
parent::processField($field);
foreach ($field->getValues() as $value) {
if ($value instanceof TextValueInterface) {
$value->setProperty('strip_html');
}
}
}
/**
* {@inheritdoc}
*/
protected function processFieldValue(&$value, $type) {
// Remove invisible content.
$text = preg_replace('@<(applet|audio|canvas|command|embed|iframe|map|menu|noembed|noframes|noscript|script|style|svg|video)[^>]*>.*</\1>@siU', ' ', $value);
// Let removed tags still delimit words.
$is_text_type = Utility::isTextType($type, array('text', 'tokenized_text'));
$is_text_type = Utility::isTextType($type);
if ($is_text_type) {
$text = str_replace(array('<', '>'), array(' <', '> '), $text);
if ($this->configuration['title']) {
......@@ -149,7 +164,6 @@ class HtmlFilter extends FieldsProcessorPluginBase {
if ($this->configuration['tags'] && $is_text_type) {
$text = strip_tags($text, '<' . implode('><', array_keys($this->configuration['tags'])) . '>');
$value = $this->parseHtml($text);
$type = 'tokenized_text';
}
else {
$text = strip_tags($text);
......@@ -181,7 +195,7 @@ class HtmlFilter extends FieldsProcessorPluginBase {
* @param float $boost
* (optional) The currently active boost value. Internal use only.
*
* @return array
* @return \Drupal\search_api\Plugin\search_api\data_type\value\TextTokenInterface[]
* Tokenized text with appropriate scores.
*/
protected function parseHtml(&$text, $active_tag = NULL, $boost = 1.0) {
......
......@@ -3,6 +3,8 @@
namespace Drupal\search_api\Plugin\search_api\processor;
use Drupal\Component\Utility\Unicode;
use Drupal\search_api\Item\FieldInterface;
use Drupal\search_api\Plugin\search_api\data_type\value\TextValueInterface;
use Drupal\search_api\Processor\FieldsProcessorPluginBase;
/**
......@@ -20,6 +22,19 @@ use Drupal\search_api\Processor\FieldsProcessorPluginBase;
*/
class IgnoreCase extends FieldsProcessorPluginBase {
/**
* {@inheritdoc}
*/
protected function processField(FieldInterface $field) {