Commit ba500d47 authored by alexpott's avatar alexpott

Issue #731298 by pjonckiere, jhodgdon: Searches for words with...

Issue #731298 by pjonckiere, jhodgdon: Searches for words with diacritics/accents: word not highlighted in results
parent 583af3dd
......@@ -75,6 +75,37 @@ public function __construct($data_directory = NULL) {
$this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
}
/**
* {@inheritdoc}
*/
public function removeDiacritics($string) {
$result = '';
foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
$code = self::ordUTF8($character);
// These two Unicode ranges include the accented US-ASCII letters, with a
// few characters that aren't accented letters mixed in. So define the
// ranges and the excluded characters.
$range1 = $code > 0x00bf && $code < 0x017f;
$exclusions_range1 = array(0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b);
$range2 = $code > 0x01cc && $code < 0x0250;
$exclusions_range2 = array(0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245);
$replacement = $character;
if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) {
$to_add = $this->lookupReplacement($code, 'xyz');
if(strlen($to_add) === 1) {
$replacement = $to_add;
}
}
$result .= $replacement;
}
return $result;
}
/**
* {@inheritdoc}
*/
......@@ -152,7 +183,8 @@ protected static function ordUTF8($character) {
*
* @return string
* US-ASCII replacement character. If it has a mapping, it is returned;
* otherwise, $unknown_character is returned.
* otherwise, $unknown_character is returned. The replacement can contain
* multiple characters.
*/
protected function replace($code, $langcode, $unknown_character) {
if ($code < 0x80) {
......@@ -168,6 +200,24 @@ protected function replace($code, $langcode, $unknown_character) {
return $this->languageOverrides[$langcode][$code];
}
return $this->lookupReplacement($code, $unknown_character);
}
/**
* Look up the generic replacement for a UTF-8 character code.
*
* @param $code
* The UTF-8 character code.
* @param string $unknown_character
* (optional) The character to substitute for characters without entries in
* the replacement tables.
*
* @return string
* US-ASCII replacement characters. If it has a mapping, it is returned;
* otherwise, $unknown_character is returned. The replacement can contain
* multiple characters.
*/
protected function lookupReplacement($code, $unknown_character = '?') {
// See if there is a generic mapping for this character.
$bank = $code >> 8;
if (!isset($this->genericMap[$bank])) {
......
......@@ -14,6 +14,22 @@
*/
interface TransliterationInterface {
/**
* Removes diacritics (accents) from certain letters.
*
* This only applies to certain letters: Accented Latin characters like
* a-with-acute-accent, in the UTF-8 character range of 0xE0 to 0xE6 and
* 01CD to 024F. Replacements that would result in the string changing length
* are excluded, as well as characters that are not accented US-ASCII letters.
*
* @param string $string
* The string holding diacritics.
*
* @return string
* $string with accented letters replaced by their unaccented equivalents.
*/
public function removeDiacritics($string);
/**
* Transliterates text from Unicode to US-ASCII.
*
......
......@@ -241,6 +241,9 @@ function search_simplify($text, $langcode = NULL) {
// Lowercase
$text = Unicode::strtolower($text);
// Remove diacitics.
$text = \Drupal::service('transliteration')->removeDiacritics($text);
// Call an external processor for word handling.
search_invoke_preprocess($text, $langcode);
......
<?php
/**
* @file
* Contains \Drupal\search\Tests\SearchNodeDiacriticsTest.
*/
namespace Drupal\search\Tests;
/**
* Tests search functionality with diacritics.
*
* @group search
*/
class SearchNodeDiacriticsTest extends SearchTestBase {
/**
* A user with permission to use advanced search.
*
* @var \Drupal\user\UserInterface
*/
public $testUser;
protected function setUp() {
parent::setUp();
node_access_rebuild();
// Create a test user and log in.
$this->testUser = $this->drupalCreateUser(array('access content', 'search content', 'use advanced search', 'access user profiles'));
$this->drupalLogin($this->testUser);
}
/**
* Tests that search returns results with diacritics in the search phrase.
*/
function testPhraseSearchPunctuation() {
$body_text = 'The Enricþment Center is cómmīŦŧęđ to the well BɆĬŇĜ of æll påŔťıçȉpǎǹţș. ';
$body_text .= 'Also meklēt (see #731298)';
$this->drupalCreateNode(array('body' => array(array('value' => $body_text))));
// Update the search index.
$this->container->get('plugin.manager.search')->createInstance('node_search')->updateIndex();
search_update_totals();
// Refresh variables after the treatment.
$this->refreshVariables();
$edit = array('keys' => 'meklet');
$this->drupalPostForm('search/node', $edit, t('Search'));
$this->assertRaw('<strong>meklēt</strong>');
$edit = array('keys' => 'meklēt');
$this->drupalPostForm('search/node', $edit, t('Search'));
$this->assertRaw('<strong>meklēt</strong>');
$edit = array('keys' => 'cómmīŦŧęđ BɆĬŇĜ påŔťıçȉpǎǹţș');
$this->drupalPostForm('search/node', $edit, t('Search'));
$this->assertRaw('<strong>cómmīŦŧęđ</strong>');
$this->assertRaw('<strong>BɆĬŇĜ</strong>');
$this->assertRaw('<strong>påŔťıçȉpǎǹţș</strong>');
$edit = array('keys' => 'committed being participants');
$this->drupalPostForm('search/node', $edit, t('Search'));
$this->assertRaw('<strong>cómmīŦŧęđ</strong>');
$this->assertRaw('<strong>BɆĬŇĜ</strong>');
$this->assertRaw('<strong>påŔťıçȉpǎǹţș</strong>');
$edit = array('keys' => 'Enricþment');
$this->drupalPostForm('search/node', $edit, t('Search'));
$this->assertRaw('<strong>Enricþment</strong>');
$edit = array('keys' => 'Enritchment');
$this->drupalPostForm('search/node', $edit, t('Search'));
$this->assertNoRaw('<strong>Enricþment</strong>');
$edit = array('keys' => 'æll');
$this->drupalPostForm('search/node', $edit, t('Search'));
$this->assertRaw('<strong>æll</strong>');
$edit = array('keys' => 'all');
$this->drupalPostForm('search/node', $edit, t('Search'));
$this->assertNoRaw('<strong>æll</strong>');
}
}
......@@ -20,6 +20,58 @@
*/
class PhpTransliterationTest extends UnitTestCase {
/**
* Tests the PhpTransliteration::removeDiacritics() function.
*
* @param string $original
* The language code to test.
* @param string $expected
* The expected return from PhpTransliteration::removeDiacritics().
*
* @dataProvider providerTestPhpTransliterationRemoveDiacritics
*/
public function testRemoveDiacritics($original, $expected) {
$transliterator_class = new PhpTransliteration();
$result = $transliterator_class->removeDiacritics($original);
$this->assertEquals($expected, $result);
}
/**
* Provides data for self::testRemoveDiacritics().
*
* @return array
* An array of arrays, each containing the parameters for
* self::testRemoveDiacritics().
*/
public function providerTestPhpTransliterationRemoveDiacritics() {
return array(
// Test all characters in the Unicode range 0x00bf to 0x017f.
array('ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ', 'AAAAAAÆCEEEEIIII'),
array('ÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß', 'ÐNOOOOO×OUUUUYÞß'),
array('àáâãäåæçèéêëìíîï', 'aaaaaaæceeeeiiii'),
array('ðñòóôõö÷øùúûüýþÿ', 'ðnooooo÷ouuuuyþy'),
array('ĀāĂ㥹ĆćĈĉĊċČčĎď', 'AaAaAaCcCcCcCcDd'),
array('ĐđĒēĔĕĖėĘęĚěĜĝĞğ', 'DdEeEeEeEeEeGgGg'),
array('ĠġĢģĤĥĦħĨĩĪīĬĭĮį', 'GgGgHhHhIiIiIiIi'),
array('İıIJijĴĵĶķĸĹĺĻļĽľĿ', 'IiIJijJjKkĸLlLlLlL'),
array('ŀŁłŃńŅņŇňʼnŊŋŌōŎŏ', 'lLlNnNnNnʼnŊŋOoOo'),
array('ŐőŒœŔŕŖŗŘřŚśŜŝŞş', 'OoŒœRrRrRrSsSsSs'),
array('ŠšŢţŤťŦŧŨũŪūŬŭŮů', 'SsTtTtTtUuUuUuUu'),
array('ŰűŲųŴŵŶŷŸŹźŻżŽž', 'UuUuWwYyYZzZzZz'),
// Test all characters in the Unicode range 0x01CD to 0x024F.
array('ǍǎǏ', 'AaI'),
array('ǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟ', 'iOoUuUuUuUuUuǝAa'),
array('ǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯ', 'AaǢǣGgGgKkOoOoǮǯ'),
array('ǰDZDzdzǴǵǶǷǸǹǺǻǼǽǾǿ', 'jDZDzdzGgǶǷNnAaǼǽOo'),
array('ȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏ', 'AaAaEeEeIiIiOoOo'),
array('ȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟ', 'RrRrUuUuSsTtȜȝHh'),
array('ȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯ', 'ȠȡȢȣZzAaEeOoOoOo'),
array('ȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿ', 'OoYylntjȸȹACcLTs'),
array('ɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ', 'zɁɂBUɅEeJjQqRrYy'),
);
}
/**
* Tests the PhpTransliteration class.
*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment