Commit 040e6275 authored by catch's avatar catch

Issue #3001997 by Krzysztof Domański, scott_euser, alexpott: Transliteration a...

Issue #3001997 by Krzysztof Domański, scott_euser, alexpott: Transliteration a string containing an unknown character (e.g. 0x80) is not valid

(cherry picked from commit f9e7921b)
parent 597d9915
......@@ -107,6 +107,29 @@ public function removeDiacritics($string) {
public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
$result = '';
$length = 0;
$hash = FALSE;
// Replace question marks with a unique hash if necessary. This because
// mb_convert_encoding() replaces all invalid characters with a question
// mark.
if ($unknown_character != '?' && strpos($string, '?') !== FALSE) {
$hash = hash('sha256', $string);
$string = str_replace('?', $hash, $string);
}
// Ensure the string is valid UTF8 for preg_split(). Unknown characters will
// be replaced by a question mark.
$string = mb_convert_encoding($string, 'UTF-8', 'UTF-8');
// Use the provided unknown character instead of a question mark.
if ($unknown_character != '?') {
$string = str_replace('?', $unknown_character, $string);
// Restore original question marks if necessary.
if ($hash !== FALSE) {
$string = str_replace($hash, '?', $string);
}
}
// Split into Unicode characters and transliterate each one.
foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
$code = self::ordUTF8($character);
......
......@@ -142,8 +142,6 @@ public function providerTestPhpTransliteration() {
// Test strings in some other languages.
// Turkish, provided by drupal.org user Kartagis.
['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'],
// Illegal/unknown unicode.
['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?'],
// Max length.
['de', $two_byte, 'Ae Oe', '?', 5],
];
......@@ -164,6 +162,60 @@ public function testTransliterationWithMaxLength() {
$this->assertSame($trunc_output, $transliteration->transliterate($input, 'de', '?', 18), 'Truncating to 18 characters works');
}
/**
* Tests the unknown character replacement.
*
* @param string $langcode
* The language code to test.
* @param string $original
* The original string.
* @param string $expected
* The expected return from PhpTransliteration::transliterate().
* @param string $unknown_character
* The character to substitute for characters in $string without
* transliterated equivalents.
* @param int $max_length
* The maximum length of the string that returns the transliteration.
*
* @dataProvider providerTestTransliterationUnknownCharacter
*/
public function testTransliterationUnknownCharacter($langcode, $original, $expected, $unknown_character = '?', $max_length = NULL) {
$transliteration = new PhpTransliteration();
$actual = $transliteration->transliterate($original, $langcode, $unknown_character, $max_length);
$this->assertSame($expected, $actual);
}
/**
* Provides data for self::testTransliterationUnknownCharacter().
*
* @return array
* An array of arrays, each containing the parameters for
* self::testTransliterationUnknownCharacter().
*/
public function providerTestTransliterationUnknownCharacter() {
return [
// Each test case is (language code, input, output, unknown character, max
// length).
// Illegal/unknown unicode.
['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?????'],
['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '-----', '-'],
['en', 'Hel' . chr(0x80) . 'o World', 'Hel?o World'],
['en', 'Hell' . chr(0x80) . ' World', 'Hell? World'],
// Non default replacement.
['en', chr(0x80) . 'ello World', '_ello World', '_'],
// Keep the original question marks.
['en', chr(0xF8) . '?' . chr(0x80), '???'],
['en', chr(0x80) . 'ello ? World?', '_ello ? World?', '_'],
['pl', 'aąeę' . chr(0x80) . 'oółżźz ?', 'aaee?oolzzz ?'],
// Non-US-ASCII replacement.
['en', chr(0x80) . 'ello World?', 'Oello World?', 'Ö'],
['pl', chr(0x80) . 'óóść', 'ooosc', 'ó'],
// Ensure question marks are replaced when max length used.
['en', chr(0x80) . 'ello ? World?', '_ello ?', '_', 7],
];
}
/**
* Tests inclusion is safe.
*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment