Commit fbec0a1e authored by catch's avatar catch

Issue #1858376 by jhodgdon: Provide tests and documentation for long Unicode characters.

parent 63792c76
......@@ -172,7 +172,8 @@ protected function replace($code, $langcode, $unknown_character) {
* PHPTransliteration::$dataDirectory. These files should set up an array
* variable $overrides with an element whose key is $langcode and whose value
* is an array whose keys are character codes, and whose values are their
* transliterations in this language.
* transliterations in this language. The character codes can be for any valid
* Unicode character, independent of the number of bytes.
*
* @param $langcode
* Code for the language to read.
......@@ -200,7 +201,8 @@ protected function readLanguageOverrides($langcode) {
* hexidecimal notation) in PHPTransliteration::$dataDirectory. These files
* should set up a variable $bank containing an array whose numerical indices
* are the remaining two bytes of the character code, and whose values are the
* transliterations of these characters into US-ASCII.
* transliterations of these characters into US-ASCII. Note that the maximum
* Unicode character that can be encoded in this way is 4 bytes.
*
* @param $bank
* First two bytes of the Unicode character, or 0 for the ASCII range.
......
......@@ -13,6 +13,7 @@
* Enhances PHPTransliteration with an alter hook.
*
* @ingroup transliteration
* @see hook_transliteration_overrides_alter()
*/
class PHPTransliteration extends BaseTransliteration {
......
......@@ -177,11 +177,14 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) {
* vs. initial capital letter only) is not taken into account, and in
* transliterations of capital letters that result in two or more letters, by
* convention only the first is capitalized in the Drupal transliteration
* result. So, the process has limitations; however, since the reason for
* transliteration is typically to create machine names or file names, this
* should not really be a problem. After transliteration, other transformation
* or validation may be necessary, such as converting spaces to another
* character, removing non-printable characters, lower-casing, etc.
* result. Also, only Unicode characters of 4 bytes or less can be
* transliterated in the base system; language-specific overrides can be made
* for longer Unicode characters. So, the process has limitations; however,
* since the reason for transliteration is typically to create machine names or
* file names, this should not really be a problem. After transliteration,
* other transformation or validation may be necessary, such as converting
* spaces to another character, removing non-printable characters,
* lower-casing, etc.
*
* Here is a code snippet to transliterate some text:
* @code
......@@ -196,13 +199,20 @@ function hook_language_fallback_candidates_alter(array &$fallback_candidates) {
* Drupal Core provides the generic transliteration character tables and
* overrides for a few common languages; modules can implement
* hook_transliteration_overrides_alter() to provide further language-specific
* overrides. Modules can also completely override the transliteration classes
* in \Drupal\Core\CoreBundle.
* overrides (including providing transliteration for Unicode characters that
* are longer than 4 bytes). Modules can also completely override the
* transliteration classes in \Drupal\Core\CoreBundle.
*/
/**
* Provide language-specific overrides for transliteration.
*
* If the overrides you want to provide are standard for your language, consider
* providing a patch for the Drupal Core transliteration system instead of using
* this hook. This hook can be used temporarily until Drupal Core's
* transliteration tables are fixed, or for sites that want to use a
* non-standard transliteration system.
*
* @param array $overrides
* Associative array of language-specific overrides whose keys are integer
* Unicode character codes, and whose values are the transliterations of those
......
......@@ -43,6 +43,13 @@ public function testPHPTransliteration() {
// This is a Canadian Aboriginal character like a triangle. See
// http://www.unicode.org/charts/PDF/U1400.pdf
$four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8');
// These are two Gothic alphabet letters. See
// http://en.wikipedia.org/wiki/Gothic_alphabet
// They are not in our tables, but should at least give us '?' (unknown).
$five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8');
// Five-byte characters do not work in MySQL, so make a printable version.
$five_byte_printable = '𐌰𐌸';
$cases = array(
// Each test case is (language code, input, output).
// Test ASCII in English.
......@@ -55,6 +62,8 @@ public function testPHPTransliteration() {
// directly from the data files.
array('fr', $three_byte, 'c'),
array('fr', $four_byte, 'wii'),
// Test 5-byte characters.
array('en', $five_byte, '??', $five_byte_printable),
// Test a language with no overrides.
array('en', $two_byte, 'A O U A O aouaohello'),
// Test language overrides provided by core.
......@@ -64,9 +73,10 @@ public function testPHPTransliteration() {
array('dk', $random, $random),
array('kg', $three_byte, 'ts'),
// Test the language override hook in the test module, which changes
// the transliteration of Ä to Z.
// the transliteration of Ä to Z and provides for the 5-byte characters.
array('zz', $two_byte, 'Z O U A O aouaohello'),
array('zz', $random, $random),
array('zz', $five_byte, 'ATh', $five_byte_printable),
// Test strings in some other languages.
// Turkish, provided by drupal.org user Kartagis.
array('tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'),
......@@ -78,10 +88,11 @@ public function testPHPTransliteration() {
foreach($cases as $case) {
list($langcode, $original, $expected) = $case;
$printable = (isset($case[3])) ? $case[3] : $original;
$transliterator_class = new PHPTransliteration();
$actual = $transliterator_class->transliterate($original, $langcode);
$this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in new class instance.', array(
'@original' => $original,
'@original' => $printable,
'@langcode' => $langcode,
'@expected' => $expected,
'@actual' => $actual,
......@@ -89,7 +100,7 @@ public function testPHPTransliteration() {
$actual = $transliterator_service->transliterate($original, $langcode);
$this->assertIdentical($actual, $expected, format_string('@original transliteration to @actual is identical to @expected for language @langcode in service instance.', array(
'@original' => $original,
'@original' => $printable,
'@langcode' => $langcode,
'@expected' => $expected,
'@actual' => $actual,
......
......@@ -12,5 +12,9 @@ function transliterate_test_transliteration_overrides_alter(&$overrides, $langco
if ($langcode == 'zz') {
// The default transliteration of Ä is A, but change it to Z for testing.
$overrides[0xC4] = 'Z';
// Also provide transliterations of two 5-byte characters from
// http://en.wikipedia.org/wiki/Gothic_alphabet.
$overrides[0x10330] = 'A';
$overrides[0x10338] = 'Th';
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment