Commit 60e3f5c1 authored by catch's avatar catch

Issue #567832 by jhodgdon, Damien Tournoud, amateescu: Transliteration in core.

parent a0b6d650
<?php
/**
* @file
* Definition of \Drupal\Component\Transliteration\PhpTransliteration.
*
* Some parts of this code were derived from the MediaWiki project's UtfNormal
* class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
* http://www.mediawiki.org/
*/
namespace Drupal\Component\Transliteration;
/**
* Implements transliteration without using the PECL extensions.
*
* Transliterations are done character-by-character, by looking up non-US-ASCII
* characters in a transliteration database. The database comes from two types
* of files, both of which are searched for in the
* PHPTransliteration::$dataDirectory directory. First, language-specific
* overrides are searched (see PHPTranslation::readLanguageOverrides() for
* details of these files). If there is no language-specific override for a
* character, the generic transliteration character tables are searched (see
* PHPTranslation::readGenericData() for details of these files). If looking up
* the character in the generic table results in a NULL value, or an illegal
* character is encountered, then a substitute character is returned.
*
* This class is the registered transliteration class returned from
* drupal_container()->get('transliteration') by default.
*
* @ingroup transliteration
*/
class PHPTransliteration implements TransliterationInterface {
/**
* Directory where data for transliteration resides.
*
* The constructor sets this (by default) to subdirectory 'data' underneath
* the directory where the class's PHP file resides.
*
* @var string
*/
protected $dataDirectory;
/**
* Associative array of language-specific character transliteration tables.
*
* The outermost array keys are language codes. For each language code key,
* the value is an array whose keys are Unicode character codes, and whose
* values are the transliterations of those characters to US-ASCII. This is
* set up as needed in PHPTransliteration::replace() by calling
* PHPTransliteration::readLanguageOverrides().
*
* @var array
*/
protected $languageOverrides = array();
/**
* Non-language-specific transliteration tables.
*
* Array whose keys are the upper two bytes of the Unicode character, and
* whose values are an array of transliterations for each lower-two bytes
* character code. This is set up as needed in PHPTransliteration::replace()
* by calling PHPTransliteration::readGenericData().
*
* @var array
*/
protected $genericMap = array();
/**
* Returns this PHPTransliteration object (for the Drupal Container).
*/
public function get() {
return $this;
}
/**
* Constructs a transliteration object.
*
* @param string $data_directory
* (optional) The directory where data files reside. If omitted, defaults
* to subdirectory 'data' underneath the directory where the class's PHP
* file resides.
*/
public function __construct($data_directory = NULL) {
// Set up data directory and tail bytes table.
$this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
}
/**
* Implements TransliterationInterface::transliterate().
*/
public function transliterate($string, $langcode = 'en', $unknown_character = '?') {
$result = '';
// Split into Unicode characters and transliterate each one.
foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
$code = self::ordUTF8($character);
if ($code == -1) {
$result .= $unknown_character;
}
else {
$result .= $this->replace($code, $langcode, $unknown_character);
}
}
return $result;
}
/**
* Finds the character code for a UTF-8 character: like ord() but for UTF-8.
*
* @param string $character
* A single UTF-8 character.
*
* @return int
* The character code, or -1 if an illegal character is found.
*/
protected static function ordUTF8($character) {
$first_byte = ord($character[0]);
if (($first_byte & 0x80) == 0) {
// Single-byte form: 0xxxxxxxx.
return $first_byte;
}
if (($first_byte & 0xe0) == 0xc0) {
// Two-byte form: 110xxxxx 10xxxxxx.
return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
}
if (($first_byte & 0xf0) == 0xe0) {
// Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
}
if (($first_byte & 0xf8) == 0xf0) {
// Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
}
// Other forms are not legal.
return -1;
}
/**
* Replaces a single Unicode character using the transliteration database.
*
* @param int $code
* The character code of a Unicode character.
* @param string $langcode
* The language code of the language the character is in.
* @param string $unknown_character
* The character to substitute for characters without transliterated
* equivalents.
*
* @return string
* US-ASCII replacement character. If it has a mapping, it is returned;
* otherwise, $unknown_character is returned.
*/
protected function replace($code, $langcode, $unknown_character) {
if ($code < 0x80) {
// Already lower ASCII.
return chr($code);
}
// See if there is a language-specific override for this character.
if (!isset($this->languageOverrides[$langcode])) {
$this->readLanguageOverrides($langcode);
}
if (isset($this->languageOverrides[$langcode][$code])) {
return $this->languageOverrides[$langcode][$code];
}
// See if there is a generic mapping for this character.
$bank = $code >> 8;
if (!isset($this->genericMap[$bank])) {
$this->readGenericData($bank);
}
$code = $code & 0xff;
return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;
}
/**
* Reads in language overrides for a language code.
*
* The data is read from files named "$langcode.php" in
* PHPTransliteration::$dataDirectory. These files should set up an array
* variable $overrides with an element whose key is $langcode and whose value
* is an array whose keys are character codes, and whose values are their
* transliterations in this language. The resulting $overrides array is
* altered by invoking hook_transliteration_overrides_alter() to let modules
* add additional overrides.
*
* @param $langcode
* Code for the language to read.
*/
protected function readLanguageOverrides($langcode) {
// Figure out the file name to use by sanitizing the language code,
// just in case.
$file = $this->dataDirectory . '/' . preg_replace('[^a-zA-Z\-]', '', $langcode) . '.php';
// Read in this file, which should set up a variable called $overrides,
// which will be local to this function.
if (is_file($file)) {
include($file);
}
if (!isset($overrides) || !is_array($overrides)) {
$overrides = array($langcode => array());
}
// Let modules alter the list, and save it.
drupal_alter('transliteration_overrides', $overrides, $langcode);
$this->languageOverrides[$langcode] = $overrides[$langcode];
}
/**
* Reads in generic transliteration data for a bank of characters.
*
* The data is read in from a file named "x$bank.php" (with $bank in
* hexidecimal notation) in PHPTransliteration::$dataDirectory. These files
* should set up a variable $bank containing an array whose numerical indices
* are the remaining two bytes of the character code, and whose values are the
* transliterations of these characters into US-ASCII.
*
* @param $bank
* First two bytes of the Unicode character, or 0 for the ASCII range.
*/
protected function readGenericData($bank) {
// Figure out the file name.
$file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
// Read in this file, which should set up a variable called $base, which
// will be local to this function.
if (is_file($file)) {
include($file);
}
if (!isset($base) || !is_array($base)) {
$base = array();
}
// Save this data.
$this->genericMap[$bank] = $base;
}
}
<?php
/**
* @file
* Definition of \Drupal\Component\Transliteration\TransliterationInterface.
*/
namespace Drupal\Component\Transliteration;
/**
* Defines an interface for classes providing transliteration.
*
* @ingroup transliteration
*/
interface TransliterationInterface {
/**
* Transliterates text from Unicode to US-ASCII.
*
* @param string $string
* The string to transliterate.
* @param string $langcode
* (optional) The language code of the language the string is in. Defaults
* to 'en' if not provided.
* @param string $unknown_character
* (optional) The character to substitute for characters in $string without
* transliterated equivalents. Defaults to '?'.
*
* @return string
* $string with non-US-ASCII characters transliterated to US-ASCII
* characters, and unknown characters replaced with $unknown_character.
*/
public function transliterate($string, $langcode = 'en', $unknown_character = '?');
}
<?php
/**
* @file
* German transliteration data for the PHPTransliteration class.
*/
$overrides['de'] = array(
0xC4 => 'Ae',
0xD6 => 'Oe',
0xDC => 'Ue',
0xE4 => 'ae',
0xF6 => 'oe',
0xFC => 'ue',
);
<?php
/**
* @file
* Danish transliteration data for the PHPTransliteration class.
*/
$overrides['dk'] = array(
0xC5 => 'Aa',
0xD8 => 'Oe',
0xE5 => 'aa',
0xF8 => 'oe',
);
<?php
/**
* @file
* Esperanto transliteration data for the PHPTransliteration class.
*/
$overrides['eo'] = array(
0x18 => 'Cx',
0x19 => 'cx',
0x11C => 'Gx',
0x11D => 'gx',
0x124 => 'Hx',
0x125 => 'hx',
0x134 => 'Jx',
0x135 => 'jx',
0x15C => 'Sx',
0x15D => 'sx',
0x16C => 'Ux',
0x16D => 'ux',
);
<?php
/**
* @file
* Kyrgyz transliteration data for the PHPTransliteration class.
*/
$overrides['kg'] = array(
0x41 => 'E',
0x416 => 'C',
0x419 => 'J',
0x425 => 'X',
0x426 => 'TS',
0x429 => 'SCH',
0x42E => 'JU',
0x42F => 'JA',
0x436 => 'c',
0x439 => 'j',
0x445 => 'x',
0x446 => 'ts',
0x449 => 'sch',
0x44E => 'ju',
0x44F => 'ja',
0x451 => 'e',
0x4A2 => 'H',
0x4A3 => 'h',
0x4AE => 'W',
0x4AF => 'w',
0x4E8 => 'Q',
0x4E9 => 'q',
);
<?php
/**
* @file
* Generic transliteration data for the PHPTransliteration class.
*/
$base = array(
// Note: to save memory plain ASCII mappings have been left out.
0x80 => '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
0x90 => '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
0xA0 => ' ', '!', 'C/', 'PS', '$?', 'Y=', '|', 'SS', '"', '(c)', 'a', '<<', '!', '', '(r)', '-',
0xB0 => 'deg', '+-', '2', '3', '\'', 'u', 'P', '*', ',', '1', 'o', '>>', '1/4', '1/2', '3/4', '?',
0xC0 => 'A', 'A', 'A', 'A', 'A', 'A', 'Ae', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
0xD0 => 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'Y', 'Th', 'ss',
0xE0 => 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
0xF0 => 'd', 'n', 'o', 'o', 'o', 'o', 'o', '/', 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y',
);
<?php
/**
* @file
* Generic transliteration data for the PHPTransliteration class.
*/
$base = array(
0x00 => 'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd',
0x10 => 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g',
0x20 => 'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i',
0x30 => 'I', 'i', 'IJ', 'ij', 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L',
0x40 => 'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', '\'n', 'NG', 'ng', 'O', 'o', 'O', 'o',
0x50 => 'O', 'o', 'OE', 'oe', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's',
0x60 => 'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u',
0x70 => 'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's',
0x80 => 'b', 'B', 'B', 'b', '6', '6', 'O', 'C', 'c', 'D', 'D', 'D', 'd', 'd', '3', '@',
0x90 => 'E', 'F', 'f', 'G', 'G', 'hv', 'I', 'I', 'K', 'k', 'l', 'l', 'W', 'N', 'n', 'O',
0xA0 => 'O', 'o', 'OI', 'oi', 'P', 'p', 'YR', '2', '2', 'SH', 'sh', 't', 'T', 't', 'T', 'U',
0xB0 => 'u', 'Y', 'V', 'Y', 'y', 'Z', 'z', 'ZH', 'ZH', 'zh', 'zh', '2', '5', '5', 'ts', 'w',
0xC0 => '|', '||', '|=', '!', 'DZ', 'Dz', 'dz', 'LJ', 'Lj', 'lj', 'NJ', 'Nj', 'nj', 'A', 'a', 'I',
0xD0 => 'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', '@', 'A', 'a',
0xE0 => 'A', 'a', 'AE', 'ae', 'G', 'g', 'G', 'g', 'K', 'k', 'O', 'o', 'O', 'o', 'ZH', 'zh',
0xF0 => 'j', 'DZ', 'D', 'dz', 'G', 'g', 'HV', 'W', 'N', 'n', 'A', 'a', 'AE', 'ae', 'O', 'o',
);
<?php
/**
* @file
* Generic transliteration data for the PHPTransliteration class.
*/
$base = array(
0x00 => 'A', 'a', 'A', 'a', 'E', 'e', 'E', 'e', 'I', 'i', 'I', 'i', 'O', 'o', 'O', 'o',
0x10 => 'R', 'r', 'R', 'r', 'U', 'u', 'U', 'u', 'S', 's', 'T', 't', 'Y', 'y', 'H', 'h',
0x20 => 'N', 'd', 'OU', 'ou', 'Z', 'z', 'A', 'a', 'E', 'e', 'O', 'o', 'O', 'o', 'O', 'o',
0x30 => 'O', 'o', 'Y', 'y', 'l', 'n', 't', 'j', 'db', 'qp', 'A', 'C', 'c', 'L', 'T', 's',
0x40 => 'z', '?', '?', 'B', 'U', 'V', 'E', 'e', 'J', 'j', 'Q', 'q', 'R', 'r', 'Y', 'y',
0x50 => 'a', 'a', 'a', 'b', 'o', 'c', 'd', 'd', 'e', '@', '@', 'e', 'e', 'e', 'e', 'j',
0x60 => 'g', 'g', 'g', 'g', 'u', 'Y', 'h', 'h', 'i', 'i', 'I', 'l', 'l', 'l', 'lZ', 'W',
0x70 => 'W', 'm', 'n', 'n', 'n', 'o', 'OE', 'O', 'F', 'R', 'R', 'R', 'R', 'r', 'r', 'R',
0x80 => 'R', 'R', 's', 'S', 'j', 'S', 'S', 't', 't', 'U', 'U', 'v', '^', 'W', 'Y', 'Y',
0x90 => 'z', 'z', 'Z', 'Z', '?', '?', '?', 'C', '@', 'B', 'E', 'G', 'H', 'j', 'k', 'L',
0xA0 => 'q', '?', '?', 'dz', 'dZ', 'dz', 'ts', 'tS', 'tC', 'fN', 'ls', 'lz', 'WW', ']]', 'h', 'h',
0xB0 => 'k', 'h', 'j', 'r', 'r', 'r', 'r', 'w', 'y', '\'', '"', '`', '\'', '`', '`', '\'',
0xC0 => '?', '?', '<', '>', '^', 'V', '^', 'V', '\'', '-', '/', '\\', ',', '_', '\\', '/',
0xD0 => ':', '.', '`', '\'', '^', 'V', '+', '-', 'V', '.', '@', ',', '~', '"', 'R', 'X',
0xE0 => 'G', 'l', 's', 'x', '?', '', '', '', '', '', '', '', 'V', '=', '"', NULL,
0xF0 => NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
);
<?php
/**
* @file
* Generic transliteration data for the PHPTransliteration class.
*/
$base = array(
0x00 => '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
0x10 => '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
0x20 => '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
0x30 => '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
0x40 => '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', NULL,
0x50 => NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
0x60 => '', '', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
0x70 => NULL, NULL, NULL, NULL, '\'', ',', NULL, NULL, NULL, NULL, '', NULL, NULL, NULL, '?', NULL,
0x80 => NULL, NULL, NULL, NULL, '', '', 'A', ';', 'E', 'I', 'I', NULL, 'O', NULL, 'U', 'O',
0x90 => 'I', 'A', 'V', 'G', 'D', 'E', 'Z', 'I', 'Th', 'I', 'K', 'L', 'M', 'N', 'X', 'O',
0xA0 => 'P', 'R', NULL, 'S', 'T', 'Y', 'F', 'H', 'Ps', 'O', 'I', 'Y', 'a', 'e', 'i', 'i',
0xB0 => 'y', 'a', 'v', 'g', 'd', 'e', 'z', 'i', 'th', 'i', 'k', 'l', 'm', 'n', 'x', 'o',
0xC0 => 'p', 'r', 's', 's', 't', 'y', 'f', 'h', 'ps', 'o', 'i', 'y', 'o', 'y', 'o', NULL,
0xD0 => 'b', 'th', 'U', 'U', 'U', 'ph', 'p', '&', NULL, NULL, 'St', 'st', 'W', 'w', 'Q', 'q',
0xE0 => 'Sp', 'sp', 'Sh', 'sh', 'F', 'f', 'Kh', 'kh', 'H', 'h', 'G', 'g', 'CH', 'ch', 'Ti', 'ti',
0xF0 => 'k', 'r', 'c', 'j', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
);
<?php
/**
* @file
* Generic transliteration data for the PHPTransliteration class.
*/
$base = array(
0x00 => 'Jo', 'Yo', 'Dj', 'Gj', 'Ie', 'Dz', 'I', 'Yi', 'J', 'Lj', 'Nj', 'Tsh', 'Kj', 'I', 'U', 'Dzh',
0x10 => 'A', 'B', 'V', 'G', 'D', 'E', 'Zh', 'Z', 'I', 'Y', 'K', 'L', 'M', 'N', 'O', 'P',
0x20 => 'R', 'S', 'T', 'U', 'F', 'H', 'C', 'Ch', 'Sh', 'Shch', '', 'Y', '', 'E', 'Yu', 'Ya',
0x30 => 'a', 'b', 'v', 'g', 'd', 'e', 'zh', 'z', 'i', 'y', 'k', 'l', 'm', 'n', 'o', 'p',
0x40 => 'r', 's', 't', 'u', 'f', 'h', 'c', 'ch', 'sh', 'shch', '', 'y', '', 'e', 'yu', 'ya',
0x50 => 'je', 'yo', 'dj', 'gj', 'ie', 'dz', 'i', 'yi', 'j', 'lj', 'nj', 'tsh', 'kj', 'i', 'u', 'dzh',
0x60 => 'O', 'o', 'E', 'e', 'Ie', 'ie', 'E', 'e', 'Ie', 'ie', 'O', 'o', 'Io', 'io', 'Ks', 'ks',
0x70 => 'Ps', 'ps', 'F', 'f', 'Y', 'y', 'Y', 'y', 'u', 'u', 'O', 'o', 'O', 'o', 'Ot', 'ot',
0x80 => 'Q', 'q', '*1000*', '', '', '', '', NULL, '*100.000*', '*1.000.000*', NULL, NULL, '"', '"', 'R\'', 'r\'',
0x90 => 'G\'', 'g\'', 'G\'', 'g\'', 'G\'', 'g\'', 'Zh\'', 'zh\'', 'Z\'', 'z\'', 'K\'', 'k\'', 'K\'', 'k\'', 'K\'', 'k\'',
0xA0 => 'K\'', 'k\'', 'N\'', 'n\'', 'Ng', 'ng', 'P\'', 'p\'', 'Kh', 'kh', 'S\'', 's\'', 'T\'', 't\'', 'U', 'u',
0xB0 => 'U\'', 'u\'', 'Kh\'', 'kh\'', 'Tts', 'tts', 'Ch\'', 'ch\'', 'Ch\'', 'ch\'', 'H', 'h', 'Ch', 'ch', 'Ch\'', 'ch\'',
0xC0 => '`', 'Zh', 'zh', 'K\'', 'k\'', NULL, NULL, 'N\'', 'n\'', NULL, NULL, 'Ch', 'ch', NULL, NULL, NULL,
0xD0 => 'a', 'a', 'A', 'a', 'Ae', 'ae', 'Ie', 'ie', '@', '@', '@', '@', 'Zh', 'zh', 'Z', 'z',
0xE0 => 'Dz', 'dz', 'I', 'i', 'I', 'i', 'O', 'o', 'O', 'o', 'O', <