Newer
Older
<?php
namespace Drupal\Tests\Component\Transliteration;
use Drupal\Component\Transliteration\PhpTransliteration;
use Drupal\Component\Utility\Random;
use org\bovigo\vfs\vfsStream;
use PHPUnit\Framework\TestCase;
/**
catch
committed
* Tests Transliteration component functionality.
*
* @group Transliteration
*
* @coversDefaultClass \Drupal\Component\Transliteration\PhpTransliteration
*/
class PhpTransliterationTest extends TestCase {
/**
* Tests the PhpTransliteration::removeDiacritics() function.
*
* @param string $original
* The language code to test.
* @param string $expected
* The expected return from PhpTransliteration::removeDiacritics().
*
* @dataProvider providerTestPhpTransliterationRemoveDiacritics
*/
public function testRemoveDiacritics($original, $expected) {
$transliterator_class = new PhpTransliteration();
$result = $transliterator_class->removeDiacritics($original);
$this->assertEquals($expected, $result);
}
/**
* Provides data for self::testRemoveDiacritics().
*
* @return array
* An array of arrays, each containing the parameters for
* self::testRemoveDiacritics().
*/
public function providerTestPhpTransliterationRemoveDiacritics() {
return [
// Test all characters in the Unicode range 0x00bf to 0x017f.
['ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ', 'AAAAAAÆCEEEEIIII'],
['ÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß', 'ÐNOOOOO×OUUUUYÞß'],
['àáâãäåæçèéêëìíîï', 'aaaaaaæceeeeiiii'],
['ðñòóôõö÷øùúûüýþÿ', 'ðnooooo÷ouuuuyþy'],
['ĀāĂ㥹ĆćĈĉĊċČčĎď', 'AaAaAaCcCcCcCcDd'],
['ĐđĒēĔĕĖėĘęĚěĜĝĞğ', 'DdEeEeEeEeEeGgGg'],
['ĠġĢģĤĥĦħĨĩĪīĬĭĮį', 'GgGgHhHhIiIiIiIi'],
['İıIJijĴĵĶķĸĹĺĻļĽľĿ', 'IiIJijJjKkĸLlLlLlL'],
['ŀŁłŃńŅņŇňʼnŊŋŌōŎŏ', 'lLlNnNnNnʼnŊŋOoOo'],
['ŐőŒœŔŕŖŗŘřŚśŜŝŞş', 'OoŒœRrRrRrSsSsSs'],
['ŠšŢţŤťŦŧŨũŪūŬŭŮů', 'SsTtTtTtUuUuUuUu'],
['ŰűŲųŴŵŶŷŸŹźŻżŽž', 'UuUuWwYyYZzZzZz'],
// Test all characters in the Unicode range 0x01CD to 0x024F.
['ǍǎǏ', 'AaI'],
['ǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟ', 'iOoUuUuUuUuUuǝAa'],
['ǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯ', 'AaǢǣGgGgKkOoOoǮǯ'],
['ǰDZDzdzǴǵǶǷǸǹǺǻǼǽǾǿ', 'jDZDzdzGgǶǷNnAaǼǽOo'],
['ȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏ', 'AaAaEeEeIiIiOoOo'],
['ȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟ', 'RrRrUuUuSsTtȜȝHh'],
['ȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯ', 'ȠȡȢȣZzAaEeOoOoOo'],
['ȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿ', 'OoYylntjȸȹACcLTs'],
['ɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ', 'zɁɂBUɅEeJjQqRrYy'],
];
/**
* Tests the PhpTransliteration class.
*
* @param string $langcode
* The language code to test.
* @param string $original
* The original string.
* @param string $expected
* The expected return from PhpTransliteration::transliterate().
* @param string $unknown_character
* (optional) The character to substitute for characters in $string without
* transliterated equivalents. Defaults to '?'.
* @param int $max_length
* (optional) If provided, return at most this many characters, ensuring
* that the transliteration does not split in the middle of an input
* character's transliteration.
*
* @dataProvider providerTestPhpTransliteration
*/
public function testPhpTransliteration($langcode, $original, $expected, $unknown_character = '?', $max_length = NULL) {
$transliterator_class = new PhpTransliteration();
$actual = $transliterator_class->transliterate($original, $langcode, $unknown_character, $max_length);
$this->assertSame($expected, $actual);
}
/**
* Provides data for self::testPhpTransliteration().
*
* @return array
* An array of arrays, each containing the parameters for
* self::testPhpTransliteration().
*/
public function providerTestPhpTransliteration() {
$random_generator = new Random();
$random = $random_generator->string(10);
// Make some strings with two, three, and four-byte characters for testing.
// Note that the 3-byte character is overridden by the 'kg' language.
$two_byte = 'Ä Ö Ü Å Ø äöüåøhello';
Alex Pott
committed
// This is a Cyrillic character that looks something like a "u". See
// http://www.unicode.org/charts/PDF/U0400.pdf
$three_byte = html_entity_decode('ц', ENT_NOQUOTES, 'UTF-8');
// This is a Canadian Aboriginal character like a triangle. See
// http://www.unicode.org/charts/PDF/U1400.pdf
$four_byte = html_entity_decode('ᐑ', ENT_NOQUOTES, 'UTF-8');
catch
committed
// These are two Gothic alphabet letters. See
// http://wikipedia.org/wiki/Gothic_alphabet
catch
committed
// They are not in our tables, but should at least give us '?' (unknown).
$five_byte = html_entity_decode('𐌰𐌸', ENT_NOQUOTES, 'UTF-8');
return [
// Each test case is (language code, input, output).
// Test ASCII in English.
['en', $random, $random],
// Test ASCII in some other language with no overrides.
['fr', $random, $random],
// Test 3 and 4-byte characters in a language without overrides.
// Note: if the data tables change, these will need to change too! They
// are set up to test that data table loading works, so values come
// directly from the data files.
['fr', $three_byte, 'c'],
['fr', $four_byte, 'wii'],
catch
committed
// Test 5-byte characters.
['en', $five_byte, '??'],
// Test a language with no overrides.
['en', $two_byte, 'A O U A O aouaohello'],
// Test language overrides provided by core.
['de', $two_byte, 'Ae Oe Ue A O aeoeueaohello'],
['de', $random, $random],
['dk', $two_byte, 'A O U Aa Oe aouaaoehello'],
['dk', $random, $random],
['kg', $three_byte, 'ts'],
// Test strings in some other languages.
// Turkish, provided by drupal.org user Kartagis.
['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'],
// Max length.
['de', $two_byte, 'Ae Oe', '?', 5],
];
}
/**
* Tests the transliteration with max length.
*/
public function testTransliterationWithMaxLength() {
$transliteration = new PhpTransliteration();
// Test with max length, using German. It should never split up the
// transliteration of a single character.
$input = 'Ä Ö Ü Å Ø äöüåøhello';
$trunc_output = 'Ae Oe Ue A O aeoe';
$this->assertSame($trunc_output, $transliteration->transliterate($input, 'de', '?', 17), 'Truncating to 17 characters works');
$this->assertSame($trunc_output, $transliteration->transliterate($input, 'de', '?', 18), 'Truncating to 18 characters works');
}
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/**
* Tests the unknown character replacement.
*
* @param string $langcode
* The language code to test.
* @param string $original
* The original string.
* @param string $expected
* The expected return from PhpTransliteration::transliterate().
* @param string $unknown_character
* The character to substitute for characters in $string without
* transliterated equivalents.
* @param int $max_length
* The maximum length of the string that returns the transliteration.
*
* @dataProvider providerTestTransliterationUnknownCharacter
*/
public function testTransliterationUnknownCharacter($langcode, $original, $expected, $unknown_character = '?', $max_length = NULL) {
$transliteration = new PhpTransliteration();
$actual = $transliteration->transliterate($original, $langcode, $unknown_character, $max_length);
$this->assertSame($expected, $actual);
}
/**
* Provides data for self::testTransliterationUnknownCharacter().
*
* @return array
* An array of arrays, each containing the parameters for
* self::testTransliterationUnknownCharacter().
*/
public function providerTestTransliterationUnknownCharacter() {
return [
// Each test case is (language code, input, output, unknown character, max
// length).
// Illegal/unknown unicode.
['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?????'],
['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '-----', '-'],
['en', 'Hel' . chr(0x80) . 'o World', 'Hel?o World'],
['en', 'Hell' . chr(0x80) . ' World', 'Hell? World'],
// Non default replacement.
['en', chr(0x80) . 'ello World', '_ello World', '_'],
// Keep the original question marks.
['en', chr(0xF8) . '?' . chr(0x80), '???'],
['en', chr(0x80) . 'ello ? World?', '_ello ? World?', '_'],
['pl', 'aąeę' . chr(0x80) . 'oółżźz ?', 'aaee?oolzzz ?'],
// Non-US-ASCII replacement.
['en', chr(0x80) . 'ello World?', 'Oello World?', 'Ö'],
['pl', chr(0x80) . 'óóść', 'ooosc', 'ó'],
// Ensure question marks are replaced when max length used.
['en', chr(0x80) . 'ello ? World?', '_ello ?', '_', 7],
];
}
/**
* Tests inclusion is safe.
*
* @covers ::readLanguageOverrides
*/
public function testSafeInclude() {
// The overrides in the transliteration data directory transliterates 0x82
// into "safe" but the overrides one directory higher transliterates the
// same character into "security hole". So by using "../index" as the
// language code we can test the ../ is stripped from the langcode.
vfsStream::setup('transliteration', NULL, [
'index.php' => '<?php $overrides = ["../index" => [0x82 => "security hole"]];',
'dir' => [
'index.php' => '<?php $overrides = ["../index" => [0x82 => "safe"]];',
],
]);
$transliteration = new PhpTransliteration(vfsStream::url('transliteration/dir'));
$transliterated = $transliteration->transliterate(chr(0xC2) . chr(0x82), '../index');
$this->assertSame('safe', $transliterated);
}