summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNathaniel Catchpole2019-01-02 10:28:53 (GMT)
committerNathaniel Catchpole2019-01-02 10:29:03 (GMT)
commit040e6275a0a1ff741465152bcb5f185665b0089c (patch)
tree439fc7495ba10b1ad6d95d3fd3ab76fd4f250e2d
parent597d991559cccc7b2d04c23845e4b6ac2af46164 (diff)
Issue #3001997 by Krzysztof Domański, scott_euser, alexpott: Transliteration a string containing an unknown character (e.g. 0x80) is not valid
(cherry picked from commit f9e7921bc8a352fbcc5fab33e517dd1935e77b48)
-rw-r--r--core/lib/Drupal/Component/Transliteration/PhpTransliteration.php23
-rw-r--r--core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php56
2 files changed, 77 insertions, 2 deletions
diff --git a/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php b/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php
index 3cd8d68..2f63974 100644
--- a/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php
+++ b/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php
@@ -107,6 +107,29 @@ class PhpTransliteration implements TransliterationInterface {
public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
$result = '';
$length = 0;
+ $hash = FALSE;
+
+ // Replace question marks with a unique hash if necessary. This because
+ // mb_convert_encoding() replaces all invalid characters with a question
+ // mark.
+ if ($unknown_character != '?' && strpos($string, '?') !== FALSE) {
+ $hash = hash('sha256', $string);
+ $string = str_replace('?', $hash, $string);
+ }
+
+ // Ensure the string is valid UTF8 for preg_split(). Unknown characters will
+ // be replaced by a question mark.
+ $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8');
+
+ // Use the provided unknown character instead of a question mark.
+ if ($unknown_character != '?') {
+ $string = str_replace('?', $unknown_character, $string);
+ // Restore original question marks if necessary.
+ if ($hash !== FALSE) {
+ $string = str_replace($hash, '?', $string);
+ }
+ }
+
// Split into Unicode characters and transliterate each one.
foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
$code = self::ordUTF8($character);
diff --git a/core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php b/core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php
index 924f3e9..b6a79b3 100644
--- a/core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php
+++ b/core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php
@@ -142,8 +142,6 @@ class PhpTransliterationTest extends TestCase {
// Test strings in some other languages.
// Turkish, provided by drupal.org user Kartagis.
['tr', 'Abayı serdiler bize. Söyleyeceğim yüzlerine. Sanırım hepimiz aynı şeyi düşünüyoruz.', 'Abayi serdiler bize. Soyleyecegim yuzlerine. Sanirim hepimiz ayni seyi dusunuyoruz.'],
- // Illegal/unknown unicode.
- ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?'],
// Max length.
['de', $two_byte, 'Ae Oe', '?', 5],
];
@@ -165,6 +163,60 @@ class PhpTransliterationTest extends TestCase {
}
/**
+ * Tests the unknown character replacement.
+ *
+ * @param string $langcode
+ * The language code to test.
+ * @param string $original
+ * The original string.
+ * @param string $expected
+ * The expected return from PhpTransliteration::transliterate().
+ * @param string $unknown_character
+ * The character to substitute for characters in $string without
+ * transliterated equivalents.
+ * @param int $max_length
+ * The maximum length of the string that returns the transliteration.
+ *
+ * @dataProvider providerTestTransliterationUnknownCharacter
+ */
+ public function testTransliterationUnknownCharacter($langcode, $original, $expected, $unknown_character = '?', $max_length = NULL) {
+ $transliteration = new PhpTransliteration();
+ $actual = $transliteration->transliterate($original, $langcode, $unknown_character, $max_length);
+ $this->assertSame($expected, $actual);
+ }
+
+ /**
+ * Provides data for self::testTransliterationUnknownCharacter().
+ *
+ * @return array
+ * An array of arrays, each containing the parameters for
+ * self::testTransliterationUnknownCharacter().
+ */
+ public function providerTestTransliterationUnknownCharacter() {
+ return [
+ // Each test case is (language code, input, output, unknown character, max
+ // length).
+
+ // Illegal/unknown unicode.
+ ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?????'],
+ ['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '-----', '-'],
+ ['en', 'Hel' . chr(0x80) . 'o World', 'Hel?o World'],
+ ['en', 'Hell' . chr(0x80) . ' World', 'Hell? World'],
+ // Non default replacement.
+ ['en', chr(0x80) . 'ello World', '_ello World', '_'],
+ // Keep the original question marks.
+ ['en', chr(0xF8) . '?' . chr(0x80), '???'],
+ ['en', chr(0x80) . 'ello ? World?', '_ello ? World?', '_'],
+ ['pl', 'aąeę' . chr(0x80) . 'oółżźz ?', 'aaee?oolzzz ?'],
+ // Non-US-ASCII replacement.
+ ['en', chr(0x80) . 'ello World?', 'Oello World?', 'Ö'],
+ ['pl', chr(0x80) . 'óóść', 'ooosc', 'ó'],
+ // Ensure question marks are replaced when max length used.
+ ['en', chr(0x80) . 'ello ? World?', '_ello ?', '_', 7],
+ ];
+ }
+
+ /**
* Tests inclusion is safe.
*
* @covers ::readLanguageOverrides