summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNathaniel Catchpole2018-11-20 11:58:15 (GMT)
committerNathaniel Catchpole2018-11-20 11:58:25 (GMT)
commit35c3d18ae0fce93eea588ae4356a94d562ea0fed (patch)
tree28fad1ef43eaa5956f6f7d04c1fc5ff83d9ebdbd
parent2d9d4473860abf556091bdcc2ab8fdc779ff924a (diff)
Issue #3000630 by scott_euser, Krzysztof Domański, APolitsin, vijaycs85, longwave: Transliteration causes 2 capital letters at the beginning of a word
(cherry picked from commit bb7fb6a3dd840f95ed76c17f94dcd2dc049e0470)
-rw-r--r--core/lib/Drupal/Component/Transliteration/PhpTransliteration.php60
-rw-r--r--core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php13
2 files changed, 55 insertions, 18 deletions
diff --git a/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php b/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php
index 3cd8d68..5eee57a 100644
--- a/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php
+++ b/core/lib/Drupal/Component/Transliteration/PhpTransliteration.php
@@ -105,31 +105,55 @@ class PhpTransliteration implements TransliterationInterface {
* {@inheritdoc}
*/
public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
- $result = '';
+ $results = [];
$length = 0;
- // Split into Unicode characters and transliterate each one.
- foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
- $code = self::ordUTF8($character);
- if ($code == -1) {
- $to_add = $unknown_character;
- }
- else {
- $to_add = $this->replace($code, $langcode, $unknown_character);
- }
- // Check if this exceeds the maximum allowed length.
- if (isset($max_length)) {
- $length += strlen($to_add);
- if ($length > $max_length) {
- // There is no more space.
- return $result;
+ // Split on words to handle mixed case per word.
+ $words = explode(' ', $string);
+ foreach ($words as $key => $word) {
+ $results[$key] = '';
+
+ // String is mixed case if it consists of both uppercase and lowercase
+ // letters. To accurately check this, remove any numbers and check that
+ // remaining characters are not all uppercase and not all lowercase.
+ $alpha_string = preg_replace('/\\d/', '', $word);
+ $mixed_case = (strlen($alpha_string) > 1 && mb_strtolower($alpha_string) !== $alpha_string && mb_strtoupper($alpha_string) !== $alpha_string);
+
+ // Split into Unicode characters and transliterate each one.
+ foreach (preg_split('//u', $word, 0, PREG_SPLIT_NO_EMPTY) as $character) {
+ $code = self::ordUTF8($character);
+ if ($code == -1) {
+ $to_add = $unknown_character;
}
+ else {
+ $to_add = $this->replace($code, $langcode, $unknown_character);
+ }
+
+ // Check if this exceeds the maximum allowed length.
+ if (isset($max_length)) {
+ $length += strlen($to_add);
+ if ($length > $max_length) {
+ // There is no more space.
+ $results = array_filter($results);
+ return implode(' ', $results);
+ }
+ }
+
+ // If this is a capitalised letter of a mixed case word, only capitalise
+ // the first letter and lowercase any subsequent letters.
+ if ($mixed_case && strlen($to_add) > 1 && mb_strtoupper($to_add) === $to_add) {
+ $to_add = ucfirst(strtolower($to_add));
+ }
+
+ $results[$key] .= $to_add;
}
- $result .= $to_add;
+ // Add space to count for max length.
+ $length++;
}
- return $result;
+ $results = array_filter($results);
+ return implode(' ', $results);
}
/**
diff --git a/core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php b/core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php
index 924f3e9..eb468c8 100644
--- a/core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php
+++ b/core/tests/Drupal/Tests/Component/Transliteration/PhpTransliterationTest.php
@@ -146,6 +146,19 @@ class PhpTransliterationTest extends TestCase {
['en', chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80), '?'],
// Max length.
['de', $two_byte, 'Ae Oe', '?', 5],
+ // Test strings with mixed case words where a single capital character
+ // results in multiple characters. The first character should remain
+ // capitalised but subsequent resulting characters should be lowercase.
+ // For example a result of the transliteration should be 'Shtrikhkod'
+ // not 'SHtrikhkod'. Numbers should not be used in determining whether a
+ // string is mixed case.
+ ['ru', 'Штрихкод', 'Shtrikhkod'],
+ ['bg', 'Щастие', 'Schastie'],
+ ['bg', 'Щ1', 'SCH1'],
+ ['bg', 'Щ1Щ', 'SCH1SCH'],
+ ['bg', 'Щ1щ', 'Sch1sch'],
+ ['bg', 'Щастие ЩЩЩ', 'Schastie SCHSCHSCH'],
+ ['bg', 'Щастие ЩЩЩ. Щастие! Щастие', 'Schastie SCHSCHSCH. Schastie! Schastie'],
];
}