From 101532cfa629c422b0ce05255f87272fd49ab602 Mon Sep 17 00:00:00 2001 From: Amire80 Date: Tue, 24 Oct 2017 17:08:29 +0530 Subject: [PATCH] Add special language names to facilitate searching This adds several custom languages. The addition of Punjabi addresses Bug T178070. The addition of Chinese addresses Bug T73891. Georgian and Catalan (Valencian) variant spellings are added because these are the most frequent languages that are not found in the ULS search box. Bug: T73891 Bug: T178070 Change-Id: Ifbb08b560e454643d246379c19f725bde61917e9 --- data/LanguageNameIndexer.php | 24 ++++++++++++++++++++++++ data/LanguageNameSearchData.php | 11 +++++++---- tests/phpunit/LanguageSearchTest.php | 24 ++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/data/LanguageNameIndexer.php b/data/LanguageNameIndexer.php index a4b67ec7..37659509 100644 --- a/data/LanguageNameIndexer.php +++ b/data/LanguageNameIndexer.php @@ -36,6 +36,7 @@ class LanguageNameIndexer extends Maintenance { $buckets = []; foreach ( $languages as $sourceLanguage => $autonym ) { $translations = LanguageNames::getNames( $sourceLanguage, 0, 2 ); + foreach ( $translations as $targetLanguage => $translation ) { $translation = mb_strtolower( $translation ); // Remove directionality markers used in Names.php: users are not @@ -46,6 +47,29 @@ class LanguageNameIndexer extends Maintenance { } } + // Some languages don't have a conveniently searchable name in CLDR. + // For example, the name of Western Punjabi doesn't start with + // the string "punjabi" in any language, so it cannot be found + // by people who search in English. + // To resolve this, some languages are added here locally. + $specialLanguages = [ + // Catalan, sometimes searched as "Valencià" + 'ca' => 'valencia', + // Georgian, the transliteration of the autonym is often used for searching + 'ka' => 'kartuli', + // Western Punjabi, doesn't start with the word "Punjabi" in any language + 'pnb' => 'punjabi western', + // Simplified and Traditional Chinese, because zh-hans and zh-hant + // are not mapped to any English name + 'zh-hans' => 'chinese simplified', + 'zh-hant' => 'chinese traditional', + ]; + + foreach ( $specialLanguages as $targetLanguage => $translation ) { + $bucket = LanguageNameSearch::getIndex( $translation ); + $buckets[$bucket][$translation] = $targetLanguage; + } + $lengths = array_values( array_map( 'count', $buckets ) ); $count = count( $buckets ); $min = min( $lengths ); diff --git a/data/LanguageNameSearchData.php b/data/LanguageNameSearchData.php index 44b7b005..cf34b4eb 100644 --- a/data/LanguageNameSearchData.php +++ b/data/LanguageNameSearchData.php @@ -2923,6 +2923,8 @@ class LanguageNameSearchData { 'chex' => 'cs', 'cheva' => 'ny', 'chukot' => 'chk', + 'chinese simplified' => 'zh-hans', + 'chinese traditional' => 'zh-hant', ], 100 => [ 'dansk' => 'da', @@ -2933,6 +2935,7 @@ class LanguageNameSearchData { 'dorerin naoero' => 'na', 'diné bizaad' => 'nv', 'deitsch' => 'pdc', + 'davvisámegiella' => 'se', 'deens' => 'da', 'duits' => 'de', 'divehi' => 'dv', @@ -3172,7 +3175,6 @@ class LanguageNameSearchData { 'divehigiella' => 'dv', 'dzongkhagiella' => 'dz', 'dárogiella' => 'no', - 'davvisámegiella' => 'se', 'durkagiella' => 'tr', 'dovdameahttun giella' => 'und', 'divehijski' => 'dv', @@ -8464,6 +8466,7 @@ class LanguageNameSearchData { 'kreol (nigeriya)' => 'pcm', 'ký hiệu blissymbols' => 'zbl', 'không có nội dung ngôn ngữ' => 'zxx', + 'kartuli' => 'ka', ], 108 => [ 'la .lojban.' => 'jbo', @@ -12634,6 +12637,7 @@ class LanguageNameSearchData { 'portugal (braziliya)' => 'pt-br', 'portugal (yevropa)' => 'pt-pt', 'portugänapük' => 'pt', + 'punjabi western' => 'pnb', ], 113 => [ 'qafár af' => 'aa', @@ -13051,7 +13055,6 @@ class LanguageNameSearchData { 'sicilianu' => 'scn', 'scots' => 'sco', 'sassaresu' => 'sdc', - 'sámegiella' => 'se', 'sängö' => 'sg', 'srpskohrvatski / српскохрватски' => 'sh', 'simple english' => 'simple', @@ -14838,7 +14841,6 @@ class LanguageNameSearchData { 'tojikī' => 'tg-latn', 'türkmençe' => 'tk', 'tagalog' => 'tl', - 'toki pona' => 'tokipona', 'tok pisin' => 'tpi', 'türkçe' => 'tr', 'tatarça' => 'tt-latn', @@ -17079,6 +17081,7 @@ class LanguageNameSearchData { 'valis' => 'wae', 'volamo' => 'wal', 'valbiri' => 'wbp', + 'valencia' => 'ca', ], 119 => [ 'west-vlams' => 'vls', @@ -36749,7 +36752,6 @@ class LanguageNameSearchData { 'タリシュ語' => 'tly', 'ツワナ語' => 'tn', 'トンガ語' => 'to', - 'トキポナ' => 'tokipona', 'トク・ピシン語' => 'tpi', 'トルコ語' => 'tr', 'トゥロヨ語' => 'tru', @@ -36788,6 +36790,7 @@ class LanguageNameSearchData { 'シャウィーア語(アラビア文字)' => 'shy-arab', 'シャウィーア語(ラテン文字)' => 'shy-latn', 'シャウィーア語(ティフナグ文字)' => 'shy-tfng', + 'トキポナ' => 'tokipona', 'アチョリ語' => 'ach', 'アダングメ語' => 'ada', 'アヴェスタ語' => 'ae', diff --git a/tests/phpunit/LanguageSearchTest.php b/tests/phpunit/LanguageSearchTest.php index 13e07834..bd993f4d 100644 --- a/tests/phpunit/LanguageSearchTest.php +++ b/tests/phpunit/LanguageSearchTest.php @@ -58,6 +58,30 @@ class LanguageSearchTest extends PHPUnit_Framework_TestCase { 'ml' => 'മലയാളം', ] ], + [ 'punja', [ + 'pa' => 'punjaabi sennii', + 'pnb' => 'punjabi western', + ] + ], + [ 'kartuli', [ + 'ka' => 'kartuli', + ] + ], + [ 'valencia', [ + 'ca' => 'valencia', + ] + ], + [ 'chinese', [ + 'zh-hans' => 'chinese simplified', + 'zh-hant' => 'chinese traditional', + 'zh' => 'chinesesch', + 'zh-cn' => 'chinese (china)', + 'zh-hk' => 'chinese (hong kong)', + 'zh-min-nan' => 'chinese (min nan)', + 'zh-sg' => 'chinese (singapore)', + 'zh-tw' => 'chinese (taiwan)' + ] + ], [ 'finish', [ 'fi' => 'finnish' ]