Perform search on every word of language name

See e.g. T132021. This favours coverage over quality.

Change-Id: I3fc8fb1702802bc002c3d7e2941563840914f325
This commit is contained in:
Niklas Laxström
2017-10-31 16:59:06 +01:00
committed by Amire80
parent a9dc4a0f1a
commit a353c5ab65
4 changed files with 8824 additions and 12 deletions

View File

@@ -43,12 +43,27 @@ class LanguageNameIndexer extends Maintenance {
$translations = LanguageNames::getNames( $sourceLanguage, 0, 2 ); $translations = LanguageNames::getNames( $sourceLanguage, 0, 2 );
foreach ( $translations as $targetLanguage => $translation ) { foreach ( $translations as $targetLanguage => $translation ) {
$translation = mb_strtolower( $translation );
// Remove directionality markers used in Names.php: users are not // Remove directionality markers used in Names.php: users are not
// going to type these. // going to type these.
$translation = str_replace( "\xE2\x80\x8E", '', $translation ); $translation = str_replace( "\xE2\x80\x8E", '', $translation );
$bucket = LanguageNameSearch::getIndex( $translation ); $translation = mb_strtolower( $translation );
$buckets[$bucket][$translation] = $targetLanguage; $translation = trim( $translation );
// Clean up "gjermanishte zvicerane (dialekti i alpeve)" to "gjermanishte zvicerane".
// The original name is still shown, but avoid us creating entries such as
// "(dialekti" or "alpeve)".
$basicForm = preg_replace( '/\(.+\)$/', '', $translation );
$words = preg_split( '/[\s]+/u', $basicForm, -1, PREG_SPLIT_NO_EMPTY );
foreach ( $words as $index => $word ) {
$bucket = LanguageNameSearch::getIndex( $word );
$display = $translation;
if ( $index > 0 && count( $words ) > 1 ) {
$display = "$word <$translation>";
}
$buckets[$bucket][$display] = $targetLanguage;
}
} }
} }

View File

@@ -19,11 +19,7 @@
*/ */
class LanguageNameSearch { class LanguageNameSearch {
public static function search( $searchKey, $typos = 0 ) { public static function search( $searchKey, $typos = 0 ) {
// Use code's mb_strtolower compatibily code for MW < 1.27 $searchKey = mb_strtolower( $searchKey );
$language = Language::factory( 'en' );
// @todo: Shouldn't this be unicode aware?
$searchKey = $language->lc( $searchKey );
$index = self::getIndex( $searchKey ); $index = self::getIndex( $searchKey );
if ( !isset( LanguageNameSearchData::$buckets[$index] ) ) { if ( !isset( LanguageNameSearchData::$buckets[$index] ) ) {

File diff suppressed because it is too large Load Diff

View File

@@ -59,7 +59,7 @@ class LanguageSearchTest extends PHPUnit_Framework_TestCase {
] ]
], ],
[ 'punja', [ [ 'punja', [
'pa' => 'punjaabi sennii', 'pa' => 'punjabi <èdè punjabi>',
'pnb' => 'punjabi western', 'pnb' => 'punjabi western',
] ]
], ],
@@ -79,7 +79,15 @@ class LanguageSearchTest extends PHPUnit_Framework_TestCase {
'zh-hk' => 'chinese (hong kong)', 'zh-hk' => 'chinese (hong kong)',
'zh-min-nan' => 'chinese (min nan)', 'zh-min-nan' => 'chinese (min nan)',
'zh-sg' => 'chinese (singapore)', 'zh-sg' => 'chinese (singapore)',
'zh-tw' => 'chinese (taiwan)' 'zh-tw' => 'chinese (taiwan)',
'cdo' => 'chinese <min dong chinese>',
'gan' => 'chinese <isi-gan chinese>',
'hak' => 'chinese <isi-hakka chinese>',
'lzh' => 'chinesesch <klassescht chinesesch>',
'nan' => 'chinese <isi-min nan chinese>',
'wuu' => 'chinese <isi-wu chinese>',
'zh-classical' => 'chinese <classical chinese>',
'hsn' => 'chinese <isi-xiang chinese>',
] ]
], ],
[ 'finish', [ [ 'finish', [