Update language name search index

I noticed some language names are not searchable. I made it so
that autonyms from language-data are added to the search index.
Without this, languages not present in Names.php or in the CLDR
extension are not searchable via the API except by language code.

Change-Id: I51a9e2eb15fb40963e6edbf1db76133d84de7291
This commit is contained in:
Niklas Laxström
2019-05-19 17:49:24 +02:00
parent ede9c683a9
commit 6939354e16
3 changed files with 3189 additions and 1080 deletions

View File

@@ -36,12 +36,26 @@ class LanguageNameIndexer extends Maintenance {
// Avoid local configuration leaking to this script // Avoid local configuration leaking to this script
$wgExtraLanguageNames = []; $wgExtraLanguageNames = [];
$languages = Language::fetchLanguageNames( null, 'all' ); $languageNames = [];
// Add languages from language-data
$ulsLanguages = $this->getLanguageData()[ 'languages' ];
foreach ( $ulsLanguages as $languageCode => $languageEntry ) {
// Redirect have only one item
if ( isset( $languageEntry[ 2 ] ) ) {
$languageNames[ 'autonyms' ][ $languageCode ] = $languageEntry[ 2 ];
}
}
// Languages and their names in different languages from Names.php and the cldr extension
// This comes after $ulsLanguages so that for example the als/gsw mixup is using the code
// used in the Wikimedia world.
$mwLanguages = Language::fetchLanguageNames( null, 'all' );
foreach ( array_keys( $mwLanguages ) as $languageCode ) {
$languageNames[ $languageCode ] = LanguageNames::getNames( $languageCode, 0, 2 );
}
$buckets = []; $buckets = [];
foreach ( $languages as $sourceLanguage => $autonym ) { foreach ( $languageNames as $translations ) {
$translations = LanguageNames::getNames( $sourceLanguage, 0, 2 );
foreach ( $translations as $targetLanguage => $translation ) { foreach ( $translations as $targetLanguage => $translation ) {
// Remove directionality markers used in Names.php: users are not // Remove directionality markers used in Names.php: users are not
// going to type these. // going to type these.
@@ -128,6 +142,15 @@ class LanguageNameIndexer extends Maintenance {
$this->generateFile( $buckets ); $this->generateFile( $buckets );
} }
private function getLanguageData() {
$file = __DIR__ . '/../lib/jquery.uls/src/jquery.uls.data.js';
$contents = file_get_contents( $file );
preg_match( '/.*\$\.uls\.data = (.*?)} \( jQuery \)/s', $contents, $matches );
$json = $matches[ 1 ];
$data = json_decode( $json, true );
return $data;
}
private function generateFile( array $buckets ) { private function generateFile( array $buckets ) {
$template = <<<PHP $template = <<<PHP
<?php <?php

File diff suppressed because it is too large Load Diff

View File

@@ -103,10 +103,12 @@ class LanguageSearchTest extends PHPUnit\Framework\TestCase {
'zh' => 'chinese', 'zh' => 'chinese',
'zh-cn' => 'chinese (china)', 'zh-cn' => 'chinese (china)',
'zh-hk' => 'chinese (hong kong)', 'zh-hk' => 'chinese (hong kong)',
'zh-mo' => 'chinese (macau)',
'zh-my' => 'chinese (malaysia)',
'zh-min-nan' => 'chinese (min nan)', 'zh-min-nan' => 'chinese (min nan)',
'zh-sg' => 'chinese (singapore)', 'zh-sg' => 'chinese (singapore)',
'zh-tw' => 'chinese (taiwan)', 'zh-tw' => 'chinese (taiwan)',
'zh-hans' => 'chinese simplified', 'zh-hans' => 'chinese simplificate',
'zh-hant' => 'chinese traditional', 'zh-hant' => 'chinese traditional',
'zh-classical' => 'chinese — classical chinese', 'zh-classical' => 'chinese — classical chinese',
'gan' => 'chinese — gan chinese', 'gan' => 'chinese — gan chinese',