addDescription( 'Script to create language names index.' ); } public function execute() { global $wgExtraLanguageNames; // Avoid local configuration leaking to this script $wgExtraLanguageNames = []; $languages = Language::fetchLanguageNames( null, 'all' ); $buckets = []; foreach ( $languages as $sourceLanguage => $autonym ) { $translations = LanguageNames::getNames( $sourceLanguage, 0, 2 ); foreach ( $translations as $targetLanguage => $translation ) { // Remove directionality markers used in Names.php: users are not // going to type these. $translation = str_replace( "\xE2\x80\x8E", '', $translation ); $translation = mb_strtolower( $translation ); $translation = trim( $translation ); // Clean up "gjermanishte zvicerane (dialekti i alpeve)" to "gjermanishte zvicerane". // The original name is still shown, but avoid us creating entries such as // "(dialekti" or "alpeve)". $basicForm = preg_replace( '/\(.+\)$/', '', $translation ); $words = preg_split( '/[\s]+/u', $basicForm, -1, PREG_SPLIT_NO_EMPTY ); foreach ( $words as $index => $word ) { $bucket = LanguageNameSearch::getIndex( $word ); $display = $translation; if ( $index > 0 && count( $words ) > 1 ) { $display = "$word <$translation>"; } $buckets[$bucket][$display] = $targetLanguage; } } } // Some languages don't have a conveniently searchable name in CLDR. // For example, the name of Western Punjabi doesn't start with // the string "punjabi" in any language, so it cannot be found // by people who search in English. // To resolve this, some languages are added here locally. $specialLanguages = [ // Catalan, sometimes searched as "Valencià" 'ca' => 'valencia', // Georgian, the transliteration of the autonym is often used for searching 'ka' => 'kartuli', // Western Punjabi, doesn't start with the word "Punjabi" in any language 'pnb' => 'punjabi western', // Simplified and Traditional Chinese, because zh-hans and zh-hant // are not mapped to any English name 'zh-hans' => 'chinese simplified', 'zh-hant' => 'chinese traditional', ]; foreach ( $specialLanguages as $targetLanguage => $translation ) { $bucket = LanguageNameSearch::getIndex( $translation ); $buckets[$bucket][$translation] = $targetLanguage; } $lengths = array_values( array_map( 'count', $buckets ) ); $count = count( $buckets ); $min = min( $lengths ); $max = max( $lengths ); $median = $lengths[ceil( $count / 2 )]; $avg = array_sum( $lengths ) / $count; $this->output( "Bucket stats:\n - $count buckets\n - smallest has $min entries\n" ); $this->output( " - largest has $max entries\n - median size is $median entries\n" ); $this->output( " - average size is $avg entries\n" ); $this->generateFile( $buckets ); } private function generateFile( array $buckets ) { $template = <<s $data = preg_replace( '/(=>)\s+(\[)/m', '\1 \2', $data ); // Convert spaces to tabs. Since we are not top-level need more tabs. $data = preg_replace( '/^ /m', "\t\t\t", $data ); $data = preg_replace( '/^ /m', "\t\t", $data ); $template = str_replace( '___', $data, $template ); file_put_contents( __DIR__ . '/LanguageNameSearchData.php', $template ); } } $maintClass = 'LanguageNameIndexer'; require_once RUN_MAINTENANCE_IF_MAIN;