Files
mediawiki-extensions-Univer…/data/LanguageNameIndexer.php
Amir E. Aharoni 5ad9ee98e8 Add Hebrew search aliases for Tigre and Tigrinya
The Tigre and Tigrinya languages are spoken by thousands
of people in Israel, who have a more convenient access
to Hebrew keyboards than to Ethiopic keyboards.
The situation is further complicated by the fact that
the languages' names in the Hebrew alphabet have several
variant spellings, especially with regard to the first letter,
which is the most important one for searching.

This patch ensures that the languages are conveniently
findable in any Hebrew-script spelling.

(Every variant is on a separate line because
otherwise code editors mess up RTL text display.)

Bug: T375052
Change-Id: I8a6414718802f091ca1df9367b2dbc170cd568c4
2024-10-15 12:01:44 +00:00

246 lines
8.6 KiB
PHP

<?php
/**
* Script to create language names index.
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
* have to do anything special to choose one license or the other and you don't
* have to notify anyone which license you are using. You are free to use
* UniversalLanguageSelector in commercial projects as long as the copyright
* header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
*
* @file
* @ingroup Extensions
* @license GPL-2.0-or-later
* @license MIT
*/
$IP = getenv( 'MW_INSTALL_PATH' );
if ( $IP === false ) {
$IP = __DIR__ . '/../../..';
}
require_once "$IP/maintenance/Maintenance.php";
use MediaWiki\Extension\CLDR\LanguageNames;
use MediaWiki\Languages\LanguageNameUtils;
use MediaWiki\MainConfigNames;
class LanguageNameIndexer extends Maintenance {
public function __construct() {
parent::__construct();
$this->addDescription( 'Script to create language names index.' );
$this->requireExtension( 'UniversalLanguageSelector' );
$this->requireExtension( 'CLDR' );
}
public function execute() {
// Avoid local configuration leaking to this script
if ( $this->getConfig()->get( MainConfigNames::ExtraLanguageNames ) !== [] ) {
$this->fatalError( 'You have entries in $wgExtraLanguageNames. Needs to be empty for this script.' );
}
$languageNames = [];
// Add languages from language-data
$ulsLanguages = $this->getLanguageData()[ 'languages' ];
foreach ( $ulsLanguages as $languageCode => $languageEntry ) {
// Redirect have only one item
if ( isset( $languageEntry[ 2 ] ) ) {
$languageNames[ 'autonyms' ][ $languageCode ] = $languageEntry[ 2 ];
}
}
// Languages and their names in different languages from Names.php and the cldr extension
// This comes after $ulsLanguages so that for example the als/gsw mixup is using the code
// used in the Wikimedia world.
$mwLanguages = $this->getServiceContainer()->getLanguageNameUtils()
->getLanguageNames( LanguageNameUtils::AUTONYMS, LanguageNameUtils::ALL );
foreach ( array_keys( $mwLanguages ) as $languageCode ) {
$languageNames[ $languageCode ] = LanguageNames::getNames( $languageCode, 0, 2 );
}
$buckets = [];
foreach ( $languageNames as $translations ) {
foreach ( $translations as $targetLanguage => $translation ) {
$translation = mb_strtolower( $translation );
$translation = trim( $translation );
// Clean up "gjermanishte zvicerane (dialekti i alpeve)" to "gjermanishte zvicerane".
// The original name is still shown, but avoid us creating entries such as
// "(dialekti" or "alpeve)".
$basicForm = preg_replace( '/\(.+\)$/', '', $translation );
$words = preg_split( '/[\s]+/u', $basicForm, -1, PREG_SPLIT_NO_EMPTY );
foreach ( $words as $index => $word ) {
$bucket = LanguageNameSearch::getIndex( $word );
$type = 'prefix';
$display = $translation;
if ( $index > 0 ) {
// Avoid creating infix entries for short strings like punctuation, articles, prepositions...
if ( mb_strlen( $word ) < 3 ) {
continue;
}
$type = 'infix';
$display = "$word$translation";
}
$buckets[$bucket][$type][$display] = $targetLanguage;
}
}
}
// Some languages don't have a conveniently searchable name in CLDR.
// For example, the name of Western Punjabi doesn't start with
// the string "punjabi" in any language, so it cannot be found
// by people who search in English.
// To resolve this, some languages are added here locally.
$specialLanguages = [
// Abron / Brong / Bono (T369464)
'abr' => [ 'bono', 'brong' ],
// Catalan, sometimes searched as "Valencià"
'ca' => [ 'valencia' ],
// Compatibility with the old name and other Chinese varieties
'cdo' => [ 'chinese min dong' ],
// Spanish, the transliteration of the autonym is often used for searching
'es' => [ 'castellano' ],
// Armenian, the transliteration of the autonym is often used for searching
'hy' => [ 'hayeren' ],
// Georgian, the transliteration of the autonym is often used for searching
'ka' => [ 'kartuli', 'qartuli' ],
// Japanese, the transliteration of the autonym is often used for searching
'ja' => [ 'nihongo', 'にほんご' ],
// Chiluvale (T368856)
'lue' => [ 'luvale, chi-' ],
// Tigrinya: variant names in Hebrew,
// to ensure they can be found in different spellings
'ti' => [
'טגריניה',
'טגרינית',
'טיגריניה',
'טיגרינית',
'תגריניה',
'תגרינית',
'תיגריניה',
],
// Tigre: variant names in Hebrew,
// to ensure they can be found in different spellings
'tig' => [
'טגרה',
'טגרית',
'טיגרה',
'תגרה',
'תגרית',
'תיגרה',
'תיגרית',
],
// Mon, renamed in core MediaWiki's Names.php (T352776)
'mnw' => [ 'ဘာသာ မန်' ],
// Palembang, also known as "Musi".
// Writing this as two words ensures that it has a unique key,
// so that Moore (mos), which is known as "musi" in one of the languages,
// can also be found
'mui' => [ 'musi palembang' ],
// Western Punjabi, doesn't start with the word "Punjabi" in any language
'pnb' => [ 'punjabi western' ],
// Tai Nuea (T367377)
'tdd' => [ 'ᥖᥭᥰᥖᥬᥳᥑᥨᥒᥰ' ],
// Waale (T368046) - support alternate spellings of the name
'wlx' => [ 'waali', 'waalii' ],
// Simplified and Traditional Chinese, because zh-hans and zh-hant
// are not mapped to any English name
'zh-hans' => [ 'chinese simplified' ],
'zh-hant' => [ 'chinese traditional' ],
// Compatibility with the old name and other Chinese varieties
'zh-min-nan' => [ 'chinese min nan' ],
];
foreach ( $specialLanguages as $targetLanguage => $translations ) {
foreach ( $translations as $translation ) {
$bucket = LanguageNameSearch::getIndex( $translation );
$buckets[$bucket]['prefix'][$translation] = $targetLanguage;
}
}
$lengths = [];
// Sorting the bucket contents gives two benefits:
// - more consistent output across environments
// - shortest matches appear first, especially exact matches
// Sort buckets by index
ksort( $buckets );
foreach ( $buckets as &$bucketTypes ) {
$lengths[] = array_sum( array_map( 'count', $bucketTypes ) );
// Ensure 'prefix' is before 'infix';
krsort( $bucketTypes );
// Ensure each bucket has entries sorted
foreach ( $bucketTypes as &$bucket ) {
ksort( $bucket );
}
}
$count = count( $buckets );
$min = min( $lengths );
$max = max( $lengths );
$median = $lengths[ceil( $count / 2 )];
$avg = array_sum( $lengths ) / $count;
$this->output( "Bucket stats:\n - $count buckets\n - smallest has $min entries\n" );
$this->output( " - largest has $max entries\n - median size is $median entries\n" );
$this->output( " - average size is $avg entries\n" );
$this->generateFile( $buckets );
}
/**
* @return array
*/
private function getLanguageData() {
$file = __DIR__ . '/../lib/jquery.uls/src/jquery.uls.data.js';
$contents = file_get_contents( $file );
if ( !preg_match( '/.*\$\.uls\.data\s*=\s*(.*?)\s*}\s*\(\s*jQuery\s*\)/s', $contents, $matches ) ) {
throw new LogicException( 'Syntax error in jquery.uls.data.js?' );
}
$json = $matches[ 1 ];
$data = json_decode( $json, true );
if ( !$data ) {
throw new LogicException( 'json_decode failed. Syntax error in jquery.uls.data.js?' );
}
return $data;
}
/**
* @param array $buckets
*/
private function generateFile( array $buckets ) {
$template = <<<'PHP'
<?php
// This file is generated by a script!
class LanguageNameSearchData {
public static $buckets = ___;
}
PHP;
// Format for short array format
$data = var_export( $buckets, true );
$data = str_replace( "array (", '[', $data );
$data = str_replace( "),", '],', $data );
// Closing of the array, add correct indentation
$data = preg_replace( "/\)$/", "\t]", $data );
// Remove newlines after =>s
$data = preg_replace( '/(=>)\s+(\[)/m', '\1 \2', $data );
// Convert spaces to tabs. Since we are not top-level need more tabs.
$data = preg_replace( '/^ /m', "\t\t\t\t", $data );
$data = preg_replace( '/^ /m', "\t\t\t", $data );
$data = preg_replace( '/^ /m', "\t\t", $data );
$template = str_replace( '___', $data, $template );
file_put_contents( __DIR__ . '/LanguageNameSearchData.php', $template );
}
}
$maintClass = LanguageNameIndexer::class;
require_once RUN_MAINTENANCE_IF_MAIN;