Merge "Improve ULS language search api"

This commit is contained in:
jenkins-bot
2017-12-01 04:47:30 +00:00
committed by Gerrit Code Review
6 changed files with 51632 additions and 46567 deletions

View File

@@ -22,12 +22,11 @@
* @ingroup API * @ingroup API
*/ */
class ApiLanguageSearch extends ApiBase { class ApiLanguageSearch extends ApiBase {
public function execute() { public function execute() {
$params = $this->extractRequestParams(); $params = $this->extractRequestParams();
$search = $params['search']; $search = $params['search'];
$typos = $params['typos']; $typos = $params['typos'];
$searches = LanguageNameSearch::search( $search, $typos ); $searches = LanguageNameSearch::search( $search, $typos, $this->getLanguage()->getCode() );
$result = $this->getResult(); $result = $this->getResult();
$result->addValue( null, $this->getModuleName(), $searches ); $result->addValue( null, $this->getModuleName(), $searches );
} }

View File

@@ -58,11 +58,13 @@ class LanguageNameIndexer extends Maintenance {
foreach ( $words as $index => $word ) { foreach ( $words as $index => $word ) {
$bucket = LanguageNameSearch::getIndex( $word ); $bucket = LanguageNameSearch::getIndex( $word );
$type = 'prefix';
$display = $translation; $display = $translation;
if ( $index > 0 && count( $words ) > 1 ) { if ( $index > 0 && count( $words ) > 1 ) {
$type = 'infix';
$display = "$word <$translation>"; $display = "$word <$translation>";
} }
$buckets[$bucket][$display] = $targetLanguage; $buckets[$bucket][$type][$display] = $targetLanguage;
} }
} }
} }
@@ -87,10 +89,25 @@ class LanguageNameIndexer extends Maintenance {
foreach ( $specialLanguages as $targetLanguage => $translation ) { foreach ( $specialLanguages as $targetLanguage => $translation ) {
$bucket = LanguageNameSearch::getIndex( $translation ); $bucket = LanguageNameSearch::getIndex( $translation );
$buckets[$bucket][$translation] = $targetLanguage; $buckets[$bucket]['prefix'][$translation] = $targetLanguage;
}
$lengths = [];
// Sorting the bucket contents gives two benefits:
// - more consistent output across environments
// - shortest matches appear first, especially exact matches
// Sort buckets by index
ksort( $buckets );
foreach ( $buckets as $index => &$bucketTypes ) {
$lengths[] = array_sum( array_map( 'count', $bucketTypes ) );
// Ensure 'prefix' is before 'infix';
krsort( $bucketTypes );
// Ensure each bucket has entries sorted
foreach ( $bucketTypes as $type => &$bucket ) {
ksort( $bucket );
}
} }
$lengths = array_values( array_map( 'count', $buckets ) );
$count = count( $buckets ); $count = count( $buckets );
$min = min( $lengths ); $min = min( $lengths );
$max = max( $lengths ); $max = max( $lengths );
@@ -113,7 +130,6 @@ class LanguageNameSearchData {
PHP; PHP;
ksort( $buckets );
// Format for short array format // Format for short array format
$data = var_export( $buckets, true ); $data = var_export( $buckets, true );
$data = str_replace( "array (", '[', $data ); $data = str_replace( "array (", '[', $data );

View File

@@ -18,29 +18,91 @@
* @licence MIT License * @licence MIT License
*/ */
class LanguageNameSearch { class LanguageNameSearch {
public static function search( $searchKey, $typos = 0 ) { /**
* Find languages with fuzzy matching.
* The order of results is following:
* 1: exact language code match
* 2: exact language name match in any language
* 3: prefix language name match in any language
* 4: infix language name match in any language
*
* The returned language name for autocompletion is the first one that
* matches in this list:
* 1: exact match in [user, autonym, any other language]
* 2: prefix match in [user, autonum, any other language]
* 3: inline match in [user, autonym, any other language]
*
* @param string $searchKey
* @param int $typos
* @param string $userLanguage Language tag.
* @return array
*/
public static function search( $searchKey, $typos = 0, $userLanguage = null ) {
$results = [];
$searchKey = mb_strtolower( $searchKey ); $searchKey = mb_strtolower( $searchKey );
$index = self::getIndex( $searchKey );
if ( !isset( LanguageNameSearchData::$buckets[$index] ) ) { // Always prefer exact language code match
return []; if ( Language::isKnownLanguageTag( $searchKey ) ) {
$name = mb_strtolower( Language::fetchLanguageName( $searchKey, $userLanguage ) );
// Check if language code is a prefix of the name
if ( strpos( $name, $searchKey ) === 0 ) {
$results[$searchKey] = $name;
} else {
$results[$searchKey] = "$searchKey <$name>";
}
} }
$bucket = LanguageNameSearchData::$buckets[$index]; $index = self::getIndex( $searchKey );
$bucketsForIndex = [];
if ( isset( LanguageNameSearchData::$buckets[$index] ) ) {
$bucketsForIndex = LanguageNameSearchData::$buckets[$index];
}
$results = []; // types are 'prefix', 'infix' (in this order!)
foreach ( $bucket as $name => $code ) { foreach ( $bucketsForIndex as $bucketType => $bucket ) {
// Prefix search foreach ( $bucket as $name => $code ) {
if ( strrpos( $name, $searchKey, -strlen( $name ) ) !== false // We can skip checking languages we already have in the list
|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos ) if ( isset( $results[ $code ] ) ) {
) { continue;
$results[$code] = $name; }
// Apply fuzzy search
if ( !self::matchNames( $name, $searchKey, $typos ) ) {
continue;
}
// Once we find a match, figure out the best name to display to the user
// If $userLanguage is not provided (null), it is the same as autonym
$candidates = [
mb_strtolower( Language::fetchLanguageName( $code, $userLanguage ) ),
mb_strtolower( Language::fetchLanguageName( $code, null ) ),
$name
];
foreach ( $candidates as $candidate ) {
if ( $searchKey === $candidate ) {
$results[$code] = $candidate;
continue 2;
}
}
foreach ( $candidates as $candidate ) {
if ( self::matchNames( $candidate, $searchKey, $typos ) ) {
$results[$code] = $candidate;
continue 2;
}
}
} }
} }
return $results; return $results;
} }
public static function matchNames( $name, $searchKey, $typos ) {
return strrpos( $name, $searchKey, -strlen( $name ) ) !== false
|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos );
}
public static function getIndex( $name ) { public static function getIndex( $name ) {
$codepoint = self::getCodepoint( $name ); $codepoint = self::getCodepoint( $name );

File diff suppressed because it is too large Load Diff

View File

@@ -23,7 +23,7 @@
// MediaWiki overrides for ULS defaults // MediaWiki overrides for ULS defaults
$.fn.uls.defaults = $.extend( $.fn.uls.defaults, { $.fn.uls.defaults = $.extend( $.fn.uls.defaults, {
languages: mw.config.get( 'wgULSLanguages' ) || {}, languages: mw.config.get( 'wgULSLanguages' ) || {},
searchAPI: mw.util.wikiScript( 'api' ) + '?action=languagesearch&format=json' searchAPI: mw.util.wikiScript( 'api' ) + '?action=languagesearch&format=json&formatversion=2'
} ); } );
// No need of IME in the ULS language search bar // No need of IME in the ULS language search bar

View File

@@ -23,8 +23,11 @@ class LanguageSearchTest extends PHPUnit_Framework_TestCase {
* @dataProvider searchDataProvider * @dataProvider searchDataProvider
*/ */
public function testSearch( $searchKey, $expected ) { public function testSearch( $searchKey, $expected ) {
$actual = LanguageNameSearch::search( $searchKey, 1 ); $actual = LanguageNameSearch::search( $searchKey, 1, 'en' );
// This is for better error messages
$this->assertEquals( $expected, $actual ); $this->assertEquals( $expected, $actual );
// This is for identical order
$this->assertSame( $expected, $actual );
} }
public function searchDataProvider() { public function searchDataProvider() {
@@ -34,8 +37,8 @@ class LanguageSearchTest extends PHPUnit_Framework_TestCase {
] ]
], ],
[ 'മല', [ [ 'മല', [
'ml' => 'മലയാളം',
'mg' => 'മലഗാസി', 'mg' => 'മലഗാസി',
'ml' => 'മലയാളം',
'ms' => 'മലെയ്', 'ms' => 'മലെയ്',
] ]
], ],
@@ -43,15 +46,16 @@ class LanguageSearchTest extends PHPUnit_Framework_TestCase {
'fi' => 'φινλανδικά', 'fi' => 'φινλανδικά',
] ]
], ],
[ 'blah', [] [ 'blargh', []
], ],
[ 'الفرنسية', [ [ 'الفرنسية', [
'fr' => 'الفرنسية', 'fr' => 'الفرنسية',
'fr-ca' => 'الفرنسية الكندية',
'fr-ch' => 'الفرنسية السويسرية', 'fr-ch' => 'الفرنسية السويسرية',
'frm' => 'الفرنسية الوسطى',
'fro' => 'الفرنسية القديمة', 'fro' => 'الفرنسية القديمة',
'crs' => 'الفرنسية الكريولية السيشيلية' 'frc' => 'الفرنسية الكاجونية',
'crs' => 'الفرنسية الكريولية السيشيلية',
'fr-ca' => 'الفرنسية الكندية',
'frm' => 'الفرنسية الوسطى',
] ]
], ],
[ 'മലയളം', [ [ 'മലയളം', [
@@ -59,7 +63,7 @@ class LanguageSearchTest extends PHPUnit_Framework_TestCase {
] ]
], ],
[ 'punja', [ [ 'punja', [
'pa' => 'punjabi <èdè punjabi>', 'pa' => class_exists( 'LanguageNames' ) ? 'punjabi' : 'punjaabi sennii',
'pnb' => 'punjabi western', 'pnb' => 'punjabi western',
] ]
], ],
@@ -72,22 +76,22 @@ class LanguageSearchTest extends PHPUnit_Framework_TestCase {
] ]
], ],
[ 'chinese', [ [ 'chinese', [
'zh-hans' => 'chinese simplified', 'zh' => 'chinese',
'zh-hant' => 'chinese traditional',
'zh' => 'chinesesch',
'zh-cn' => 'chinese (china)', 'zh-cn' => 'chinese (china)',
'zh-hk' => 'chinese (hong kong)', 'zh-hk' => 'chinese (hong kong)',
'zh-min-nan' => 'chinese (min nan)', 'zh-min-nan' => 'chinese (min nan)',
'zh-sg' => 'chinese (singapore)', 'zh-sg' => 'chinese (singapore)',
'zh-tw' => 'chinese (taiwan)', 'zh-tw' => 'chinese (taiwan)',
'cdo' => 'chinese <min dong chinese>', 'zh-hans' => 'chinese simplified',
'gan' => 'chinese <isi-gan chinese>', 'zh-hant' => 'chinese traditional',
'hak' => 'chinese <isi-hakka chinese>', 'zh-classical' => 'chinese <classical chinese>',
'lzh' => 'chinesesch <klassescht chinesesch>', 'gan' => 'chinese <gan chinese>',
'hak' => 'chinese <hakka chinese>',
'nan' => 'chinese <isi-min nan chinese>', 'nan' => 'chinese <isi-min nan chinese>',
'wuu' => 'chinese <isi-wu chinese>', 'wuu' => 'chinese <isi-wu chinese>',
'zh-classical' => 'chinese <classical chinese>',
'hsn' => 'chinese <isi-xiang chinese>', 'hsn' => 'chinese <isi-xiang chinese>',
'lzh' => 'chinese <literary chinese>',
'cdo' => 'chinese <min dong chinese>',
] ]
], ],
[ 'finish', [ [ 'finish', [