Improve ULS language search api

* Store prefixes and infixes separately in the data
* First match language code, then prefixes, then infixes
* Try to use suggestion either in user language or autonym first
* use formatversion=2 to avoid escaping Unicode

Using Language::fetchLanguageName might can have a small
performance impact. On the other hand there is now check
to skip languages we already found, avoiding some fuzzy
matching.

This is in a preparation for a change in jquery.uls to use
the search API more, while trying to reduce the amount of
weird autocompletion suggestions we show to the user.

Bug: T73891
Change-Id: Id94c5352d9a591969bf90144d1d2d5e758d08301
This commit is contained in:
Niklas Laxström
2017-11-27 12:22:25 +01:00
parent a353c5ab65
commit e87dd20cdd
6 changed files with 51632 additions and 46567 deletions

View File

@@ -18,29 +18,91 @@
* @licence MIT License
*/
class LanguageNameSearch {
public static function search( $searchKey, $typos = 0 ) {
/**
* Find languages with fuzzy matching.
* The order of results is following:
* 1: exact language code match
* 2: exact language name match in any language
* 3: prefix language name match in any language
* 4: infix language name match in any language
*
* The returned language name for autocompletion is the first one that
* matches in this list:
* 1: exact match in [user, autonym, any other language]
* 2: prefix match in [user, autonum, any other language]
* 3: inline match in [user, autonym, any other language]
*
* @param string $searchKey
* @param int $typos
* @param string $userLanguage Language tag.
* @return array
*/
public static function search( $searchKey, $typos = 0, $userLanguage = null ) {
$results = [];
$searchKey = mb_strtolower( $searchKey );
$index = self::getIndex( $searchKey );
if ( !isset( LanguageNameSearchData::$buckets[$index] ) ) {
return [];
// Always prefer exact language code match
if ( Language::isKnownLanguageTag( $searchKey ) ) {
$name = mb_strtolower( Language::fetchLanguageName( $searchKey, $userLanguage ) );
// Check if language code is a prefix of the name
if ( strpos( $name, $searchKey ) === 0 ) {
$results[$searchKey] = $name;
} else {
$results[$searchKey] = "$searchKey <$name>";
}
}
$bucket = LanguageNameSearchData::$buckets[$index];
$index = self::getIndex( $searchKey );
$bucketsForIndex = [];
if ( isset( LanguageNameSearchData::$buckets[$index] ) ) {
$bucketsForIndex = LanguageNameSearchData::$buckets[$index];
}
$results = [];
foreach ( $bucket as $name => $code ) {
// Prefix search
if ( strrpos( $name, $searchKey, -strlen( $name ) ) !== false
|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos )
) {
$results[$code] = $name;
// types are 'prefix', 'infix' (in this order!)
foreach ( $bucketsForIndex as $bucketType => $bucket ) {
foreach ( $bucket as $name => $code ) {
// We can skip checking languages we already have in the list
if ( isset( $results[ $code ] ) ) {
continue;
}
// Apply fuzzy search
if ( !self::matchNames( $name, $searchKey, $typos ) ) {
continue;
}
// Once we find a match, figure out the best name to display to the user
// If $userLanguage is not provided (null), it is the same as autonym
$candidates = [
mb_strtolower( Language::fetchLanguageName( $code, $userLanguage ) ),
mb_strtolower( Language::fetchLanguageName( $code, null ) ),
$name
];
foreach ( $candidates as $candidate ) {
if ( $searchKey === $candidate ) {
$results[$code] = $candidate;
continue 2;
}
}
foreach ( $candidates as $candidate ) {
if ( self::matchNames( $candidate, $searchKey, $typos ) ) {
$results[$code] = $candidate;
continue 2;
}
}
}
}
return $results;
}
public static function matchNames( $name, $searchKey, $typos ) {
return strrpos( $name, $searchKey, -strlen( $name ) ) !== false
|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos );
}
public static function getIndex( $name ) {
$codepoint = self::getCodepoint( $name );