Merge "Improve ULS language search api"
This commit is contained in:
@@ -58,11 +58,13 @@ class LanguageNameIndexer extends Maintenance {
|
||||
foreach ( $words as $index => $word ) {
|
||||
$bucket = LanguageNameSearch::getIndex( $word );
|
||||
|
||||
$type = 'prefix';
|
||||
$display = $translation;
|
||||
if ( $index > 0 && count( $words ) > 1 ) {
|
||||
$type = 'infix';
|
||||
$display = "$word <$translation>";
|
||||
}
|
||||
$buckets[$bucket][$display] = $targetLanguage;
|
||||
$buckets[$bucket][$type][$display] = $targetLanguage;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -87,10 +89,25 @@ class LanguageNameIndexer extends Maintenance {
|
||||
|
||||
foreach ( $specialLanguages as $targetLanguage => $translation ) {
|
||||
$bucket = LanguageNameSearch::getIndex( $translation );
|
||||
$buckets[$bucket][$translation] = $targetLanguage;
|
||||
$buckets[$bucket]['prefix'][$translation] = $targetLanguage;
|
||||
}
|
||||
|
||||
$lengths = [];
|
||||
// Sorting the bucket contents gives two benefits:
|
||||
// - more consistent output across environments
|
||||
// - shortest matches appear first, especially exact matches
|
||||
// Sort buckets by index
|
||||
ksort( $buckets );
|
||||
foreach ( $buckets as $index => &$bucketTypes ) {
|
||||
$lengths[] = array_sum( array_map( 'count', $bucketTypes ) );
|
||||
// Ensure 'prefix' is before 'infix';
|
||||
krsort( $bucketTypes );
|
||||
// Ensure each bucket has entries sorted
|
||||
foreach ( $bucketTypes as $type => &$bucket ) {
|
||||
ksort( $bucket );
|
||||
}
|
||||
}
|
||||
|
||||
$lengths = array_values( array_map( 'count', $buckets ) );
|
||||
$count = count( $buckets );
|
||||
$min = min( $lengths );
|
||||
$max = max( $lengths );
|
||||
@@ -113,7 +130,6 @@ class LanguageNameSearchData {
|
||||
|
||||
PHP;
|
||||
|
||||
ksort( $buckets );
|
||||
// Format for short array format
|
||||
$data = var_export( $buckets, true );
|
||||
$data = str_replace( "array (", '[', $data );
|
||||
|
||||
@@ -18,29 +18,91 @@
|
||||
* @licence MIT License
|
||||
*/
|
||||
class LanguageNameSearch {
|
||||
public static function search( $searchKey, $typos = 0 ) {
|
||||
/**
|
||||
* Find languages with fuzzy matching.
|
||||
* The order of results is following:
|
||||
* 1: exact language code match
|
||||
* 2: exact language name match in any language
|
||||
* 3: prefix language name match in any language
|
||||
* 4: infix language name match in any language
|
||||
*
|
||||
* The returned language name for autocompletion is the first one that
|
||||
* matches in this list:
|
||||
* 1: exact match in [user, autonym, any other language]
|
||||
* 2: prefix match in [user, autonum, any other language]
|
||||
* 3: inline match in [user, autonym, any other language]
|
||||
*
|
||||
* @param string $searchKey
|
||||
* @param int $typos
|
||||
* @param string $userLanguage Language tag.
|
||||
* @return array
|
||||
*/
|
||||
public static function search( $searchKey, $typos = 0, $userLanguage = null ) {
|
||||
$results = [];
|
||||
$searchKey = mb_strtolower( $searchKey );
|
||||
$index = self::getIndex( $searchKey );
|
||||
|
||||
if ( !isset( LanguageNameSearchData::$buckets[$index] ) ) {
|
||||
return [];
|
||||
// Always prefer exact language code match
|
||||
if ( Language::isKnownLanguageTag( $searchKey ) ) {
|
||||
$name = mb_strtolower( Language::fetchLanguageName( $searchKey, $userLanguage ) );
|
||||
// Check if language code is a prefix of the name
|
||||
if ( strpos( $name, $searchKey ) === 0 ) {
|
||||
$results[$searchKey] = $name;
|
||||
} else {
|
||||
$results[$searchKey] = "$searchKey <$name>";
|
||||
}
|
||||
}
|
||||
|
||||
$bucket = LanguageNameSearchData::$buckets[$index];
|
||||
$index = self::getIndex( $searchKey );
|
||||
$bucketsForIndex = [];
|
||||
if ( isset( LanguageNameSearchData::$buckets[$index] ) ) {
|
||||
$bucketsForIndex = LanguageNameSearchData::$buckets[$index];
|
||||
}
|
||||
|
||||
$results = [];
|
||||
foreach ( $bucket as $name => $code ) {
|
||||
// Prefix search
|
||||
if ( strrpos( $name, $searchKey, -strlen( $name ) ) !== false
|
||||
|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos )
|
||||
) {
|
||||
$results[$code] = $name;
|
||||
// types are 'prefix', 'infix' (in this order!)
|
||||
foreach ( $bucketsForIndex as $bucketType => $bucket ) {
|
||||
foreach ( $bucket as $name => $code ) {
|
||||
// We can skip checking languages we already have in the list
|
||||
if ( isset( $results[ $code ] ) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Apply fuzzy search
|
||||
if ( !self::matchNames( $name, $searchKey, $typos ) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Once we find a match, figure out the best name to display to the user
|
||||
// If $userLanguage is not provided (null), it is the same as autonym
|
||||
$candidates = [
|
||||
mb_strtolower( Language::fetchLanguageName( $code, $userLanguage ) ),
|
||||
mb_strtolower( Language::fetchLanguageName( $code, null ) ),
|
||||
$name
|
||||
];
|
||||
|
||||
foreach ( $candidates as $candidate ) {
|
||||
if ( $searchKey === $candidate ) {
|
||||
$results[$code] = $candidate;
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
|
||||
foreach ( $candidates as $candidate ) {
|
||||
if ( self::matchNames( $candidate, $searchKey, $typos ) ) {
|
||||
$results[$code] = $candidate;
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
public static function matchNames( $name, $searchKey, $typos ) {
|
||||
return strrpos( $name, $searchKey, -strlen( $name ) ) !== false
|
||||
|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos );
|
||||
}
|
||||
|
||||
public static function getIndex( $name ) {
|
||||
$codepoint = self::getCodepoint( $name );
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user