Merge "Improve ULS language search api"

2017-12-01 04:47:30 +00:00
parent 3bf7361262 e87dd20cdd
commit 603cfea7d0
6 changed files with 51632 additions and 46567 deletions
--- a/data/LanguageNameIndexer.php
+++ b/data/LanguageNameIndexer.php
@@ -58,11 +58,13 @@ class LanguageNameIndexer extends Maintenance {
 				foreach ( $words as $index => $word ) {
 					$bucket = LanguageNameSearch::getIndex( $word );

+					$type = 'prefix';
 					$display = $translation;
 					if ( $index > 0 && count( $words ) > 1 ) {
+						$type = 'infix';
 						$display = "$word <$translation>";
 					}
-					$buckets[$bucket][$display] = $targetLanguage;
+					$buckets[$bucket][$type][$display] = $targetLanguage;
 				}
 			}
 		}
@@ -87,10 +89,25 @@ class LanguageNameIndexer extends Maintenance {

 		foreach ( $specialLanguages as $targetLanguage => $translation ) {
 			$bucket = LanguageNameSearch::getIndex( $translation );
-			$buckets[$bucket][$translation] = $targetLanguage;
+			$buckets[$bucket]['prefix'][$translation] = $targetLanguage;
+		}
+
+		$lengths = [];
+		// Sorting the bucket contents gives two benefits:
+		// - more consistent output across environments
+		// - shortest matches appear first, especially exact matches
+		// Sort buckets by index
+		ksort( $buckets );
+		foreach ( $buckets as $index => &$bucketTypes ) {
+			$lengths[] = array_sum( array_map( 'count', $bucketTypes ) );
+			// Ensure 'prefix' is before 'infix';
+			krsort( $bucketTypes );
+			// Ensure each bucket has entries sorted
+			foreach ( $bucketTypes as $type => &$bucket ) {
+				ksort( $bucket );
+			}
 		}

-		$lengths = array_values( array_map( 'count', $buckets ) );
 		$count = count( $buckets );
 		$min = min( $lengths );
 		$max = max( $lengths );
@@ -113,7 +130,6 @@ class LanguageNameSearchData {

 PHP;

-		ksort( $buckets );
 		// Format for short array format
 		$data = var_export( $buckets, true );
 		$data = str_replace( "array (", '[', $data );
--- a/data/LanguageNameSearch.php
+++ b/data/LanguageNameSearch.php
@@ -18,29 +18,91 @@
 * @licence MIT License
 */
 class LanguageNameSearch {
-	public static function search( $searchKey, $typos = 0 ) {
+	/**
+	 * Find languages with fuzzy matching.
+	 * The order of results is following:
+	 * 1: exact language code match
+	 * 2: exact language name match in any language
+	 * 3: prefix language name match in any language
+	 * 4: infix language name match in any language
+	 *
+	 * The returned language name for autocompletion is the first one that
+	 * matches in this list:
+	 * 1: exact match in [user, autonym, any other language]
+	 * 2: prefix match in [user, autonum, any other language]
+	 * 3: inline match in [user, autonym, any other language]
+	 *
+	 * @param string $searchKey
+	 * @param int $typos
+	 * @param string $userLanguage Language tag.
+	 * @return array
+	 */
+	public static function search( $searchKey, $typos = 0, $userLanguage = null ) {
+		$results = [];
 		$searchKey = mb_strtolower( $searchKey );
-		$index = self::getIndex( $searchKey );

-		if ( !isset( LanguageNameSearchData::$buckets[$index] ) ) {
-			return [];
+		// Always prefer exact language code match
+		if ( Language::isKnownLanguageTag( $searchKey ) ) {
+			$name = mb_strtolower( Language::fetchLanguageName( $searchKey, $userLanguage ) );
+			// Check if language code is a prefix of the name
+			if ( strpos( $name, $searchKey ) === 0 ) {
+				$results[$searchKey] = $name;
+			} else {
+				$results[$searchKey] = "$searchKey <$name>";
+			}
 		}

-		$bucket = LanguageNameSearchData::$buckets[$index];
+		$index = self::getIndex( $searchKey );
+		$bucketsForIndex = [];
+		if ( isset( LanguageNameSearchData::$buckets[$index] ) ) {
+			$bucketsForIndex = LanguageNameSearchData::$buckets[$index];
+		}

-		$results = [];
-		foreach ( $bucket as $name => $code ) {
-			// Prefix search
-			if ( strrpos( $name, $searchKey, -strlen( $name ) ) !== false
-				|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos )
-			) {
-				$results[$code] = $name;
+		// types are 'prefix', 'infix' (in this order!)
+		foreach ( $bucketsForIndex as $bucketType => $bucket ) {
+			foreach ( $bucket as $name => $code ) {
+				// We can skip checking languages we already have in the list
+				if ( isset( $results[ $code ] ) ) {
+					continue;
+				}
+
+				// Apply fuzzy search
+				if ( !self::matchNames( $name, $searchKey, $typos ) ) {
+					continue;
+				}
+
+				// Once we find a match, figure out the best name to display to the user
+				// If $userLanguage is not provided (null), it is the same as autonym
+				$candidates = [
+					mb_strtolower( Language::fetchLanguageName( $code, $userLanguage ) ),
+					mb_strtolower( Language::fetchLanguageName( $code, null ) ),
+					$name
+				];
+
+				foreach ( $candidates as $candidate ) {
+					if ( $searchKey === $candidate ) {
+						$results[$code] = $candidate;
+						continue 2;
+					}
+				}
+
+				foreach ( $candidates as $candidate ) {
+					if ( self::matchNames( $candidate, $searchKey, $typos ) ) {
+						$results[$code] = $candidate;
+						continue 2;
+					}
+				}
 			}
 		}

 		return $results;
 	}

+	public static function matchNames( $name, $searchKey, $typos ) {
+		return strrpos( $name, $searchKey, -strlen( $name ) ) !== false
+			|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos );
+	}
+
 	public static function getIndex( $name ) {
 		$codepoint = self::getCodepoint( $name );

--- a/data/LanguageNameSearchData.php
+++ b/data/LanguageNameSearchData.php