100 ) { // Searching with long search keys for language names is not useful. So, return early. return []; } $languageNameUtils = MediaWikiServices::getInstance()->getLanguageNameUtils(); // Always prefer exact language code match if ( $languageNameUtils->isKnownLanguageTag( $searchKey ) ) { $name = mb_strtolower( $languageNameUtils->getLanguageName( $searchKey, $userLanguage ) ); // Check if language code is a prefix of the name if ( strpos( $name, $searchKey ) === 0 ) { $results[$searchKey] = $name; } else { $results[$searchKey] = "$searchKey – $name"; } } $index = self::getIndex( $searchKey ); $bucketsForIndex = LanguageNameSearchData::$buckets[$index] ?? []; // types are 'prefix', 'infix' (in this order!) foreach ( $bucketsForIndex as $bucket ) { foreach ( $bucket as $name => $code ) { // We can skip checking languages we already have in the list if ( isset( $results[ $code ] ) ) { continue; } // Apply fuzzy search if ( !self::matchNames( $name, $searchKey, $typos ) ) { continue; } // Once we find a match, figure out the best name to display to the user // If $userLanguage is not provided (null), it is the same as autonym $candidates = [ mb_strtolower( $languageNameUtils->getLanguageName( $code, $userLanguage ) ), mb_strtolower( $languageNameUtils->getLanguageName( $code, LanguageNameUtils::AUTONYMS ) ), $name ]; foreach ( $candidates as $candidate ) { if ( $searchKey === $candidate ) { $results[$code] = $candidate; continue 2; } } foreach ( $candidates as $candidate ) { if ( self::matchNames( $candidate, $searchKey, $typos ) ) { $results[$code] = $candidate; continue 2; } } } } return $results; } /** * @param string $name * @param string $searchKey * @param int $typos * @return bool */ public static function matchNames( $name, $searchKey, $typos ) { return strrpos( $name, $searchKey, -strlen( $name ) ) !== false || ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos ); } /** * @param string $name * @return int */ public static function getIndex( $name ) { $codepoint = self::getCodepoint( $name ); if ( $codepoint < 4000 ) { // For latin etc. we need smaller buckets for speed return $codepoint; } // Try to group names of same script together return $codepoint - ( $codepoint % 1000 ); } /** * Get the code point of first letter of string * * @param string $str * @return int Code point of first letter of string */ public static function getCodepoint( $str ) { $values = []; $lookingFor = 1; $strLen = strlen( $str ); $number = 0; for ( $i = 0; $i < $strLen; $i++ ) { $thisValue = ord( $str[$i] ); if ( $thisValue < 128 ) { $number = $thisValue; break; } // Codepoints larger than 127 are represented by multi-byte sequences if ( $values === [] ) { // 224 is the lowest non-overlong-encoded codepoint. $lookingFor = ( $thisValue < 224 ) ? 2 : 3; } $values[] = $thisValue; if ( count( $values ) === $lookingFor ) { // Refer http://en.wikipedia.org/wiki/UTF-8#Description if ( $lookingFor === 3 ) { $number = ( $values[0] % 16 ) * 4096; $number += ( $values[1] % 64 ) * 64; $number += $values[2] % 64; } else { $number = ( $values[0] % 32 ) * 64; $number += $values[1] % 64; } break; } } return $number; } /** * Calculate the Levenshtein distance between two strings * @param string $str1 * @param string $str2 * @return int */ public static function levenshteinDistance( $str1, $str2 ) { if ( $str1 === $str2 ) { return 0; } $length1 = mb_strlen( $str1, 'UTF-8' ); $length2 = mb_strlen( $str2, 'UTF-8' ); if ( $length1 === 0 ) { return $length2; } if ( $length1 < $length2 ) { return self::levenshteinDistance( $str2, $str1 ); } $prevRow = range( 0, $length2 ); for ( $i = 0; $i < $length1; $i++ ) { $currentRow = []; $currentRow[0] = $i + 1; $c1 = mb_substr( $str1, $i, 1, 'UTF-8' ); for ( $j = 0; $j < $length2; $j++ ) { $c2 = mb_substr( $str2, $j, 1, 'UTF-8' ); $insertions = $prevRow[$j + 1] + 1; $deletions = $currentRow[$j] + 1; $substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 ); $currentRow[] = min( $insertions, $deletions, $substitutions ); } $prevRow = $currentRow; } return $prevRow[$length2]; } }