Files
mediawiki-extensions-Univer…/data/LanguageNameSearch.php
Fomafix e44e4c4c0c Remove phpcs exclude rules and update PHP code
Change-Id: Ie37deda368a4d163cf324988721033829ae8af77
2022-11-13 18:33:34 +00:00

216 lines
6.0 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
use MediaWiki\MediaWikiServices;
/**
* Cross-Language Language name search
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
* have to do anything special to choose one license or the other and you don't
* have to notify anyone which license you are using. You are free to use
* UniversalLanguageSelector in commercial projects as long as the copyright
* header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
*
* @file
* @ingroup Extensions
* @license GPL-2.0-or-later
* @license MIT
*/
class LanguageNameSearch {
/**
* Find languages with fuzzy matching.
* The order of results is following:
* 1: exact language code match
* 2: exact language name match in any language
* 3: prefix language name match in any language
* 4: infix language name match in any language
*
* The returned language name for autocompletion is the first one that
* matches in this list:
* 1: exact match in [user, autonym, any other language]
* 2: prefix match in [user, autonym, any other language]
* 3: inline match in [user, autonym, any other language]
*
* @param string $searchKey
* @param int $typos
* @param string|null $userLanguage Language tag.
* @return array
*/
public static function search( $searchKey, $typos = 0, $userLanguage = null ) {
$results = [];
$searchKey = mb_strtolower( $searchKey );
if ( mb_strlen( $searchKey ) > 100 ) {
// Searching with long search keys for language names is not useful. So, return early.
return [];
}
$languageNameUtils = MediaWikiServices::getInstance()->getLanguageNameUtils();
// Always prefer exact language code match
if ( $languageNameUtils->isKnownLanguageTag( $searchKey ) ) {
$name = mb_strtolower( $languageNameUtils->getLanguageName( $searchKey, $userLanguage ) );
// Check if language code is a prefix of the name
if ( strpos( $name, $searchKey ) === 0 ) {
$results[$searchKey] = $name;
} else {
$results[$searchKey] = "$searchKey $name";
}
}
$index = self::getIndex( $searchKey );
$bucketsForIndex = LanguageNameSearchData::$buckets[$index] ?? [];
// types are 'prefix', 'infix' (in this order!)
foreach ( $bucketsForIndex as $bucket ) {
foreach ( $bucket as $name => $code ) {
// We can skip checking languages we already have in the list
if ( isset( $results[ $code ] ) ) {
continue;
}
// Apply fuzzy search
if ( !self::matchNames( $name, $searchKey, $typos ) ) {
continue;
}
// Once we find a match, figure out the best name to display to the user
// If $userLanguage is not provided (null), it is the same as autonym
$candidates = [
mb_strtolower( $languageNameUtils->getLanguageName( $code, $userLanguage ) ),
mb_strtolower( $languageNameUtils->getLanguageName( $code, null ) ),
$name
];
foreach ( $candidates as $candidate ) {
if ( $searchKey === $candidate ) {
$results[$code] = $candidate;
continue 2;
}
}
foreach ( $candidates as $candidate ) {
if ( self::matchNames( $candidate, $searchKey, $typos ) ) {
$results[$code] = $candidate;
continue 2;
}
}
}
}
return $results;
}
/**
* @param string $name
* @param string $searchKey
* @param int $typos
* @return bool
*/
public static function matchNames( $name, $searchKey, $typos ) {
return strrpos( $name, $searchKey, -strlen( $name ) ) !== false
|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos );
}
/**
* @param string $name
* @return int
*/
public static function getIndex( $name ) {
$codepoint = self::getCodepoint( $name );
if ( $codepoint < 4000 ) {
// For latin etc. we need smaller buckets for speed
return $codepoint;
}
// Try to group names of same script together
return $codepoint - ( $codepoint % 1000 );
}
/**
* Get the code point of first letter of string
*
* @param string $str
* @return int Code point of first letter of string
*/
public static function getCodepoint( $str ) {
$values = [];
$lookingFor = 1;
$strLen = strlen( $str );
$number = 0;
for ( $i = 0; $i < $strLen; $i++ ) {
$thisValue = ord( $str[$i] );
if ( $thisValue < 128 ) {
$number = $thisValue;
break;
}
// Codepoints larger than 127 are represented by multi-byte sequences
if ( $values === [] ) {
// 224 is the lowest non-overlong-encoded codepoint.
$lookingFor = ( $thisValue < 224 ) ? 2 : 3;
}
$values[] = $thisValue;
if ( count( $values ) === $lookingFor ) {
// Refer http://en.wikipedia.org/wiki/UTF-8#Description
if ( $lookingFor === 3 ) {
$number = ( $values[0] % 16 ) * 4096;
$number += ( $values[1] % 64 ) * 64;
$number += $values[2] % 64;
} else {
$number = ( $values[0] % 32 ) * 64;
$number += $values[1] % 64;
}
break;
}
}
return $number;
}
/**
* Calculate the Levenshtein distance between two strings
* @param string $str1
* @param string $str2
* @return int
*/
public static function levenshteinDistance( $str1, $str2 ) {
if ( $str1 === $str2 ) {
return 0;
}
$length1 = mb_strlen( $str1, 'UTF-8' );
$length2 = mb_strlen( $str2, 'UTF-8' );
if ( $length1 === 0 ) {
return $length2;
}
if ( $length1 < $length2 ) {
return self::levenshteinDistance( $str2, $str1 );
}
$prevRow = range( 0, $length2 );
for ( $i = 0; $i < $length1; $i++ ) {
$currentRow = [];
$currentRow[0] = $i + 1;
$c1 = mb_substr( $str1, $i, 1, 'UTF-8' );
for ( $j = 0; $j < $length2; $j++ ) {
$c2 = mb_substr( $str2, $j, 1, 'UTF-8' );
$insertions = $prevRow[$j + 1] + 1;
$deletions = $currentRow[$j] + 1;
$substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 );
$currentRow[] = min( $insertions, $deletions, $substitutions );
}
$prevRow = $currentRow;
}
return $prevRow[$length2];
}
}