Files
mediawiki-extensions-Univer…/data/LanguageNameSearch.php
Niklas Laxström b3ba423354 LanguageNameIndexer: Generate PHP file instead of serialized file.
Serialized format is no longer in style for data. PHP files can
take advantage of AutoLoader and caching so they can even be faster
than serialized files. As side bonus we can have readable diffs
for updates.

Only downside is that the file generation takes about ten lines of
ugly string manipulation.

Change-Id: If09704d1172daa13c72a308814534cac1fe9899f
2016-08-08 07:55:42 +00:00

133 lines
3.6 KiB
PHP

<?php
/**
* Cross-Language Language name search
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
* have to do anything special to choose one license or the other and you don't
* have to notify anyone which license you are using. You are free to use
* UniversalLanguageSelector in commercial projects as long as the copyright
* header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
*
* @file
* @ingroup Extensions
* @licence GNU General Public Licence 2.0 or later
* @licence MIT License
*/
class LanguageNameSearch {
public static function search( $searchKey, $typos = 0 ) {
// Use code's mb_strtolower compatibily code for MW < 1.27
$language = Language::factory( 'en' );
// @todo: Shouldn't this be unicode aware?
$searchKey = $language->lc( $searchKey );
$index = self::getIndex( $searchKey );
if ( !isset( LanguageNameSearchData::$buckets[$index] ) ) {
return [];
}
$bucket = LanguageNameSearchData::$buckets[$index];
$results = [];
foreach ( $bucket as $name => $code ) {
// Prefix search
if ( strrpos( $name, $searchKey, -strlen( $name ) ) !== false
|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos )
) {
$results[$code] = $name;
}
}
return $results;
}
public static function getIndex( $name ) {
return self::getCodepoint( $name ) % 1000;
}
/**
* Get the code point of first letter of string
*
* @param $str string
* @return integer Code point of first letter of string
*/
public static function getCodepoint( $str ) {
$values = [];
$lookingFor = 1;
$strLen = strlen( $str );
$number = 0;
for ( $i = 0; $i < $strLen; $i++ ) {
$thisValue = ord( $str[$i] );
if ( $thisValue < 128 ) {
$number = $thisValue;
break;
} else {
// Codepoints larger than 127 are represented by multi-byte sequences
if ( count( $values ) === 0 ) {
// 224 is the lowest non-overlong-encoded codepoint.
$lookingFor = ( $thisValue < 224 ) ? 2 : 3;
}
$values[] = $thisValue;
if ( count( $values ) === $lookingFor ) {
// Refer http://en.wikipedia.org/wiki/UTF-8#Description
if ( $lookingFor === 3 ) {
$number = ( $values[0] % 16 ) * 4096;
$number += ( $values[1] % 64 ) * 64;
$number += $values[2] % 64;
} else {
$number = ( $values[0] % 32 ) * 64;
$number += $values[1] % 64;
}
break;
}
}
}
return $number;
}
/**
* Calculate the Levenshtein distance between two strings
* @param $str1
* @param $str2
* @return integer
*/
public static function levenshteinDistance( $str1, $str2 ) {
$length1 = mb_strlen( $str1, 'UTF-8' );
$length2 = mb_strlen( $str2, 'UTF-8' );
if ( $length1 < $length2 ) {
return self::levenshteinDistance( $str2, $str1 );
}
if ( $length1 === 0 ) {
return $length2;
}
if ( $str1 === $str2 ) {
return 0;
}
$prevRow = range( 0, $length2 );
for ( $i = 0; $i < $length1; $i++ ) {
$currentRow = [];
$currentRow[0] = $i + 1;
$c1 = mb_substr( $str1, $i, 1, 'UTF-8' );
for ( $j = 0; $j < $length2; $j++ ) {
$c2 = mb_substr( $str2, $j, 1, 'UTF-8' );
$insertions = $prevRow[$j + 1] + 1;
$deletions = $currentRow[$j] + 1;
$substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 );
$currentRow[] = min( $insertions, $deletions, $substitutions );
}
$prevRow = $currentRow;
}
return $prevRow[$length2];
}
}