Files
mediawiki-extensions-Univer…/data/LanguageNameSearch.php
Niklas Laxström 55b68c329d LanguageNameSearch: do not mix different scripts in same buckets
To keep the average and maximum bucket size low, I made codepoints
< 4000 more granular and code points >= 4000 less granular. This
could be tweaked further for sure to reach more even sized buckets.

Bucket stats before:
 - 773 buckets
 - smallest has 1 entries
 - largest has 1804 entries
 - median size is 66 entries
 - average size is 45.394566623545 entries

Bucket stats after:
 - 698 buckets
 - smallest has 1 entries
 - largest has 1792 entries
 - median size is 16 entries
 - average size is 50.272206303725 entries

Change-Id: Id62d93658117564b05294c2fe36ca7c182784859
2016-08-08 16:21:52 +02:00

141 lines
3.8 KiB
PHP

<?php
/**
* Cross-Language Language name search
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
* have to do anything special to choose one license or the other and you don't
* have to notify anyone which license you are using. You are free to use
* UniversalLanguageSelector in commercial projects as long as the copyright
* header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
*
* @file
* @ingroup Extensions
* @licence GNU General Public Licence 2.0 or later
* @licence MIT License
*/
class LanguageNameSearch {
public static function search( $searchKey, $typos = 0 ) {
// Use code's mb_strtolower compatibily code for MW < 1.27
$language = Language::factory( 'en' );
// @todo: Shouldn't this be unicode aware?
$searchKey = $language->lc( $searchKey );
$index = self::getIndex( $searchKey );
if ( !isset( LanguageNameSearchData::$buckets[$index] ) ) {
return [];
}
$bucket = LanguageNameSearchData::$buckets[$index];
$results = [];
foreach ( $bucket as $name => $code ) {
// Prefix search
if ( strrpos( $name, $searchKey, -strlen( $name ) ) !== false
|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos )
) {
$results[$code] = $name;
}
}
return $results;
}
public static function getIndex( $name ) {
$codepoint = self::getCodepoint( $name );
if ( $codepoint < 4000 ) {
// For latin etc. we need smaller buckets for speed
return $codepoint;
} else {
// Try to group names of same script together
return $codepoint - ( $codepoint % 1000 );
}
}
/**
* Get the code point of first letter of string
*
* @param $str string
* @return integer Code point of first letter of string
*/
public static function getCodepoint( $str ) {
$values = [];
$lookingFor = 1;
$strLen = strlen( $str );
$number = 0;
for ( $i = 0; $i < $strLen; $i++ ) {
$thisValue = ord( $str[$i] );
if ( $thisValue < 128 ) {
$number = $thisValue;
break;
} else {
// Codepoints larger than 127 are represented by multi-byte sequences
if ( count( $values ) === 0 ) {
// 224 is the lowest non-overlong-encoded codepoint.
$lookingFor = ( $thisValue < 224 ) ? 2 : 3;
}
$values[] = $thisValue;
if ( count( $values ) === $lookingFor ) {
// Refer http://en.wikipedia.org/wiki/UTF-8#Description
if ( $lookingFor === 3 ) {
$number = ( $values[0] % 16 ) * 4096;
$number += ( $values[1] % 64 ) * 64;
$number += $values[2] % 64;
} else {
$number = ( $values[0] % 32 ) * 64;
$number += $values[1] % 64;
}
break;
}
}
}
return $number;
}
/**
* Calculate the Levenshtein distance between two strings
* @param $str1
* @param $str2
* @return integer
*/
public static function levenshteinDistance( $str1, $str2 ) {
$length1 = mb_strlen( $str1, 'UTF-8' );
$length2 = mb_strlen( $str2, 'UTF-8' );
if ( $length1 < $length2 ) {
return self::levenshteinDistance( $str2, $str1 );
}
if ( $length1 === 0 ) {
return $length2;
}
if ( $str1 === $str2 ) {
return 0;
}
$prevRow = range( 0, $length2 );
for ( $i = 0; $i < $length1; $i++ ) {
$currentRow = [];
$currentRow[0] = $i + 1;
$c1 = mb_substr( $str1, $i, 1, 'UTF-8' );
for ( $j = 0; $j < $length2; $j++ ) {
$c2 = mb_substr( $str2, $j, 1, 'UTF-8' );
$insertions = $prevRow[$j + 1] + 1;
$deletions = $currentRow[$j] + 1;
$substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 );
$currentRow[] = min( $insertions, $deletions, $substitutions );
}
$prevRow = $currentRow;
}
return $prevRow[$length2];
}
}