Files
mediawiki-extensions-Univer…/data/LanguageNameSearch.php
Santhosh Thottingal 3bf7361262 LanguageNameSearch: Optimize levenshteinDistance
1. Do string comparison for equality early in the method so that we can
   do early return if it passes.
2. Move the zero length check for string up for early return. This may
   not have any significant change in performance though.

Change-Id: I86bdd612a4a31c5ebfac6bcd7687b829acc69cda
2017-11-30 16:38:41 +05:30

137 lines
3.7 KiB
PHP

<?php
/**
* Cross-Language Language name search
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
* have to do anything special to choose one license or the other and you don't
* have to notify anyone which license you are using. You are free to use
* UniversalLanguageSelector in commercial projects as long as the copyright
* header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
*
* @file
* @ingroup Extensions
* @licence GNU General Public Licence 2.0 or later
* @licence MIT License
*/
class LanguageNameSearch {
public static function search( $searchKey, $typos = 0 ) {
$searchKey = mb_strtolower( $searchKey );
$index = self::getIndex( $searchKey );
if ( !isset( LanguageNameSearchData::$buckets[$index] ) ) {
return [];
}
$bucket = LanguageNameSearchData::$buckets[$index];
$results = [];
foreach ( $bucket as $name => $code ) {
// Prefix search
if ( strrpos( $name, $searchKey, -strlen( $name ) ) !== false
|| ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos )
) {
$results[$code] = $name;
}
}
return $results;
}
public static function getIndex( $name ) {
$codepoint = self::getCodepoint( $name );
if ( $codepoint < 4000 ) {
// For latin etc. we need smaller buckets for speed
return $codepoint;
} else {
// Try to group names of same script together
return $codepoint - ( $codepoint % 1000 );
}
}
/**
* Get the code point of first letter of string
*
* @param string $str
* @return int Code point of first letter of string
*/
public static function getCodepoint( $str ) {
$values = [];
$lookingFor = 1;
$strLen = strlen( $str );
$number = 0;
for ( $i = 0; $i < $strLen; $i++ ) {
$thisValue = ord( $str[$i] );
if ( $thisValue < 128 ) {
$number = $thisValue;
break;
} else {
// Codepoints larger than 127 are represented by multi-byte sequences
if ( count( $values ) === 0 ) {
// 224 is the lowest non-overlong-encoded codepoint.
$lookingFor = ( $thisValue < 224 ) ? 2 : 3;
}
$values[] = $thisValue;
if ( count( $values ) === $lookingFor ) {
// Refer http://en.wikipedia.org/wiki/UTF-8#Description
if ( $lookingFor === 3 ) {
$number = ( $values[0] % 16 ) * 4096;
$number += ( $values[1] % 64 ) * 64;
$number += $values[2] % 64;
} else {
$number = ( $values[0] % 32 ) * 64;
$number += $values[1] % 64;
}
break;
}
}
}
return $number;
}
/**
* Calculate the Levenshtein distance between two strings
* @param string $str1
* @param string $str2
* @return int
*/
public static function levenshteinDistance( $str1, $str2 ) {
if ( $str1 === $str2 ) {
return 0;
}
$length1 = mb_strlen( $str1, 'UTF-8' );
$length2 = mb_strlen( $str2, 'UTF-8' );
if ( $length1 === 0 ) {
return $length2;
}
if ( $length1 < $length2 ) {
return self::levenshteinDistance( $str2, $str1 );
}
$prevRow = range( 0, $length2 );
for ( $i = 0; $i < $length1; $i++ ) {
$currentRow = [];
$currentRow[0] = $i + 1;
$c1 = mb_substr( $str1, $i, 1, 'UTF-8' );
for ( $j = 0; $j < $length2; $j++ ) {
$c2 = mb_substr( $str2, $j, 1, 'UTF-8' );
$insertions = $prevRow[$j + 1] + 1;
$deletions = $currentRow[$j] + 1;
$substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 );
$currentRow[] = min( $insertions, $deletions, $substitutions );
}
$prevRow = $currentRow;
}
return $prevRow[$length2];
}
}