Allow typo in search key

* Introduce Levenshtein algorithm
* New API param 'typos' to give number of typos allowed
* test cases

Change-Id: I22bf34d08a910d1509d7eab5adc292eadc9a7c7d
This commit is contained in:
Santhosh Thottingal
2012-08-01 14:14:55 +05:30
committed by Gerrit Code Review
parent 878313d2ec
commit 76f9038aff
3 changed files with 63 additions and 6 deletions

View File

@@ -30,7 +30,8 @@ class ApiLanguageSearch extends ApiBase {
public function execute() { public function execute() {
$params = $this->extractRequestParams(); $params = $this->extractRequestParams();
$search = $params['search']; $search = $params['search'];
$searches = LanguageNameSearch::search( $search ); $typos = $params['typos'];
$searches = LanguageNameSearch::search( $search, $typos );
$result = $this->getResult(); $result = $this->getResult();
$result->addValue( null, $this->getModuleName(), $searches ); $result->addValue( null, $this->getModuleName(), $searches );
} }
@@ -40,12 +41,18 @@ class ApiLanguageSearch extends ApiBase {
'search' => array( 'search' => array(
ApiBase::PARAM_REQUIRED => true ApiBase::PARAM_REQUIRED => true
), ),
'typos' => array(
ApiBase::PARAM_REQUIRED => false,
ApiBase::PARAM_TYPE => 'integer',
ApiBase::PARAM_DFLT => 1
),
); );
} }
public function getParamDescription() { public function getParamDescription() {
return array( return array(
'search' => 'Search string', 'search' => 'Search string',
'typos' => 'Number of spelling mistakes allowed in the search string',
); );
} }
@@ -57,6 +64,7 @@ class ApiLanguageSearch extends ApiBase {
return array( return array(
'api.php?action=languagesearch&search=Te', 'api.php?action=languagesearch&search=Te',
'api.php?action=languagesearch&search=ഫി', 'api.php?action=languagesearch&search=ഫി',
'api.php?action=languagesearch&search=ഫി&typos=1',
); );
} }
public function getVersion() { public function getVersion() {

View File

@@ -22,11 +22,11 @@ class LanguageNameSearch {
public static function init() { public static function init() {
self::$languagenames = unserialize( file_get_contents( __DIR__ . '/langnames.ser' ) ); self::$languagenames = unserialize( file_get_contents( __DIR__ . '/langnames.ser' ) );
} }
public static function search( $searchKey, $typos = 0 ) {
public static function search( $searchKey ) {
if ( self::$languagenames === null ) { if ( self::$languagenames === null ) {
self::init(); self::init();
} }
$searchKey = strtolower( $searchKey );
$bucket = self::$languagenames[self::getIndex( $searchKey )]; $bucket = self::$languagenames[self::getIndex( $searchKey )];
if ( !$bucket ) { if ( !$bucket ) {
return array(); return array();
@@ -36,10 +36,15 @@ class LanguageNameSearch {
// Prefix search // Prefix search
if ( strpos( $name, $searchKey, 0 ) === 0 ) { if ( strpos( $name, $searchKey, 0 ) === 0 ) {
$results[$code] = $name; $results[$code] = $name;
continue;
}
if ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) === $typos ) {
$results[$code] = $name;
} }
} }
return $results; return $results;
} }
public static function getIndex( $name ) { public static function getIndex( $name ) {
$codepoint = self::getCodepoint( $name ); $codepoint = self::getCodepoint( $name );
if ( $codepoint < 1000 ) { if ( $codepoint < 1000 ) {
@@ -52,6 +57,7 @@ class LanguageNameSearch {
} }
return $bucket; return $bucket;
} }
/** /**
* Get the code point of first letter of string * Get the code point of first letter of string
* *
@@ -65,7 +71,7 @@ class LanguageNameSearch {
$thisValue = ord( $str[$i] ); $thisValue = ord( $str[$i] );
if ( $thisValue < 128 ) { if ( $thisValue < 128 ) {
return $thisValue; return $thisValue;
} else { // Codepoints larger than 127 are represented by multi-byte sequences, } else {// Codepoints larger than 127 are represented by multi-byte sequences,
if ( count( $values ) === 0 ) { if ( count( $values ) === 0 ) {
// 224 is the lowest non-overlong-encoded codepoint. // 224 is the lowest non-overlong-encoded codepoint.
$lookingFor = ( $thisValue < 224 ) ? 2 : 3; $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
@@ -73,11 +79,46 @@ class LanguageNameSearch {
$values[] = $thisValue; $values[] = $thisValue;
if ( count( $values ) === $lookingFor ) { if ( count( $values ) === $lookingFor ) {
// Refer http://en.wikipedia.org/wiki/UTF-8#Description // Refer http://en.wikipedia.org/wiki/UTF-8#Description
$number = ( $lookingFor === 3 ) ? ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ) : ( ( $values[0] % 32 ) * 64 ) + ( $values[ $number = ( $lookingFor === 3 ) ? ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ) : ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
1] % 64 );
return $number; return $number;
} }
} }
} }
} }
/**
* Calculate the Levenshtein distance between two strings
* @param $str1
* @param $str2
* @return integer
*/
static function levenshteinDistance( $str1, $str2 ) {
$length1 = mb_strlen( $str1, 'UTF-8' );
$length2 = mb_strlen( $str2, 'UTF-8' );
if ( $length1 < $length2 ) {
return self::levenshteinDistance( $str2, $str1 );
}
if ( $length1 === 0 ) {
return $length2;
}
if ( $str1 === $str2 ) {
return 0;
}
$prevRow = range( 0, $length2 );
$currentRow = array();
for ( $i = 0; $i < $length1; $i++ ) {
$currentRow = array();
$currentRow[0] = $i + 1;
$c1 = mb_substr( $str1, $i, 1, 'UTF-8' );
for ( $j = 0; $j < $length2; $j++ ) {
$c2 = mb_substr( $str2, $j, 1, 'UTF-8' );
$insertions = $prevRow[$j + 1] + 1;
$deletions = $currentRow[$j] + 1;
$substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 );
$currentRow[] = min( $insertions, $deletions, $substitutions );
}
$prevRow = $currentRow;
}
return $prevRow[$length2];
}
} }

View File

@@ -54,6 +54,14 @@ class LanguageSearchTest extends PHPUnit_Framework_TestCase {
'fro' => 'الفرنسية القديمة', 'fro' => 'الفرنسية القديمة',
) )
), ),
array( "മലയളം", array(
'ml' => "മലയാളം",
)
),
array( "finish", array(
'fi' => 'finnish'
)
),
); );
} }
} }