Allow typo in search key
* Introduce Levenshtein algorithm * New API param 'typos' to give number of typos allowed * test cases Change-Id: I22bf34d08a910d1509d7eab5adc292eadc9a7c7d
This commit is contained in:
committed by
Gerrit Code Review
parent
878313d2ec
commit
76f9038aff
@@ -30,7 +30,8 @@ class ApiLanguageSearch extends ApiBase {
|
|||||||
public function execute() {
|
public function execute() {
|
||||||
$params = $this->extractRequestParams();
|
$params = $this->extractRequestParams();
|
||||||
$search = $params['search'];
|
$search = $params['search'];
|
||||||
$searches = LanguageNameSearch::search( $search );
|
$typos = $params['typos'];
|
||||||
|
$searches = LanguageNameSearch::search( $search, $typos );
|
||||||
$result = $this->getResult();
|
$result = $this->getResult();
|
||||||
$result->addValue( null, $this->getModuleName(), $searches );
|
$result->addValue( null, $this->getModuleName(), $searches );
|
||||||
}
|
}
|
||||||
@@ -40,12 +41,18 @@ class ApiLanguageSearch extends ApiBase {
|
|||||||
'search' => array(
|
'search' => array(
|
||||||
ApiBase::PARAM_REQUIRED => true
|
ApiBase::PARAM_REQUIRED => true
|
||||||
),
|
),
|
||||||
|
'typos' => array(
|
||||||
|
ApiBase::PARAM_REQUIRED => false,
|
||||||
|
ApiBase::PARAM_TYPE => 'integer',
|
||||||
|
ApiBase::PARAM_DFLT => 1
|
||||||
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getParamDescription() {
|
public function getParamDescription() {
|
||||||
return array(
|
return array(
|
||||||
'search' => 'Search string',
|
'search' => 'Search string',
|
||||||
|
'typos' => 'Number of spelling mistakes allowed in the search string',
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -57,6 +64,7 @@ class ApiLanguageSearch extends ApiBase {
|
|||||||
return array(
|
return array(
|
||||||
'api.php?action=languagesearch&search=Te',
|
'api.php?action=languagesearch&search=Te',
|
||||||
'api.php?action=languagesearch&search=ഫി',
|
'api.php?action=languagesearch&search=ഫി',
|
||||||
|
'api.php?action=languagesearch&search=ഫി&typos=1',
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
public function getVersion() {
|
public function getVersion() {
|
||||||
|
|||||||
@@ -22,11 +22,11 @@ class LanguageNameSearch {
|
|||||||
public static function init() {
|
public static function init() {
|
||||||
self::$languagenames = unserialize( file_get_contents( __DIR__ . '/langnames.ser' ) );
|
self::$languagenames = unserialize( file_get_contents( __DIR__ . '/langnames.ser' ) );
|
||||||
}
|
}
|
||||||
|
public static function search( $searchKey, $typos = 0 ) {
|
||||||
public static function search( $searchKey ) {
|
|
||||||
if ( self::$languagenames === null ) {
|
if ( self::$languagenames === null ) {
|
||||||
self::init();
|
self::init();
|
||||||
}
|
}
|
||||||
|
$searchKey = strtolower( $searchKey );
|
||||||
$bucket = self::$languagenames[self::getIndex( $searchKey )];
|
$bucket = self::$languagenames[self::getIndex( $searchKey )];
|
||||||
if ( !$bucket ) {
|
if ( !$bucket ) {
|
||||||
return array();
|
return array();
|
||||||
@@ -36,10 +36,15 @@ class LanguageNameSearch {
|
|||||||
// Prefix search
|
// Prefix search
|
||||||
if ( strpos( $name, $searchKey, 0 ) === 0 ) {
|
if ( strpos( $name, $searchKey, 0 ) === 0 ) {
|
||||||
$results[$code] = $name;
|
$results[$code] = $name;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) === $typos ) {
|
||||||
|
$results[$code] = $name;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return $results;
|
return $results;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static function getIndex( $name ) {
|
public static function getIndex( $name ) {
|
||||||
$codepoint = self::getCodepoint( $name );
|
$codepoint = self::getCodepoint( $name );
|
||||||
if ( $codepoint < 1000 ) {
|
if ( $codepoint < 1000 ) {
|
||||||
@@ -52,6 +57,7 @@ class LanguageNameSearch {
|
|||||||
}
|
}
|
||||||
return $bucket;
|
return $bucket;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the code point of first letter of string
|
* Get the code point of first letter of string
|
||||||
*
|
*
|
||||||
@@ -65,7 +71,7 @@ class LanguageNameSearch {
|
|||||||
$thisValue = ord( $str[$i] );
|
$thisValue = ord( $str[$i] );
|
||||||
if ( $thisValue < 128 ) {
|
if ( $thisValue < 128 ) {
|
||||||
return $thisValue;
|
return $thisValue;
|
||||||
} else { // Codepoints larger than 127 are represented by multi-byte sequences,
|
} else {// Codepoints larger than 127 are represented by multi-byte sequences,
|
||||||
if ( count( $values ) === 0 ) {
|
if ( count( $values ) === 0 ) {
|
||||||
// 224 is the lowest non-overlong-encoded codepoint.
|
// 224 is the lowest non-overlong-encoded codepoint.
|
||||||
$lookingFor = ( $thisValue < 224 ) ? 2 : 3;
|
$lookingFor = ( $thisValue < 224 ) ? 2 : 3;
|
||||||
@@ -73,11 +79,46 @@ class LanguageNameSearch {
|
|||||||
$values[] = $thisValue;
|
$values[] = $thisValue;
|
||||||
if ( count( $values ) === $lookingFor ) {
|
if ( count( $values ) === $lookingFor ) {
|
||||||
// Refer http://en.wikipedia.org/wiki/UTF-8#Description
|
// Refer http://en.wikipedia.org/wiki/UTF-8#Description
|
||||||
$number = ( $lookingFor === 3 ) ? ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ) : ( ( $values[0] % 32 ) * 64 ) + ( $values[
|
$number = ( $lookingFor === 3 ) ? ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ) : ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
|
||||||
1] % 64 );
|
|
||||||
return $number;
|
return $number;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Calculate the Levenshtein distance between two strings
|
||||||
|
* @param $str1
|
||||||
|
* @param $str2
|
||||||
|
* @return integer
|
||||||
|
*/
|
||||||
|
static function levenshteinDistance( $str1, $str2 ) {
|
||||||
|
$length1 = mb_strlen( $str1, 'UTF-8' );
|
||||||
|
$length2 = mb_strlen( $str2, 'UTF-8' );
|
||||||
|
if ( $length1 < $length2 ) {
|
||||||
|
return self::levenshteinDistance( $str2, $str1 );
|
||||||
|
}
|
||||||
|
if ( $length1 === 0 ) {
|
||||||
|
return $length2;
|
||||||
|
}
|
||||||
|
if ( $str1 === $str2 ) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
$prevRow = range( 0, $length2 );
|
||||||
|
$currentRow = array();
|
||||||
|
for ( $i = 0; $i < $length1; $i++ ) {
|
||||||
|
$currentRow = array();
|
||||||
|
$currentRow[0] = $i + 1;
|
||||||
|
$c1 = mb_substr( $str1, $i, 1, 'UTF-8' );
|
||||||
|
for ( $j = 0; $j < $length2; $j++ ) {
|
||||||
|
$c2 = mb_substr( $str2, $j, 1, 'UTF-8' );
|
||||||
|
$insertions = $prevRow[$j + 1] + 1;
|
||||||
|
$deletions = $currentRow[$j] + 1;
|
||||||
|
$substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 );
|
||||||
|
$currentRow[] = min( $insertions, $deletions, $substitutions );
|
||||||
|
}
|
||||||
|
$prevRow = $currentRow;
|
||||||
|
}
|
||||||
|
return $prevRow[$length2];
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -54,6 +54,14 @@ class LanguageSearchTest extends PHPUnit_Framework_TestCase {
|
|||||||
'fro' => 'الفرنسية القديمة',
|
'fro' => 'الفرنسية القديمة',
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
|
array( "മലയളം", array(
|
||||||
|
'ml' => "മലയാളം",
|
||||||
|
)
|
||||||
|
),
|
||||||
|
array( "finish", array(
|
||||||
|
'fi' => 'finnish'
|
||||||
|
)
|
||||||
|
),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user