Cross-language language name search

Implementation of Also Written As language name
search algorithm.
See http://etherpad.wikimedia.org/l10n-uls-language-search

Change-Id: Iff84408c531b650a44d031b63d5c823737cceafc
This commit is contained in:
Santhosh Thottingal
2012-07-24 15:43:56 +05:30
parent fb11c720b5
commit 08c14dafa4
9 changed files with 319 additions and 19 deletions

View File

@@ -0,0 +1,59 @@
<?php
/**
* Script to create language names index.
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
* have to do anything special to choose one license or the other and you don't
* have to notify anyone which license you are using. You are free to use
* UniversalLanguageSelector in commercial projects as long as the copyright
* header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
*
* @file
* @ingroup Extensions
* @licence GNU General Public Licence 2.0 or later
* @licence MIT License
*/
// Standard boilerplate to define $IP
if ( getenv( 'MW_INSTALL_PATH' ) !== false ) {
$IP = getenv( 'MW_INSTALL_PATH' );
} else {
$dir = __DIR__;
$IP = "$dir/../../..";
}
require_once ( "$IP/maintenance/Maintenance.php" );
class LanguageNameIndexer extends Maintenance {
public function __construct() {
parent::__construct();
$this->addDescription( "Script to create language names index." );
}
public function execute() {
$languages = Language::fetchLanguageNames( null, 'all' );
$all = array();
$buckets = array();
foreach ( $languages as $code => $name ) {
$all[$code][strtolower( $name )] = true;
$langnames = LanguageNames::getNames( $code, 0, 2 );
foreach ( $langnames as $code => $name ) {
$all[$code][] = strtolower( $name );
}
}
foreach ( $all as $code => $names ) {
foreach ( $names as $index => $name ) {
$bucket = LanguageNameSearch::getIndex( $name );
$buckets[$bucket][$name] = $code;
}
}
$this->output( "Total buckets: " . count( $buckets ) . "\n" );
file_put_contents( 'langnames.ser', serialize( $buckets ) );
}
}
$maintClass = 'LanguageNameIndexer';
require_once( RUN_MAINTENANCE_IF_MAIN );

View File

@@ -0,0 +1,80 @@
<?php
/**
* Cross-Language Language name search
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
* have to do anything special to choose one license or the other and you don't
* have to notify anyone which license you are using. You are free to use
* UniversalLanguageSelector in commercial projects as long as the copyright
* header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
*
* @file
* @ingroup Extensions
* @licence GNU General Public Licence 2.0 or later
* @licence MIT License
*/
class LanguageNameSearch {
static $languagenames;
public function init() {
self::$languagenames = unserialize( file_get_contents( __DIR__ . '/langnames.ser' ) );
}
public static function search( $searchKey ) {
$results = array();
if ( self::$languagenames === null ) {
self::init();
}
$bucket = self::$languagenames[self::getIndex( $searchKey )];
foreach ( $bucket as $name => $code ) {
// Prefix search
if ( strpos( $name, $searchKey, 0 ) === 0 ) {
$results[$code] = $name;
}
}
return $results;
}
public static function getIndex( $name ) {
$codepoint = self::getCodepoint( $name );
if ( $codepoint < 1000 ) {
$bucket = $codepoint;
} else {
$bucket = $codepoint % 1000;
}
if ( !isset( $buckets[$bucket] ) ) {
$buckets[$bucket] = array();
}
return $bucket;
}
/**
* Get the code point of first letter of string
*
* @return integer Code point of first letter of string
*/
static function getCodepoint( $str ) {
$unicode = array();
$values = array();
$lookingFor = 1;
for ( $i = 0; $i < strlen( $str ); $i++ ) {
$thisValue = ord( $str[$i] );
if ( $thisValue < 128 ) {
return $thisValue;
} else { // Codepoints larger than 127 are represented by multi-byte sequences,
if ( count( $values ) === 0 ) {
// 224 is the lowest non-overlong-encoded codepoint.
$lookingFor = ( $thisValue < 224 ) ? 2 : 3;
}
$values[] = $thisValue;
if ( count( $values ) === $lookingFor ) {
// Refer http://en.wikipedia.org/wiki/UTF-8#Description
$number = ( $lookingFor === 3 ) ? ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ) : ( ( $values[0] % 32 ) * 64 ) + ( $values[
1] % 64 );
return $number;
}
}
}
}
}