Cross-language language name search

Implementation of Also Written As language name
search algorithm.
See http://etherpad.wikimedia.org/l10n-uls-language-search

Change-Id: Iff84408c531b650a44d031b63d5c823737cceafc
This commit is contained in:
Santhosh Thottingal
2012-07-24 15:43:56 +05:30
parent fb11c720b5
commit 08c14dafa4
9 changed files with 319 additions and 19 deletions

View File

@@ -22,6 +22,10 @@ if ( !defined( 'MEDIAWIKI' ) ) {
echo( "This file is an extension to the MediaWiki software and cannot be used standalone.\n" );
die( -1 );
}
/**
* Version number used in extension credits and in other placed where needed.
*/
define( 'ULS_VERSION', '2012-07-20' );
$wgExtensionCredits['other'][] = array(
'path' => __FILE__,
@@ -48,11 +52,14 @@ $wgExtensionMessagesFiles['UniversalLanguageSelector'] = "$dir/UniversalLanguage
// Register auto load for the page class
$wgAutoloadClasses['UniversalLanguageSelectorHooks'] = "$dir/UniversalLanguageSelector.hooks.php";
$wgAutoloadClasses['ApiLanguageSearch'] = "$dir/api/ApiLanguageSearch.php";
$wgAutoloadClasses['LanguageNameSearch'] = "$dir/data/LanguageNameSearch.php";
$wgHooks['BeforePageDisplay'][] = 'UniversalLanguageSelectorHooks::addModules';
$wgHooks['PersonalUrls'][] = 'UniversalLanguageSelectorHooks::addTrigger';
$wgHooks['SkinAfterContent'][] = 'UniversalLanguageSelectorHooks::addTemplate';
$wgHooks['ResourceLoaderTestModules'][] = 'UniversalLanguageSelectorHooks::addTestModules';
$wgAPIModules['languagesearch'] = 'ApiLanguageSearch';
$wgResourceModules['ext.uls.init'] = array(
'scripts' => 'resources/ext.uls.init.js',

66
api/ApiLanguageSearch.php Normal file
View File

@@ -0,0 +1,66 @@
<?php
/**
* Language name search API
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
* have to do anything special to choose one license or the other and you don't
* have to notify anyone which license you are using. You are free to use
* UniversalLanguageSelector in commercial projects as long as the copyright
* header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
*
* @file
* @ingroup Extensions
* @licence GNU General Public Licence 2.0 or later
* @licence MIT License
*/
/**
* @ingroup API
*/
class ApiLanguageSearch extends ApiBase {
public function getCustomPrinter() {
return $this->getMain()->createPrinterByName( 'json' );
}
public function execute() {
$params = $this->extractRequestParams();
$search = $params['search'];
$searches = LanguageNameSearch::search( $search );
$result = $this->getResult();
$result->addValue( null, $this->getModuleName(), $searches );
}
public function getAllowedParams() {
return array(
'search' => array(
ApiBase::PARAM_REQUIRED => true
),
);
}
public function getParamDescription() {
return array(
'search' => 'Search string',
);
}
public function getDescription() {
return 'Search for language names in any script';
}
public function getExamples() {
return array(
'api.php?action=languagesearch&search=Te',
'api.php?action=languagesearch&search=ഫി',
);
}
public function getVersion() {
return __CLASS__ . ': ' . ULS_VERSION;
}
}

View File

@@ -0,0 +1,59 @@
<?php
/**
* Script to create language names index.
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
* have to do anything special to choose one license or the other and you don't
* have to notify anyone which license you are using. You are free to use
* UniversalLanguageSelector in commercial projects as long as the copyright
* header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
*
* @file
* @ingroup Extensions
* @licence GNU General Public Licence 2.0 or later
* @licence MIT License
*/
// Standard boilerplate to define $IP
if ( getenv( 'MW_INSTALL_PATH' ) !== false ) {
$IP = getenv( 'MW_INSTALL_PATH' );
} else {
$dir = __DIR__;
$IP = "$dir/../../..";
}
require_once ( "$IP/maintenance/Maintenance.php" );
class LanguageNameIndexer extends Maintenance {
public function __construct() {
parent::__construct();
$this->addDescription( "Script to create language names index." );
}
public function execute() {
$languages = Language::fetchLanguageNames( null, 'all' );
$all = array();
$buckets = array();
foreach ( $languages as $code => $name ) {
$all[$code][strtolower( $name )] = true;
$langnames = LanguageNames::getNames( $code, 0, 2 );
foreach ( $langnames as $code => $name ) {
$all[$code][] = strtolower( $name );
}
}
foreach ( $all as $code => $names ) {
foreach ( $names as $index => $name ) {
$bucket = LanguageNameSearch::getIndex( $name );
$buckets[$bucket][$name] = $code;
}
}
$this->output( "Total buckets: " . count( $buckets ) . "\n" );
file_put_contents( 'langnames.ser', serialize( $buckets ) );
}
}
$maintClass = 'LanguageNameIndexer';
require_once( RUN_MAINTENANCE_IF_MAIN );

View File

@@ -0,0 +1,80 @@
<?php
/**
* Cross-Language Language name search
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
* have to do anything special to choose one license or the other and you don't
* have to notify anyone which license you are using. You are free to use
* UniversalLanguageSelector in commercial projects as long as the copyright
* header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
*
* @file
* @ingroup Extensions
* @licence GNU General Public Licence 2.0 or later
* @licence MIT License
*/
class LanguageNameSearch {
static $languagenames;
public function init() {
self::$languagenames = unserialize( file_get_contents( __DIR__ . '/langnames.ser' ) );
}
public static function search( $searchKey ) {
$results = array();
if ( self::$languagenames === null ) {
self::init();
}
$bucket = self::$languagenames[self::getIndex( $searchKey )];
foreach ( $bucket as $name => $code ) {
// Prefix search
if ( strpos( $name, $searchKey, 0 ) === 0 ) {
$results[$code] = $name;
}
}
return $results;
}
public static function getIndex( $name ) {
$codepoint = self::getCodepoint( $name );
if ( $codepoint < 1000 ) {
$bucket = $codepoint;
} else {
$bucket = $codepoint % 1000;
}
if ( !isset( $buckets[$bucket] ) ) {
$buckets[$bucket] = array();
}
return $bucket;
}
/**
* Get the code point of first letter of string
*
* @return integer Code point of first letter of string
*/
static function getCodepoint( $str ) {
$unicode = array();
$values = array();
$lookingFor = 1;
for ( $i = 0; $i < strlen( $str ); $i++ ) {
$thisValue = ord( $str[$i] );
if ( $thisValue < 128 ) {
return $thisValue;
} else { // Codepoints larger than 127 are represented by multi-byte sequences,
if ( count( $values ) === 0 ) {
// 224 is the lowest non-overlong-encoded codepoint.
$lookingFor = ( $thisValue < 224 ) ? 2 : 3;
}
$values[] = $thisValue;
if ( count( $values ) === $lookingFor ) {
// Refer http://en.wikipedia.org/wiki/UTF-8#Description
$number = ( $lookingFor === 3 ) ? ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ) : ( ( $values[0] % 32 ) * 64 ) + ( $values[
1] % 64 );
return $number;
}
}
}
}
}

View File

@@ -96,7 +96,8 @@
that.$languageFilter.languagefilter( {
$target: $lcd, //$( 'ul.uls-language-filter-result' ),
languages: that.languages
languages: that.languages,
searchAPI: that.options.searchAPI
} );
// Create region selectors, one per region
@@ -174,7 +175,8 @@
$.fn.uls.defaults = {
menu: '.uls-menu',
onSelect: null // Callback function to be called when a language is selected
onSelect: null, // Callback function to be called when a language is selected
searchAPI: null // Language search API
};
$.fn.uls.Constructor = ULS;

View File

@@ -26,7 +26,8 @@
setlang : language
} );
window.location.href = uri.toString();
}
},
searchAPI: mw.util.wikiScript( 'api' ) + "?action=languagesearch"
} );
} );
} )( jQuery );

View File

@@ -59,14 +59,27 @@
}
}
}
// Also do a search to search API
if( this.options.searchAPI && query){
this.searchAPI( query );
}
},
render: function( langCode ) {
searchAPI: function( query ) {
var that = this;
$.get( that.options.searchAPI, { search: query }, function( result ) {
$.each( result['languagesearch'], function( code, name ) {
that.render( code, name );
} );
} );
},
render: function( langCode, languageName ) {
var $target = this.options.$target;
if ( !$target ) {
return;
}
$target.append( langCode );
$target.append( langCode, null, languageName );
},
escapeRegex: function( value ) {
@@ -110,7 +123,8 @@
$.fn.languagefilter.defaults = {
$target: null, // Where to append the results
languages: null, // Languages as code:name format. Default values come from data.languages.
clickhandler: null
clickhandler: null,
searchAPI: null
};
$.fn.languagefilter.Constructor = LanguageFilter;

View File

@@ -31,27 +31,39 @@
LanguageCategoryDisplay.prototype = {
constructor: LanguageCategoryDisplay,
append: function( langCode, regionCode ) {
append: function( langCode, regionCode, languageName ) {
var that = this;
this.addToRegion( langCode, regionCode );
this.addToRegion( langCode, regionCode, languageName );
},
/**
* Check whether a language code is already displayed or not.
* @param langCode
* @return boolean
*/
exists: function( langCode ) {
return this.$element.find( 'li' ).filter(function() {
return $(this).data('code') === langCode;
} ).length > 0;
},
/**
* Add the language to a region.
* If the region parameter is given , add to that region alone
* Otherwise to all regions that this language belongs.
* @param langCode
* @param region
* @param region Optional region
* @param languageName Optional languageName
*/
addToRegion: function( langCode, region ) {
var that = this,
language = that.options.languages[langCode];
var langName = $.uls.data.autonym( langCode )
|| that.options.languages[langCode]
|| langCode;
var regions = [];
addToRegion: function( langCode, region, languageName) {
var that = this;
if ( that.exists( langCode ) ) {
return;
}
var language = $.uls.data.languages[langCode],
langName = languageName
|| $.uls.data.autonym( langCode )
|| that.options.languages[langCode]
|| langCode,
regions = [];
if ( region ) {
regions.push( region );
} else {

View File

@@ -0,0 +1,59 @@
<?php
/**
* PHPUnit tests for UniversalLanguageSelector extension.
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
* have to do anything special to choose one license or the other and you don't
* have to notify anyone which license you are using. You are free to use
* UniversalLanguageSelector in commercial projects as long as the copyright
* header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
*
* @file
* @ingroup Extensions
* @licence GNU General Public Licence 2.0 or later
* @licence MIT License
*/
require_once( __DIR__ . '/../../data/LanguageNameSearch.php' );
class LanguageSearchTest extends PHPUnit_Framework_TestCase {
/**
* @dataProvider searchDataProvider
*/
public function testSearch( $searchKey, $result ) {
$this->assertEquals( $result, LanguageNameSearch::search( $searchKey ) );
}
public function searchDataProvider() {
return array(
array( "ഹിന്ദി", array(
'hi' => 'ഹിന്ദി'
)
),
array( "മല", array(
'ml' => "മലയാളം",
'mg' => 'മലഗാസി',
'ms' => 'മലയ',
)
),
array( "Φινλαν", array(
'fi' => 'Φινλανδικά',
)
),
array( "blah", array(
)
),
array( "الفرنسية", array(
'fr' => 'الفرنسية',
'fr-ca' => 'الفرنسية الكندية',
'fr-ch' => 'الفرنسية السويسرية',
'frm' => 'الفرنسية الوسطى',
'fro' => 'الفرنسية القديمة',
)
),
);
}
}