Adding language presence by territory to langdb
A very simple mechanism for importing per-country language lists from CLDR to ULS' langdb. If I understand correctly, we only need languages spoken in a country ordered by number of speakers. The CLDR data already has it and it should be mostly useful. Also added a utility function and a test. Some tweaks to override the CLDR data are still needed: * The data as it is omits some useful languages. For example, Amharic is not listed in Eritrea. * Some countries have a very large number of languages. Ideally it's right, but is not practical currently, for example India with 75. Maybe hand-picking or limiting the choice to top X languages can be useful, but requires thought. * Some language codes are standard, but different from Wikipedia practice, for example "pa_Guru" (we just write "pa"). Maybe a mapping of codes is needed. Change-Id: I3c0cd5a9118997ba39a4f3695978e359f3de6956
This commit is contained in:
@@ -20,9 +20,53 @@
|
|||||||
|
|
||||||
include __DIR__ . '/spyc.php';
|
include __DIR__ . '/spyc.php';
|
||||||
|
|
||||||
$data = file_get_contents( 'langdb.yaml' );
|
print( "Reading langdb.yaml...\n" );
|
||||||
$parsed = spyc_load( $data );
|
$yamlLangdb = file_get_contents( 'langdb.yaml' );
|
||||||
$json = json_encode( $parsed );
|
$parsedLangdb = spyc_load( $yamlLangdb );
|
||||||
|
|
||||||
|
$supplementalDataFilename = 'supplementalData.xml';
|
||||||
|
$supplementalDataUrl = "http://unicode.org/repos/cldr/trunk/common/supplemental/$supplementalDataFilename";
|
||||||
|
|
||||||
|
$curl = curl_init( $supplementalDataUrl );
|
||||||
|
$supplementalDataFile = fopen( $supplementalDataFilename, 'w' );
|
||||||
|
|
||||||
|
curl_setopt( $curl, CURLOPT_FILE, $supplementalDataFile );
|
||||||
|
curl_setopt( $curl, CURLOPT_HEADER, 0 );
|
||||||
|
|
||||||
|
print( "Trying to download $supplementalDataUrl...\n" );
|
||||||
|
$curlSuccess = curl_exec( $curl );
|
||||||
|
curl_close( $curl );
|
||||||
|
fclose( $supplementalDataFile );
|
||||||
|
|
||||||
|
if ( !$curlSuccess ) {
|
||||||
|
die( "Failed to download CLDR data from $supplementalDataUrl.\n" );
|
||||||
|
}
|
||||||
|
print( "Downloaded $supplementalDataFilename, trying to parse...\n" );
|
||||||
|
|
||||||
|
$supplementalData = simplexml_load_file( $supplementalDataFilename );
|
||||||
|
|
||||||
|
if ( !( $supplementalData instanceof SimpleXMLElement ) ) {
|
||||||
|
die( "Attempt to load CLDR data from $supplementalDataFilename failed.\n" );
|
||||||
|
}
|
||||||
|
|
||||||
|
print( "CLDR supplemental data parsed successfully, reading territories info...\n" );
|
||||||
|
$parsedLangdb['territories'] = array();
|
||||||
|
|
||||||
|
foreach ( $supplementalData->territoryInfo->territory as $territoryRecord ) {
|
||||||
|
$territoryAtributes = $territoryRecord->attributes();
|
||||||
|
$territoryCodeAttr = $territoryAtributes['type'];
|
||||||
|
$territoryCode = "$territoryCodeAttr[0]";
|
||||||
|
$parsedLangdb['territories'][$territoryCode] = array();
|
||||||
|
|
||||||
|
foreach ( $territoryRecord->languagePopulation as $languageRecord ) {
|
||||||
|
$languageAttributes = $languageRecord->attributes();
|
||||||
|
$languageCodeAttr = $languageAttributes['type'];
|
||||||
|
$parsedLangdb['territories'][$territoryCode][] = "$languageCodeAttr[0]";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print( "Writing JSON langdb...\n" );
|
||||||
|
$json = json_encode( $parsedLangdb );
|
||||||
$js = <<<JAVASCRIPT
|
$js = <<<JAVASCRIPT
|
||||||
// Please do not edit. This file is generated from data/langdb.yaml by ulsdata2json.php
|
// Please do not edit. This file is generated from data/langdb.yaml by ulsdata2json.php
|
||||||
( function ( $ ) {
|
( function ( $ ) {
|
||||||
@@ -32,3 +76,5 @@ $js = <<<JAVASCRIPT
|
|||||||
|
|
||||||
JAVASCRIPT;
|
JAVASCRIPT;
|
||||||
file_put_contents( '../src/jquery.uls.data.js', $js );
|
file_put_contents( '../src/jquery.uls.data.js', $js );
|
||||||
|
|
||||||
|
print( "Done.\n" );
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -338,4 +338,13 @@
|
|||||||
$.uls.data.isRtl = function( language ) {
|
$.uls.data.isRtl = function( language ) {
|
||||||
return $.inArray( $.uls.data.script( language ), $.uls.data.rtlscripts ) !== -1;
|
return $.inArray( $.uls.data.script( language ), $.uls.data.rtlscripts ) !== -1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the languages spoken in a territory.
|
||||||
|
* @param string Territory code
|
||||||
|
* @return list of language codes
|
||||||
|
*/
|
||||||
|
$.uls.data.languagesInTerritory = function( territory ) {
|
||||||
|
return $.uls.data.territories[territory];
|
||||||
|
};
|
||||||
} )( jQuery );
|
} )( jQuery );
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ test( "-- Initial check", function() {
|
|||||||
} );
|
} );
|
||||||
|
|
||||||
test( "-- $.uls.data testing", function() {
|
test( "-- $.uls.data testing", function() {
|
||||||
expect( 23 );
|
expect( 24 );
|
||||||
|
|
||||||
strictEqual( $.uls.data.autonyms()['he'], 'עברית', 'Correct autonym is returned for Hebrew using autonyms().' );
|
strictEqual( $.uls.data.autonyms()['he'], 'עברית', 'Correct autonym is returned for Hebrew using autonyms().' );
|
||||||
|
|
||||||
@@ -128,6 +128,8 @@ test( "-- $.uls.data testing", function() {
|
|||||||
|
|
||||||
strictEqual( $.uls.data.isRtl( "te" ), false, "Telugu language is not RTL" );
|
strictEqual( $.uls.data.isRtl( "te" ), false, "Telugu language is not RTL" );
|
||||||
strictEqual( $.uls.data.isRtl( "dv" ), true, "Divehi language is RTL" );
|
strictEqual( $.uls.data.isRtl( "dv" ), true, "Divehi language is RTL" );
|
||||||
|
|
||||||
|
ok( $.inArray( "sah", $.uls.data.languagesInTerritory( "RU" ) ) > -1, "Sakha language is spoken in Russia" );
|
||||||
} );
|
} );
|
||||||
|
|
||||||
}() );
|
}() );
|
||||||
|
|||||||
Reference in New Issue
Block a user