Adding language presence by territory to langdb

A very simple mechanism for importing per-country language lists from CLDR to ULS' langdb. If I understand correctly, we only need languages spoken in a country ordered by number of speakers. The CLDR data already has it and it should be mostly useful. Also added a utility function and a test. Some tweaks to override the CLDR data are still needed: * The data as it is omits some useful languages. For example, Amharic is not listed in Eritrea. * Some countries have a very large number of languages. Ideally it's right, but is not practical currently, for example India with 75. Maybe hand-picking or limiting the choice to top X languages can be useful, but requires thought. * Some language codes are standard, but different from Wikipedia practice, for example "pa_Guru" (we just write "pa"). Maybe a mapping of codes is needed. Change-Id: I3c0cd5a9118997ba39a4f3695978e359f3de6956
2012-08-23 23:08:52 +03:00
parent 0d73ffe09e
commit ca411138c7
4 changed files with 62 additions and 5 deletions
--- a/lib/jquery.uls/data/ulsdata2json.php
+++ b/lib/jquery.uls/data/ulsdata2json.php
@@ -20,9 +20,53 @@

 include __DIR__ . '/spyc.php';

-$data = file_get_contents( 'langdb.yaml' );
-$parsed = spyc_load( $data );
-$json = json_encode( $parsed );
+print( "Reading langdb.yaml...\n" );
+$yamlLangdb = file_get_contents( 'langdb.yaml' );
+$parsedLangdb = spyc_load( $yamlLangdb );
+
+$supplementalDataFilename = 'supplementalData.xml';
+$supplementalDataUrl = "http://unicode.org/repos/cldr/trunk/common/supplemental/$supplementalDataFilename";
+
+$curl = curl_init( $supplementalDataUrl );
+$supplementalDataFile = fopen( $supplementalDataFilename, 'w' );
+
+curl_setopt( $curl, CURLOPT_FILE, $supplementalDataFile );
+curl_setopt( $curl, CURLOPT_HEADER, 0 );
+
+print( "Trying to download $supplementalDataUrl...\n" );
+$curlSuccess = curl_exec( $curl );
+curl_close( $curl );
+fclose( $supplementalDataFile );
+
+if ( !$curlSuccess ) {
+	die( "Failed to download CLDR data from $supplementalDataUrl.\n" );
+}
+print( "Downloaded $supplementalDataFilename, trying to parse...\n" );
+
+$supplementalData = simplexml_load_file( $supplementalDataFilename );
+
+if ( !( $supplementalData instanceof SimpleXMLElement ) ) {
+	die( "Attempt to load CLDR data from $supplementalDataFilename failed.\n" );
+}
+
+print( "CLDR supplemental data parsed successfully, reading territories info...\n" );
+$parsedLangdb['territories'] = array();
+
+foreach ( $supplementalData->territoryInfo->territory as $territoryRecord ) {
+	$territoryAtributes = $territoryRecord->attributes();
+	$territoryCodeAttr = $territoryAtributes['type'];
+	$territoryCode = "$territoryCodeAttr[0]";
+	$parsedLangdb['territories'][$territoryCode] = array();
+
+	foreach ( $territoryRecord->languagePopulation as $languageRecord ) {
+		$languageAttributes = $languageRecord->attributes();
+		$languageCodeAttr = $languageAttributes['type'];
+		$parsedLangdb['territories'][$territoryCode][] = "$languageCodeAttr[0]";
+	}
+}
+
+print( "Writing JSON langdb...\n" );
+$json = json_encode( $parsedLangdb );
 $js = <<<JAVASCRIPT
 // Please do not edit. This file is generated from data/langdb.yaml by ulsdata2json.php
 ( function ( $ ) {
@@ -32,3 +76,5 @@ $js = <<<JAVASCRIPT

 JAVASCRIPT;
 file_put_contents( '../src/jquery.uls.data.js', $js );
+
+print( "Done.\n" );
--- a/lib/jquery.uls/src/jquery.uls.data.js
+++ b/lib/jquery.uls/src/jquery.uls.data.js
--- a/lib/jquery.uls/src/jquery.uls.data.utils.js
+++ b/lib/jquery.uls/src/jquery.uls.data.utils.js
@@ -338,4 +338,13 @@
 	$.uls.data.isRtl = function( language ) {
 		return $.inArray( $.uls.data.script( language ), $.uls.data.rtlscripts ) !== -1;
 	};
+
+	/**
+	 * Returns the languages spoken in a territory.
+	 * @param string Territory code
+	 * @return list of language codes
+	 */
+	$.uls.data.languagesInTerritory = function( territory ) {
+		return $.uls.data.territories[territory];
+	};
 } )( jQuery );