From ca411138c762a82f7e2bdddec44ab349d7b47e8e Mon Sep 17 00:00:00 2001 From: "Amir E. Aharoni" Date: Thu, 23 Aug 2012 23:08:52 +0300 Subject: [PATCH] Adding language presence by territory to langdb A very simple mechanism for importing per-country language lists from CLDR to ULS' langdb. If I understand correctly, we only need languages spoken in a country ordered by number of speakers. The CLDR data already has it and it should be mostly useful. Also added a utility function and a test. Some tweaks to override the CLDR data are still needed: * The data as it is omits some useful languages. For example, Amharic is not listed in Eritrea. * Some countries have a very large number of languages. Ideally it's right, but is not practical currently, for example India with 75. Maybe hand-picking or limiting the choice to top X languages can be useful, but requires thought. * Some language codes are standard, but different from Wikipedia practice, for example "pa_Guru" (we just write "pa"). Maybe a mapping of codes is needed. Change-Id: I3c0cd5a9118997ba39a4f3695978e359f3de6956 --- lib/jquery.uls/data/ulsdata2json.php | 52 +++++++++++++++++++-- lib/jquery.uls/src/jquery.uls.data.js | 2 +- lib/jquery.uls/src/jquery.uls.data.utils.js | 9 ++++ tests/qunit/ext.uls.tests.js | 4 +- 4 files changed, 62 insertions(+), 5 deletions(-) diff --git a/lib/jquery.uls/data/ulsdata2json.php b/lib/jquery.uls/data/ulsdata2json.php index 0b5aeabe..e77b89d5 100644 --- a/lib/jquery.uls/data/ulsdata2json.php +++ b/lib/jquery.uls/data/ulsdata2json.php @@ -20,9 +20,53 @@ include __DIR__ . '/spyc.php'; -$data = file_get_contents( 'langdb.yaml' ); -$parsed = spyc_load( $data ); -$json = json_encode( $parsed ); +print( "Reading langdb.yaml...\n" ); +$yamlLangdb = file_get_contents( 'langdb.yaml' ); +$parsedLangdb = spyc_load( $yamlLangdb ); + +$supplementalDataFilename = 'supplementalData.xml'; +$supplementalDataUrl = "http://unicode.org/repos/cldr/trunk/common/supplemental/$supplementalDataFilename"; + +$curl = curl_init( $supplementalDataUrl ); +$supplementalDataFile = fopen( $supplementalDataFilename, 'w' ); + +curl_setopt( $curl, CURLOPT_FILE, $supplementalDataFile ); +curl_setopt( $curl, CURLOPT_HEADER, 0 ); + +print( "Trying to download $supplementalDataUrl...\n" ); +$curlSuccess = curl_exec( $curl ); +curl_close( $curl ); +fclose( $supplementalDataFile ); + +if ( !$curlSuccess ) { + die( "Failed to download CLDR data from $supplementalDataUrl.\n" ); +} +print( "Downloaded $supplementalDataFilename, trying to parse...\n" ); + +$supplementalData = simplexml_load_file( $supplementalDataFilename ); + +if ( !( $supplementalData instanceof SimpleXMLElement ) ) { + die( "Attempt to load CLDR data from $supplementalDataFilename failed.\n" ); +} + +print( "CLDR supplemental data parsed successfully, reading territories info...\n" ); +$parsedLangdb['territories'] = array(); + +foreach ( $supplementalData->territoryInfo->territory as $territoryRecord ) { + $territoryAtributes = $territoryRecord->attributes(); + $territoryCodeAttr = $territoryAtributes['type']; + $territoryCode = "$territoryCodeAttr[0]"; + $parsedLangdb['territories'][$territoryCode] = array(); + + foreach ( $territoryRecord->languagePopulation as $languageRecord ) { + $languageAttributes = $languageRecord->attributes(); + $languageCodeAttr = $languageAttributes['type']; + $parsedLangdb['territories'][$territoryCode][] = "$languageCodeAttr[0]"; + } +} + +print( "Writing JSON langdb...\n" ); +$json = json_encode( $parsedLangdb ); $js = << -1, "Sakha language is spoken in Russia" ); } ); }() );