Add a PHP interface to work with the language data

Additional changes in this PR include,

  * Added composer.json
  * Refactored the folder structure.
  * Added editorconfig.
  * Added PHPCS and formatted existing code.
  * Changes to use a single license - GPL-2.0-or-later
  * ESLint related fixes

Bug: T218639
This commit is contained in:
Abijeet
2020-01-22 00:53:38 +05:30
parent 6ca93c0966
commit 09ab6024fe
16 changed files with 2585 additions and 151 deletions

371
src/LanguageData.php Normal file
View File

@@ -0,0 +1,371 @@
<?php
/**
* Contains a utility class to query the language data.
*
* @file
* @license GPL-2.0-or-later
*/
namespace Wikimedia;
/**
* Utility class to query the language data.
*/
class LanguageData {
private static $instance;
public const OTHER_SCRIPT_GROUP = 'Other';
private const LANGUAGE_DATA_PATH = '../data/language-data.json';
private $data;
/**
* Returns an instance of the class
* @return LanguageData
*/
public static function get(): LanguageData {
if ( self::$instance === null ) {
self::$instance = new LanguageData();
self::$instance->loadData();
}
return self::$instance;
}
private function loadData() {
$this->data = json_decode( file_get_contents( __DIR__ . '/' . self::LANGUAGE_DATA_PATH ) );
}
/**
* Checks if a language code is valid
* @param string $languageCode
* @return bool
*/
public function isKnown( string $languageCode ): bool {
return isset( $this->data->languages->$languageCode );
}
/**
* Is this language a redirect to another language?
* @param string $languageCode Language code
* @return string|bool Target language code if it's a redirect or false if it's not
*/
public function isRedirect( string $languageCode ) {
if (
$this->isKnown( $languageCode ) &&
count( $this->getLanguage( $languageCode ) ) === 1
) {
return $this->getLanguage( $languageCode )[0];
}
return false;
}
/**
* Get all the languages
* @return object
*/
public function getLanguages() {
return $this->data->languages;
}
/**
* Returns the script of the language or false
* @param string $languageCode
* @return string|bool Language script if its a known language, else false.
*/
public function getScript( string $languageCode ) {
if ( !$this->isKnown( $languageCode ) ) {
return false;
}
$targetCode = $this->isRedirect( $languageCode );
if ( $targetCode ) {
return $this->getScript( $targetCode );
}
return $this->getLanguage( $languageCode )[0];
}
/**
* Returns the regions in which a language is spoken.
* @param string $languageCode
* @return string[]|bool Array of regions or false if language is unknown.
*/
public function getRegions( string $languageCode ) {
if ( !$this->isKnown( $languageCode ) ) {
return false;
}
$targetCode = $this->isRedirect( $languageCode );
if ( $targetCode ) {
return $this->getRegions( $targetCode );
}
return $this->getLanguage( $languageCode )[1];
}
/**
* Returns the autonym of the language.
* @param string $languageCode Language code
* @return string|bool
*/
public function getAutonym( $languageCode ) {
if ( !$this->isKnown( $languageCode ) ) {
return false;
}
$targetCode = $this->isRedirect( $languageCode );
if ( $targetCode ) {
return $this->getAutonym( $targetCode );
}
$language = $this->getLanguage( $languageCode );
return count( $language ) >= 2 ? $language[2] : $languageCode;
}
/**
* Returns all language codes and corresponding autonyms
* @return array
*/
public function getAutonyms(): array {
$languages = $this->getLanguages();
$languageAutonyms = [];
foreach ( $languages as $languageCode => $languageData ) {
if ( $this->isRedirect( $languageCode ) ) {
continue;
}
$languageAutonyms[$languageCode] = $this->getAutonym( $languageCode );
}
return $languageAutonyms;
}
/**
* Returns all languages written in the given scripts.
* @param string[] $scripts
* @return string[]
*/
public function getLanguagesInScripts( array $scripts ): array {
$languages = $this->getLanguages();
$languagesInScripts = [];
foreach ( $languages as $languageCode => $languageData ) {
if ( $this->isRedirect( $languageCode ) ) {
continue;
}
$script = $this->getScript( $languageCode );
if ( in_array( $script, $scripts ) ) {
$languagesInScripts[] = $languageCode;
}
}
return $languagesInScripts;
}
/**
* Returns all languages written in the given script.
* @param string $script
* @return string[]
*/
public function getLanguagesInScript( string $script ): array {
return $this->getLanguagesInScripts( [ $script ] );
}
/**
* Returns the script group of a script or 'Other' if it doesn't
* belong to any group.
* @param string $script Script code
* @return string script group name
*/
public function getGroupOfScript( string $script ): string {
$scriptGroups = $this->data->scriptgroups;
foreach ( $scriptGroups as $scriptGroup => $scriptGroupData ) {
if ( in_array( $script, $scriptGroupData ) ) {
return $scriptGroup;
}
}
return self::OTHER_SCRIPT_GROUP;
}
/**
* Returns the script group of a language.
* @param string $languageCode Language code
* @return string script group name
*/
public function getScriptGroupOfLanguage( string $languageCode ): string {
return $this->getGroupOfScript( $this->getScript( $languageCode ) );
}
/**
* Return the list of languages passed, grouped by script.
* @param string[] $languageCodes Array of language codes to group
* @return array Array of language codes grouped by script
*/
public function getLanguagesByScriptGroup( array $languageCodes ): array {
$languagesByScriptGroup = [];
foreach ( $languageCodes as $languageCode ) {
if ( !$this->isKnown( $languageCode ) ) {
continue;
}
$targetLanguageCode = $this->isRedirect( $languageCode );
if ( $targetLanguageCode === false ) {
$targetLanguageCode = $languageCode;
}
$langScriptGroup = $this->getScriptGroupOfLanguage( $targetLanguageCode );
if ( !isset( $languagesByScriptGroup[$langScriptGroup] ) ) {
$languagesByScriptGroup[$langScriptGroup] = [];
}
$languagesByScriptGroup[$langScriptGroup][] = $languageCode;
}
return $languagesByScriptGroup;
}
/**
* Returns an associative array of languages in several regions,
* grouped by script group.
* @param string[] $regions array of region codes
* @return array
*/
public function getLanguagesByScriptGroupInRegions( array $regions ): array {
$languagesByScriptGroupInRegions = [];
$languages = $this->getLanguages();
foreach ( $languages as $languageCode => $languageData ) {
if ( $this->isRedirect( $languageCode ) ) {
continue;
}
$languageRegions = $this->getRegions( $languageCode );
foreach ( $regions as $region ) {
if ( !in_array( $region, $languageRegions ) ) {
continue;
}
$langScriptGroup = $this->getScriptGroupOfLanguage( $languageCode );
if ( !isset( $languagesByScriptGroupInRegions[$langScriptGroup] ) ) {
$languagesByScriptGroupInRegions[$langScriptGroup] = [];
}
$languagesByScriptGroupInRegions[$langScriptGroup][] = $languageCode;
}
}
return $languagesByScriptGroupInRegions;
}
/**
* Returns an associative array of languages in a region, grouped by their script.
* @param string $region Region code
* @return array
*/
public function getLanguagesByScriptGroupInRegion( $region ): array {
return $this->getLanguagesByScriptGroupInRegions( [ $region ] );
}
/**
* Return the list of languages sorted by script groups.
* @param string[] $languageCodes Array of language codes to sort
* @return string[] Array of language codes
*/
public function sortByScriptGroup( array $languageCodes ) {
$groupedLanguageData = $this->getLanguagesByScriptGroup( $languageCodes );
ksort( $groupedLanguageData, SORT_STRING | SORT_FLAG_CASE );
$sortedLanguageData = [];
foreach ( $groupedLanguageData as $languageData ) {
$sortedLanguageData = array_merge( $sortedLanguageData, $languageData );
}
return $sortedLanguageData;
}
/**
* Sort languages by their autonym.
* @param string[] $languageCodes
* @return string[]
*/
public function sortByAutonym( array $languageCodes ): array {
$sortedLanguages = [];
foreach ( $languageCodes as $languageCode ) {
$autonym = $this->getAutonym( $languageCode );
if ( $autonym !== false ) {
$sortedLanguages[$languageCode] = $autonym;
}
}
asort( $sortedLanguages, SORT_STRING | SORT_FLAG_CASE );
return array_keys( $sortedLanguages );
}
/**
* Check if a language is right-to-left.
* @param string $languageCode Language code
* @return bool
*/
public function isRtl( string $languageCode ): bool {
$script = $this->getScript( $languageCode );
return in_array( $script, $this->data->rtlscripts );
}
/**
* Return the direction of the language. Returns false if the direction is unknown.
* @param string $languageCode Language code
* @return string|bool
*/
public function getDir( string $languageCode ) {
if ( $this->isKnown( $languageCode ) ) {
return $this->isRtl( $languageCode ) ? 'rtl' : 'ltr';
}
return false;
}
/**
* Returns the languages spoken in a territory.
* @param string $territory Territory code
* @return string[]|bool list of language codes
*/
public function getLanguagesInTerritory( string $territory ) {
if ( isset( $this->data->territories->$territory ) ) {
return $this->data->territories->$territory;
}
return false;
}
/**
* Adds a language in run time and sets its options as provided.
* If the target option is provided, the language is defined as a redirect.
* Other possible options are script, regions and autonym.
* @param string $languageCode New language code.
* @param array $options Language properties.
*/
public function addLanguage( string $languageCode, array $options ): void {
$languages = $this->getLanguages();
if ( isset( $options['target'] ) ) {
$languages->$languageCode = [ $options['target'] ];
} else {
$languages->$languageCode =
[ $options['script'], $options['regions'], $options['autonym'] ];
}
}
/**
* Return the language data based on language code. Performs no check, meant for
* internal use only.
* @param string $languageCode
* @return array
*/
private function getLanguage( string $languageCode ): array {
return $this->data->languages->$languageCode;
}
}

305
src/index.js Normal file
View File

@@ -0,0 +1,305 @@
var languageData = require( '../data/language-data.json' );
/**
* Utility functions for querying language data.
*/
/**
* Check whether the languageCode is known to the language database.
* For practical purposes it may be same as checking if given language code is valid,
* but not guaranteed that all valid language codes are in our database.
* @param {string} languageCode language code
* @return {boolean}
*/
function isKnown( languageCode ) {
return !!languageData.languages[ languageCode ];
}
/**
* Is this language a redirect to another language?
* @param {string} language Language code
* @return {string} Target language code if it's a redirect or false if it's not
*/
function isRedirect( language ) {
return ( isKnown( language ) && languageData.languages[ language ].length === 1 ) ?
languageData.languages[ language ][ 0 ] : false;
}
/**
* Get all the languages
* @return {Object}
*/
function getLanguages() {
return languageData.languages;
}
/**
* Returns the script of the language.
* @param {string} language Language code
* @return {string}
*/
function getScript( language ) {
var target = isRedirect( language );
if ( target ) {
return getScript( target );
}
if ( !isKnown( language ) ) {
// Undetermined
return 'Zyyy';
}
return languageData.languages[ language ][ 0 ];
}
/**
* Returns the regions in which a language is spoken.
* @param {string} language Language code
* @return {string[]} 'UNKNOWN'
*/
function getRegions( language ) {
var target = isRedirect( language );
if ( target ) {
return getRegions( target );
}
return ( isKnown( language ) && languageData.languages[ language ][ 1 ] ) || 'UNKNOWN';
}
/**
* Returns the autonym of the language.
* @param {string} language Language code
* @return {string}
*/
function getAutonym( language ) {
var target = isRedirect( language );
if ( target ) {
return getAutonym( target );
}
return ( isKnown( language ) && languageData.languages[ language ][ 2 ] ) || language;
}
/**
* Returns all language codes and corresponding autonyms
* @return {Array}
*/
function getAutonyms() {
var language,
autonymsByCode = {};
for ( language in languageData.languages ) {
if ( isRedirect( language ) ) {
continue;
}
autonymsByCode[ language ] = getAutonym( language );
}
return autonymsByCode;
}
/**
* Returns all languages written in the given scripts.
* @param {string[]} scripts
* @return {string[]} languages codes
*/
function getLanguagesInScripts( scripts ) {
var language, i,
languagesInScripts = [];
for ( language in languageData.languages ) {
if ( isRedirect( language ) ) {
continue;
}
for ( i = 0; i < scripts.length; i++ ) {
if ( scripts[ i ] === getScript( language ) ) {
languagesInScripts.push( language );
break;
}
}
}
return languagesInScripts;
}
/**
* Returns all languages written in script.
* @param {string} script
* @return {string[]} array of strings (languages codes)
*/
function getLanguagesInScript( script ) {
return getLanguagesInScripts( [ script ] );
}
/**
* Returns the script group of a script or 'Other' if it doesn't
* belong to any group.
* @param {string} script Script code
* @return {string} script group name
*/
function getGroupOfScript( script ) {
var scriptGroup;
for ( scriptGroup in languageData.scriptgroups ) {
if ( languageData.scriptgroups[ scriptGroup ].includes( script ) ) {
return scriptGroup;
}
}
return 'Other';
}
/**
* Returns the script group of a language.
* @param {string} language Language code
* @return {string} script group name
*/
function getScriptGroupOfLanguage( language ) {
return getGroupOfScript( getScript( language ) );
}
/**
* Get the given list of languages grouped by script.
* @param {string[]} languages Array of language codes to group
* @return {string[]} Array of language codes
*/
function getLanguagesByScriptGroup( languages ) {
var languagesByScriptGroup = {},
language, languageIndex, resolvedRedirect, langScriptGroup;
for ( languageIndex = 0; languageIndex < languages.length; languageIndex++ ) {
language = languages[ languageIndex ];
resolvedRedirect = isRedirect( language ) || language;
langScriptGroup = getScriptGroupOfLanguage( resolvedRedirect );
if ( !languagesByScriptGroup[ langScriptGroup ] ) {
languagesByScriptGroup[ langScriptGroup ] = [];
}
languagesByScriptGroup[ langScriptGroup ].push( language );
}
return languagesByScriptGroup;
}
/**
* Returns an associative array of languages in several regions,
* grouped by script group.
* @param {string[]} regions array of region codes
* @return {Object}
*/
function getLanguagesByScriptGroupInRegions( regions ) {
var language, i, scriptGroup,
languagesByScriptGroupInRegions = {};
for ( language in languageData.languages ) {
if ( isRedirect( language ) ) {
continue;
}
for ( i = 0; i < regions.length; i++ ) {
if ( getRegions( language ).includes( regions[ i ] ) ) {
scriptGroup = getScriptGroupOfLanguage( language );
if ( languagesByScriptGroupInRegions[ scriptGroup ] === undefined ) {
languagesByScriptGroupInRegions[ scriptGroup ] = [];
}
languagesByScriptGroupInRegions[ scriptGroup ].push( language );
break;
}
}
}
return languagesByScriptGroupInRegions;
}
/**
* Returns an associative array of languages in a region,
* grouped by script group.
* @param {string} region Region code
* @return {Object}
*/
function getLanguagesByScriptGroupInRegion( region ) {
return getLanguagesByScriptGroupInRegions( [ region ] );
}
/**
* Return the list of languages sorted by script groups.
* @param {string[]} languages Array of language codes to sort
* @return {string[]} Array of language codes
*/
function sortByScriptGroup( languages ) {
var groupedLanguages, scriptGroups, i,
allLanguages = [];
groupedLanguages = getLanguagesByScriptGroup( languages );
scriptGroups = Object.keys( groupedLanguages ).sort();
for ( i = 0; i < scriptGroups.length; i++ ) {
allLanguages = allLanguages.concat( groupedLanguages[ scriptGroups[ i ] ] );
}
return allLanguages;
}
/**
* A callback for sorting languages by autonym.
* Can be used as an argument to a sort function.
* @param {string} a Language code
* @param {string} b Language code
* @return {number}
*/
function sortByAutonym( a, b ) {
var autonymA = getAutonym( a ) || a,
autonymB = getAutonym( b ) || b;
return ( autonymA.toLowerCase() < autonymB.toLowerCase() ) ? -1 : 1;
}
/**
* Check if a language is right-to-left.
* @param {string} language Language code
* @return {boolean}
*/
function isRtl( language ) {
return languageData.rtlscripts.includes( getScript( language ) );
}
/**
* Return the direction of the language
* @param {string} language Language code
* @return {string}
*/
function getDir( language ) {
return isRtl( language ) ? 'rtl' : 'ltr';
}
/**
* Returns the languages spoken in a territory.
* @param {string} territory Territory code
* @return {string[]} list of language codes
*/
function getLanguagesInTerritory( territory ) {
return languageData.territories[ territory ];
}
/**
* Adds a language in run time and sets its options as provided.
* If the target option is provided, the language is defined as a redirect.
* Other possible options are script, regions and autonym.
*
* @param {string} code New language code.
* @param {Object} options Language properties.
*/
function addLanguage( code, options ) {
if ( options.target ) {
languageData.languages[ code ] = [ options.target ];
} else {
languageData.languages[ code ] = [ options.script, options.regions, options.autonym ];
}
}
module.exports = {
addLanguage,
getAutonym,
getAutonyms,
getDir,
getGroupOfScript,
getLanguages,
getLanguagesByScriptGroup,
getLanguagesByScriptGroupInRegion,
getLanguagesByScriptGroupInRegions,
getLanguagesInScript,
getLanguagesInScripts,
getLanguagesInTerritory,
getRegions,
getScript,
getScriptGroupOfLanguage,
isKnown,
isRedirect,
isRtl,
sortByScriptGroup,
sortByAutonym
};

1046
src/util/spyc.php Normal file

File diff suppressed because it is too large Load Diff

111
src/util/ulsdata2json.php Normal file
View File

@@ -0,0 +1,111 @@
<?php
/**
* Script to create the language data in JSON format for ULS.
*
* Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
* Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
* contributors. See CREDITS for a list.
*
* @file
* @ingroup Extensions
* @license GPL-2.0-or-later
*/
include __DIR__ . '/spyc.php';
print "Reading langdb.yaml...\n";
$yamlLangdb = file_get_contents( __DIR__ . '/../../data/langdb.yaml' );
$parsedLangdb = spyc_load( $yamlLangdb );
$supplementalDataFilename = 'supplementalData.xml';
$supplementalDataUrl =
// phpcs:ignore Generic.Files.LineLength
"https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/supplementalData.xml";
$curl = curl_init( $supplementalDataUrl );
$supplementalDataFile = fopen( $supplementalDataFilename, 'w' );
curl_setopt( $curl, CURLOPT_FILE, $supplementalDataFile );
curl_setopt( $curl, CURLOPT_HEADER, 0 );
print "Trying to download $supplementalDataUrl...\n";
$curlSuccess = curl_exec( $curl );
curl_close( $curl );
fclose( $supplementalDataFile );
if ( !$curlSuccess ) {
die( "Failed to download CLDR data from $supplementalDataUrl.\n" );
}
print "Downloaded $supplementalDataFilename, trying to parse...\n";
$supplementalData = simplexml_load_file( $supplementalDataFilename );
if ( !( $supplementalData instanceof SimpleXMLElement ) ) {
die( "Attempt to load CLDR data from $supplementalDataFilename failed.\n" );
}
print "CLDR supplemental data parsed successfully, reading territories info...\n";
$parsedLangdb['territories'] = [];
foreach ( $supplementalData->territoryInfo->territory as $territoryRecord ) {
$territoryAtributes = $territoryRecord->attributes();
$territoryCodeAttr = $territoryAtributes['type'];
$territoryCode = (string)$territoryCodeAttr[0];
$parsedLangdb['territories'][$territoryCode] = [];
foreach ( $territoryRecord->languagePopulation as $languageRecord ) {
$languageAttributes = $languageRecord->attributes();
$languageCodeAttr = $languageAttributes['type'];
// Lower case is a convention for language codes in ULS.
// '_' is used in CLDR for compound codes and it's replaced with '-' here.
$normalisedCode = strtr( strtolower( (string)$languageCodeAttr[0] ), '_', '-' );
$parsedLangdb['territories'][$territoryCode][] = $normalisedCode;
// In case of codes with variants, also add the base because ULS might consider
// them as separate languages, e.g. zh, zh-hant and zh-hans.
if ( strpos( $normalisedCode, '-' ) !== false ) {
$parts = explode( '-', $normalisedCode );
$parsedLangdb['territories'][$territoryCode][] = $parts[0];
}
}
}
foreach ( $parsedLangdb['territories'] as $territoryCode => $languages ) {
foreach ( $languages as $index => $language ) {
if ( !isset( $parsedLangdb['languages'][$language] ) ) {
echo "Unknown language $language for territory $territoryCode\n";
unset( $parsedLangdb['territories'][$territoryCode][$index] );
continue;
}
$data = $parsedLangdb['languages'][$language];
if ( count( $data ) === 1 ) {
echo "Redirect for language $language to {$data[0]} territory $territoryCode\n";
$parsedLangdb['territories'][$territoryCode][$index] = $data[0];
continue;
}
}
// Clean-up to save space
if ( count( $parsedLangdb['territories'][$territoryCode] ) === 0 ) {
unset( $parsedLangdb['territories'][$territoryCode] );
continue;
}
// Remove duplicates we might have created
$parsedLangdb['territories'][$territoryCode] =
array_unique( $parsedLangdb['territories'][$territoryCode] );
// We need to renumber or json conversion thinks these are objects
$parsedLangdb['territories'][$territoryCode] =
array_values( $parsedLangdb['territories'][$territoryCode] );
}
print "Writing JSON langdb...\n";
$jsonVerbose = json_encode( $parsedLangdb, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE );
// For making diff review easier.
file_put_contents( '../language-data.json', $jsonVerbose );
print "Done.\n";