blob: d15c61dc8c39d390cb19bdd47d86830fe6575aa1 [file] [log] [blame]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
""" Usage: make_intl_data.py [language-subtag-registry.txt]
This script extracts information about mappings between deprecated and
current BCP 47 language tags from the IANA Language Subtag Registry and
converts it to JavaScript object definitions in IntlData.js. The definitions
are used in Intl.js.
The IANA Language Subtag Registry is imported from
http://www.iana.org/assignments/language-subtag-registry
and uses the syntax specified in
http://tools.ietf.org/html/rfc5646#section-3
"""
def readRegistryRecord(registry):
""" Yields the records of the IANA Language Subtag Registry as dictionaries. """
record = {}
for line in registry:
line = line.strip()
if line == "":
continue
if line == "%%":
yield record
record = {}
else:
if ":" in line:
key, value = line.split(":", 1)
key, value = key.strip(), value.strip()
record[key] = value
else:
# continuation line
record[key] += " " + line
if record:
yield record
return
def readRegistry(registry):
""" Reads IANA Language Subtag Registry and extracts information for Intl.js.
Information extracted:
- langTagMappings: mappings from complete language tags to preferred
complete language tags
- langSubtagMappings: mappings from subtags to preferred subtags
- extlangMappings: mappings from extlang subtags to preferred subtags,
with prefix to be removed
Returns these three mappings as dictionaries, along with the registry's
file date.
We also check that mappings for language subtags don't affect extlang
subtags and vice versa, so that CanonicalizeLanguageTag doesn't have
to separate them for processing. Region codes are separated by case,
and script codes by length, so they're unproblematic.
"""
langTagMappings = {}
langSubtagMappings = {}
extlangMappings = {}
languageSubtags = set()
extlangSubtags = set()
for record in readRegistryRecord(registry):
if "File-Date" in record:
fileDate = record["File-Date"]
continue
if record["Type"] == "grandfathered":
# Grandfathered tags don't use standard syntax, so
# CanonicalizeLanguageTag expects the mapping table to provide
# the final form for all.
# For langTagMappings, keys must be in lower case; values in
# the case used in the registry.
tag = record["Tag"]
if "Preferred-Value" in record:
langTagMappings[tag.lower()] = record["Preferred-Value"]
else:
langTagMappings[tag.lower()] = tag
elif record["Type"] == "redundant":
# For langTagMappings, keys must be in lower case; values in
# the case used in the registry.
if "Preferred-Value" in record:
langTagMappings[record["Tag"].lower()] = record["Preferred-Value"]
elif record["Type"] in ("language", "script", "region", "variant"):
# For langSubtagMappings, keys and values must be in the case used
# in the registry.
subtag = record["Subtag"]
if record["Type"] == "language":
languageSubtags.add(subtag)
if "Preferred-Value" in record:
if subtag == "heploc":
# The entry for heploc is unique in its complexity; handle
# it as special case below.
continue
if "Prefix" in record:
# This might indicate another heploc-like complex case.
raise Exception("Please evaluate: subtag mapping with prefix value.")
langSubtagMappings[subtag] = record["Preferred-Value"]
elif record["Type"] == "extlang":
# For extlangMappings, keys must be in the case used in the
# registry; values are records with the preferred value and the
# prefix to be removed.
subtag = record["Subtag"]
extlangSubtags.add(subtag)
if "Preferred-Value" in record:
preferred = record["Preferred-Value"]
prefix = record["Prefix"]
extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix}
else:
# No other types are allowed by
# http://tools.ietf.org/html/rfc5646#section-3.1.3
assert False, "Unrecognized Type: {0}".format(record["Type"])
# Check that mappings for language subtags and extlang subtags don't affect
# each other.
for lang in languageSubtags:
if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang:
raise Exception("Conflict: lang with extlang mapping: " + lang)
for extlang in extlangSubtags:
if extlang in langSubtagMappings:
raise Exception("Conflict: extlang with lang mapping: " + extlang)
# Special case for heploc.
langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"
return {"fileDate": fileDate,
"langTagMappings": langTagMappings,
"langSubtagMappings": langSubtagMappings,
"extlangMappings": extlangMappings}
def writeMappingsVar(intlData, dict, name, description, fileDate, url):
""" Writes a variable definition with a mapping table to file intlData.
Writes the contents of dictionary dict to file intlData with the given
variable name and a comment with description, fileDate, and URL.
"""
intlData.write("\n")
intlData.write("// {0}.\n".format(description))
intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate))
intlData.write("// {0}\n".format(url))
intlData.write("var {0} = {{\n".format(name))
keys = sorted(dict)
for key in keys:
if isinstance(dict[key], basestring):
value = '"{0}"'.format(dict[key])
else:
preferred = dict[key]["preferred"]
prefix = dict[key]["prefix"]
value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix)
intlData.write(' "{0}": {1},\n'.format(key, value))
intlData.write("};\n")
def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings):
""" Writes the language tag data to the Intl data file. """
writeMappingsVar(intlData, langTagMappings, "langTagMappings",
"Mappings from complete tags to preferred values", fileDate, url)
writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings",
"Mappings from non-extlang subtags to preferred values", fileDate, url)
writeMappingsVar(intlData, extlangMappings, "extlangMappings",
"Mappings from extlang subtags to preferred values", fileDate, url)
if __name__ == '__main__':
import codecs
import sys
import urllib2
url = "http://www.iana.org/assignments/language-subtag-registry"
if len(sys.argv) > 1:
print("Always make sure you have the newest language-subtag-registry.txt!")
registry = codecs.open(sys.argv[1], "r", encoding="utf-8")
else:
print("Downloading IANA Language Subtag Registry...")
reader = urllib2.urlopen(url)
text = reader.read().decode("utf-8")
reader.close()
registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8")
registry.write(text)
registry.seek(0)
print("Processing IANA Language Subtag Registry...")
data = readRegistry(registry)
fileDate = data["fileDate"]
langTagMappings = data["langTagMappings"]
langSubtagMappings = data["langSubtagMappings"]
extlangMappings = data["extlangMappings"]
registry.close()
print("Writing Intl data...")
intlData = codecs.open("IntlData.js", "w", encoding="utf-8")
intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n")
writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings)
intlData.close()