blob: 208fbb8b93e39ea1263c084da0ae396ab5f1b4f5 [file] [log] [blame]
#!/bin/bash
# Copyright (c) 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Remove display names for languages that are not listed in the accept-language
# list of Chromium.
function filter_display_language_names {
for lang in $(grep -v '^#' "${scriptdir}/accept_lang.list")
do
# Set $OP to '|' only if $ACCEPT_LANG_PATTERN is not empty.
OP=${ACCEPT_LANG_PATTERN:+|}
ACCEPT_LANG_PATTERN="${ACCEPT_LANG_PATTERN}${OP}${lang}"
done
ACCEPT_LANG_PATTERN="(${ACCEPT_LANG_PATTERN})[^a-z]"
echo "Filtering out display names for non-A-L languages ${langdatapath}"
for lang in $(grep -v '^#' "${scriptdir}/chrome_ui_languages.list")
do
target=${langdatapath}/${lang}.txt
echo Overwriting ${target} ...
sed -r -i \
'/^ Keys\{$/,/^ \}$/d
/^ Languages\{$/, /^ \}$/ {
/^ Languages\{$/p
/^ '${ACCEPT_LANG_PATTERN}'/p
/^ \}$/p
d
}
/^ Types\{$/,/^ \}$/d
/^ Variants\{$/,/^ \}$/d' ${target}
# Delete an empty "Languages" block. Otherwise, getting the display
# name for all the language in a given locale (e.g. en_GB) would fail
# when the above filtering sed command results in an empty "Languages"
# block.
sed -r -i \
'/^ Languages\{$/ {
N
/^ Languages\{\n \}/ d
}' ${target}
done
}
# Keep only the minimum locale data for non-UI languages.
function abridge_locale_data_for_non_ui_languages {
for lang in $(grep -v '^#' "${scriptdir}/chrome_ui_languages.list")
do
# Set $OP to '|' only if $UI_LANGUAGES is not empty.
OP=${UI_LANGUAGES:+|}
UI_LANGUAGES="${UI_LANGUAGES}${OP}${lang}"
done
EXTRA_LANGUAGES=$(egrep -v -e '^#' -e "(${UI_LANGUAGES})" \
"${scriptdir}/accept_lang.list")
echo Creating minimum locale data in ${localedatapath}
for lang in ${EXTRA_LANGUAGES}
do
target=${localedatapath}/${lang}.txt
[ -e ${target} ] || { echo "missing ${lang}"; continue; }
echo Overwriting ${target} ...
# Do not include '%%Parent' line on purpose.
sed -n -r -i \
'1, /^'${lang}'\{$/p
/^ "%%ALIAS"\{/p
/^ AuxExemplarCharacters\{.*\}$/p
/^ AuxExemplarCharacters\{$/, /^ \}$/p
/^ ExemplarCharacters\{.*\}$/p
/^ ExemplarCharacters\{$/, /^ \}$/p
/^ (LocaleScript|layout)\{$/, /^ \}$/p
/^ Version\{.*$/p
/^\}$/p' ${target}
done
echo Creating minimum locale data in ${langdatapath}
for lang in ${EXTRA_LANGUAGES}
do
target=${langdatapath}/${lang}.txt
[ -e ${target} ] || { echo "missing ${lang}"; continue; }
echo Overwriting ${target} ...
# Do not include '%%Parent' line on purpose.
sed -n -r -i \
'1, /^'${lang}'\{$/p
/^ "%%ALIAS"\{/p
/^ Languages\{$/, /^ \}$/ {
/^ Languages\{$/p
/^ '${lang}'\{.*\}$/p
/^ \}$/p
}
/^\}$/p' ${target}
done
}
# Keep only the currencies used by the larget 150 economies in terms of GDP.
# TODO(jshin): Use ucurr_isAvailable in ICU to drop more currencies.
# See also http://en.wikipedia.org/wiki/List_of_circulating_currencies
function filter_currency_data {
unset KEEPLIST
for currency in $(grep -v '^#' "${scriptdir}/currencies.list")
do
OP=${KEEPLIST:+|}
KEEPLIST=${KEEPLIST}${OP}${currency}
done
KEEPLIST="(${KEEPLIST})"
for i in ${dataroot}/curr/*.txt
do
locale=$(basename $i .txt)
[ $locale == 'supplementalData' ] && continue;
echo "Overwriting $i for $locale"
sed -n -r -i \
'1, /^'${locale}'\{$/ p
/^ "%%ALIAS"\{/p
/^ %%Parent\{/p
/^ Currencies\{$/, /^ \}$/ {
/^ Currencies\{$/ p
/^ '$KEEPLIST'\{$/, /^ \}$/ p
/^ \}$/ p
}
/^ Currencies%narrow\{$/, /^ \}$/ {
/^ Currencies%narrow\{$/ p
/^ '$KEEPLIST'\{".*\}$/ p
/^ \}$/ p
}
/^ CurrencyPlurals\{$/, /^ \}$/ {
/^ CurrencyPlurals\{$/ p
/^ '$KEEPLIST'\{$/, /^ \}$/ p
/^ \}$/ p
}
/^ [cC]urrency(Map|Meta|Spacing|UnitPatterns)\{$/, /^ \}$/ p
/^ Version\{.*\}$/p
/^\}$/p' $i
done
}
# Remove the display names for numeric region codes other than
# 419 (Latin America) because we don't use them.
function filter_region_data {
sed -i '/[0-35-9][0-9][0-9]{/ d' ${dataroot}/region/*.txt
}
function remove_exemplar_cities {
for i in ${dataroot}/zone/*.txt
do
[ $i != 'root.txt' ] && \
sed -i '/^ zoneStrings/, /^ "meta:/ {
/^ zoneStrings/ p
/^ "meta:/ p
d
}' $i
done
}
# Keep only duration and compound in units* sections.
function filter_unit_data {
for i in ${dataroot}/unit/*.txt
do
echo Overwriting $i ...
sed -r -i \
'/^ units(|Narrow|Short)\{$/, /^ \}$/ {
/^ units(|Narrow|Short)\{$/ p
/^ (duration|compound)\{$/, /^ \}$/ p
/^ \}$/ p
d
}' ${i}
done
}
# big5han and gb2312han collation do not make any sense and nobody uses them.
function remove_legacy_chinese_codepoint_collation {
echo "Removing Big5 / GB2312 / UniHan collation data from Chinese locale"
target="${dataroot}/coll/zh.txt"
echo "Overwriting ${target}"
sed -r -i '/^ (uni|big5|gb2312)han\{$/,/^ \}$/ d' ${target}
}
treeroot="$(dirname "$0")/.."
dataroot="${treeroot}/source/data"
scriptdir="${treeroot}/scripts"
localedatapath="${dataroot}/locales"
langdatapath="${dataroot}/lang"
filter_display_language_names
abridge_locale_data_for_non_ui_languages
filter_currency_data
filter_region_data
remove_legacy_chinese_codepoint_collation
filter_unit_data
# Chromium OS needs exemplar cities for timezones, but not Chromium.
# It'll save 400kB (uncompressed), but the size difference in
# 7z compressed installer is <= 100kB.
# TODO(jshin): Make separate data files for CrOS and Chromium.
#remove_exemplar_cities