src/third_party/icu/scripts/trim_data.sh - cobalt - Git at Google

 #!/bin/bash
 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.


 # Remove display names for languages that are not listed in the accept-language
 # list of Chromium.
 function filter_display_language_names {
   for lang in $(grep -v '^#' "${scriptdir}/accept_lang.list")
   do
     # Set $OP to '|' only if $ACCEPT_LANG_PATTERN is not empty.
     OP=${ACCEPT_LANG_PATTERN:+|}
     ACCEPT_LANG_PATTERN="${ACCEPT_LANG_PATTERN}${OP}${lang}"
   done
   ACCEPT_LANG_PATTERN="(${ACCEPT_LANG_PATTERN})[^a-z]"

   echo "Filtering out display names for non-A-L languages ${langdatapath}"
   for lang in $(grep -v '^#' "${scriptdir}/chrome_ui_languages.list")
   do
     target=${langdatapath}/${lang}.txt
     echo Overwriting ${target} ...
     sed -r -i \
     '/^    Keys\{$/,/^    \}$/d
      /^    Languages\{$/, /^    \}$/ {
        /^    Languages\{$/p
        /^        '${ACCEPT_LANG_PATTERN}'/p
        /^    \}$/p
        d
      }
      /^    Types\{$/,/^    \}$/d
      /^    Variants\{$/,/^    \}$/d' ${target}

     # Delete an empty "Languages" block. Otherwise, getting the display
     # name for all the language in a given locale (e.g. en_GB) would fail
     # when the above filtering sed command results in an empty "Languages"
     # block.
     sed -r -i \
     '/^    Languages\{$/ {
        N
        /^    Languages\{\n    \}/ d
      }' ${target}
   done
 }


 # Keep only the minimum locale data for non-UI languages.
 function abridge_locale_data_for_non_ui_languages {
   for lang in $(grep -v '^#' "${scriptdir}/chrome_ui_languages.list")
   do
     # Set $OP to '|' only if $UI_LANGUAGES is not empty.
     OP=${UI_LANGUAGES:+|}
     UI_LANGUAGES="${UI_LANGUAGES}${OP}${lang}"
   done

   EXTRA_LANGUAGES=$(egrep -v -e '^#' -e "(${UI_LANGUAGES})" \
                     "${scriptdir}/accept_lang.list")

   echo Creating minimum locale data in ${localedatapath}
   for lang in ${EXTRA_LANGUAGES}
   do
     target=${localedatapath}/${lang}.txt
     [  -e ${target} ] || { echo "missing ${lang}"; continue; }
     echo Overwriting ${target} ...

     # Do not include '%%Parent' line on purpose.
     sed -n -r -i \
       '1, /^'${lang}'\{$/p
        /^    "%%ALIAS"\{/p
        /^    AuxExemplarCharacters\{.*\}$/p
        /^    AuxExemplarCharacters\{$/, /^    \}$/p
        /^    ExemplarCharacters\{.*\}$/p
        /^    ExemplarCharacters\{$/, /^    \}$/p
        /^    (LocaleScript|layout)\{$/, /^    \}$/p
        /^    Version\{.*$/p
        /^\}$/p' ${target}
   done

   echo Creating minimum locale data in ${langdatapath}
   for lang in ${EXTRA_LANGUAGES}
   do
     target=${langdatapath}/${lang}.txt
     [  -e ${target} ] || { echo "missing ${lang}"; continue; }
     echo Overwriting ${target} ...

     # Do not include '%%Parent' line on purpose.
     sed -n -r -i \
       '1, /^'${lang}'\{$/p
        /^    "%%ALIAS"\{/p
        /^    Languages\{$/, /^    \}$/ {
          /^    Languages\{$/p
          /^        '${lang}'\{.*\}$/p
          /^    \}$/p
        }
        /^\}$/p' ${target}
   done
 }

 # Keep only the currencies used by the larget 150 economies in terms of GDP.
 # TODO(jshin): Use ucurr_isAvailable in ICU to drop more currencies.
 # See also http://en.wikipedia.org/wiki/List_of_circulating_currencies
 function filter_currency_data {
   unset KEEPLIST
   for currency in $(grep -v '^#' "${scriptdir}/currencies.list")
   do
     OP=${KEEPLIST:+|}
     KEEPLIST=${KEEPLIST}${OP}${currency}
   done
   KEEPLIST="(${KEEPLIST})"

   for i in ${dataroot}/curr/*.txt
   do
     locale=$(basename $i .txt)
     [ $locale == 'supplementalData' ] && continue;
     echo "Overwriting $i for $locale"
     sed -n -r -i \
       '1, /^'${locale}'\{$/ p
        /^    "%%ALIAS"\{/p
        /^    %%Parent\{/p
        /^    Currencies\{$/, /^    \}$/ {
          /^    Currencies\{$/ p
          /^        '$KEEPLIST'\{$/, /^        \}$/ p
          /^    \}$/ p
        }
        /^    Currencies%narrow\{$/, /^    \}$/ {
          /^    Currencies%narrow\{$/ p
          /^        '$KEEPLIST'\{".*\}$/ p
          /^    \}$/ p
        }
        /^    CurrencyPlurals\{$/, /^    \}$/ {
          /^    CurrencyPlurals\{$/ p
          /^        '$KEEPLIST'\{$/, /^        \}$/ p
          /^    \}$/ p
        }
        /^    [cC]urrency(Map|Meta|Spacing|UnitPatterns)\{$/, /^    \}$/ p
        /^    Version\{.*\}$/p
        /^\}$/p' $i
   done
 }

 # Remove the display names for numeric region codes other than
 # 419 (Latin America) because we don't use them.
 function filter_region_data {
   sed -i  '/[0-35-9][0-9][0-9]{/ d' ${dataroot}/region/*.txt
 }


 function remove_exemplar_cities {
   for i in ${dataroot}/zone/*.txt
   do
     [ $i != 'root.txt' ] && \
     sed -i '/^    zoneStrings/, /^        "meta:/ {
       /^    zoneStrings/ p
       /^        "meta:/ p
       d
     }' $i
   done
 }

 # Keep only duration and compound in units* sections.
 function filter_unit_data {
   for i in ${dataroot}/unit/*.txt
   do
     echo Overwriting $i ...
     sed -r -i \
       '/^    units(|Narrow|Short)\{$/, /^    \}$/ {
          /^    units(|Narrow|Short)\{$/ p
          /^        (duration|compound)\{$/, /^        \}$/ p
          /^    \}$/ p
          d
        }' ${i}
   done
 }

 # big5han and gb2312han collation do not make any sense and nobody uses them.
 function remove_legacy_chinese_codepoint_collation {
   echo "Removing Big5 / GB2312 / UniHan collation data from Chinese locale"
   target="${dataroot}/coll/zh.txt"
   echo "Overwriting ${target}"
   sed -r -i '/^        (uni|big5|gb2312)han\{$/,/^        \}$/ d' ${target}
 }

 treeroot="$(dirname "$0")/.."
 dataroot="${treeroot}/source/data"
 scriptdir="${treeroot}/scripts"
 localedatapath="${dataroot}/locales"
 langdatapath="${dataroot}/lang"


 filter_display_language_names
 abridge_locale_data_for_non_ui_languages
 filter_currency_data
 filter_region_data
 remove_legacy_chinese_codepoint_collation
 filter_unit_data

 # Chromium OS needs exemplar cities for timezones, but not Chromium.
 # It'll save 400kB (uncompressed), but the size difference in
 # 7z compressed installer is <= 100kB.
 # TODO(jshin): Make separate data files for CrOS and Chromium.
 #remove_exemplar_cities
	#!/bin/bash
	# Copyright (c) 2014 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.


	# Remove display names for languages that are not listed in the accept-language
	# list of Chromium.
	function filter_display_language_names {
	for lang in $(grep -v '^#' "${scriptdir}/accept_lang.list")
	do
	# Set $OP to '\|' only if $ACCEPT_LANG_PATTERN is not empty.
	OP=${ACCEPT_LANG_PATTERN:+\|}
	ACCEPT_LANG_PATTERN="${ACCEPT_LANG_PATTERN}${OP}${lang}"
	done
	ACCEPT_LANG_PATTERN="(${ACCEPT_LANG_PATTERN})[^a-z]"

	echo "Filtering out display names for non-A-L languages ${langdatapath}"
	for lang in $(grep -v '^#' "${scriptdir}/chrome_ui_languages.list")
	do
	target=${langdatapath}/${lang}.txt
	echo Overwriting ${target} ...
	sed -r -i \
	'/^ Keys\{$/,/^ \}$/d
	/^ Languages\{$/, /^ \}$/ {
	/^ Languages\{$/p
	/^ '${ACCEPT_LANG_PATTERN}'/p
	/^ \}$/p
	d
	}
	/^ Types\{$/,/^ \}$/d
	/^ Variants\{$/,/^ \}$/d' ${target}

	# Delete an empty "Languages" block. Otherwise, getting the display
	# name for all the language in a given locale (e.g. en_GB) would fail
	# when the above filtering sed command results in an empty "Languages"
	# block.
	sed -r -i \
	'/^ Languages\{$/ {
	N
	/^ Languages\{\n \}/ d
	}' ${target}
	done
	}


	# Keep only the minimum locale data for non-UI languages.
	function abridge_locale_data_for_non_ui_languages {
	for lang in $(grep -v '^#' "${scriptdir}/chrome_ui_languages.list")
	do
	# Set $OP to '\|' only if $UI_LANGUAGES is not empty.
	OP=${UI_LANGUAGES:+\|}
	UI_LANGUAGES="${UI_LANGUAGES}${OP}${lang}"
	done

	EXTRA_LANGUAGES=$(egrep -v -e '^#' -e "(${UI_LANGUAGES})" \
	"${scriptdir}/accept_lang.list")

	echo Creating minimum locale data in ${localedatapath}
	for lang in ${EXTRA_LANGUAGES}
	do
	target=${localedatapath}/${lang}.txt
	[ -e ${target} ] \|\| { echo "missing ${lang}"; continue; }
	echo Overwriting ${target} ...

	# Do not include '%%Parent' line on purpose.
	sed -n -r -i \
	'1, /^'${lang}'\{$/p
	/^ "%%ALIAS"\{/p
	/^ AuxExemplarCharacters\{.*\}$/p
	/^ AuxExemplarCharacters\{$/, /^ \}$/p
	/^ ExemplarCharacters\{.*\}$/p
	/^ ExemplarCharacters\{$/, /^ \}$/p
	/^ (LocaleScript\|layout)\{$/, /^ \}$/p
	/^ Version\{.*$/p
	/^\}$/p' ${target}
	done

	echo Creating minimum locale data in ${langdatapath}
	for lang in ${EXTRA_LANGUAGES}
	do
	target=${langdatapath}/${lang}.txt
	[ -e ${target} ] \|\| { echo "missing ${lang}"; continue; }
	echo Overwriting ${target} ...

	# Do not include '%%Parent' line on purpose.
	sed -n -r -i \
	'1, /^'${lang}'\{$/p
	/^ "%%ALIAS"\{/p
	/^ Languages\{$/, /^ \}$/ {
	/^ Languages\{$/p
	/^ '${lang}'\{.*\}$/p
	/^ \}$/p
	}
	/^\}$/p' ${target}
	done
	}

	# Keep only the currencies used by the larget 150 economies in terms of GDP.
	# TODO(jshin): Use ucurr_isAvailable in ICU to drop more currencies.
	# See also http://en.wikipedia.org/wiki/List_of_circulating_currencies
	function filter_currency_data {
	unset KEEPLIST
	for currency in $(grep -v '^#' "${scriptdir}/currencies.list")
	do
	OP=${KEEPLIST:+\|}
	KEEPLIST=${KEEPLIST}${OP}${currency}
	done
	KEEPLIST="(${KEEPLIST})"

	for i in ${dataroot}/curr/*.txt
	do
	locale=$(basename $i .txt)
	[ $locale == 'supplementalData' ] && continue;
	echo "Overwriting $i for $locale"
	sed -n -r -i \
	'1, /^'${locale}'\{$/ p
	/^ "%%ALIAS"\{/p
	/^ %%Parent\{/p
	/^ Currencies\{$/, /^ \}$/ {
	/^ Currencies\{$/ p
	/^ '$KEEPLIST'\{$/, /^ \}$/ p
	/^ \}$/ p
	}
	/^ Currencies%narrow\{$/, /^ \}$/ {
	/^ Currencies%narrow\{$/ p
	/^ '$KEEPLIST'\{".*\}$/ p
	/^ \}$/ p
	}
	/^ CurrencyPlurals\{$/, /^ \}$/ {
	/^ CurrencyPlurals\{$/ p
	/^ '$KEEPLIST'\{$/, /^ \}$/ p
	/^ \}$/ p
	}
	/^ [cC]urrency(Map\|Meta\|Spacing\|UnitPatterns)\{$/, /^ \}$/ p
	/^ Version\{.*\}$/p
	/^\}$/p' $i
	done
	}

	# Remove the display names for numeric region codes other than
	# 419 (Latin America) because we don't use them.
	function filter_region_data {
	sed -i '/[0-35-9][0-9][0-9]{/ d' ${dataroot}/region/*.txt
	}



	function remove_exemplar_cities {
	for i in ${dataroot}/zone/*.txt
	do
	[ $i != 'root.txt' ] && \
	sed -i '/^ zoneStrings/, /^ "meta:/ {
	/^ zoneStrings/ p
	/^ "meta:/ p
	d
	}' $i
	done
	}

	# Keep only duration and compound in units* sections.
	function filter_unit_data {
	for i in ${dataroot}/unit/*.txt
	do
	echo Overwriting $i ...
	sed -r -i \
	'/^ units(\|Narrow\|Short)\{$/, /^ \}$/ {
	/^ units(\|Narrow\|Short)\{$/ p
	/^ (duration\|compound)\{$/, /^ \}$/ p
	/^ \}$/ p
	d
	}' ${i}
	done
	}

	# big5han and gb2312han collation do not make any sense and nobody uses them.
	function remove_legacy_chinese_codepoint_collation {
	echo "Removing Big5 / GB2312 / UniHan collation data from Chinese locale"
	target="${dataroot}/coll/zh.txt"
	echo "Overwriting ${target}"
	sed -r -i '/^ (uni\|big5\|gb2312)han\{$/,/^ \}$/ d' ${target}
	}

	treeroot="$(dirname "$0")/.."
	dataroot="${treeroot}/source/data"
	scriptdir="${treeroot}/scripts"
	localedatapath="${dataroot}/locales"
	langdatapath="${dataroot}/lang"



	filter_display_language_names
	abridge_locale_data_for_non_ui_languages
	filter_currency_data
	filter_region_data
	remove_legacy_chinese_codepoint_collation
	filter_unit_data

	# Chromium OS needs exemplar cities for timezones, but not Chromium.
	# It'll save 400kB (uncompressed), but the size difference in
	# 7z compressed installer is <= 100kB.
	# TODO(jshin): Make separate data files for CrOS and Chromium.
	#remove_exemplar_cities