| # Copyright (c) 2012-2015 International Business Machines |
| # Corporation and others. All Rights Reserved. |
| # |
| # This file should be in UTF-8 with a signature byte sequence ("BOM"). |
| # |
| # collationtest.txt: Collation test data. |
| # |
| # created on: 2012apr13 |
| # created by: Markus W. Scherer |
| |
| # A line with "** test: description" is used for verbose and error output. |
| |
| # A collator can be set with "@ root" or "@ locale language-tag", |
| # for example "@ locale de-u-co-phonebk". |
| # An old-style locale ID can also be used, for example "@ locale de@collation=phonebook". |
| |
| # A collator can be built with "@ rules". |
| # An "@ rules" line is followed by one or more lines with the tailoring rules. |
| |
| # A collator can be modified with "% attribute=value". |
| |
| # "* compare" tests the order (= or <) of the following strings. |
| # The relation can be "=" or "<" (the level of the difference is not specified) |
| # or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference). |
| |
| # Test sections ("* compare") are terminated by |
| # definitions of new collators, changing attributes, or new test sections. |
| |
| ** test: simple CEs & expansions |
| # Many types of mappings are tested elsewhere, including via the UCA conformance tests. |
| # Here we mostly cover a few unusual mappings. |
| @ rules |
| &\x01 # most control codes are ignorable |
| <<<\u0300 # tertiary CE |
| &9<\x00 # NUL not ignorable |
| &\uA00A\uA00B=\uA002 # two long-primary CEs |
| &\uA00A\uA00B\u00050005=\uA003 # three CEs, require 64 bits |
| |
| * compare |
| = \x01 |
| = \x02 |
| <3 \u0300 |
| <1 9 |
| <1 \x00 |
| = \x01\x00\x02 |
| <1 a |
| <3 a\u0300 |
| <2 a\u0308 |
| = ä |
| <1 b |
| <1 か # Hiragana Ka (U+304B) |
| <2 か\u3099 # plus voiced sound mark |
| = が # Hiragana Ga (U+304C) |
| <1 \uA00A\uA00B |
| = \uA002 |
| <1 \uA00A\uA00B\u00050004 |
| <1 \uA00A\uA00B\u00050005 |
| = \uA003 |
| <1 \uA00A\uA00B\u00050006 |
| |
| ** test: contractions |
| # Create some interesting mappings, and map some normalization-inert characters |
| # (which are not subject to canonical reordering) |
| # to some of the same CEs to check the sequence of CEs. |
| @ rules |
| |
| # Contractions starting with 'a' should not continue with any character < U+0300 |
| # so that we can test a shortcut for that. |
| &a=ⓐ |
| &b<bz=ⓑ |
| &d<dz\u0301=ⓓ # d+z+acute |
| &z |
| <a\u0301=Ⓐ # a+acute sorts after z |
| <a\u0301\u0301=Ⓑ # a+acute+acute |
| <a\u0301\u0301\u0358=Ⓒ # a+acute+acute+dot above right |
| <a\u030a=Ⓓ # a+ring |
| <a\u0323=Ⓔ # a+dot below |
| <a\u0323\u0358=Ⓕ # a+dot below+dot above right |
| <a\u0327\u0323\u030a=Ⓖ # a+cedilla+dot below+ring |
| <a\u0327\u0323bz=Ⓗ # a+cedilla+dot below+b+z |
| |
| &\U0001D158=⁰ # musical notehead black (has a symbol primary) |
| <\U0001D158\U0001D165=¼ # musical quarter note |
| |
| # deliberately missing prefix contractions: |
| # dz |
| # a\u0327 |
| # a\u0327\u0323 |
| # a\u0327\u0323b |
| |
| &\x01 |
| <<<\U0001D165=¹ # musical stem (ccc=216) |
| <<<\U0001D16D=² # musical augmentation dot (ccc=226) |
| <<<\U0001D165\U0001D16D=³ # stem+dot (ccc=216 226) |
| &\u0301=❶ # acute (ccc=230) |
| &\u030a=❷ # ring (ccc=230) |
| &\u0308=❸ # diaeresis (ccc=230) |
| <<\u0308\u0301=❹ # diaeresis+acute (=dialytika tonos) (ccc=230 230) |
| &\u0327=❺ # cedilla (ccc=202) |
| &\u0323=❻ # dot below (ccc=220) |
| &\u0331=❼ # macron below (ccc=220) |
| <<\u0331\u0358=❽ # macron below+dot above right (ccc=220 232) |
| &\u0334=❾ # tilde overlay (ccc=1) |
| &\u0358=❿ # dot above right (ccc=232) |
| |
| &\u0f71=① # tibetan vowel sign aa |
| &\u0f72=② # tibetan vowel sign i |
| # \u0f71\u0f72 # tibetan vowel sign aa + i = ii = U+0F73 |
| &\u0f73=③ # tibetan vowel sign ii (ccc=0 but lccc=129) |
| |
| ** test: simple contractions |
| |
| # Some strings are chosen to cause incremental contiguous contraction matching to |
| # go into partial matches for prefixes of contractions |
| # (where the prefixes are deliberately not also contractions). |
| # When there is no complete match, then the matching code must back out of those |
| # so that discontiguous contractions work as specified. |
| |
| * compare |
| # contraction starter with no following text, or mismatch, or blocked |
| <1 a |
| = ⓐ |
| <1 aa |
| = ⓐⓐ |
| <1 ab |
| = ⓐb |
| <1 az |
| = ⓐz |
| |
| * compare |
| <1 a |
| <2 a\u0308\u030a # ring blocked by diaeresis |
| = ⓐ❸❷ |
| <2 a\u0327 |
| = ⓐ❺ |
| |
| * compare |
| <2 \u0308 |
| = ❸ |
| <2 \u0308\u030a\u0301 # acute blocked by ring |
| = ❸❷❶ |
| |
| * compare |
| <1 \U0001D158 |
| = ⁰ |
| <1 \U0001D158\U0001D165 |
| = ¼ |
| |
| # no discontiguous contraction because of missing prefix contraction d+z, |
| # and a starter ('z') after the 'd' |
| * compare |
| <1 dz\u0323\u0301 |
| = dz❻❶ |
| |
| # contiguous contractions |
| * compare |
| <1 abz |
| = ⓐⓑ |
| <1 abzz |
| = ⓐⓑz |
| |
| * compare |
| <1 a |
| <1 z |
| <1 a\u0301 |
| = Ⓐ |
| <1 a\u0301\u0301 |
| = Ⓑ |
| <1 a\u0301\u0301\u0358 |
| = Ⓒ |
| <1 a\u030a |
| = Ⓓ |
| <1 a\u0323\u0358 |
| = Ⓕ |
| <1 a\u0327\u0323\u030a # match despite missing prefix |
| = Ⓖ |
| <1 a\u0327\u0323bz |
| = Ⓗ |
| |
| * compare |
| <2 \u0308\u0308\u0301 # acute blocked from first diaeresis, contracts with second |
| = ❸❹ |
| |
| * compare |
| <1 \U0001D158\U0001D165 |
| = ¼ |
| |
| * compare |
| <3 \U0001D165\U0001D16D |
| = ³ |
| |
| ** test: discontiguous contractions |
| * compare |
| <1 a\u0327\u030a # a+ring skips cedilla |
| = Ⓓ❺ |
| <2 a\u0327\u0327\u030a # a+ring skips 2 cedillas |
| = Ⓓ❺❺ |
| <2 a\u0327\u0327\u0327\u030a # a+ring skips 3 cedillas |
| = Ⓓ❺❺❺ |
| <2 a\u0334\u0327\u0327\u030a # a+ring skips tilde overlay & 2 cedillas |
| = Ⓓ❾❺❺ |
| <1 a\u0327\u0323 # a+dot below skips cedilla |
| = Ⓔ❺ |
| <1 a\u0323\u0301\u0358 # a+dot below+dot ab.r.: 2-char match, then skips acute |
| = Ⓕ❶ |
| <2 a\u0334\u0323\u0358 # a+dot below skips tilde overlay |
| = Ⓕ❾ |
| |
| * compare |
| <2 \u0331\u0331\u0358 # macron below+dot ab.r. skips the second macron below |
| = ❽❼ |
| |
| * compare |
| <1 a\u0327\u0331\u0323\u030a # a+ring skips cedilla, macron below, dot below (dot blocked by macron) |
| = Ⓓ❺❼❻ |
| <1 a\u0327\u0323\U0001D16D\u030a # a+dot below skips cedilla |
| = Ⓔ❺²❷ |
| <2 a\u0327\u0327\u0323\u030a # a+dot below skips 2 cedillas |
| = Ⓔ❺❺❷ |
| <2 a\u0327\u0323\u0323\u030a # a+dot below skips cedilla |
| = Ⓔ❺❻❷ |
| <2 a\u0334\u0327\u0323\u030a # a+dot below skips tilde overlay & cedilla |
| = Ⓔ❾❺❷ |
| |
| * compare |
| <1 \U0001D158\u0327\U0001D165 # quarter note skips cedilla |
| = ¼❺ |
| <1 a\U0001D165\u0323 # a+dot below skips stem |
| = Ⓔ¹ |
| |
| # partial contiguous match, backs up, matches discontiguous contraction |
| <1 a\u0327\u0323b |
| = Ⓔ❺b |
| <1 a\u0327\u0323ba |
| = Ⓔ❺bⓐ |
| |
| # a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc combining marks |
| * compare |
| <1 a\u0327\u0301\u0301\u0358 |
| = Ⓒ❺ |
| |
| # FCD but not NFD |
| * compare |
| <1 a\u0f73\u0301 # a+acute skips tibetan ii |
| = Ⓐ③ |
| |
| # FCD but the 0f71 inside the 0f73 must be skipped |
| # to match the discontiguous contraction of the first 0f71 with the trailing 0f72 inside the 0f73 |
| * compare |
| <1 \u0f71\u0f73 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72 |
| = ③① |
| |
| ** test: discontiguous contractions with nested contractions |
| * compare |
| <1 a\u0323\u0308\u0301\u0358 |
| = Ⓕ❹ |
| <2 a\u0323\u0308\u0301\u0308\u0301\u0358 |
| = Ⓕ❹❹ |
| |
| ** test: discontiguous contractions with interleaved contractions |
| * compare |
| # a+ring & cedilla & macron below+dot above right |
| <1 a\u0327\u0331\u030a\u0358 |
| = Ⓓ❺❽ |
| |
| # a+ring & 1x..3x macron below+dot above right |
| <2 a\u0331\u030a\u0358 |
| = Ⓓ❽ |
| <2 a\u0331\u0331\u030a\u0358\u0358 |
| = Ⓓ❽❽ |
| # also skips acute |
| <2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358 |
| = Ⓓ❽❽❽❶ |
| |
| # a+dot below & stem+augmentation dot, followed by contiguous d+z+acute |
| <1 a\U0001D165\u0323\U0001D16Ddz\u0301 |
| = Ⓔ³ⓓ |
| |
| ** test: some simple string comparisons |
| @ root |
| * compare |
| # first string compares against "" |
| = \u0000 |
| < a |
| <1 b |
| <3 B |
| = \u0000B\u0000 |
| |
| ** test: compare with strength=primary |
| % strength=primary |
| * compare |
| <1 a |
| <1 b |
| = B |
| |
| ** test: compare with strength=secondary |
| % strength=secondary |
| * compare |
| <1 a |
| <1 b |
| = B |
| |
| ** test: compare with strength=tertiary |
| % strength=tertiary |
| * compare |
| <1 a |
| <1 b |
| <3 B |
| |
| ** test: compare with strength=quaternary |
| % strength=quaternary |
| * compare |
| <1 a |
| <1 b |
| <3 B |
| |
| ** test: compare with strength=identical |
| % strength=identical |
| * compare |
| <1 a |
| <1 b |
| <3 B |
| |
| ** test: côté with forwards secondary |
| @ root |
| * compare |
| <1 cote |
| <2 coté |
| <2 côte |
| <2 côté |
| |
| ** test: côté with forwards secondary vs. U+FFFE merge separator |
| # Merged sort keys: On each level, any difference in the first segment |
| # must trump any further difference. |
| * compare |
| <1 cote\uFFFEcôté |
| <2 coté\uFFFEcôte |
| <2 côte\uFFFEcoté |
| <2 côté\uFFFEcote |
| |
| ** test: côté with backwards secondary |
| % backwards=on |
| * compare |
| <1 cote |
| <2 côte |
| <2 coté |
| <2 côté |
| |
| ** test: côté with backwards secondary vs. U+FFFE merge separator |
| # Merged sort keys: On each level, any difference in the first segment |
| # must trump any further difference. |
| * compare |
| <1 cote\uFFFEcôté |
| <2 côte\uFFFEcoté |
| <2 coté\uFFFEcôte |
| <2 côté\uFFFEcote |
| |
| ** test: U+FFFE on identical level |
| @ root |
| % strength=identical |
| * compare |
| # All of these control codes are completely-ignorable, so that |
| # their low code points are compared with the merge separator. |
| # The merge separator must compare less than any other character. |
| <1 \uFFFE\u0001\u0002\u0003 |
| <i \u0001\uFFFE\u0002\u0003 |
| <i \u0001\u0002\uFFFE\u0003 |
| <i \u0001\u0002\u0003\uFFFE |
| |
| * compare |
| # The merge separator must even compare less than U+0000. |
| <1 \uFFFE\u0000\u0000 |
| <i \u0000\uFFFE\u0000 |
| <i \u0000\u0000\uFFFE |
| |
| ** test: Hani < surrogates < U+FFFD |
| # Note: compareUTF8() treats unpaired surrogates like U+FFFD, |
| # so with that the strings with surrogates will compare equal to each other |
| # and equal to the string with U+FFFD. |
| @ root |
| % strength=identical |
| * compare |
| <1 abz |
| <1 a\u4e00z |
| <1 a\U00020000z |
| <1 a\ud800z |
| <1 a\udbffz |
| <1 a\udc00z |
| <1 a\udfffz |
| <1 a\ufffdz |
| |
| ** test: script reordering |
| @ root |
| % reorder Hani Zzzz digit |
| * compare |
| <1 ? |
| <1 + |
| <1 丂 |
| <1 a |
| <1 α |
| <1 5 |
| |
| % reorder default |
| * compare |
| <1 ? |
| <1 + |
| <1 5 |
| <1 a |
| <1 α |
| <1 丂 |
| |
| ** test: empty rules |
| @ rules |
| * compare |
| <1 a |
| <2 ä |
| <3 Ä |
| <1 b |
| |
| ** test: very simple rules |
| @ rules |
| &a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z |
| % strength=quaternary |
| * compare |
| <1 a |
| = e |
| <4 q |
| <4 r |
| <1 x |
| <3 X |
| <2 y |
| <3 Y |
| <2 z |
| <3 Z |
| |
| ** test: tailoring twice before a root position: primary |
| @ rules |
| &[before 1]b<p |
| &[before 1]b<q |
| * compare |
| <1 a |
| <1 p |
| <1 q |
| <1 b |
| |
| ** test: tailoring twice before a root position: secondary |
| @ rules |
| &[before 2]ſ<<p |
| &[before 2]ſ<<q |
| * compare |
| <1 s |
| <2 p |
| <2 q |
| <2 ſ |
| |
| # secondary-before common weight |
| @ rules |
| &[before 2]b<<p |
| &[before 2]b<<q |
| * compare |
| <1 a |
| <1 p |
| <2 q |
| <2 b |
| |
| ** test: tailoring twice before a root position: tertiary |
| @ rules |
| &[before 3]B<<<p |
| &[before 3]B<<<q |
| * compare |
| <1 b |
| <3 p |
| <3 q |
| <3 B |
| |
| # tertiary-before common weight |
| @ rules |
| &[before 3]b<<<p |
| &[before 3]b<<<q |
| * compare |
| <1 a |
| <1 p |
| <3 q |
| <3 b |
| |
| @ rules |
| &[before 2]b<<s |
| &[before 3]s<<<p |
| &[before 3]s<<<q |
| * compare |
| <1 a |
| <1 p |
| <3 q |
| <3 s |
| <2 b |
| |
| ** test: tailor after completely ignorable |
| @ rules |
| &\x00<<<x<<y |
| * compare |
| = \x00 |
| = \x1F |
| <3 x |
| <2 y |
| |
| ** test: secondary tailoring gaps, ICU ticket 9362 |
| @ rules |
| &[before 2]s<<'_' |
| &s<<r # secondary between s and ſ (long s) |
| &ſ<<*a-q # more than 15 between ſ and secondary CE boundary |
| &[before 2][first primary ignorable]<<u<<v # between secondary CE boundary & lowest secondary CE |
| &[last primary ignorable]<<y<<z |
| |
| * compare |
| <2 u |
| <2 v |
| <2 \u0332 # lowest secondary CE |
| <2 \u0308 |
| <2 y |
| <2 z |
| <1 s_ |
| <2 ss |
| <2 sr |
| <2 sſ |
| <2 sa |
| <2 sb |
| <2 sp |
| <2 sq |
| <2 sus |
| <2 svs |
| <2 rs |
| |
| ** test: tertiary tailoring gaps, ICU ticket 9362 |
| @ rules |
| &[before 3]t<<<'_' |
| &t<<<r # tertiary between t and fullwidth t |
| &ᵀ<<<*a-q # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary |
| &[before 3][first secondary ignorable]<<<u<<<v # between tertiary CE boundary & lowest tertiary CE |
| &[last secondary ignorable]<<<y<<<z |
| |
| * compare |
| <3 u |
| <3 v |
| # Note: The root collator currently does not map any characters to tertiary CEs. |
| <3 y |
| <3 z |
| <1 t_ |
| <3 tt |
| <3 tr |
| <3 tt |
| <3 tᵀ |
| <3 ta |
| <3 tb |
| <3 tp |
| <3 tq |
| <3 tut |
| <3 tvt |
| <3 rt |
| |
| ** test: secondary & tertiary around root character |
| @ rules |
| &[before 2]m<<r |
| &m<<s |
| &[before 3]m<<<u |
| &m<<<v |
| * compare |
| <1 l |
| <1 r |
| <2 u |
| <3 m |
| <3 v |
| <2 s |
| <1 n |
| |
| ** test: secondary & tertiary around tailored item |
| @ rules |
| &m<x |
| &[before 2]x<<r |
| &x<<s |
| &[before 3]x<<<u |
| &x<<<v |
| * compare |
| <1 m |
| <1 r |
| <2 u |
| <3 x |
| <3 v |
| <2 s |
| <1 n |
| |
| ** test: more nesting of secondary & tertiary before |
| @ rules |
| &[before 3]m<<<u |
| &[before 2]m<<r |
| &[before 3]r<<<q |
| &m<<<w |
| &m<<t |
| &[before 3]w<<<v |
| &w<<<x |
| &w<<s |
| * compare |
| <1 l |
| <1 q |
| <3 r |
| <2 u |
| <3 m |
| <3 v |
| <3 w |
| <3 x |
| <2 s |
| <2 t |
| <1 n |
| |
| ** test: case bits |
| @ rules |
| &w<x # tailored CE getting case bits |
| =uv=uV=Uv=UV # 2 chars -> 1 CE |
| &ae=ch=cH=Ch=CH # 2 chars -> 2 CEs |
| &rst=yz=yZ=Yz=YZ # 2 chars -> 3 CEs |
| % caseFirst=lower |
| * compare |
| <1 ae |
| = ch |
| <3 cH |
| <3 Ch |
| <3 CH |
| <1 rst |
| = yz |
| <3 yZ |
| <3 Yz |
| <3 YZ |
| <1 w |
| <1 x |
| = uv |
| <3 uV |
| = Uv # mixed case on single CE cannot distinguish variations |
| <3 UV |
| |
| ** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower |
| @ rules |
| &\u0001<<<t<<<T # tertiary CEs |
| % caseFirst=lower |
| * compare |
| <1 aa |
| <3 aat |
| <3 aaT |
| <3 aA |
| <3 aAt |
| <3 ata |
| <3 aTa |
| |
| ** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper |
| % caseFirst=upper |
| * compare |
| <1 aA |
| <3 aAt |
| <3 aa |
| <3 aat |
| <3 aaT |
| <3 ata |
| <3 aTa |
| |
| ** test: reset on expansion, ICU tickets 9415 & 9593 |
| @ rules |
| &æ<x # tailor the last primary CE so that x sorts between ae and af |
| &æb=bæ # copy all reset CEs to make bæ sort the same |
| &각<h # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂 |
| &⒀<<y # copy/tailor 4 CEs to make y sort with only a secondary difference |
| &l·=z # handle the pre-context for · when fetching reset CEs |
| <<u # copy/tailor 2 CEs |
| |
| * compare |
| <1 ae |
| <2 æ |
| <1 x |
| <1 af |
| |
| * compare |
| <1 aeb |
| <2 æb |
| = bæ |
| |
| * compare |
| <1 각 |
| <1 h |
| <1 갂 |
| <1 갃 |
| |
| * compare |
| <1 · # by itself: primary CE |
| <1 l |
| <2 l· # l+middle dot has only a secondary difference from l |
| = z |
| <2 u |
| |
| * compare |
| <1 (13) |
| <3 ⒀ # DUCET sets special tertiary weights in all CEs |
| <2 y |
| <1 (13[ |
| |
| % alternate=shifted |
| * compare |
| <1 (13) |
| = 13 |
| <3 ⒀ |
| = y # alternate=shifted removes the tailoring difference on the last CE |
| <1 14 |
| |
| ** test: contraction inside extension, ICU ticket 9378 |
| @ rules |
| &а<<х/й # all letters are Cyrillic |
| * compare |
| <1 ай |
| <2 х |
| |
| ** test: no duplicate tailored CEs for different reset positions with same CEs, ICU ticket 10104 |
| @ rules |
| &t<x &ᵀ<y # same primary weights |
| &q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent |
| * compare |
| <1 q |
| <1 u |
| <1 v |
| <1 ꝗ |
| <1 t |
| <3 ᵀ |
| <1 y |
| <1 x |
| |
| # Principle: Each rule builds on the state of preceding rules and ignores following rules. |
| |
| ** test: later rule does not affect earlier reset position, ICU ticket 10105 |
| @ rules |
| &a < u < v < w &ov < x &b < v |
| * compare |
| <1 oa |
| <1 ou |
| <1 x # CE(o) followed by CE between u and w |
| <1 ow |
| <1 ob |
| <1 ov |
| |
| ** test: later rule does not affect earlier extension (1), ICU ticket 10105 |
| @ rules |
| &a=x/b &v=b |
| % strength=secondary |
| * compare |
| <1 B |
| <1 c |
| <1 v |
| = b |
| * compare |
| <1 AB |
| = x |
| <1 ac |
| <1 av |
| = ab |
| |
| ** test: later rule does not affect earlier extension (2), ICU ticket 10105 |
| @ rules |
| &a <<< c / e &g <<< e / l |
| % strength=secondary |
| * compare |
| <1 AE |
| = c |
| <2 æ |
| <1 agl |
| = ae |
| |
| ** test: later rule does not affect earlier extension (3), ICU ticket 10105 |
| @ rules |
| &a = b / c &d = c / e |
| % strength=secondary |
| * compare |
| <1 AC # C is still only tertiary different from the original c |
| = b |
| <1 ade |
| = ac |
| |
| ** test: extension contains tailored character, ICU ticket 10105 |
| @ rules |
| &a=e &b=u/e |
| * compare |
| <1 a |
| = e |
| <1 ba |
| = be |
| = u |
| |
| ** test: add simple mappings for characters with root context |
| @ rules |
| &z=· # middle dot has a prefix mapping in the CLDR root |
| &n=и # и (U+0438) has contractions in the root |
| * compare |
| <1 l |
| <2 l· # root mapping for l|· still works |
| <1 z |
| = · |
| * compare |
| <1 n |
| = и |
| <1 И |
| <1 и\u0306 # root mapping for й=и\u0306 still works |
| = й |
| <3 Й |
| |
| ** test: add context mappings around characters with root context |
| @ rules |
| &z=·h # middle dot has a prefix mapping in the CLDR root |
| &n=ә|и # и (U+0438) has contractions in the root |
| * compare |
| <1 l |
| <2 l· # root mapping for l|· still works |
| <1 z |
| = ·h |
| * compare |
| <1 и |
| <3 И |
| <1 и\u0306 # root mapping for й=и\u0306 still works |
| = й |
| * compare |
| <1 әn |
| = әи |
| <1 әo |
| |
| ** test: many secondary CEs at the top of their range |
| @ rules |
| &[last primary ignorable]<<*\u2801-\u28ff |
| * compare |
| <2 \u0308 |
| <2 \u2801 |
| <2 \u2802 |
| <2 \u2803 |
| <2 \u2804 |
| <2 \u28fd |
| <2 \u28fe |
| <2 \u28ff |
| <1 \x20 |
| |
| ** test: many tertiary CEs at the top of their range |
| @ rules |
| &[last secondary ignorable]<<<*a-z |
| * compare |
| <3 a |
| <3 b |
| <3 c |
| <3 d |
| # e..w |
| <3 x |
| <3 y |
| <3 z |
| <2 \u0308 |
| |
| ** test: tailor contraction together with nearly equivalent prefix, ICU ticket 10101 |
| @ rules |
| &a=p|x &b=px &c=op |
| * compare |
| <1 b |
| = px |
| <3 B |
| <1 c |
| = op |
| <3 C |
| * compare |
| <1 ca |
| = opx # first contraction op, then prefix p|x |
| <3 cA |
| <3 Ca |
| |
| ** test: reset position with prefix (pre-context), ICU ticket 10102 |
| @ rules |
| &a=p|x &px=y |
| * compare |
| <1 pa |
| = px |
| = y |
| <3 pA |
| <1 q |
| <1 x |
| |
| ** test: prefix+contraction together (1), ICU ticket 10071 |
| @ rules |
| &x=a|bc |
| * compare |
| <1 ab |
| <1 Abc |
| <1 abd |
| <1 ac |
| <1 aw |
| <1 ax |
| = abc |
| <3 aX |
| <3 Ax |
| <1 b |
| <1 bb |
| <1 bc |
| <3 bC |
| <3 Bc |
| <1 bd |
| |
| ** test: prefix+contraction together (2), ICU ticket 10071 |
| @ rules |
| &w=bc &x=a|b |
| * compare |
| <1 w |
| = bc |
| <3 W |
| * compare |
| <1 aw |
| <1 ax |
| = ab |
| <3 aX |
| <1 axb |
| <1 axc |
| = abc # prefix match a|b takes precedence over contraction match bc |
| <3 abC |
| <1 abd |
| <1 ay |
| |
| ** test: prefix+contraction together (3), ICU ticket 10071 |
| @ rules |
| &x=a|b &w=bc # reverse order of rules as previous test, order should not matter here |
| * compare # same "compare" sequences as previous test |
| <1 w |
| = bc |
| <3 W |
| * compare |
| <1 aw |
| <1 ax |
| = ab |
| <3 aX |
| <1 axb |
| <1 axc |
| = abc # prefix match a|b takes precedence over contraction match bc |
| <3 abC |
| <1 abd |
| <1 ay |
| |
| ** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962 |
| @ rules |
| &d=ch &v=p|ci |
| * compare |
| <1 pc |
| <3 pC |
| <1 pcH |
| <1 pcI |
| <1 pd |
| = pch # no-prefix contraction ch matches |
| <3 pD |
| <1 pv |
| = pci # prefix+contraction p|ci matches |
| <3 pV |
| |
| ** test: tailor in & around compact ranges of root primaries |
| # The Ogham characters U+1681..U+169A are in simple ascending order of primary CEs |
| # which should be reliably encoded as one range in the root elements data. |
| @ rules |
| &[before 1]ᚁ<a |
| &ᚁ<b |
| &[before 1]ᚂ<c |
| &ᚂ<d |
| &[before 1]ᚚ<y |
| &ᚚ<z |
| &[before 2]ᚁ<<r |
| &ᚁ<<s |
| &[before 3]ᚚ<<<t |
| &ᚚ<<<u |
| * compare |
| <1 ᣵ # U+18F5 last Canadian Aboriginal |
| <1 a |
| <1 r |
| <2 ᚁ |
| <2 s |
| <1 b |
| <1 c |
| <1 ᚂ |
| <1 d |
| <1 ᚃ |
| <1 ᚙ |
| <1 y |
| <1 t |
| <3 ᚚ |
| <3 u |
| <1 z |
| <1 ᚠ # U+16A0 first Runic |
| |
| ** test: suppressContractions |
| @ rules |
| &z<ch<әж [suppressContractions [·cә]] |
| * compare |
| <1 ch |
| <3 cH # ch was suppressed |
| <1 l |
| <1 l· # primary difference, not secondary, because l|· was suppressed |
| <1 ә |
| <2 ә\u0308 # secondary difference, not primary, because contractions for ә were suppressed |
| <1 әж |
| <3 әЖ |
| |
| ** test: Hangul & Jamo |
| @ rules |
| &L=\u1100 # first Jamo L |
| &V=\u1161 # first Jamo V |
| &T=\u11A8 # first Jamo T |
| &\uAC01<<*\u4E00-\u4EFF # first Hangul LVT syllable & lots of secondary diffs |
| * compare |
| <1 Lv |
| <3 LV |
| = \u1100\u1161 |
| = \uAC00 |
| <1 LVt |
| <3 LVT |
| = \u1100\u1161\u11A8 |
| = \uAC00\u11A8 |
| = \uAC01 |
| <2 LVT\u0308 |
| <2 \u4E00 |
| <2 \u4E01 |
| <2 \u4E80 |
| <2 \u4EFF |
| <2 LV\u0308T |
| <1 \uAC02 |
| |
| ** test: adjust special reset positions according to previous rules, CLDR ticket 6070 |
| @ rules |
| &[last variable]<x |
| [maxVariable space] # has effect only after building, no effect on following rules |
| &[last variable]<y |
| &[before 1][first regular]<z |
| * compare |
| <1 ? # some punctuation |
| <1 x |
| <1 y |
| <1 z |
| <1 $ # some symbol |
| |
| @ rules |
| &[last primary ignorable]<<x<<<y |
| &[last primary ignorable]<<z |
| * compare |
| <2 \u0358 |
| <2 x |
| <3 y |
| <2 z |
| <1 \x20 |
| |
| @ rules |
| &[last secondary ignorable]<<<x |
| &[last secondary ignorable]<<<y |
| * compare |
| <3 x |
| <3 y |
| <2 \u0358 |
| |
| @ rules |
| &[before 2][first variable]<<z |
| &[before 2][first variable]<<y |
| &[before 3][first variable]<<<x |
| &[before 3][first variable]<<<w |
| &[before 1][first variable]<v |
| &[before 2][first variable]<<u |
| &[before 3][first variable]<<<t |
| &[before 2]\uFDD1\xA0<<s # FractionalUCA.txt: FDD1 00A0, SPACE first primary |
| * compare |
| <2 \u0358 |
| <1 s |
| <2 \uFDD1\xA0 |
| <1 t |
| <3 u |
| <2 v |
| <1 w |
| <3 x |
| <3 y |
| <2 z |
| <2 \t |
| |
| @ rules |
| &[before 2][first regular]<<z |
| &[before 3][first regular]<<<y |
| &[before 1][first regular]<x |
| &[before 3][first regular]<<<w |
| &[before 2]\uFDD1\u263A<<v # FractionalUCA.txt: FDD1 263A, SYMBOL first primary |
| &[before 3][first regular]<<<u |
| &[before 1][first regular]<p # primary before the boundary: becomes variable |
| &[before 3][first regular]<<<t # not affected by p |
| &[last variable]<q # after p! |
| * compare |
| <1 ? |
| <1 p |
| <1 q |
| <1 t |
| <3 u |
| <3 v |
| <1 w |
| <3 x |
| <1 y |
| <3 z |
| <1 $ |
| |
| # check that p & q are indeed variable |
| % alternate=shifted |
| * compare |
| = ? |
| = p |
| = q |
| <1 t |
| <3 u |
| <3 v |
| <1 w |
| <3 x |
| <1 y |
| <3 z |
| <1 $ |
| |
| @ rules |
| &[before 2][first trailing]<<z |
| &[before 1][first trailing]<y |
| &[before 3][first trailing]<<<x |
| * compare |
| <1 \u4E00 # first Han, first implicit |
| <1 \uFDD1\uFDD0 # FractionalUCA.txt: unassigned first primary |
| # Note: The root collator currently does not map any characters to the trailing first boundary primary. |
| <1 x |
| <3 y |
| <1 z |
| <2 \uFFFD # The root collator currently maps U+FFFD to the first real trailing primary. |
| |
| @ rules |
| &[before 2][first primary ignorable]<<z |
| &[before 2][first primary ignorable]<<y |
| &[before 3][first primary ignorable]<<<x |
| &[before 3][first primary ignorable]<<<w |
| * compare |
| = \x01 |
| <2 w |
| <3 x |
| <3 y |
| <2 z |
| <2 \u0301 |
| |
| @ rules |
| &[before 3][first secondary ignorable]<<<y |
| &[before 3][first secondary ignorable]<<<x |
| * compare |
| = \x01 |
| <3 x |
| <3 y |
| <2 \u0301 |
| |
| ** test: canonical closure |
| @ rules |
| &X=A &U=Â |
| * compare |
| <1 U |
| = Â |
| = A\u0302 |
| <2 Ú # U with acute |
| = U\u0301 |
| = Ấ # A with circumflex & acute |
| = Â\u0301 |
| = A\u0302\u0301 |
| <1 X |
| = A |
| <2 X\u030A # with ring above |
| = Å |
| = A\u030A |
| = \u212B # Angstrom sign |
| |
| @ rules |
| &x=\u5140\u55C0 |
| * compare |
| <1 x |
| = \u5140\u55C0 |
| = \u5140\uFA0D |
| = \uFA0C\u55C0 |
| = \uFA0C\uFA0D # CJK compatibility characters |
| <3 X |
| |
| # canonical closure on prefix rules, ICU ticket 9444 |
| @ rules |
| &x=ä|ŝ |
| * compare |
| <1 äs # not tailored |
| <1 äx |
| = äŝ |
| = a\u0308s\u0302 |
| = a\u0308ŝ |
| = äs\u0302 |
| <3 äX |
| |
| ** test: conjoining Jamo map to expansions |
| @ rules |
| &gg=\u1101 # Jamo Lead consonant GG |
| &nj=\u11AC # Jamo Trail consonant NJ |
| * compare |
| <1 gg\u1161nj |
| = \u1101\u1161\u11AC |
| = \uAE4C\u11AC |
| = \uAE51 |
| <3 gg\u1161nJ |
| <1 \u1100\u1100 |
| |
| ** test: canonical tail closure, ICU ticket 5913 |
| @ rules |
| &a<â |
| * compare |
| <1 a |
| <1 â # tailored |
| = a\u0302 |
| <2 a\u0323\u0302 # discontiguous contraction |
| = ạ\u0302 # equivalent |
| = ậ # equivalent |
| <1 b |
| |
| @ rules |
| &a<ạ |
| * compare |
| <1 a |
| <1 ạ # tailored |
| = a\u0323 |
| <2 a\u0323\u0302 # contiguous contraction plus extra diacritic |
| = ạ\u0302 # equivalent |
| = ậ # equivalent |
| <1 b |
| |
| # Tail closure should work even if there is a prefix and/or contraction. |
| @ rules |
| &a<\u5140|câ |
| # In order to find discontiguous contractions for \u5140|câ |
| # there must exist a mapping for \u5140|ca, regardless of what it maps to. |
| # (This follows from the UCA spec.) |
| &x=\u5140|ca |
| * compare |
| <1 \u5140a |
| = \uFA0Ca |
| <1 \u5140câ # tailored |
| = \uFA0Ccâ |
| = \u5140ca\u0302 |
| = \uFA0Cca\u0302 |
| <2 \u5140ca\u0323\u0302 # discontiguous contraction |
| = \uFA0Cca\u0323\u0302 |
| = \u5140cạ\u0302 |
| = \uFA0Ccạ\u0302 |
| = \u5140cậ |
| = \uFA0Ccậ |
| <1 \u5140b |
| = \uFA0Cb |
| <1 \u5140x |
| = \u5140ca |
| |
| # Double-check that without the extra mapping there will be no discontiguous match. |
| @ rules |
| &a<\u5140|câ |
| * compare |
| <1 \u5140a |
| = \uFA0Ca |
| <1 \u5140câ # tailored |
| = \uFA0Ccâ |
| = \u5140ca\u0302 |
| = \uFA0Cca\u0302 |
| <1 \u5140b |
| = \uFA0Cb |
| <1 \u5140ca\u0323\u0302 # no discontiguous contraction |
| = \uFA0Cca\u0323\u0302 |
| = \u5140cạ\u0302 |
| = \uFA0Ccạ\u0302 |
| = \u5140cậ |
| = \uFA0Ccậ |
| |
| @ rules |
| &a<cạ |
| * compare |
| <1 a |
| <1 cạ # tailored |
| = ca\u0323 |
| <2 ca\u0323\u0302 # contiguous contraction plus extra diacritic |
| = cạ\u0302 # equivalent |
| = cậ # equivalent |
| <1 b |
| |
| # ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI |
| # = 03C9 0313 0300 0345 |
| # ccc = 0, 230, 230, 240 |
| @ rules |
| &δ=αῳ |
| # In order to find discontiguous contractions for αῳ |
| # there must exist a mapping for αω, regardless of what it maps to. |
| # (This follows from the UCA spec.) |
| &ε=αω |
| * compare |
| <1 δ |
| = αῳ |
| = αω\u0345 |
| <2 αω\u0313\u0300\u0345 # discontiguous contraction |
| = αὠ\u0300\u0345 |
| = αὢ\u0345 |
| = αᾢ |
| <2 αω\u0300\u0313\u0345 |
| = αὼ\u0313\u0345 |
| = αῲ\u0313 # not FCD |
| <1 ε |
| = αω |
| |
| # Double-check that without the extra mapping there will be no discontiguous match. |
| @ rules |
| &δ=αῳ |
| * compare |
| <1 αω\u0313\u0300\u0345 # no discontiguous contraction |
| = αὠ\u0300\u0345 |
| = αὢ\u0345 |
| = αᾢ |
| <2 αω\u0300\u0313\u0345 |
| = αὼ\u0313\u0345 |
| = αῲ\u0313 # not FCD |
| <1 δ |
| = αῳ |
| = αω\u0345 |
| |
| # Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232. |
| # Tests code paths where the tailored string has a combining mark |
| # that does not occur in any composite's decomposition. |
| @ rules |
| &δ=αὼ\u0315 |
| * compare |
| <1 αω\u0313\u0300\u0315 # Not tailored: The grave accent blocks the comma above. |
| = αὠ\u0300\u0315 |
| = αὢ\u0315 |
| <1 δ |
| = αὼ\u0315 |
| = αω\u0300\u0315 |
| <2 αω\u0300\u0315\u0345 |
| = αὼ\u0315\u0345 |
| = αῲ\u0315 # not FCD |
| |
| ** test: danish a+a vs. a-umlaut, ICU ticket 9319 |
| @ rules |
| &z<aa |
| * compare |
| <1 z |
| <1 aa |
| <2 aa\u0308 |
| = aä |
| |
| ** test: Jamo L with and in prefix |
| # Useful for the Korean "searchjl" tailoring (instead of contractions of pairs of Jamo L). |
| @ rules |
| # Jamo Lead consonant G after G or GG |
| &[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100 |
| # Jamo Lead consonant GG sorts like G+G |
| &\u1100\u1100=\u1101 |
| # Note: Making G|GG and GG|GG sort the same as G|G+G |
| # would require the ability to reset on G|G+G, |
| # or we could make G-after-G equal to some secondary-CE character, |
| # and reset on a pair of those. |
| # (It does not matter much if there are at most two G in a row in real text.) |
| * compare |
| <1 \u1100 |
| <2 \u1100\u1100 # only one primary from a sequence of G lead consonants |
| = \u1101 |
| <2 \u1100\u1100\u1100 |
| = \u1101\u1100 |
| # but not = \u1100\u1101, see above |
| <1 \u1100\u1161 |
| = \uAC00 |
| <2 \u1100\u1100\u1161 |
| = \u1100\uAC00 # prefix match from the L of the LV syllable |
| = \u1101\u1161 |
| = \uAE4C |
| |
| ** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546 |
| @ rules |
| # Low secondary CEs for Jamo V & T. |
| # Note: T should sort before V for proper syllable order. |
| &\u0332 # COMBINING LOW LINE (first primary ignorable) |
| <<\u1161<<\u1162 |
| |
| # Korean Jamo lead consonant search rules, part 2: |
| # Make modern compound L jamo primary equivalent to non-compound forms. |
| |
| # Secondary CEs for Jamo L-after-L, greater than Jamo V & T. |
| &\u0313 # COMBINING COMMA ABOVE (second primary ignorable) |
| =\u1100|\u1100 |
| =\u1103|\u1103 |
| =\u1107|\u1107 |
| =\u1109|\u1109 |
| =\u110C|\u110C |
| |
| # Compound L Jamo map to equivalent expansions of primary+secondary CE. |
| &\u1100\u0313=\u1101<<<\u3132 # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSANGKIYEOK |
| &\u1103\u0313=\u1104<<<\u3138 # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSANGTIKEUT |
| &\u1107\u0313=\u1108<<<\u3143 # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANGPIEUP |
| &\u1109\u0313=\u110A<<<\u3146 # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGSIOS |
| &\u110C\u0313=\u110D<<<\u3149 # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANGCIEUC |
| |
| * compare |
| <1 \u1100\u1161 |
| = \uAC00 |
| <2 \u1100\u1162 |
| = \uAC1C |
| <2 \u1100\u1100\u1161 |
| = \u1100\uAC00 |
| = \u1101\u1161 |
| = \uAE4C |
| <3 \u3132\u1161 |
| |
| ** test: Hangul syllables in prefix & in the interior of a contraction |
| @ rules |
| &x=\u1100\u1161|a\u1102\u1162z |
| * compare |
| <1 \u1100\u1161x |
| = \u1100\u1161a\u1102\u1162z |
| = \u1100\u1161a\uB0B4z |
| = \uAC00a\u1102\u1162z |
| = \uAC00a\uB0B4z |
| |
| ** test: digits are unsafe-backwards when numeric=on |
| @ root |
| % numeric=on |
| * compare |
| # If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a". |
| # We need to back up before the identical prefix "1" and compare the full numbers. |
| <1 11b |
| <1 101a |
| |
| ** test: simple locale data test |
| @ locale de |
| * compare |
| <1 a |
| <2 ä |
| <1 ae |
| <2 æ |
| |
| @ locale de-u-co-phonebk |
| * compare |
| <1 a |
| <1 ae |
| <2 ä |
| <2 æ |
| |
| # The following test cases were moved here from ICU 52's DataDrivenCollationTest.txt. |
| |
| ** test: DataDrivenCollationTest/TestMorePinyin |
| # Testing the primary strength. |
| @ locale zh |
| % strength=primary |
| * compare |
| < lā |
| = lĀ |
| = Lā |
| = LĀ |
| < lān |
| = lĀn |
| < lē |
| = lĒ |
| = Lē |
| = LĒ |
| < lēn |
| = lĒn |
| |
| ** test: DataDrivenCollationTest/TestLithuanian |
| # Lithuanian sort order. |
| @ locale lt |
| * compare |
| < cz |
| < č |
| < d |
| < iz |
| < j |
| < sz |
| < š |
| < t |
| < zz |
| < ž |
| |
| ** test: DataDrivenCollationTest/TestLatvian |
| # Latvian sort order. |
| @ locale lv |
| * compare |
| < cz |
| < č |
| < d |
| < gz |
| < ģ |
| < h |
| < iz |
| < j |
| < kz |
| < ķ |
| < l |
| < lz |
| < ļ |
| < m |
| < nz |
| < ņ |
| < o |
| < rz |
| < ŗ |
| < s |
| < sz |
| < š |
| < t |
| < zz |
| < ž |
| |
| ** test: DataDrivenCollationTest/TestEstonian |
| # Estonian sort order. |
| @ locale et |
| * compare |
| < sy |
| < š |
| < šy |
| < z |
| < zy |
| < ž |
| < v |
| < va |
| < w |
| < õ |
| < õy |
| < ä |
| < äy |
| < ö |
| < öy |
| < ü |
| < üy |
| < x |
| |
| ** test: DataDrivenCollationTest/TestAlbanian |
| # Albanian sort order. |
| @ locale sq |
| * compare |
| < cz |
| < ç |
| < d |
| < dz |
| < dh |
| < e |
| < ez |
| < ë |
| < f |
| < gz |
| < gj |
| < h |
| < lz |
| < ll |
| < m |
| < nz |
| < nj |
| < o |
| < rz |
| < rr |
| < s |
| < sz |
| < sh |
| < t |
| < tz |
| < th |
| < u |
| < xz |
| < xh |
| < y |
| < zz |
| < zh |
| |
| ** test: DataDrivenCollationTest/TestSimplifiedChineseOrder |
| # Sorted file has different order. |
| @ root |
| # normalization=on turned on & off automatically. |
| * compare |
| < \u5F20 |
| < \u5F20\u4E00\u8E3F |
| |
| ** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash |
| # This pretty much crashes. |
| @ root |
| * compare |
| < \u0f71\u0f72\u0f80\u0f71\u0f72 |
| < \u0f80 |
| |
| ** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems |
| # These are examples of strings that caused trouble in partial sort key testing. |
| @ locale th-TH |
| * compare |
| < \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C |
| < \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18 |
| * compare |
| < \u0E01\u0E07\u0E01\u0E32\u0E23 |
| < \u0E01\u0E07\u0E42\u0E01\u0E49 |
| * compare |
| < \u0E01\u0E23\u0E19\u0E17\u0E32 |
| < \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32 |
| * compare |
| < \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27 |
| < \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27 |
| * compare |
| < \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D |
| < \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32 |
| |
| ** test: DataDrivenCollationTest/TestJavaStyleRule |
| # java.text allows rules to start as '<<<x<<<y...' |
| # we emulate this by assuming a &[first tertiary ignorable] in this case. |
| @ rules |
| &\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable]=b |
| * compare |
| = a |
| = equal |
| < z |
| < x |
| = b # x had become the new first primary ignorable |
| < w |
| |
| ** test: DataDrivenCollationTest/TestShiftedIgnorable |
| # The UCA states that primary ignorables should be completely |
| # ignorable when following a shifted code point. |
| @ root |
| % alternate=shifted |
| % strength=quaternary |
| * compare |
| < a\u0020b |
| = a\u0020\u0300b |
| = a\u0020\u0301b |
| < a_b |
| = a_\u0300b |
| = a_\u0301b |
| < A\u0020b |
| = A\u0020\u0300b |
| = A\u0020\u0301b |
| < A_b |
| = A_\u0300b |
| = A_\u0301b |
| < a\u0301b |
| < A\u0301b |
| < a\u0300b |
| < A\u0300b |
| |
| ** test: DataDrivenCollationTest/TestNShiftedIgnorable |
| # The UCA states that primary ignorables should be completely |
| # ignorable when following a shifted code point. |
| @ root |
| % alternate=non-ignorable |
| % strength=tertiary |
| * compare |
| < a\u0020b |
| < A\u0020b |
| < a\u0020\u0301b |
| < A\u0020\u0301b |
| < a\u0020\u0300b |
| < A\u0020\u0300b |
| < a_b |
| < A_b |
| < a_\u0301b |
| < A_\u0301b |
| < a_\u0300b |
| < A_\u0300b |
| < a\u0301b |
| < A\u0301b |
| < a\u0300b |
| < A\u0300b |
| |
| ** test: DataDrivenCollationTest/TestSafeSurrogates |
| # It turned out that surrogates were not skipped properly |
| # when iterating backwards if they were in the middle of a |
| # contraction. This test assures that this is fixed. |
| @ rules |
| &a < x\ud800\udc00b |
| * compare |
| < a |
| < x\ud800\udc00b |
| |
| ** test: DataDrivenCollationTest/da_TestPrimary |
| # This test goes through primary strength cases |
| @ locale da |
| % strength=primary |
| * compare |
| < Lvi |
| < Lwi |
| * compare |
| < L\u00e4vi |
| < L\u00f6wi |
| * compare |
| < L\u00fcbeck |
| = Lybeck |
| |
| ** test: DataDrivenCollationTest/da_TestTertiary |
| # This test goes through tertiary strength cases |
| @ locale da |
| % strength=tertiary |
| * compare |
| < Luc |
| < luck |
| * compare |
| < luck |
| < L\u00fcbeck |
| * compare |
| < lybeck |
| < L\u00fcbeck |
| * compare |
| < L\u00e4vi |
| < L\u00f6we |
| * compare |
| < L\u00f6ww |
| < mast |
| |
| * compare |
| < A/S |
| < ANDRE |
| < ANDR\u00c9 |
| < ANDREAS |
| < AS |
| < CA |
| < \u00c7A |
| < CB |
| < \u00c7C |
| < D.S.B. |
| < DA |
| < \u00d0A |
| < DB |
| < \u00d0C |
| < DSB |
| < DSC |
| < EKSTRA_ARBEJDE |
| < EKSTRABUD0 |
| < H\u00d8ST |
| < HAAG |
| < H\u00c5NDBOG |
| < HAANDV\u00c6RKSBANKEN |
| < Karl |
| < karl |
| < NIELS\u0020J\u00d8RGEN |
| < NIELS-J\u00d8RGEN |
| < NIELSEN |
| < R\u00c9E,\u0020A |
| < REE,\u0020B |
| < R\u00c9E,\u0020L |
| < REE,\u0020V |
| < SCHYTT,\u0020B |
| < SCHYTT,\u0020H |
| < SCH\u00dcTT,\u0020H |
| < SCHYTT,\u0020L |
| < SCH\u00dcTT,\u0020M |
| < SS |
| < \u00df |
| < SSA |
| < STORE\u0020VILDMOSE |
| < STOREK\u00c6R0 |
| < STORM\u0020PETERSEN |
| < STORMLY |
| < THORVALD |
| < THORVARDUR |
| < \u00feORVAR\u00d0UR |
| < THYGESEN |
| < VESTERG\u00c5RD,\u0020A |
| < VESTERGAARD,\u0020A |
| < VESTERG\u00c5RD,\u0020B |
| < \u00c6BLE |
| < \u00c4BLE |
| < \u00d8BERG |
| < \u00d6BERG |
| |
| * compare |
| < andere |
| < chaque |
| < chemin |
| < cote |
| < cot\u00e9 |
| < c\u00f4te |
| < c\u00f4t\u00e9 |
| < \u010du\u010d\u0113t |
| < Czech |
| < hi\u0161a |
| < irdisch |
| < lie |
| < lire |
| < llama |
| < l\u00f5ug |
| < l\u00f2za |
| < lu\u010d |
| < luck |
| < L\u00fcbeck |
| < lye |
| < l\u00e4vi |
| < L\u00f6wen |
| < m\u00e0\u0161ta |
| < m\u00eer |
| < myndig |
| < M\u00e4nner |
| < m\u00f6chten |
| < pi\u00f1a |
| < pint |
| < pylon |
| < \u0161\u00e0ran |
| < savoir |
| < \u0160erb\u016bra |
| < Sietla |
| < \u015blub |
| < subtle |
| < symbol |
| < s\u00e4mtlich |
| < verkehrt |
| < vox |
| < v\u00e4ga |
| < waffle |
| < wood |
| < yen |
| < yuan |
| < yucca |
| < \u017eal |
| < \u017eena |
| < \u017den\u0113va |
| < zoo0 |
| < Zviedrija |
| < Z\u00fcrich |
| < zysk0 |
| < \u00e4ndere |
| |
| ** test: DataDrivenCollationTest/hi_TestNewRules |
| # This test goes through new rules and tests against old rules |
| @ locale hi |
| * compare |
| < कॐ |
| < कं |
| < कँ |
| < कः |
| |
| ** test: DataDrivenCollationTest/ro_TestNewRules |
| # This test goes through new rules and tests against old rules |
| @ locale ro |
| * compare |
| < xAx |
| < xă |
| < xĂ |
| < Xă |
| < XĂ |
| < xăx |
| < xĂx |
| < xâ |
| < x |
| < Xâ |
| < XÂ |
| < xâx |
| < xÂx |
| < xb |
| < xIx |
| < xî |
| < xÎ |
| < Xî |
| < XÎ |
| < xîx |
| < xÎx |
| < xj |
| < xSx |
| < xș |
| = xş |
| < xȘ |
| = xŞ |
| < Xș |
| = Xş |
| < XȘ |
| = XŞ |
| < xșx |
| = xşx |
| < xȘx |
| = xŞx |
| < xT |
| < xTx |
| < xț |
| = xţ |
| < xȚ |
| = xŢ |
| < Xț |
| = Xţ |
| < XȚ |
| = XŢ |
| < xțx |
| = xţx |
| < xȚx |
| = xŢx |
| < xU |
| |
| ** test: DataDrivenCollationTest/testOffsets |
| # This tests cases where forwards and backwards iteration get different offsets |
| @ locale en |
| % strength=tertiary |
| * compare |
| < a\uD800\uDC00\uDC00 |
| < b\uD800\uDC00\uDC00 |
| * compare |
| < \u0301A\u0301\u0301 |
| < \u0301B\u0301\u0301 |
| * compare |
| < abcd\r\u0301 |
| < abce\r\u0301 |
| # TODO: test offsets in new CollationTest |
| |
| # End of test cases moved here from ICU 52's DataDrivenCollationTest.txt. |
| |
| ** test: was ICU 52 cmsccoll/TestRedundantRules |
| @ rules |
| & a < b < c < d& [before 1] c < m |
| * compare |
| <1 a |
| <1 b |
| <1 m |
| <1 c |
| <1 d |
| |
| @ rules |
| & a < b <<< c << d <<< e& [before 3] e <<< x |
| * compare |
| <1 a |
| <1 b |
| <3 c |
| <2 d |
| <3 x |
| <3 e |
| |
| @ rules |
| & a < b <<< c << d <<< e <<< f < g& [before 1] g < x |
| * compare |
| <1 a |
| <1 b |
| <3 c |
| <2 d |
| <3 e |
| <3 f |
| <1 x |
| <1 g |
| |
| @ rules |
| & a <<< b << c < d& a < m |
| * compare |
| <1 a |
| <3 b |
| <2 c |
| <1 m |
| <1 d |
| |
| @ rules |
| &a<b<<b\u0301 &z<b |
| * compare |
| <1 a |
| <1 b\u0301 |
| <1 z |
| <1 b |
| |
| @ rules |
| &z<m<<<q<<<m |
| * compare |
| <1 z |
| <1 q |
| <3 m |
| |
| @ rules |
| &z<<<m<q<<<m |
| * compare |
| <1 z |
| <1 q |
| <3 m |
| |
| @ rules |
| & a < b < c < d& r < c |
| * compare |
| <1 a |
| <1 b |
| <1 d |
| <1 r |
| <1 c |
| |
| @ rules |
| & a < b < c < d& c < m |
| * compare |
| <1 a |
| <1 b |
| <1 c |
| <1 m |
| <1 d |
| |
| @ rules |
| & a < b < c < d& a < m |
| * compare |
| <1 a |
| <1 m |
| <1 b |
| <1 c |
| <1 d |
| |
| ** test: was ICU 52 cmsccoll/TestExpansionSyntax |
| # The following two rules should sort the particular list of strings the same. |
| @ rules |
| &AE <<< a << b <<< c &d <<< f |
| * compare |
| <1 AE |
| <3 a |
| <2 b |
| <3 c |
| <1 d |
| <3 f |
| |
| @ rules |
| &A <<< a / E << b / E <<< c /E &d <<< f |
| * compare |
| <1 AE |
| <3 a |
| <2 b |
| <3 c |
| <1 d |
| <3 f |
| |
| # The following two rules should sort the particular list of strings the same. |
| @ rules |
| &AE <<< a <<< b << c << d < e < f <<< g |
| * compare |
| <1 AE |
| <3 a |
| <3 b |
| <2 c |
| <2 d |
| <1 e |
| <1 f |
| <3 g |
| |
| @ rules |
| &A <<< a / E <<< b / E << c / E << d / E < e < f <<< g |
| * compare |
| <1 AE |
| <3 a |
| <3 b |
| <2 c |
| <2 d |
| <1 e |
| <1 f |
| <3 g |
| |
| # The following two rules should sort the particular list of strings the same. |
| @ rules |
| &AE <<< B <<< C / D <<< F |
| * compare |
| <1 AE |
| <3 B |
| <3 F |
| <1 AED |
| <3 C |
| |
| @ rules |
| &A <<< B / E <<< C / ED <<< F / E |
| * compare |
| <1 AE |
| <3 B |
| <3 F |
| <1 AED |
| <3 C |
| |
| ** test: never reorder trailing primaries |
| @ root |
| % reorder Zzzz Grek |
| * compare |
| <1 L |
| <1 字 |
| <1 Ω |
| <1 \uFFFD |
| <1 \uFFFF |
| |
| ** test: fall back to mappings with shorter prefixes, not immediately to ones with no prefixes |
| @ rules |
| &u=ab|cd |
| &v=b|ce |
| * compare |
| <1 abc |
| <1 abcc |
| <1 abcf |
| <1 abcd |
| = abu |
| <1 abce |
| = abv |
| |
| # With the following rules, there is only one prefix per composite ĉ or ç, |
| # but both prefixes apply to just c in NFD form. |
| # We would get different results for composed vs. NFD input |
| # if we fell back directly from longest-prefix mappings to no-prefix mappings. |
| @ rules |
| &x=op|ĉ |
| &y=p|ç |
| * compare |
| <1 opc |
| <2 opć |
| <1 opcz |
| <1 opd |
| <1 opĉ |
| = opc\u0302 |
| = opx |
| <1 opç |
| = opc\u0327 |
| = opy |
| |
| # The mapping is used which has the longest matching prefix for which |
| # there is also a suffix match, with the longest suffix match among several for that prefix. |
| @ rules |
| &❶=d |
| &❷=de |
| &❸=def |
| &①=c|d |
| &②=c|de |
| &③=c|def |
| &④=bc|d |
| &⑤=bc|de |
| &⑥=bc|def |
| &⑦=abc|d |
| &⑧=abc|de |
| &⑨=abc|def |
| * compare |
| <1 9aadzz |
| = 9aa❶zz |
| <1 9aadez |
| = 9aa❷z |
| <1 9aadef |
| = 9aa❸ |
| <1 9acdzz |
| = 9ac①zz |
| <1 9acdez |
| = 9ac②z |
| <1 9acdef |
| = 9ac③ |
| <1 9bcdzz |
| = 9bc④zz |
| <1 9bcdez |
| = 9bc⑤z |
| <1 9bcdef |
| = 9bc⑥ |
| <1 abcdzz |
| = abc⑦zz |
| <1 abcdez |
| = abc⑧z |
| <1 abcdef |
| = abc⑨ |
| |
| ** test: prefix + discontiguous contraction with missing prefix contraction |
| # Unfortunate terminology: The first "prefix" here is the pre-context, |
| # the second "prefix" refers to the contraction/relation string that is |
| # one shorter than the one being tested. |
| @ rules |
| &x=p|e |
| &y=p|ê |
| &z=op|ê |
| # No mapping for op|e: |
| # Discontiguous contraction matching should not match op|ê in opệ |
| # because it would have to skip the dot below and extend a match on op|e by the circumflex, |
| # but there is no match on op|e. |
| * compare |
| <1 oPe |
| <1 ope |
| = opx |
| <1 opệ |
| = opy\u0323 # y not z |
| <1 opê |
| = opz |
| |
| # We cannot test for fallback by whether the contraction default CE32 |
| # is for another contraction. With the following rules, there is no mapping for op|e, |
| # and the fallback to prefix p has no contractions. |
| @ rules |
| &x=p|e |
| &z=op|ê |
| * compare |
| <1 oPe |
| <1 ope |
| = opx |
| <2 opệ |
| = opx\u0323\u0302 # x not z |
| <1 opê |
| = opz |
| |
| # One more variation: Fallback to the simple code point, no shorter non-empty prefix. |
| @ rules |
| &x=e |
| &z=op|ê |
| * compare |
| <1 ope |
| = opx |
| <3 oPe |
| = oPx |
| <2 opệ |
| = opx\u0323\u0302 # x not z |
| <1 opê |
| = opz |
| |
| ** test: maxVariable via rules |
| @ rules |
| [maxVariable space][alternate shifted] |
| * compare |
| = \u0020 |
| = \u000A |
| <1 . |
| <1 ° # degree sign |
| <1 $ |
| <1 0 |
| |
| ** test: maxVariable via setting |
| @ root |
| % maxVariable=currency |
| % alternate=shifted |
| * compare |
| = \u0020 |
| = \u000A |
| = . |
| = ° # degree sign |
| = $ |
| <1 0 |
| |
| ** test: ICU4J CollationMiscTest/TestContractionClosure (ää) |
| # This tests canonical closure, but it also tests that CollationFastLatin |
| # bails out properly for contractions with combining marks. |
| # For that we need pairs of strings that remain in the Latin fastpath |
| # long enough, hence the extra "= b" lines. |
| @ rules |
| &b=\u00e4\u00e4 |
| * compare |
| <1 b |
| = \u00e4\u00e4 |
| = b |
| = a\u0308a\u0308 |
| = b |
| = \u00e4a\u0308 |
| = b |
| = a\u0308\u00e4 |
| |
| ** test: ICU4J CollationMiscTest/TestContractionClosure (Å) |
| @ rules |
| &b=\u00C5 |
| * compare |
| <1 b |
| = \u00C5 |
| = b |
| = A\u030A |
| = b |
| = \u212B |
| |
| ** test: reset-before on already-tailored characters, ICU ticket 10108 |
| @ rules |
| &a<w<<x &[before 2]x<<y |
| * compare |
| <1 a |
| <1 w |
| <2 y |
| <2 x |
| |
| @ rules |
| &a<<w<<<x &[before 2]x<<y |
| * compare |
| <1 a |
| <2 y |
| <2 w |
| <3 x |
| |
| @ rules |
| &a<w<x &[before 2]x<<y |
| * compare |
| <1 a |
| <1 w |
| <1 y |
| <2 x |
| |
| @ rules |
| &a<w<<<x &[before 2]x<<y |
| * compare |
| <1 a |
| <1 y |
| <2 w |
| <3 x |
| |
| ** test: numeric collation with other settings, ICU ticket 9092 |
| @ root |
| % strength=identical |
| % caseFirst=upper |
| % numeric=on |
| * compare |
| <1 100\u0020a |
| <1 101 |
| |
| ** test: collation type fallback from unsupported type, ICU ticket 10149 |
| @ locale fr-CA-u-co-phonebk |
| # Expect the same result as with fr-CA, using backwards-secondary order. |
| # That is, we should fall back from the unsupported collation type |
| # to the locale's default collation type. |
| * compare |
| <1 cote |
| <2 côte |
| <2 coté |
| <2 côté |
| |
| ** test: @ is equivalent to [backwards 2], ICU ticket 9956 |
| @ rules |
| &b<a @ &v<<w |
| * compare |
| <1 b |
| <1 a |
| <1 cote |
| <2 côte |
| <2 coté |
| <2 côté |
| <1 v |
| <2 w |
| <1 x |
| |
| ** test: shifted+reordering, ICU ticket 9507 |
| @ root |
| % reorder Grek punct space |
| % alternate=shifted |
| % strength=quaternary |
| # Which primaries are "variable" should be determined without script reordering, |
| # and then primaries should be reordered whether they are shifted to quaternary or not. |
| * compare |
| <4 ( # punctuation |
| <4 ) |
| <4 \u0020 # space |
| <1 ` # symbol |
| <1 ^ |
| <1 $ # currency symbol |
| <1 € |
| <1 0 # numbers |
| <1 ε # Greek |
| <1 e # Latin |
| <1 e(e |
| <4 e)e |
| <4 e\u0020e |
| <4 ee |
| <3 e(E |
| <4 e)E |
| <4 e\u0020E |
| <4 eE |
| |
| ** test: "uppercase first" could sort a string before its prefix, ICU ticket 9351 |
| @ rules |
| &\u0001<<<b<<<B |
| % caseFirst=upper |
| * compare |
| <1 aaa |
| <3 aaaB |
| |
| ** test: secondary+case ignores secondary ignorables, ICU ticket 9355 |
| @ rules |
| &\u0001<<<b<<<B |
| % strength=secondary |
| % caseLevel=on |
| * compare |
| <1 a |
| = ab |
| = aB |
| |
| ** test: custom collation rules involving tail of a contraction in Malayalam, ICU ticket 6328 |
| @ rules |
| &[before 2] ൌ << ൗ # U+0D57 << U+0D4C == 0D46+0D57 |
| * compare |
| <1 ൗx |
| <2 ൌx |
| <1 ൗy |
| <2 ൌy |
| |
| ** test: quoted apostrophe in compact syntax, ICU ticket 8204 |
| @ rules |
| &q<<*a''c |
| * compare |
| <1 d |
| <1 p |
| <1 q |
| <2 a |
| <2 \u0027 |
| <2 c |
| <1 r |
| |
| # ICU ticket #8260 "Support all collation-related keywords in Collator.getInstance()" |
| ** test: locale -u- with collation keywords, ICU ticket 8260 |
| @ locale de-u-kv-sPace-ka-shifTed-kn-kk-falsE-kf-Upper-kc-tRue-ks-leVel4 |
| * compare |
| <4 \u0020 # space is shifted, strength=quaternary |
| <1 ! # punctuation is regular |
| <1 2 |
| <1 12 # numeric sorting |
| <1 B |
| <c b # uppercase first on case level |
| <1 x\u0301\u0308 |
| <2 x\u0308\u0301 # normalization off |
| |
| ** test: locale @ with collation keywords, ICU ticket 8260 |
| @ locale fr@colbAckwards=yes;ColStrength=Quaternary;kv=currencY;colalternate=shifted |
| * compare |
| <4 $ # currency symbols are shifted, strength=quaternary |
| <1 àla |
| <2 alà # backwards secondary level |
| |
| ** test: locale -u- with script reordering, ICU ticket 8260 |
| @ locale el-u-kr-kana-SYMBOL-Grek-hani-cyrl-latn-digit-armn-deva-ethi-thai |
| * compare |
| <1 \u0020 |
| <1 あ |
| <1 ☂ |
| <1 Ω |
| <1 丂 |
| <1 ж |
| <1 L |
| <1 4 |
| <1 Ձ |
| <1 अ |
| <1 ሄ |
| <1 ฉ |
| |
| ** test: locale @collation=type should be case-insensitive |
| @ locale de@coLLation=PhoneBook |
| * compare |
| <1 ae |
| <2 ä |
| <3 Ä |
| |
| ** test: import root search rules plus German phonebook rules, ICU ticket 8962 |
| @ locale de-u-co-search |
| * compare |
| <1 = |
| <1 ≠ |
| <1 a |
| <1 ae |
| <2 ä |
| |
| # Once more, but with runtime builder. |
| @ rules |
| [import und-u-co-search][import de-u-co-phonebk] |
| * compare |
| <1 = |
| <1 ≠ |
| <1 a |
| <1 ae |
| <2 ä |
| |
| # Once again, with import from "root" not "und" (as in a proper language tag). |
| @ rules |
| [import root-u-co-search][import de-u-co-phonebk] |
| * compare |
| <1 = |
| <1 ≠ |
| <1 a |
| <1 ae |
| <2 ä |
| |
| ** test: import rules from a language with non-Latin native script, and reset the reordering, ICU ticket 10998 |
| # Greek should sort Greek first. |
| @ rules |
| [import el] |
| * compare |
| <1 4 |
| <1 Ω |
| <1 L |
| |
| # Import Greek, and then reset the reordering. |
| @ rules |
| [import el][reorder Zzzz] |
| * compare |
| <1 4 |
| <1 L |
| <1 Ω |
| |
| # "others" is a synonym for Zzzz. |
| @ rules |
| [import el][reorder others] |
| * compare |
| <1 4 |
| <1 L |
| <1 Ω |
| |
| ** test: regression test for CollationFastLatinBuilder, ICU ticket 11388 |
| @ rules |
| &x<<aa<<<Aa<<<AA |
| % strength=secondary |
| * compare |
| <1 AA |
| <2 Aẩ |
| <2 aą |
| * compare |
| <1 AA |
| <2 aą |
| |
| ** test: tailor tertiary-after a common tertiary where there is a lower one |
| # Assume that Hiragana small A has a below-common tertiary, and Hiragana A has a common one. |
| # See ICU ticket 11448 & CLDR ticket 7222. |
| @ rules |
| &あ<<<x<<<y<<<z |
| * compare |
| <1 ぁ |
| <3 あ |
| <3 x |
| <3 y |
| <3 z |
| <3 ァ |
| <1 い |
| |
| ** test: tailor tertiary-after a below-common tertiary |
| @ rules |
| &ぁ<<<x<<<y<<<z |
| * compare |
| <1 ぁ |
| <3 x |
| <3 y |
| <3 z |
| <3 あ |
| <3 ァ |
| <1 い |
| |
| ** test: tailor tertiary-before a common tertiary where there is a lower one |
| @ rules |
| &[before 3]あ<<<x<<<y<<<z |
| * compare |
| <1 ぁ |
| <3 x |
| <3 y |
| <3 z |
| <3 あ |
| <3 ァ |
| <1 い |
| |
| ** test: tailor tertiary-before a below-common tertiary |
| @ rules |
| &[before 3]ぁ<<<x<<<y<<<z |
| * compare |
| <1 x |
| <3 y |
| <3 z |
| <3 ぁ |
| <3 あ |
| <3 ァ |
| <1 い |
| |
| ** test: reorder single scripts not groups, ICU ticket 11449 |
| @ root |
| % reorder Goth Latn |
| * compare |
| <1 4 |
| <1 𐌰 # Gothic |
| <1 L |
| <1 Ω |
| # Before ICU 55, the following reordered together with Gothic. |
| <1 𐌈 # Old Italic |
| <1 𐑐 # Shavian |