| # © 2016 and later: Unicode, Inc. and others. |
| # License & terms of use: http://www.unicode.org/copyright.html |
| # Generated using tools/cldr/cldr-to-icu/build-icu-data.xml |
| # |
| # File: Grek_Latn.txt |
| # Generated from CLDR |
| # |
| |
| # Rules are predicated on running NFD first, and NFC afterwards |
| # :: [\u0000-\u007F \u0370-Ͽ [:Greek:] [:nonspacing mark:]] ; |
| # MINIMAL FILTER GENERATED FOR: Greek-Latin |
| :: [;µ·ÄËÏÖÜäëïöüÿ-āĒ-ēĪ-īŌ-ōŪ-ūŸǕ-ǜǞ-ǣǬ-ǭȪ-ȭȰ-ȳ\u0304\u0308\u0313-\u0314\u0342-\u0345ͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϗϛϝϟϡϣϥϧϩϫϭϯ-ϵϷ-\u07FBЁЇёїӒ-ӓӚ-ӟӢ-ӧӪ-ӱӴ-ӵӸ-ӹḔ-ḗḠ-ḡḦ-ḧḮ-ḯḸ-ḹṎ-ṓṜ-ṝṺ-ṻẄ-ẅẌ-ẍẗἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-ῌ῏-ΐῖ-Ί῟-Ῥῲ-ῴῶ-ῼΩϹ] ; |
| :: NFD (NFC) ; |
| # TEST CASES |
| # Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος |
| # ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ |
| # ᾳ ῃ ῳ ὃ ὄ |
| # ὠς ὡς ὢς ὣς |
| # Ὠς Ὡς Ὢς Ὣς |
| # ὨΣ ὩΣ ὪΣ ὫΣ |
| # Ạ, ạ, Ẹ, ẹ, Ọ, ọ |
| # Useful variables |
| $lower = [[:latin:][:greek:] & [:Ll:]]; |
| $glower = [[:greek:] & [:Ll:]]; |
| $upper = [[:latin:][:greek:] & [:Lu:]] ; |
| $accent = [:M:] ; |
| # NOTE: restrict to just the Greek & Latin accents that we care about |
| # TODO: broaden out once interation is fixed |
| $accentMinus = [ [\u0300-\u0345] & [:M:] - [\u0338]] ; |
| $macron = \u0304 ; |
| $ddot = \u0308 ; |
| $ddotmac = [$ddot$macron]; |
| $lcgvowel = [αεηιουω] ; |
| $ucgvowel = [ΑΕΗΙΟΥΩ] ; |
| $gvowel = [$lcgvowel $ucgvowel] ; |
| $lcgvowelC = [$lcgvowel $accent] ; |
| $evowel = [aeiouyAEIOUY]; |
| $evowel2 = [iuyIUY]; |
| $vowel = [ $evowel $gvowel] ; |
| $gammaLike = [ΓΚΞΧγκξχϰ] ; |
| $egammaLike = [GKXCgkxc] ; |
| $smooth = \u0313 ; |
| $rough = \u0314 ; |
| $iotasub = \u0345 ; |
| $evowel_i = [$evowel-[iI]] ; |
| $evowel2_i = [uyUY]; |
| $underbar = \u0331; |
| $afterLetter = [:L:] [[:M:]\']* ; |
| $beforeLetter = [[:M:]\']* [:L:] ; |
| $beforeLower = $accent * $lower ; |
| $notLetter = [^[:L:][:M:]] ; |
| $under = \u0331; |
| # Fix punctuation |
| # preserve original |
| \: ↔ \: $under ; |
| \? ↔ \? $under ; |
| \; ↔ \? ; |
| · ↔ \: ; |
| # CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve |
| \u0342 ↔ \u0302 ; |
| # IOTA: convert iota subscript to iota |
| # first make previous alpha long! |
| $accent_minus = [[$accent]-[$iotasub$macron]]; |
| Α } $accent_minus * $iotasub → | Α $macron ; |
| α } $accent_minus * $iotasub → | α $macron ; |
| # now convert to uppercase if after uppercase, ow to lowercase |
| $upper $accent * { $iotasub → I ; |
| $iotasub → i ; |
| | $1 $iotasub ← ($evowel $macron $accentMinus *) i ; |
| | $1 $iotasub ← ($evowel $macron $accentMinus *) I ; |
| # BREATHING |
| # Convert rough breathing to h, and move before letters. |
| # Make A ` x = → H a x |
| Α ($macron?) $rough } $beforeLower → H | α $1; |
| Ε $rough } $beforeLower → H | ε; |
| Η $rough } $beforeLower → H | η ; |
| Ι ($ddot?) $rough } $beforeLower → H | ι $1; |
| Ο $rough } $beforeLower → H | ο ; |
| Υ $rough } $beforeLower → H | υ ; |
| Ω ($ddot?) $rough } $beforeLower → H | ω $1; |
| # Make A x ` = → H a x |
| Α ($glower $macron?) $rough → H | α $1 ; |
| Ε ($glower) $rough → H | ε $1 ; |
| Η ($glower) $rough → H | η $1 ; |
| Ι ($glower $ddot?) $rough → H | ι $1 ; |
| Ο ($glower) $rough → H | ο $1 ; |
| Υ ($glower) $rough → H | υ $1 ; |
| Ω ($glower $ddot?) $rough → H | ω $1 ; |
| #Otherwise, make x ` into h x and X ` into H X |
| ($lcgvowel + $ddotmac? ) $rough → h | $1 ; |
| ($gvowel + $ddotmac? ) $rough → H | $1 ; |
| # Go backwards with H |
| | $1 $rough ← h ($evowel $macron $ddot? $evowel2_i $macron?) ; |
| | $1 $rough ← h ($evowel $ddot? $evowel2 $macron?) ; |
| | $1 $rough ← h ($evowel $macron? $ddot?) ; |
| | $1 $rough ← H ([AEIOUY] $macron $ddot? $evowel2_i $macron?) ; |
| | $1 $rough ← H ([AEIOUY] $ddot? $evowel2 $macron?) ; |
| | $1 $rough ← H ([AEIOUY] $macron? $ddot?) ; |
| # titlecase, have to fix individually |
| # in the future, we should add &uppercase() to make this easier |
| | A $1 $rough ← H a ($macron $ddot? $evowel2_i $macron?) ; |
| | E $1 $rough ← H e ($macron $ddot? $evowel2_i $macron?) ; |
| | I $1 $rough ← H i ($macron $ddot? $evowel2_i $macron?) ; |
| | O $1 $rough ← H o ($macron $ddot? $evowel2_i $macron?) ; |
| | U $1 $rough ← H u ($macron $ddot? $evowel2_i $macron?) ; |
| | Y $1 $rough ← H y ($macron $ddot? $evowel2_i $macron?) ; |
| | A $1 $rough ← H a ($ddot? $evowel2 $macron?) ; |
| | E $1 $rough ← H e ($ddot? $evowel2 $macron?) ; |
| | I $1 $rough ← H i ($ddot? $evowel2 $macron?) ; |
| | O $1 $rough ← H o ($ddot? $evowel2 $macron?) ; |
| | U $1 $rough ← H u ($ddot? $evowel2 $macron?) ; |
| | Y $1 $rough ← H y ($ddot? $evowel2 $macron?) ; |
| | A $1 $rough ← H a ($macron? $ddot? ) ; |
| | E $1 $rough ← H e ($macron? $ddot? ) ; |
| | I $1 $rough ← H i ($macron? $ddot? ) ; |
| | O $1 $rough ← H o ($macron? $ddot? ) ; |
| | U $1 $rough ← H u ($macron? $ddot? ) ; |
| | Y $1 $rough ← H y ($macron? $ddot? ) ; |
| # Now do smooth |
| #delete smooth breathing for Latin |
| $smooth → ; |
| # insert in Greek |
| # the assumption is that all Marks are on letters. |
| | $1 $smooth ← $notLetter { ([rR]) } [^hH$smooth$rough] ; |
| | $1 $smooth ← $notLetter { ($evowel $macron? $evowel2 $macron?) } [^$smooth$rough] ; |
| | $1 $smooth ← $notLetter { ($evowel $macron?) } [^$evowel2$smooth$rough] ; |
| # TODO: preserve smooth/rough breathing if not |
| # on initial vowel sequence |
| # need to have these up here so the rules don't mask |
| # remove now superfluous macron when returning |
| Α ← A $macron ; |
| α ← a $macron ; |
| η ↔ e $macron ; |
| Η ↔ E $macron ; |
| φ ↔ ph ; |
| Ψ } $beforeLower ↔ Ps ; |
| Ψ ↔ PS ; |
| Φ } $beforeLower ↔ Ph ; |
| Φ ↔ PH ; |
| ψ ↔ ps ; |
| ω ↔ o $macron ; |
| Ω ↔ O $macron; |
| # NORMAL |
| α ↔ a ; |
| Α ↔ A ; |
| β ↔ b ; |
| Β ↔ B ; |
| γ } $gammaLike ↔ n } $egammaLike ; |
| γ ↔ g ; |
| Γ } $gammaLike ↔ N } $egammaLike ; |
| Γ ↔ G ; |
| δ ↔ d ; |
| Δ ↔ D ; |
| ε ↔ e ; |
| Ε ↔ E ; |
| ζ ↔ z ; |
| Ζ ↔ Z ; |
| θ ↔ th ; |
| Θ } $beforeLower ↔ Th ; |
| Θ ↔ TH ; |
| ι ↔ i ; |
| Ι ↔ I ; |
| κ ↔ k ; |
| Κ ↔ K ; |
| λ ↔ l ; |
| Λ ↔ L ; |
| μ ↔ m ; |
| Μ ↔ M ; |
| ν } $gammaLike → n\' ; |
| ν ↔ n ; |
| Ν } $gammaLike ↔ N\' ; |
| Ν ↔ N ; |
| ξ ↔ x ; |
| Ξ ↔ X ; |
| ο ↔ o ; |
| Ο ↔ O ; |
| π ↔ p ; |
| Π ↔ P ; |
| ρ $rough ↔ rh; |
| Ρ $rough } $beforeLower ↔ Rh ; |
| Ρ $rough ↔ RH ; |
| ρ ↔ r ; |
| Ρ ↔ R ; |
| # insert separator before things that turn into s |
| [Pp] { } [ςσΣϷϸϺϻ] → \' ; |
| # special S variants |
| Ϸ ↔ S\u030C ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L |
| ϸ ↔ s\u030C ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L |
| Ϻ ↔ S\u0302 ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L |
| ϻ ↔ s\u0302 ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L |
| # underbar means exception |
| # before a letter, initial |
| ς } $beforeLetter ↔ s $underbar } $beforeLetter; |
| σ } $beforeLetter ↔ s } $beforeLetter; |
| # otherwise, after a letter = final |
| $afterLetter { σ ↔ $afterLetter { s $underbar; |
| $afterLetter { ς ↔ $afterLetter { s ; |
| # otherwise (isolated) = initial |
| ς ↔ s $underbar; |
| σ ↔ s ; |
| # [Pp] { Σ ↔ \'S ; |
| Σ ↔ S ; |
| τ ↔ t ; |
| Τ ↔ T ; |
| $vowel {υ } ↔ u ; |
| υ ↔ y ; |
| $vowel { Υ ↔ U ; |
| Υ ↔ Y ; |
| χ ↔ ch ; |
| Χ } $beforeLower ↔ Ch ; |
| Χ ↔ CH ; |
| # Completeness for ASCII |
| $ignore = [[:Mark:]''] * ; |
| | k ← c ; |
| | ph ← f ; |
| | i ← j ; |
| | k ← q ; |
| | b ← v } $vowel ; |
| | b ← w } $vowel; |
| | u ← v ; |
| | u ← w; |
| | K ← C ; |
| | Ph ← F ; |
| | I ← J ; |
| | K ← Q ; |
| | B ← V } $vowel ; |
| | B ← W } $vowel ; |
| | U ← V ; |
| | U ← W ; |
| $rough } $ignore [:UppercaseLetter:] → H ; |
| $ignore [:UppercaseLetter:] { $rough → H ; |
| $rough ← H ; |
| $rough ↔ h ; |
| # Completeness for Greek |
| ϐ → | β ; |
| ϑ → | θ ; |
| ϒ → | Υ ; |
| ϕ → | φ ; |
| ϖ → | π ; |
| ϰ → | κ ; |
| ϱ → | ρ ; |
| ϲ → | σ ; |
| Ϲ → | Σ; #U+03F9 GREEK CAPITAL LUNATE SIGMA SYMBOL |
| ϳ → j ; |
| ϴ → | Θ ; |
| ϵ → | ε ; |
| µ → | μ ; |
| ͺ → i; |
| # delete any trailing ' marks used for roundtripping |
| ← [Ππ] { \' } [Ss] ; |
| ← [Νν] { \' } $egammaLike ; |
| ::NFC (NFD) ; |
| # ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ; |
| # ([\u0000-\u007F · [:Latin:] [:nonspacing mark:]]) ; |
| # MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD |
| :: ( [':?A-Za-zÀ-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0300-\u0337\u0339-\u0345΅-ΆΈ-ΊΌΎ-ΐΪ-ΰϊ-ώϓ-ϔЀ-ЁЃЇЌ-ЎЙйѐ-ёѓїќ-ўѶ-ѷӁ-ӂӐ-ӓӖ-ӗӚ-ӟӢ-ӧӪ-ӵӸ-ӹḀ-ẙẛẠ-ỹἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼ῁-ῄῆ-ΐῖ-Ί῝-΅ῲ-ῴῶ-ῼK-Å] ) ; |
| |