| /* |
| ******************************************************************************* |
| * Copyright (C) 2013-2015, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * collationdatareader.h |
| * |
| * created on: 2013feb07 |
| * created by: Markus W. Scherer |
| */ |
| |
| #ifndef __COLLATIONDATAREADER_H__ |
| #define __COLLATIONDATAREADER_H__ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #include "unicode/udata.h" |
| |
| struct UDataMemory; |
| |
| U_NAMESPACE_BEGIN |
| |
| struct CollationTailoring; |
| |
| /** |
| * Collation binary data reader. |
| */ |
| struct U_I18N_API CollationDataReader /* all static */ { |
| // The following constants are also copied into source/common/ucol_swp.cpp. |
| // Keep them in sync! |
| enum { |
| /** |
| * Number of int32_t indexes. |
| * |
| * Can be 2 if there are only options. |
| * Can be 7 or 8 if there are only options and a script reordering. |
| * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. |
| */ |
| IX_INDEXES_LENGTH, // 0 |
| /** |
| * Bits 31..24: numericPrimary, for numeric collation |
| * 23..16: fast Latin format version (0 = no fast Latin table) |
| * 15.. 0: options bit set |
| */ |
| IX_OPTIONS, |
| IX_RESERVED2, |
| IX_RESERVED3, |
| |
| /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ |
| IX_JAMO_CE32S_START, // 4 |
| |
| // Byte offsets from the start of the data, after the generic header. |
| // The indexes[] are at byte offset 0, other data follows. |
| // Each data item is aligned properly. |
| // The data items should be in descending order of unit size, |
| // to minimize the need for padding. |
| // Each item's byte length is given by the difference between its offset and |
| // the next index/offset value. |
| /** Byte offset to int32_t reorderCodes[]. */ |
| IX_REORDER_CODES_OFFSET, |
| /** |
| * Byte offset to uint8_t reorderTable[]. |
| * Empty table if <256 bytes (padding only). |
| * Otherwise 256 bytes or more (with padding). |
| */ |
| IX_REORDER_TABLE_OFFSET, |
| /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ |
| IX_TRIE_OFFSET, |
| |
| IX_RESERVED8_OFFSET, // 8 |
| /** Byte offset to int64_t ces[]. */ |
| IX_CES_OFFSET, |
| IX_RESERVED10_OFFSET, |
| /** Byte offset to uint32_t ce32s[]. */ |
| IX_CE32S_OFFSET, |
| |
| /** Byte offset to uint32_t rootElements[]. */ |
| IX_ROOT_ELEMENTS_OFFSET, // 12 |
| /** Byte offset to UChar *contexts[]. */ |
| IX_CONTEXTS_OFFSET, |
| /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */ |
| IX_UNSAFE_BWD_OFFSET, |
| /** Byte offset to uint16_t fastLatinTable[]. */ |
| IX_FAST_LATIN_TABLE_OFFSET, |
| |
| /** Byte offset to uint16_t scripts[]. */ |
| IX_SCRIPTS_OFFSET, // 16 |
| /** |
| * Byte offset to UBool compressibleBytes[]. |
| * Empty table if <256 bytes (padding only). |
| * Otherwise 256 bytes or more (with padding). |
| */ |
| IX_COMPRESSIBLE_BYTES_OFFSET, |
| IX_RESERVED18_OFFSET, |
| IX_TOTAL_SIZE |
| }; |
| |
| static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, |
| CollationTailoring &tailoring, UErrorCode &errorCode); |
| |
| static UBool U_CALLCONV |
| isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); |
| |
| private: |
| CollationDataReader(); // no constructor |
| }; |
| |
| /* |
| * Format of collation data (ucadata.icu, binary data in coll/ *.res files). |
| * Format version 5. |
| * |
| * The root collation data is stored in the ucadata.icu file. |
| * Tailorings are stored inside .res resource bundle files, with a complete file header. |
| * |
| * Collation data begins with a standard ICU data file header |
| * (DataHeader, see ucmndata.h and unicode/udata.h). |
| * The UDataInfo.dataVersion field contains the UCA and other version numbers, |
| * see the comments for CollationTailoring.version. |
| * |
| * After the header, the file contains the following parts. |
| * Constants are defined as enum values of the CollationDataReader class. |
| * See also the Collation class. |
| * |
| * int32_t indexes[indexesLength]; |
| * The indexes array has variable length. |
| * Some tailorings only need the length and the options, |
| * others only add reorderCodes and the reorderTable, |
| * some need to store mappings. |
| * Only as many indexes are stored as needed to read all of the data. |
| * |
| * Index 0: indexesLength |
| * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS |
| * Index 2..3: Unused/reserved/0. |
| * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo |
| * are stored in a short, contiguous part of the ce32s array. |
| * |
| * Indexes 5..19 are byte offsets in ascending order. |
| * Each byte offset marks the start of the next part in the data file, |
| * and the end of the previous one. |
| * When two consecutive byte offsets are the same (or too short), |
| * then the corresponding part is empty. |
| * Byte offsets are offsets from after the header, |
| * that is, from the beginning of the indexes[]. |
| * Each part starts at an offset with proper alignment for its data. |
| * If necessary, the previous part may include padding bytes to achieve this alignment. |
| * The last byte offset that is stored in the indexes indicates the total size of the data |
| * (starting with the indexes). |
| * |
| * int32_t reorderCodes[]; -- empty in root |
| * The list of script and reordering codes. |
| * |
| * Beginning with format version 5, this array may optionally |
| * have trailing entries with a full list of reorder ranges |
| * as described for CollationSettings::reorderRanges. |
| * |
| * Script or reorder codes are first and do not exceed 16-bit values. |
| * Range limits are stored in the upper 16 bits, and are never 0. |
| * Split this array into reorder codes and ranges at the first entry |
| * with non-zero upper 16 bits. |
| * |
| * If the ranges are missing but needed for split-reordered primary lead bytes, |
| * then they are regenerated at load time. |
| * |
| * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes |
| * Primary-weight lead byte permutation table. |
| * Normally present when the reorderCodes are, but can be built at load time. |
| * |
| * Beginning with format version 5, a 0 entry at a non-zero index |
| * (which is otherwise an illegal value) |
| * means that the primary lead byte is "split" |
| * (there are different offsets for primaries that share that lead byte) |
| * and the reordering offset must be determined via the reorder ranges |
| * that are either stored as part of the reorderCodes array |
| * or regenerated at load time. |
| * |
| * UTrie2 trie; -- see utrie2_impl.h and utrie2.h |
| * The trie holds the main collation data. Each code point is mapped to a 32-bit value. |
| * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, |
| * in which case it is a special CE32 and contains a 4-bit tag and further data. |
| * See the Collation class for details. |
| * |
| * The trie has a value for each lead surrogate code unit with some bits encoding |
| * collective properties of the 1024 supplementary characters whose UTF-16 form starts with |
| * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. |
| * |
| * int64_t ces[]; |
| * 64-bit CEs and expansions that cannot be stored in a more compact form. |
| * |
| * uint32_t ce32s[]; |
| * CE32s for expansions in compact form, and for characters whose trie values |
| * contain special data. |
| * |
| * uint32_t rootElements[]; -- empty in all tailorings |
| * Compact storage for all of the CEs that occur in the root collation. |
| * See the CollationRootElements class. |
| * |
| * UChar *contexts[]; |
| * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. |
| * |
| * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() |
| * Serialized form of characters that are unsafe when iterating backwards, |
| * and at the end of an identical string prefix. |
| * Back up to a safe character. |
| * Lead surrogates are "unsafe" when any of their corresponding supplementary |
| * code points are unsafe. |
| * Does not include [:^lccc=0:][:^tccc=0:]. |
| * For each tailoring, the root unsafeBackwardSet is subtracted. |
| * (As a result, in many tailorings no set needs to be stored.) |
| * |
| * uint16_t fastLatinTable[]; |
| * Optional optimization for Latin text. |
| * See the CollationFastLatin class. |
| * |
| * uint16_t scripts[]; -- empty in all tailorings |
| * Format version 5: |
| * uint16_t numScripts; |
| * uint16_t scriptsIndex[numScripts+16]; |
| * uint16_t scriptStarts[]; |
| * See CollationData::numScripts etc. |
| * |
| * Format version 4: |
| * Table of the reordering groups with their first and last lead bytes, |
| * and their script and reordering codes. |
| * See CollationData::scripts. |
| * |
| * UBool compressibleBytes[]; -- empty in all tailorings |
| * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. |
| * |
| * ----------------- |
| * Changes for formatVersion 5 (ICU 55) |
| * |
| * Reordering moves single scripts, not groups of scripts. |
| * Reorder ranges are optionally appended to the reorderCodes, |
| * and a 0 entry in the reorderTable indicates a split lead byte. |
| * The scripts data has a new format. |
| * |
| * The rootElements may contain secondary and tertiary weights below common=05. |
| * (Used for small Hiragana letters.) |
| * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. |
| * There are no other data structure changes, but builder code needs to be able to handle such data. |
| * |
| * The collation element for the merge separator code point U+FFFE |
| * does not necessarily have special, unique secondary/tertiary weights any more. |
| */ |
| |
| U_NAMESPACE_END |
| |
| #endif // !UCONFIG_NO_COLLATION |
| #endif // __COLLATIONDATAREADER_H__ |