| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2011-2013, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * file name: ppucd.h |
| * encoding: UTF-8 |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2011dec11 |
| * created by: Markus W. Scherer |
| */ |
| |
| #ifndef __PPUCD_H__ |
| #define __PPUCD_H__ |
| |
| #include "unicode/utypes.h" |
| #include "unicode/uniset.h" |
| #include "unicode/unistr.h" |
| |
| #include <stdio.h> |
| |
| /** Additions to the uchar.h enum UProperty. */ |
| enum { |
| /** Name_Alias */ |
| PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT, |
| PPUCD_CONDITIONAL_CASE_MAPPINGS, |
| PPUCD_TURKIC_CASE_FOLDING |
| }; |
| |
| U_NAMESPACE_BEGIN |
| |
| class U_TOOLUTIL_API PropertyNames { |
| public: |
| virtual ~PropertyNames(); |
| virtual int32_t getPropertyEnum(const char *name) const; |
| virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const; |
| }; |
| |
| struct U_TOOLUTIL_API UniProps { |
| UniProps(); |
| ~UniProps(); |
| |
| int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; } |
| |
| UChar32 start, end; |
| UBool binProps[UCHAR_BINARY_LIMIT]; |
| int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]; |
| UVersionInfo age; |
| UChar32 bmg, bpb; |
| UChar32 scf, slc, stc, suc; |
| int32_t digitValue; |
| const char *numericValue; |
| const char *name; |
| const char *nameAlias; |
| UnicodeString cf, lc, tc, uc; |
| UnicodeSet scx; |
| }; |
| |
| class U_TOOLUTIL_API PreparsedUCD { |
| public: |
| enum LineType { |
| /** No line, end of file. */ |
| NO_LINE, |
| /** Empty line. (Might contain a comment.) */ |
| EMPTY_LINE, |
| |
| /** ucd;6.1.0 */ |
| UNICODE_VERSION_LINE, |
| |
| /** property;Binary;Alpha;Alphabetic */ |
| PROPERTY_LINE, |
| /** binary;N;No;F;False */ |
| BINARY_LINE, |
| /** value;gc;Zs;Space_Separator */ |
| VALUE_LINE, |
| |
| /** defaults;0000..10FFFF;age=NA;bc=L;... */ |
| DEFAULTS_LINE, |
| /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */ |
| BLOCK_LINE, |
| /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */ |
| CP_LINE, |
| /** unassigned;E01F0..E0FFF;bc=BN;CWKCF;DI;GCB=CN;NFKC_CF= */ |
| UNASSIGNED_LINE, |
| |
| /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */ |
| ALG_NAMES_RANGE_LINE, |
| |
| LINE_TYPE_COUNT |
| }; |
| |
| /** |
| * Constructor. |
| * Prepare this object for a new, empty package. |
| */ |
| PreparsedUCD(const char *filename, UErrorCode &errorCode); |
| |
| /** Destructor. */ |
| ~PreparsedUCD(); |
| |
| /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */ |
| void setPropertyNames(const PropertyNames *pn) { pnames=pn; } |
| |
| /** |
| * Reads a line from the preparsed UCD file. |
| * Splits the line by replacing each ';' with a NUL. |
| */ |
| LineType readLine(UErrorCode &errorCode); |
| |
| /** Returns the number of the line read by readLine(). */ |
| int32_t getLineNumber() const { return lineNumber; } |
| |
| /** Returns the line's next field, or NULL. */ |
| const char *nextField(); |
| |
| /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */ |
| const UVersionInfo &getUnicodeVersion() const { return ucdVersion; } |
| |
| /** Returns TRUE if the current line has property values. */ |
| UBool lineHasPropertyValues() const { |
| return DEFAULTS_LINE<=lineType && lineType<=UNASSIGNED_LINE; |
| } |
| |
| /** |
| * Parses properties from the current line. |
| * Clears newValues and sets UProperty codes for property values mentioned |
| * on the current line (as opposed to being inherited). |
| * Returns a pointer to the filled-in UniProps, or NULL if something went wrong. |
| * The returned UniProps are usable until the next line of the same type is read. |
| */ |
| const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode); |
| |
| /** |
| * Returns the code point range for the current algnamesrange line. |
| * Calls & parses nextField(). |
| * Further nextField() calls will yield the range's type & prefix string. |
| * Returns U_SUCCESS(errorCode). |
| */ |
| UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode); |
| |
| private: |
| UBool isLineBufferAvailable(int32_t i) { |
| return defaultLineIndex!=i && blockLineIndex!=i; |
| } |
| |
| /** Resets the field iterator and returns the line's first field (the line type field). */ |
| const char *firstField(); |
| |
| UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, |
| UErrorCode &errorCode); |
| UChar32 parseCodePoint(const char *s, UErrorCode &errorCode); |
| UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode); |
| void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode); |
| void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode); |
| |
| static const int32_t kNumLineBuffers=3; |
| |
| PropertyNames *icuPnames; // owned |
| const PropertyNames *pnames; // aliased |
| FILE *file; |
| int32_t defaultLineIndex, blockLineIndex, lineIndex; |
| int32_t lineNumber; |
| LineType lineType; |
| char *fieldLimit; |
| char *lineLimit; |
| |
| UVersionInfo ucdVersion; |
| UniProps defaultProps, blockProps, cpProps; |
| UnicodeSet blockValues; |
| // Multiple lines so that default and block properties can maintain pointers |
| // into their line buffers. |
| char lines[kNumLineBuffers][4096]; |
| }; |
| |
| U_NAMESPACE_END |
| |
| #endif // __PPUCD_H__ |