| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ****************************************************************************** |
| * |
| * Copyright (C) 1999-2014, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ****************************************************************************** |
| * file name: unames.c |
| * encoding: UTF-8 |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 1999oct04 |
| * created by: Markus W. Scherer |
| */ |
| |
| #if defined(STARBOARD) |
| #include "starboard/client_porting/poem/assert_poem.h" |
| #include "starboard/client_porting/poem/string_poem.h" |
| #endif // defined(STARBOARD) |
| #include "unicode/utypes.h" |
| #include "unicode/putil.h" |
| #include "unicode/uchar.h" |
| #include "unicode/udata.h" |
| #include "unicode/utf.h" |
| #include "unicode/utf16.h" |
| #include "uassert.h" |
| #include "ustr_imp.h" |
| #include "umutex.h" |
| #include "cmemory.h" |
| #include "cstring.h" |
| #include "ucln_cmn.h" |
| #include "udataswp.h" |
| #include "uprops.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| /* prototypes ------------------------------------------------------------- */ |
| |
| static const char DATA_NAME[] = "unames"; |
| static const char DATA_TYPE[] = "icu"; |
| |
| #define GROUP_SHIFT 5 |
| #define LINES_PER_GROUP (1L<<GROUP_SHIFT) |
| #define GROUP_MASK (LINES_PER_GROUP-1) |
| |
| /* |
| * This struct was replaced by explicitly accessing equivalent |
| * fields from triples of uint16_t. |
| * The Group struct was padded to 8 bytes on compilers for early ARM CPUs, |
| * which broke the assumption that sizeof(Group)==6 and that the ++ operator |
| * would advance by 6 bytes (3 uint16_t). |
| * |
| * We can't just change the data structure because it's loaded from a data file, |
| * and we don't want to make it less compact, so we changed the access code. |
| * |
| * For details see ICU tickets 6331 and 6008. |
| typedef struct { |
| uint16_t groupMSB, |
| offsetHigh, offsetLow; / * avoid padding * / |
| } Group; |
| */ |
| enum { |
| GROUP_MSB, |
| GROUP_OFFSET_HIGH, |
| GROUP_OFFSET_LOW, |
| GROUP_LENGTH |
| }; |
| |
| /* |
| * Get the 32-bit group offset. |
| * @param group (const uint16_t *) pointer to a Group triple of uint16_t |
| * @return group offset (int32_t) |
| */ |
| #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW]) |
| |
| #define NEXT_GROUP(group) ((group)+GROUP_LENGTH) |
| #define PREV_GROUP(group) ((group)-GROUP_LENGTH) |
| |
| typedef struct { |
| uint32_t start, end; |
| uint8_t type, variant; |
| uint16_t size; |
| } AlgorithmicRange; |
| |
| typedef struct { |
| uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; |
| } UCharNames; |
| |
| /* |
| * Get the groups table from a UCharNames struct. |
| * The groups table consists of one uint16_t groupCount followed by |
| * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH |
| * and the comment for the old struct Group above. |
| * |
| * @param names (const UCharNames *) pointer to the UCharNames indexes |
| * @return (const uint16_t *) pointer to the groups table |
| */ |
| #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset) |
| |
| typedef struct { |
| const char *otherName; |
| UChar32 code; |
| } FindName; |
| |
| #define DO_FIND_NAME NULL |
| |
| static UDataMemory *uCharNamesData=NULL; |
| static UCharNames *uCharNames=NULL; |
| static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER; |
| |
| /* |
| * Maximum length of character names (regular & 1.0). |
| */ |
| static int32_t gMaxNameLength=0; |
| |
| /* |
| * Set of chars used in character names (regular & 1.0). |
| * Chars are platform-dependent (can be EBCDIC). |
| */ |
| static uint32_t gNameSet[8]={ 0 }; |
| |
| #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT |
| #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1 |
| #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2 |
| |
| #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3) |
| |
| static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = { |
| "unassigned", |
| "uppercase letter", |
| "lowercase letter", |
| "titlecase letter", |
| "modifier letter", |
| "other letter", |
| "non spacing mark", |
| "enclosing mark", |
| "combining spacing mark", |
| "decimal digit number", |
| "letter number", |
| "other number", |
| "space separator", |
| "line separator", |
| "paragraph separator", |
| "control", |
| "format", |
| "private use area", |
| "surrogate", |
| "dash punctuation", |
| "start punctuation", |
| "end punctuation", |
| "connector punctuation", |
| "other punctuation", |
| "math symbol", |
| "currency symbol", |
| "modifier symbol", |
| "other symbol", |
| "initial punctuation", |
| "final punctuation", |
| "noncharacter", |
| "lead surrogate", |
| "trail surrogate" |
| }; |
| |
| /* implementation ----------------------------------------------------------- */ |
| |
| static UBool U_CALLCONV unames_cleanup(void) |
| { |
| if(uCharNamesData) { |
| udata_close(uCharNamesData); |
| uCharNamesData = NULL; |
| } |
| if(uCharNames) { |
| uCharNames = NULL; |
| } |
| gCharNamesInitOnce.reset(); |
| gMaxNameLength=0; |
| return TRUE; |
| } |
| |
| static UBool U_CALLCONV |
| isAcceptable(void * /*context*/, |
| const char * /*type*/, const char * /*name*/, |
| const UDataInfo *pInfo) { |
| return (UBool)( |
| pInfo->size>=20 && |
| pInfo->isBigEndian==U_IS_BIG_ENDIAN && |
| pInfo->charsetFamily==U_CHARSET_FAMILY && |
| pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ |
| pInfo->dataFormat[1]==0x6e && |
| pInfo->dataFormat[2]==0x61 && |
| pInfo->dataFormat[3]==0x6d && |
| pInfo->formatVersion[0]==1); |
| } |
| |
| static void U_CALLCONV |
| loadCharNames(UErrorCode &status) { |
| U_ASSERT(uCharNamesData == NULL); |
| U_ASSERT(uCharNames == NULL); |
| |
| uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status); |
| if(U_FAILURE(status)) { |
| uCharNamesData = NULL; |
| } else { |
| uCharNames = (UCharNames *)udata_getMemory(uCharNamesData); |
| } |
| ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup); |
| } |
| |
| |
| static UBool |
| isDataLoaded(UErrorCode *pErrorCode) { |
| umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode); |
| return U_SUCCESS(*pErrorCode); |
| } |
| |
| #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) UPRV_BLOCK_MACRO_BEGIN { \ |
| if((bufferLength)>0) { \ |
| *(buffer)++=c; \ |
| --(bufferLength); \ |
| } \ |
| ++(bufferPos); \ |
| } UPRV_BLOCK_MACRO_END |
| |
| #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT |
| |
| /* |
| * Important: expandName() and compareName() are almost the same - |
| * apply fixes to both. |
| * |
| * UnicodeData.txt uses ';' as a field separator, so no |
| * field can contain ';' as part of its contents. |
| * In unames.dat, it is marked as token[';']==-1 only if the |
| * semicolon is used in the data file - which is iff we |
| * have Unicode 1.0 names or ISO comments or aliases. |
| * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases |
| * although we know that it will never be part of a name. |
| */ |
| static uint16_t |
| expandName(UCharNames *names, |
| const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, |
| char *buffer, uint16_t bufferLength) { |
| uint16_t *tokens=(uint16_t *)names+8; |
| uint16_t token, tokenCount=*tokens++, bufferPos=0; |
| uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; |
| uint8_t c; |
| |
| if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
| /* |
| * skip the modern name if it is not requested _and_ |
| * if the semicolon byte value is a character, not a token number |
| */ |
| if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { |
| int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; |
| do { |
| while(nameLength>0) { |
| --nameLength; |
| if(*name++==';') { |
| break; |
| } |
| } |
| } while(--fieldIndex>0); |
| } else { |
| /* |
| * the semicolon byte value is a token number, therefore |
| * only modern names are stored in unames.dat and there is no |
| * such requested alternate name here |
| */ |
| nameLength=0; |
| } |
| } |
| |
| /* write each letter directly, and write a token word per token */ |
| while(nameLength>0) { |
| --nameLength; |
| c=*name++; |
| |
| if(c>=tokenCount) { |
| if(c!=';') { |
| /* implicit letter */ |
| WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| } else { |
| /* finished */ |
| break; |
| } |
| } else { |
| token=tokens[c]; |
| if(token==(uint16_t)(-2)) { |
| /* this is a lead byte for a double-byte token */ |
| token=tokens[c<<8|*name++]; |
| --nameLength; |
| } |
| if(token==(uint16_t)(-1)) { |
| if(c!=';') { |
| /* explicit letter */ |
| WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| } else { |
| /* stop, but skip the semicolon if we are seeking |
| extended names and there was no 2.0 name but there |
| is a 1.0 name. */ |
| if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) { |
| if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { |
| continue; |
| } |
| } |
| /* finished */ |
| break; |
| } |
| } else { |
| /* write token word */ |
| uint8_t *tokenString=tokenStrings+token; |
| while((c=*tokenString++)!=0) { |
| WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| } |
| } |
| } |
| } |
| |
| /* zero-terminate */ |
| if(bufferLength>0) { |
| *buffer=0; |
| } |
| |
| return bufferPos; |
| } |
| |
| /* |
| * compareName() is almost the same as expandName() except that it compares |
| * the currently expanded name to an input name. |
| * It returns the match/no match result as soon as possible. |
| */ |
| static UBool |
| compareName(UCharNames *names, |
| const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, |
| const char *otherName) { |
| uint16_t *tokens=(uint16_t *)names+8; |
| uint16_t token, tokenCount=*tokens++; |
| uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; |
| uint8_t c; |
| const char *origOtherName = otherName; |
| |
| if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
| /* |
| * skip the modern name if it is not requested _and_ |
| * if the semicolon byte value is a character, not a token number |
| */ |
| if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { |
| int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; |
| do { |
| while(nameLength>0) { |
| --nameLength; |
| if(*name++==';') { |
| break; |
| } |
| } |
| } while(--fieldIndex>0); |
| } else { |
| /* |
| * the semicolon byte value is a token number, therefore |
| * only modern names are stored in unames.dat and there is no |
| * such requested alternate name here |
| */ |
| nameLength=0; |
| } |
| } |
| |
| /* compare each letter directly, and compare a token word per token */ |
| while(nameLength>0) { |
| --nameLength; |
| c=*name++; |
| |
| if(c>=tokenCount) { |
| if(c!=';') { |
| /* implicit letter */ |
| if((char)c!=*otherName++) { |
| return FALSE; |
| } |
| } else { |
| /* finished */ |
| break; |
| } |
| } else { |
| token=tokens[c]; |
| if(token==(uint16_t)(-2)) { |
| /* this is a lead byte for a double-byte token */ |
| token=tokens[c<<8|*name++]; |
| --nameLength; |
| } |
| if(token==(uint16_t)(-1)) { |
| if(c!=';') { |
| /* explicit letter */ |
| if((char)c!=*otherName++) { |
| return FALSE; |
| } |
| } else { |
| /* stop, but skip the semicolon if we are seeking |
| extended names and there was no 2.0 name but there |
| is a 1.0 name. */ |
| if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) { |
| if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { |
| continue; |
| } |
| } |
| /* finished */ |
| break; |
| } |
| } else { |
| /* write token word */ |
| uint8_t *tokenString=tokenStrings+token; |
| while((c=*tokenString++)!=0) { |
| if((char)c!=*otherName++) { |
| return FALSE; |
| } |
| } |
| } |
| } |
| } |
| |
| /* complete match? */ |
| return (UBool)(*otherName==0); |
| } |
| |
| static uint8_t getCharCat(UChar32 cp) { |
| uint8_t cat; |
| |
| if (U_IS_UNICODE_NONCHAR(cp)) { |
| return U_NONCHARACTER_CODE_POINT; |
| } |
| |
| if ((cat = u_charType(cp)) == U_SURROGATE) { |
| cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE; |
| } |
| |
| return cat; |
| } |
| |
| static const char *getCharCatName(UChar32 cp) { |
| uint8_t cat = getCharCat(cp); |
| |
| /* Return unknown if the table of names above is not up to |
| date. */ |
| |
| if (cat >= UPRV_LENGTHOF(charCatNames)) { |
| return "unknown"; |
| } else { |
| return charCatNames[cat]; |
| } |
| } |
| |
| static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) { |
| const char *catname = getCharCatName(code); |
| uint16_t length = 0; |
| |
| UChar32 cp; |
| int ndigits, i; |
| |
| WRITE_CHAR(buffer, bufferLength, length, '<'); |
| while (catname[length - 1]) { |
| WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]); |
| } |
| WRITE_CHAR(buffer, bufferLength, length, '-'); |
| for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4) |
| ; |
| if (ndigits < 4) |
| ndigits = 4; |
| for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) { |
| uint8_t v = (uint8_t)(cp & 0xf); |
| buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10); |
| } |
| buffer += ndigits; |
| length += static_cast<uint16_t>(ndigits); |
| WRITE_CHAR(buffer, bufferLength, length, '>'); |
| |
| return length; |
| } |
| |
| /* |
| * getGroup() does a binary search for the group that contains the |
| * Unicode code point "code". |
| * The return value is always a valid Group* that may contain "code" |
| * or else is the highest group before "code". |
| * If the lowest group is after "code", then that one is returned. |
| */ |
| static const uint16_t * |
| getGroup(UCharNames *names, uint32_t code) { |
| const uint16_t *groups=GET_GROUPS(names); |
| uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT), |
| start=0, |
| limit=*groups++, |
| number; |
| |
| /* binary search for the group of names that contains the one for code */ |
| while(start<limit-1) { |
| number=(uint16_t)((start+limit)/2); |
| if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) { |
| limit=number; |
| } else { |
| start=number; |
| } |
| } |
| |
| /* return this regardless of whether it is an exact match */ |
| return groups+start*GROUP_LENGTH; |
| } |
| |
| /* |
| * expandGroupLengths() reads a block of compressed lengths of 32 strings and |
| * expands them into offsets and lengths for each string. |
| * Lengths are stored with a variable-width encoding in consecutive nibbles: |
| * If a nibble<0xc, then it is the length itself (0=empty string). |
| * If a nibble>=0xc, then it forms a length value with the following nibble. |
| * Calculation see below. |
| * The offsets and lengths arrays must be at least 33 (one more) long because |
| * there is no check here at the end if the last nibble is still used. |
| */ |
| static const uint8_t * |
| expandGroupLengths(const uint8_t *s, |
| uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) { |
| /* read the lengths of the 32 strings in this group and get each string's offset */ |
| uint16_t i=0, offset=0, length=0; |
| uint8_t lengthByte; |
| |
| /* all 32 lengths must be read to get the offset of the first group string */ |
| while(i<LINES_PER_GROUP) { |
| lengthByte=*s++; |
| |
| /* read even nibble - MSBs of lengthByte */ |
| if(length>=12) { |
| /* double-nibble length spread across two bytes */ |
| length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12); |
| lengthByte&=0xf; |
| } else if((lengthByte /* &0xf0 */)>=0xc0) { |
| /* double-nibble length spread across this one byte */ |
| length=(uint16_t)((lengthByte&0x3f)+12); |
| } else { |
| /* single-nibble length in MSBs */ |
| length=(uint16_t)(lengthByte>>4); |
| lengthByte&=0xf; |
| } |
| |
| *offsets++=offset; |
| *lengths++=length; |
| |
| offset+=length; |
| ++i; |
| |
| /* read odd nibble - LSBs of lengthByte */ |
| if((lengthByte&0xf0)==0) { |
| /* this nibble was not consumed for a double-nibble length above */ |
| length=lengthByte; |
| if(length<12) { |
| /* single-nibble length in LSBs */ |
| *offsets++=offset; |
| *lengths++=length; |
| |
| offset+=length; |
| ++i; |
| } |
| } else { |
| length=0; /* prevent double-nibble detection in the next iteration */ |
| } |
| } |
| |
| /* now, s is at the first group string */ |
| return s; |
| } |
| |
| static uint16_t |
| expandGroupName(UCharNames *names, const uint16_t *group, |
| uint16_t lineNumber, UCharNameChoice nameChoice, |
| char *buffer, uint16_t bufferLength) { |
| uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; |
| const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); |
| s=expandGroupLengths(s, offsets, lengths); |
| return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice, |
| buffer, bufferLength); |
| } |
| |
| static uint16_t |
| getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice, |
| char *buffer, uint16_t bufferLength) { |
| const uint16_t *group=getGroup(names, code); |
| if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) { |
| return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice, |
| buffer, bufferLength); |
| } else { |
| /* group not found */ |
| /* zero-terminate */ |
| if(bufferLength>0) { |
| *buffer=0; |
| } |
| return 0; |
| } |
| } |
| |
| /* |
| * enumGroupNames() enumerates all the names in a 32-group |
| * and either calls the enumerator function or finds a given input name. |
| */ |
| static UBool |
| enumGroupNames(UCharNames *names, const uint16_t *group, |
| UChar32 start, UChar32 end, |
| UEnumCharNamesFn *fn, void *context, |
| UCharNameChoice nameChoice) { |
| uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; |
| const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); |
| |
| s=expandGroupLengths(s, offsets, lengths); |
| if(fn!=DO_FIND_NAME) { |
| char buffer[200]; |
| uint16_t length; |
| |
| while(start<=end) { |
| length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer)); |
| if (!length && nameChoice == U_EXTENDED_CHAR_NAME) { |
| buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; |
| } |
| /* here, we assume that the buffer is large enough */ |
| if(length>0) { |
| if(!fn(context, start, nameChoice, buffer, length)) { |
| return FALSE; |
| } |
| } |
| ++start; |
| } |
| } else { |
| const char *otherName=((FindName *)context)->otherName; |
| while(start<=end) { |
| if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) { |
| ((FindName *)context)->code=start; |
| return FALSE; |
| } |
| ++start; |
| } |
| } |
| return TRUE; |
| } |
| |
| /* |
| * enumExtNames enumerate extended names. |
| * It only needs to do it if it is called with a real function and not |
| * with the dummy DO_FIND_NAME, because u_charFromName() does a check |
| * for extended names by itself. |
| */ |
| static UBool |
| enumExtNames(UChar32 start, UChar32 end, |
| UEnumCharNamesFn *fn, void *context) |
| { |
| if(fn!=DO_FIND_NAME) { |
| char buffer[200]; |
| uint16_t length; |
| |
| while(start<=end) { |
| buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; |
| /* here, we assume that the buffer is large enough */ |
| if(length>0) { |
| if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) { |
| return FALSE; |
| } |
| } |
| ++start; |
| } |
| } |
| |
| return TRUE; |
| } |
| |
| static UBool |
| enumNames(UCharNames *names, |
| UChar32 start, UChar32 limit, |
| UEnumCharNamesFn *fn, void *context, |
| UCharNameChoice nameChoice) { |
| uint16_t startGroupMSB, endGroupMSB, groupCount; |
| const uint16_t *group, *groupLimit; |
| |
| startGroupMSB=(uint16_t)(start>>GROUP_SHIFT); |
| endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT); |
| |
| /* find the group that contains start, or the highest before it */ |
| group=getGroup(names, start); |
| |
| if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) { |
| /* enumerate synthetic names between start and the group start */ |
| UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT); |
| if(extLimit>limit) { |
| extLimit=limit; |
| } |
| if(!enumExtNames(start, extLimit-1, fn, context)) { |
| return FALSE; |
| } |
| start=extLimit; |
| } |
| |
| if(startGroupMSB==endGroupMSB) { |
| if(startGroupMSB==group[GROUP_MSB]) { |
| /* if start and limit-1 are in the same group, then enumerate only in that one */ |
| return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice); |
| } |
| } else { |
| const uint16_t *groups=GET_GROUPS(names); |
| groupCount=*groups++; |
| groupLimit=groups+groupCount*GROUP_LENGTH; |
| |
| if(startGroupMSB==group[GROUP_MSB]) { |
| /* enumerate characters in the partial start group */ |
| if((start&GROUP_MASK)!=0) { |
| if(!enumGroupNames(names, group, |
| start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1, |
| fn, context, nameChoice)) { |
| return FALSE; |
| } |
| group=NEXT_GROUP(group); /* continue with the next group */ |
| } |
| } else if(startGroupMSB>group[GROUP_MSB]) { |
| /* make sure that we start enumerating with the first group after start */ |
| const uint16_t *nextGroup=NEXT_GROUP(group); |
| if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) { |
| UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; |
| if (end > limit) { |
| end = limit; |
| } |
| if (!enumExtNames(start, end - 1, fn, context)) { |
| return FALSE; |
| } |
| } |
| group=nextGroup; |
| } |
| |
| /* enumerate entire groups between the start- and end-groups */ |
| while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) { |
| const uint16_t *nextGroup; |
| start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT; |
| if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) { |
| return FALSE; |
| } |
| nextGroup=NEXT_GROUP(group); |
| if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) { |
| UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; |
| if (end > limit) { |
| end = limit; |
| } |
| if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) { |
| return FALSE; |
| } |
| } |
| group=nextGroup; |
| } |
| |
| /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */ |
| if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) { |
| return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice); |
| } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) { |
| UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT; |
| if (next > start) { |
| start = next; |
| } |
| } else { |
| return TRUE; |
| } |
| } |
| |
| /* we have not found a group, which means everything is made of |
| extended names. */ |
| if (nameChoice == U_EXTENDED_CHAR_NAME) { |
| if (limit > UCHAR_MAX_VALUE + 1) { |
| limit = UCHAR_MAX_VALUE + 1; |
| } |
| return enumExtNames(start, limit - 1, fn, context); |
| } |
| |
| return TRUE; |
| } |
| |
| static uint16_t |
| writeFactorSuffix(const uint16_t *factors, uint16_t count, |
| const char *s, /* suffix elements */ |
| uint32_t code, |
| uint16_t indexes[8], /* output fields from here */ |
| const char *elementBases[8], const char *elements[8], |
| char *buffer, uint16_t bufferLength) { |
| uint16_t i, factor, bufferPos=0; |
| char c; |
| |
| /* write elements according to the factors */ |
| |
| /* |
| * the factorized elements are determined by modulo arithmetic |
| * with the factors of this algorithm |
| * |
| * note that for fewer operations, count is decremented here |
| */ |
| --count; |
| for(i=count; i>0; --i) { |
| factor=factors[i]; |
| indexes[i]=(uint16_t)(code%factor); |
| code/=factor; |
| } |
| /* |
| * we don't need to calculate the last modulus because start<=code<=end |
| * guarantees here that code<=factors[0] |
| */ |
| indexes[0]=(uint16_t)code; |
| |
| /* write each element */ |
| for(;;) { |
| if(elementBases!=NULL) { |
| *elementBases++=s; |
| } |
| |
| /* skip indexes[i] strings */ |
| factor=indexes[i]; |
| while(factor>0) { |
| while(*s++!=0) {} |
| --factor; |
| } |
| if(elements!=NULL) { |
| *elements++=s; |
| } |
| |
| /* write element */ |
| while((c=*s++)!=0) { |
| WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| } |
| |
| /* we do not need to perform the rest of this loop for i==count - break here */ |
| if(i>=count) { |
| break; |
| } |
| |
| /* skip the rest of the strings for this factors[i] */ |
| factor=(uint16_t)(factors[i]-indexes[i]-1); |
| while(factor>0) { |
| while(*s++!=0) {} |
| --factor; |
| } |
| |
| ++i; |
| } |
| |
| /* zero-terminate */ |
| if(bufferLength>0) { |
| *buffer=0; |
| } |
| |
| return bufferPos; |
| } |
| |
| /* |
| * Important: |
| * Parts of findAlgName() are almost the same as some of getAlgName(). |
| * Fixes must be applied to both. |
| */ |
| static uint16_t |
| getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, |
| char *buffer, uint16_t bufferLength) { |
| uint16_t bufferPos=0; |
| |
| /* Only the normative character name can be algorithmic. */ |
| if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
| /* zero-terminate */ |
| if(bufferLength>0) { |
| *buffer=0; |
| } |
| return 0; |
| } |
| |
| switch(range->type) { |
| case 0: { |
| /* name = prefix hex-digits */ |
| const char *s=(const char *)(range+1); |
| char c; |
| |
| uint16_t i, count; |
| |
| /* copy prefix */ |
| while((c=*s++)!=0) { |
| WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| } |
| |
| /* write hexadecimal code point value */ |
| count=range->variant; |
| |
| /* zero-terminate */ |
| if(count<bufferLength) { |
| buffer[count]=0; |
| } |
| |
| for(i=count; i>0;) { |
| if(--i<bufferLength) { |
| c=(char)(code&0xf); |
| if(c<10) { |
| c+='0'; |
| } else { |
| c+='A'-10; |
| } |
| buffer[i]=c; |
| } |
| code>>=4; |
| } |
| |
| bufferPos+=count; |
| break; |
| } |
| case 1: { |
| /* name = prefix factorized-elements */ |
| uint16_t indexes[8]; |
| const uint16_t *factors=(const uint16_t *)(range+1); |
| uint16_t count=range->variant; |
| const char *s=(const char *)(factors+count); |
| char c; |
| |
| /* copy prefix */ |
| while((c=*s++)!=0) { |
| WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| } |
| |
| bufferPos+=writeFactorSuffix(factors, count, |
| s, code-range->start, indexes, NULL, NULL, buffer, bufferLength); |
| break; |
| } |
| default: |
| /* undefined type */ |
| /* zero-terminate */ |
| if(bufferLength>0) { |
| *buffer=0; |
| } |
| break; |
| } |
| |
| return bufferPos; |
| } |
| |
| /* |
| * Important: enumAlgNames() and findAlgName() are almost the same. |
| * Any fix must be applied to both. |
| */ |
| static UBool |
| enumAlgNames(AlgorithmicRange *range, |
| UChar32 start, UChar32 limit, |
| UEnumCharNamesFn *fn, void *context, |
| UCharNameChoice nameChoice) { |
| char buffer[200]; |
| uint16_t length; |
| |
| if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
| return TRUE; |
| } |
| |
| switch(range->type) { |
| case 0: { |
| char *s, *end; |
| char c; |
| |
| /* get the full name of the start character */ |
| length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer)); |
| if(length<=0) { |
| return TRUE; |
| } |
| |
| /* call the enumerator function with this first character */ |
| if(!fn(context, start, nameChoice, buffer, length)) { |
| return FALSE; |
| } |
| |
| /* go to the end of the name; all these names have the same length */ |
| end=buffer; |
| while(*end!=0) { |
| ++end; |
| } |
| |
| /* enumerate the rest of the names */ |
| while(++start<limit) { |
| /* increment the hexadecimal number on a character-basis */ |
| s=end; |
| for (;;) { |
| c=*--s; |
| if(('0'<=c && c<'9') || ('A'<=c && c<'F')) { |
| *s=(char)(c+1); |
| break; |
| } else if(c=='9') { |
| *s='A'; |
| break; |
| } else if(c=='F') { |
| *s='0'; |
| } |
| } |
| |
| if(!fn(context, start, nameChoice, buffer, length)) { |
| return FALSE; |
| } |
| } |
| break; |
| } |
| case 1: { |
| uint16_t indexes[8]; |
| const char *elementBases[8], *elements[8]; |
| const uint16_t *factors=(const uint16_t *)(range+1); |
| uint16_t count=range->variant; |
| const char *s=(const char *)(factors+count); |
| char *suffix, *t; |
| uint16_t prefixLength, i, idx; |
| |
| char c; |
| |
| /* name = prefix factorized-elements */ |
| |
| /* copy prefix */ |
| suffix=buffer; |
| prefixLength=0; |
| while((c=*s++)!=0) { |
| *suffix++=c; |
| ++prefixLength; |
| } |
| |
| /* append the suffix of the start character */ |
| length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count, |
| s, (uint32_t)start-range->start, |
| indexes, elementBases, elements, |
| suffix, (uint16_t)(sizeof(buffer)-prefixLength))); |
| |
| /* call the enumerator function with this first character */ |
| if(!fn(context, start, nameChoice, buffer, length)) { |
| return FALSE; |
| } |
| |
| /* enumerate the rest of the names */ |
| while(++start<limit) { |
| /* increment the indexes in lexical order bound by the factors */ |
| i=count; |
| for (;;) { |
| idx=(uint16_t)(indexes[--i]+1); |
| if(idx<factors[i]) { |
| /* skip one index and its element string */ |
| indexes[i]=idx; |
| s=elements[i]; |
| while(*s++!=0) { |
| } |
| elements[i]=s; |
| break; |
| } else { |
| /* reset this index to 0 and its element string to the first one */ |
| indexes[i]=0; |
| elements[i]=elementBases[i]; |
| } |
| } |
| |
| /* to make matters a little easier, just append all elements to the suffix */ |
| t=suffix; |
| length=prefixLength; |
| for(i=0; i<count; ++i) { |
| s=elements[i]; |
| while((c=*s++)!=0) { |
| *t++=c; |
| ++length; |
| } |
| } |
| /* zero-terminate */ |
| *t=0; |
| |
| if(!fn(context, start, nameChoice, buffer, length)) { |
| return FALSE; |
| } |
| } |
| break; |
| } |
| default: |
| /* undefined type */ |
| break; |
| } |
| |
| return TRUE; |
| } |
| |
| /* |
| * findAlgName() is almost the same as enumAlgNames() except that it |
| * returns the code point for a name if it fits into the range. |
| * It returns 0xffff otherwise. |
| */ |
| static UChar32 |
| findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) { |
| UChar32 code; |
| |
| if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
| return 0xffff; |
| } |
| |
| switch(range->type) { |
| case 0: { |
| /* name = prefix hex-digits */ |
| const char *s=(const char *)(range+1); |
| char c; |
| |
| uint16_t i, count; |
| |
| /* compare prefix */ |
| while((c=*s++)!=0) { |
| if((char)c!=*otherName++) { |
| return 0xffff; |
| } |
| } |
| |
| /* read hexadecimal code point value */ |
| count=range->variant; |
| code=0; |
| for(i=0; i<count; ++i) { |
| c=*otherName++; |
| if('0'<=c && c<='9') { |
| code=(code<<4)|(c-'0'); |
| } else if('A'<=c && c<='F') { |
| code=(code<<4)|(c-'A'+10); |
| } else { |
| return 0xffff; |
| } |
| } |
| |
| /* does it fit into the range? */ |
| if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) { |
| return code; |
| } |
| break; |
| } |
| case 1: { |
| char buffer[64]; |
| uint16_t indexes[8]; |
| const char *elementBases[8], *elements[8]; |
| const uint16_t *factors=(const uint16_t *)(range+1); |
| uint16_t count=range->variant; |
| const char *s=(const char *)(factors+count), *t; |
| UChar32 start, limit; |
| uint16_t i, idx; |
| |
| char c; |
| |
| /* name = prefix factorized-elements */ |
| |
| /* compare prefix */ |
| while((c=*s++)!=0) { |
| if((char)c!=*otherName++) { |
| return 0xffff; |
| } |
| } |
| |
| start=(UChar32)range->start; |
| limit=(UChar32)(range->end+1); |
| |
| /* initialize the suffix elements for enumeration; indexes should all be set to 0 */ |
| writeFactorSuffix(factors, count, s, 0, |
| indexes, elementBases, elements, buffer, sizeof(buffer)); |
| |
| /* compare the first suffix */ |
| if(0==uprv_strcmp(otherName, buffer)) { |
| return start; |
| } |
| |
| /* enumerate and compare the rest of the suffixes */ |
| while(++start<limit) { |
| /* increment the indexes in lexical order bound by the factors */ |
| i=count; |
| for (;;) { |
| idx=(uint16_t)(indexes[--i]+1); |
| if(idx<factors[i]) { |
| /* skip one index and its element string */ |
| indexes[i]=idx; |
| s=elements[i]; |
| while(*s++!=0) {} |
| elements[i]=s; |
| break; |
| } else { |
| /* reset this index to 0 and its element string to the first one */ |
| indexes[i]=0; |
| elements[i]=elementBases[i]; |
| } |
| } |
| |
| /* to make matters a little easier, just compare all elements of the suffix */ |
| t=otherName; |
| for(i=0; i<count; ++i) { |
| s=elements[i]; |
| while((c=*s++)!=0) { |
| if(c!=*t++) { |
| s=""; /* does not match */ |
| i=99; |
| } |
| } |
| } |
| if(i<99 && *t==0) { |
| return start; |
| } |
| } |
| break; |
| } |
| default: |
| /* undefined type */ |
| break; |
| } |
| |
| return 0xffff; |
| } |
| |
| /* sets of name characters, maximum name lengths ---------------------------- */ |
| |
| #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f))) |
| #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0) |
| |
| static int32_t |
| calcStringSetLength(uint32_t set[8], const char *s) { |
| int32_t length=0; |
| char c; |
| |
| while((c=*s++)!=0) { |
| SET_ADD(set, c); |
| ++length; |
| } |
| return length; |
| } |
| |
| static int32_t |
| calcAlgNameSetsLengths(int32_t maxNameLength) { |
| AlgorithmicRange *range; |
| uint32_t *p; |
| uint32_t rangeCount; |
| int32_t length; |
| |
| /* enumerate algorithmic ranges */ |
| p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
| rangeCount=*p; |
| range=(AlgorithmicRange *)(p+1); |
| while(rangeCount>0) { |
| switch(range->type) { |
| case 0: |
| /* name = prefix + (range->variant times) hex-digits */ |
| /* prefix */ |
| length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant; |
| if(length>maxNameLength) { |
| maxNameLength=length; |
| } |
| break; |
| case 1: { |
| /* name = prefix factorized-elements */ |
| const uint16_t *factors=(const uint16_t *)(range+1); |
| const char *s; |
| int32_t i, count=range->variant, factor, factorLength, maxFactorLength; |
| |
| /* prefix length */ |
| s=(const char *)(factors+count); |
| length=calcStringSetLength(gNameSet, s); |
| s+=length+1; /* start of factor suffixes */ |
| |
| /* get the set and maximum factor suffix length for each factor */ |
| for(i=0; i<count; ++i) { |
| maxFactorLength=0; |
| for(factor=factors[i]; factor>0; --factor) { |
| factorLength=calcStringSetLength(gNameSet, s); |
| s+=factorLength+1; |
| if(factorLength>maxFactorLength) { |
| maxFactorLength=factorLength; |
| } |
| } |
| length+=maxFactorLength; |
| } |
| |
| if(length>maxNameLength) { |
| maxNameLength=length; |
| } |
| break; |
| } |
| default: |
| /* unknown type */ |
| break; |
| } |
| |
| range=(AlgorithmicRange *)((uint8_t *)range+range->size); |
| --rangeCount; |
| } |
| return maxNameLength; |
| } |
| |
| static int32_t |
| calcExtNameSetsLengths(int32_t maxNameLength) { |
| int32_t i, length; |
| |
| for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) { |
| /* |
| * for each category, count the length of the category name |
| * plus 9= |
| * 2 for <> |
| * 1 for - |
| * 6 for most hex digits per code point |
| */ |
| length=9+calcStringSetLength(gNameSet, charCatNames[i]); |
| if(length>maxNameLength) { |
| maxNameLength=length; |
| } |
| } |
| return maxNameLength; |
| } |
| |
| static int32_t |
| calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths, |
| uint32_t set[8], |
| const uint8_t **pLine, const uint8_t *lineLimit) { |
| const uint8_t *line=*pLine; |
| int32_t length=0, tokenLength; |
| uint16_t c, token; |
| |
| while(line!=lineLimit && (c=*line++)!=(uint8_t)';') { |
| if(c>=tokenCount) { |
| /* implicit letter */ |
| SET_ADD(set, c); |
| ++length; |
| } else { |
| token=tokens[c]; |
| if(token==(uint16_t)(-2)) { |
| /* this is a lead byte for a double-byte token */ |
| c=c<<8|*line++; |
| token=tokens[c]; |
| } |
| if(token==(uint16_t)(-1)) { |
| /* explicit letter */ |
| SET_ADD(set, c); |
| ++length; |
| } else { |
| /* count token word */ |
| if(tokenLengths!=NULL) { |
| /* use cached token length */ |
| tokenLength=tokenLengths[c]; |
| if(tokenLength==0) { |
| tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); |
| tokenLengths[c]=(int8_t)tokenLength; |
| } |
| } else { |
| tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); |
| } |
| length+=tokenLength; |
| } |
| } |
| } |
| |
| *pLine=line; |
| return length; |
| } |
| |
| static void |
| calcGroupNameSetsLengths(int32_t maxNameLength) { |
| uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; |
| |
| uint16_t *tokens=(uint16_t *)uCharNames+8; |
| uint16_t tokenCount=*tokens++; |
| uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset; |
| |
| int8_t *tokenLengths; |
| |
| const uint16_t *group; |
| const uint8_t *s, *line, *lineLimit; |
| |
| int32_t groupCount, lineNumber, length; |
| |
| tokenLengths=(int8_t *)uprv_malloc(tokenCount); |
| if(tokenLengths!=NULL) { |
| uprv_memset(tokenLengths, 0, tokenCount); |
| } |
| |
| group=GET_GROUPS(uCharNames); |
| groupCount=*group++; |
| |
| /* enumerate all groups */ |
| while(groupCount>0) { |
| s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group); |
| s=expandGroupLengths(s, offsets, lengths); |
| |
| /* enumerate all lines in each group */ |
| for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) { |
| line=s+offsets[lineNumber]; |
| length=lengths[lineNumber]; |
| if(length==0) { |
| continue; |
| } |
| |
| lineLimit=line+length; |
| |
| /* read regular name */ |
| length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); |
| if(length>maxNameLength) { |
| maxNameLength=length; |
| } |
| if(line==lineLimit) { |
| continue; |
| } |
| |
| /* read Unicode 1.0 name */ |
| length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); |
| if(length>maxNameLength) { |
| maxNameLength=length; |
| } |
| if(line==lineLimit) { |
| continue; |
| } |
| |
| /* read ISO comment */ |
| /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/ |
| } |
| |
| group=NEXT_GROUP(group); |
| --groupCount; |
| } |
| |
| if(tokenLengths!=NULL) { |
| uprv_free(tokenLengths); |
| } |
| |
| /* set gMax... - name length last for threading */ |
| gMaxNameLength=maxNameLength; |
| } |
| |
| static UBool |
| calcNameSetsLengths(UErrorCode *pErrorCode) { |
| static const char extChars[]="0123456789ABCDEF<>-"; |
| int32_t i, maxNameLength; |
| |
| if(gMaxNameLength!=0) { |
| return TRUE; |
| } |
| |
| if(!isDataLoaded(pErrorCode)) { |
| return FALSE; |
| } |
| |
| /* set hex digits, used in various names, and <>-, used in extended names */ |
| for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) { |
| SET_ADD(gNameSet, extChars[i]); |
| } |
| |
| /* set sets and lengths from algorithmic names */ |
| maxNameLength=calcAlgNameSetsLengths(0); |
| |
| /* set sets and lengths from extended names */ |
| maxNameLength=calcExtNameSetsLengths(maxNameLength); |
| |
| /* set sets and lengths from group names, set global maximum values */ |
| calcGroupNameSetsLengths(maxNameLength); |
| |
| return TRUE; |
| } |
| |
| U_NAMESPACE_END |
| |
| /* public API --------------------------------------------------------------- */ |
| |
| U_NAMESPACE_USE |
| |
| U_CAPI int32_t U_EXPORT2 |
| u_charName(UChar32 code, UCharNameChoice nameChoice, |
| char *buffer, int32_t bufferLength, |
| UErrorCode *pErrorCode) { |
| AlgorithmicRange *algRange; |
| uint32_t *p; |
| uint32_t i; |
| int32_t length; |
| |
| /* check the argument values */ |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return 0; |
| } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || |
| bufferLength<0 || (bufferLength>0 && buffer==NULL) |
| ) { |
| *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { |
| return u_terminateChars(buffer, bufferLength, 0, pErrorCode); |
| } |
| |
| length=0; |
| |
| /* try algorithmic names first */ |
| p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
| i=*p; |
| algRange=(AlgorithmicRange *)(p+1); |
| while(i>0) { |
| if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) { |
| length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); |
| break; |
| } |
| algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); |
| --i; |
| } |
| |
| if(i==0) { |
| if (nameChoice == U_EXTENDED_CHAR_NAME) { |
| length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength); |
| if (!length) { |
| /* extended character name */ |
| length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength); |
| } |
| } else { |
| /* normal character name */ |
| length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); |
| } |
| } |
| |
| return u_terminateChars(buffer, bufferLength, length, pErrorCode); |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| u_getISOComment(UChar32 /*c*/, |
| char *dest, int32_t destCapacity, |
| UErrorCode *pErrorCode) { |
| /* check the argument values */ |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return 0; |
| } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) { |
| *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| return u_terminateChars(dest, destCapacity, 0, pErrorCode); |
| } |
| |
| U_CAPI UChar32 U_EXPORT2 |
| u_charFromName(UCharNameChoice nameChoice, |
| const char *name, |
| UErrorCode *pErrorCode) { |
| char upper[120] = {0}; |
| char lower[120] = {0}; |
| FindName findName; |
| AlgorithmicRange *algRange; |
| uint32_t *p; |
| uint32_t i; |
| UChar32 cp = 0; |
| char c0; |
| static constexpr UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */ |
| |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return error; |
| } |
| |
| if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) { |
| *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return error; |
| } |
| |
| if(!isDataLoaded(pErrorCode)) { |
| return error; |
| } |
| |
| /* construct the uppercase and lowercase of the name first */ |
| for(i=0; i<sizeof(upper); ++i) { |
| if((c0=*name++)!=0) { |
| upper[i]=uprv_toupper(c0); |
| lower[i]=uprv_tolower(c0); |
| } else { |
| upper[i]=lower[i]=0; |
| break; |
| } |
| } |
| if(i==sizeof(upper)) { |
| /* name too long, there is no such character */ |
| *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
| return error; |
| } |
| // i==strlen(name)==strlen(lower)==strlen(upper) |
| |
| /* try extended names first */ |
| if (lower[0] == '<') { |
| if (nameChoice == U_EXTENDED_CHAR_NAME && lower[--i] == '>') { |
| // Parse a string like "<category-HHHH>" where HHHH is a hex code point. |
| uint32_t limit = i; |
| while (i >= 3 && lower[--i] != '-') {} |
| |
| // There should be 1 to 8 hex digits. |
| int32_t hexLength = limit - (i + 1); |
| if (i >= 2 && lower[i] == '-' && 1 <= hexLength && hexLength <= 8) { |
| uint32_t cIdx; |
| |
| lower[i] = 0; |
| |
| for (++i; i < limit; ++i) { |
| if (lower[i] >= '0' && lower[i] <= '9') { |
| cp = (cp << 4) + lower[i] - '0'; |
| } else if (lower[i] >= 'a' && lower[i] <= 'f') { |
| cp = (cp << 4) + lower[i] - 'a' + 10; |
| } else { |
| *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
| return error; |
| } |
| // Prevent signed-integer overflow and out-of-range code points. |
| if (cp > UCHAR_MAX_VALUE) { |
| *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
| return error; |
| } |
| } |
| |
| /* Now validate the category name. |
| We could use a binary search, or a trie, if |
| we really wanted to. */ |
| uint8_t cat = getCharCat(cp); |
| for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) { |
| |
| if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) { |
| if (cat == cIdx) { |
| return cp; |
| } |
| break; |
| } |
| } |
| } |
| } |
| |
| *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
| return error; |
| } |
| |
| /* try algorithmic names now */ |
| p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
| i=*p; |
| algRange=(AlgorithmicRange *)(p+1); |
| while(i>0) { |
| if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) { |
| return cp; |
| } |
| algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); |
| --i; |
| } |
| |
| /* normal character name */ |
| findName.otherName=upper; |
| findName.code=error; |
| enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice); |
| if (findName.code == error) { |
| *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
| } |
| return findName.code; |
| } |
| |
| U_CAPI void U_EXPORT2 |
| u_enumCharNames(UChar32 start, UChar32 limit, |
| UEnumCharNamesFn *fn, |
| void *context, |
| UCharNameChoice nameChoice, |
| UErrorCode *pErrorCode) { |
| AlgorithmicRange *algRange; |
| uint32_t *p; |
| uint32_t i; |
| |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return; |
| } |
| |
| if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) { |
| *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| if((uint32_t) limit > UCHAR_MAX_VALUE + 1) { |
| limit = UCHAR_MAX_VALUE + 1; |
| } |
| if((uint32_t)start>=(uint32_t)limit) { |
| return; |
| } |
| |
| if(!isDataLoaded(pErrorCode)) { |
| return; |
| } |
| |
| /* interleave the data-driven ones with the algorithmic ones */ |
| /* iterate over all algorithmic ranges; assume that they are in ascending order */ |
| p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
| i=*p; |
| algRange=(AlgorithmicRange *)(p+1); |
| while(i>0) { |
| /* enumerate the character names before the current algorithmic range */ |
| /* here: start<limit */ |
| if((uint32_t)start<algRange->start) { |
| if((uint32_t)limit<=algRange->start) { |
| enumNames(uCharNames, start, limit, fn, context, nameChoice); |
| return; |
| } |
| if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) { |
| return; |
| } |
| start=(UChar32)algRange->start; |
| } |
| /* enumerate the character names in the current algorithmic range */ |
| /* here: algRange->start<=start<limit */ |
| if((uint32_t)start<=algRange->end) { |
| if((uint32_t)limit<=(algRange->end+1)) { |
| enumAlgNames(algRange, start, limit, fn, context, nameChoice); |
| return; |
| } |
| if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) { |
| return; |
| } |
| start=(UChar32)algRange->end+1; |
| } |
| /* continue to the next algorithmic range (here: start<limit) */ |
| algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); |
| --i; |
| } |
| /* enumerate the character names after the last algorithmic range */ |
| enumNames(uCharNames, start, limit, fn, context, nameChoice); |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| uprv_getMaxCharNameLength() { |
| UErrorCode errorCode=U_ZERO_ERROR; |
| if(calcNameSetsLengths(&errorCode)) { |
| return gMaxNameLength; |
| } else { |
| return 0; |
| } |
| } |
| |
| /** |
| * Converts the char set cset into a Unicode set uset. |
| * @param cset Set of 256 bit flags corresponding to a set of chars. |
| * @param uset USet to receive characters. Existing contents are deleted. |
| */ |
| static void |
| charSetToUSet(uint32_t cset[8], const USetAdder *sa) { |
| UChar us[256]; |
| char cs[256]; |
| |
| int32_t i, length; |
| UErrorCode errorCode; |
| |
| errorCode=U_ZERO_ERROR; |
| |
| if(!calcNameSetsLengths(&errorCode)) { |
| return; |
| } |
| |
| /* build a char string with all chars that are used in character names */ |
| length=0; |
| for(i=0; i<256; ++i) { |
| if(SET_CONTAINS(cset, i)) { |
| cs[length++]=(char)i; |
| } |
| } |
| |
| /* convert the char string to a UChar string */ |
| u_charsToUChars(cs, us, length); |
| |
| /* add each UChar to the USet */ |
| for(i=0; i<length; ++i) { |
| if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */ |
| sa->add(sa->set, us[i]); |
| } |
| } |
| } |
| |
| /** |
| * Fills set with characters that are used in Unicode character names. |
| * @param set USet to receive characters. |
| */ |
| U_CAPI void U_EXPORT2 |
| uprv_getCharNameCharacters(const USetAdder *sa) { |
| charSetToUSet(gNameSet, sa); |
| } |
| |
| /* data swapping ------------------------------------------------------------ */ |
| |
| /* |
| * The token table contains non-negative entries for token bytes, |
| * and -1 for bytes that represent themselves in the data file's charset. |
| * -2 entries are used for lead bytes. |
| * |
| * Direct bytes (-1 entries) must be translated from the input charset family |
| * to the output charset family. |
| * makeTokenMap() writes a permutation mapping for this. |
| * Use it once for single-/lead-byte tokens and once more for all trail byte |
| * tokens. (';' is an unused trail byte marked with -1.) |
| */ |
| static void |
| makeTokenMap(const UDataSwapper *ds, |
| int16_t tokens[], uint16_t tokenCount, |
| uint8_t map[256], |
| UErrorCode *pErrorCode) { |
| UBool usedOutChar[256]; |
| uint16_t i, j; |
| uint8_t c1, c2; |
| |
| if(U_FAILURE(*pErrorCode)) { |
| return; |
| } |
| |
| if(ds->inCharset==ds->outCharset) { |
| /* Same charset family: identity permutation */ |
| for(i=0; i<256; ++i) { |
| map[i]=(uint8_t)i; |
| } |
| } else { |
| uprv_memset(map, 0, 256); |
| uprv_memset(usedOutChar, 0, 256); |
| |
| if(tokenCount>256) { |
| tokenCount=256; |
| } |
| |
| /* set the direct bytes (byte 0 always maps to itself) */ |
| for(i=1; i<tokenCount; ++i) { |
| if(tokens[i]==-1) { |
| /* convert the direct byte character */ |
| c1=(uint8_t)i; |
| ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n", |
| i, ds->inCharset); |
| return; |
| } |
| |
| /* enter the converted character into the map and mark it used */ |
| map[c1]=c2; |
| usedOutChar[c2]=TRUE; |
| } |
| } |
| |
| /* set the mappings for the rest of the permutation */ |
| for(i=j=1; i<tokenCount; ++i) { |
| /* set mappings that were not set for direct bytes */ |
| if(map[i]==0) { |
| /* set an output byte value that was not used as an output byte above */ |
| while(usedOutChar[j]) { |
| ++j; |
| } |
| map[i]=(uint8_t)j++; |
| } |
| } |
| |
| /* |
| * leave mappings at tokenCount and above unset if tokenCount<256 |
| * because they won't be used |
| */ |
| } |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| uchar_swapNames(const UDataSwapper *ds, |
| const void *inData, int32_t length, void *outData, |
| UErrorCode *pErrorCode) { |
| const UDataInfo *pInfo; |
| int32_t headerSize; |
| |
| const uint8_t *inBytes; |
| uint8_t *outBytes; |
| |
| uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset, |
| offset, i, count, stringsCount; |
| |
| const AlgorithmicRange *inRange; |
| AlgorithmicRange *outRange; |
| |
| /* udata_swapDataHeader checks the arguments */ |
| headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); |
| if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| return 0; |
| } |
| |
| /* check data format and format version */ |
| pInfo=(const UDataInfo *)((const char *)inData+4); |
| if(!( |
| pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ |
| pInfo->dataFormat[1]==0x6e && |
| pInfo->dataFormat[2]==0x61 && |
| pInfo->dataFormat[3]==0x6d && |
| pInfo->formatVersion[0]==1 |
| )) { |
| udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n", |
| pInfo->dataFormat[0], pInfo->dataFormat[1], |
| pInfo->dataFormat[2], pInfo->dataFormat[3], |
| pInfo->formatVersion[0]); |
| *pErrorCode=U_UNSUPPORTED_ERROR; |
| return 0; |
| } |
| |
| inBytes=(const uint8_t *)inData+headerSize; |
| outBytes=(uint8_t *)outData+headerSize; |
| if(length<0) { |
| algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]); |
| } else { |
| length-=headerSize; |
| if( length<20 || |
| (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3])) |
| ) { |
| udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n", |
| length); |
| *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| return 0; |
| } |
| } |
| |
| if(length<0) { |
| /* preflighting: iterate through algorithmic ranges */ |
| offset=algNamesOffset; |
| count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); |
| offset+=4; |
| |
| for(i=0; i<count; ++i) { |
| inRange=(const AlgorithmicRange *)(inBytes+offset); |
| offset+=ds->readUInt16(inRange->size); |
| } |
| } else { |
| /* swap data */ |
| const uint16_t *p; |
| uint16_t *q, *temp; |
| |
| int16_t tokens[512]; |
| uint16_t tokenCount; |
| |
| uint8_t map[256], trailMap[256]; |
| |
| /* copy the data for inaccessible bytes */ |
| if(inBytes!=outBytes) { |
| uprv_memcpy(outBytes, inBytes, length); |
| } |
| |
| /* the initial 4 offsets first */ |
| tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]); |
| groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]); |
| groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]); |
| ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode); |
| |
| /* |
| * now the tokens table |
| * it needs to be permutated along with the compressed name strings |
| */ |
| p=(const uint16_t *)(inBytes+16); |
| q=(uint16_t *)(outBytes+16); |
| |
| /* read and swap the tokenCount */ |
| tokenCount=ds->readUInt16(*p); |
| ds->swapArray16(ds, p, 2, q, pErrorCode); |
| ++p; |
| ++q; |
| |
| /* read the first 512 tokens and make the token maps */ |
| if(tokenCount<=512) { |
| count=tokenCount; |
| } else { |
| count=512; |
| } |
| for(i=0; i<count; ++i) { |
| tokens[i]=udata_readInt16(ds, p[i]); |
| } |
| for(; i<512; ++i) { |
| tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */ |
| } |
| makeTokenMap(ds, tokens, tokenCount, map, pErrorCode); |
| makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| return 0; |
| } |
| |
| /* |
| * swap and permutate the tokens |
| * go through a temporary array to support in-place swapping |
| */ |
| temp=(uint16_t *)uprv_malloc(tokenCount*2); |
| if(temp==NULL) { |
| udata_printError(ds, "out of memory swapping %u unames.icu tokens\n", |
| tokenCount); |
| *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| return 0; |
| } |
| |
| /* swap and permutate single-/lead-byte tokens */ |
| for(i=0; i<tokenCount && i<256; ++i) { |
| ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode); |
| } |
| |
| /* swap and permutate trail-byte tokens */ |
| for(; i<tokenCount; ++i) { |
| ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode); |
| } |
| |
| /* copy the result into the output and free the temporary array */ |
| uprv_memcpy(q, temp, tokenCount*2); |
| uprv_free(temp); |
| |
| /* |
| * swap the token strings but not a possible padding byte after |
| * the terminating NUL of the last string |
| */ |
| udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset), |
| outBytes+tokenStringOffset, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| udata_printError(ds, "uchar_swapNames(token strings) failed\n"); |
| return 0; |
| } |
| |
| /* swap the group table */ |
| count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset))); |
| ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2), |
| outBytes+groupsOffset, pErrorCode); |
| |
| /* |
| * swap the group strings |
| * swap the string bytes but not the nibble-encoded string lengths |
| */ |
| if(ds->inCharset!=ds->outCharset) { |
| uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1]; |
| |
| const uint8_t *inStrings, *nextInStrings; |
| uint8_t *outStrings; |
| |
| uint8_t c; |
| |
| inStrings=inBytes+groupStringOffset; |
| outStrings=outBytes+groupStringOffset; |
| |
| stringsCount=algNamesOffset-groupStringOffset; |
| |
| /* iterate through string groups until only a few padding bytes are left */ |
| while(stringsCount>32) { |
| nextInStrings=expandGroupLengths(inStrings, offsets, lengths); |
| |
| /* move past the length bytes */ |
| stringsCount-=(uint32_t)(nextInStrings-inStrings); |
| outStrings+=nextInStrings-inStrings; |
| inStrings=nextInStrings; |
| |
| count=offsets[31]+lengths[31]; /* total number of string bytes in this group */ |
| stringsCount-=count; |
| |
| /* swap the string bytes using map[] and trailMap[] */ |
| while(count>0) { |
| c=*inStrings++; |
| *outStrings++=map[c]; |
| if(tokens[c]!=-2) { |
| --count; |
| } else { |
| /* token lead byte: swap the trail byte, too */ |
| *outStrings++=trailMap[*inStrings++]; |
| count-=2; |
| } |
| } |
| } |
| } |
| |
| /* swap the algorithmic ranges */ |
| offset=algNamesOffset; |
| count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); |
| ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode); |
| offset+=4; |
| |
| for(i=0; i<count; ++i) { |
| if(offset>(uint32_t)length) { |
| udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n", |
| length, i); |
| *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| return 0; |
| } |
| |
| inRange=(const AlgorithmicRange *)(inBytes+offset); |
| outRange=(AlgorithmicRange *)(outBytes+offset); |
| offset+=ds->readUInt16(inRange->size); |
| |
| ds->swapArray32(ds, inRange, 8, outRange, pErrorCode); |
| ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode); |
| switch(inRange->type) { |
| case 0: |
| /* swap prefix string */ |
| ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)), |
| outRange+1, pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n", |
| i); |
| return 0; |
| } |
| break; |
| case 1: |
| { |
| /* swap factors and the prefix and factor strings */ |
| uint32_t factorsCount; |
| |
| factorsCount=inRange->variant; |
| p=(const uint16_t *)(inRange+1); |
| q=(uint16_t *)(outRange+1); |
| ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode); |
| |
| /* swap the strings, up to the last terminating NUL */ |
| p+=factorsCount; |
| q+=factorsCount; |
| stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p); |
| while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) { |
| --stringsCount; |
| } |
| ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode); |
| } |
| break; |
| default: |
| udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n", |
| inRange->type, i); |
| *pErrorCode=U_UNSUPPORTED_ERROR; |
| return 0; |
| } |
| } |
| } |
| |
| return headerSize+(int32_t)offset; |
| } |
| |
| /* |
| * Hey, Emacs, please set the following: |
| * |
| * Local Variables: |
| * indent-tabs-mode: nil |
| * End: |
| * |
| */ |