| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2003-2013, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: ucm.c |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2003jun20 |
| * created by: Markus W. Scherer |
| * |
| * This file reads a .ucm file, stores its mappings and sorts them. |
| * It implements handling of Unicode conversion mappings from .ucm files |
| * for makeconv, canonucm, rptp2ucm, etc. |
| * |
| * Unicode code point sequences with a length of more than 1, |
| * as well as byte sequences with more than 4 bytes or more than one complete |
| * character sequence are handled to support m:n mappings. |
| */ |
| |
| #include "unicode/utypes.h" |
| #include "unicode/ustring.h" |
| #include "cstring.h" |
| #include "cmemory.h" |
| #include "filestrm.h" |
| #include "uarrsort.h" |
| #include "ucnvmbcs.h" |
| #include "ucnv_bld.h" |
| #include "ucnv_ext.h" |
| #include "uparse.h" |
| #include "ucm.h" |
| #include <stdio.h> |
| |
| #if !UCONFIG_NO_CONVERSION |
| |
| /* -------------------------------------------------------------------------- */ |
| |
| static void |
| printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) { |
| int32_t j; |
| |
| for(j=0; j<m->uLen; ++j) { |
| fprintf(f, "<U%04lX>", (long)codePoints[j]); |
| } |
| |
| fputc(' ', f); |
| |
| for(j=0; j<m->bLen; ++j) { |
| fprintf(f, "\\x%02X", bytes[j]); |
| } |
| |
| if(m->f>=0) { |
| fprintf(f, " |%u\n", m->f); |
| } else { |
| fputs("\n", f); |
| } |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { |
| printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) { |
| UCMapping *m; |
| int32_t i, length; |
| |
| m=table->mappings; |
| length=table->mappingsLength; |
| if(byUnicode) { |
| for(i=0; i<length; ++m, ++i) { |
| ucm_printMapping(table, m, f); |
| } |
| } else { |
| const int32_t *map=table->reverseMap; |
| for(i=0; i<length; ++i) { |
| ucm_printMapping(table, m+map[i], f); |
| } |
| } |
| } |
| |
| /* mapping comparisons ------------------------------------------------------ */ |
| |
| static int32_t |
| compareUnicode(UCMTable *lTable, const UCMapping *l, |
| UCMTable *rTable, const UCMapping *r) { |
| const UChar32 *lu, *ru; |
| int32_t result, i, length; |
| |
| if(l->uLen==1 && r->uLen==1) { |
| /* compare two single code points */ |
| return l->u-r->u; |
| } |
| |
| /* get pointers to the code point sequences */ |
| lu=UCM_GET_CODE_POINTS(lTable, l); |
| ru=UCM_GET_CODE_POINTS(rTable, r); |
| |
| /* get the minimum length */ |
| if(l->uLen<=r->uLen) { |
| length=l->uLen; |
| } else { |
| length=r->uLen; |
| } |
| |
| /* compare the code points */ |
| for(i=0; i<length; ++i) { |
| result=lu[i]-ru[i]; |
| if(result!=0) { |
| return result; |
| } |
| } |
| |
| /* compare the lengths */ |
| return l->uLen-r->uLen; |
| } |
| |
| static int32_t |
| compareBytes(UCMTable *lTable, const UCMapping *l, |
| UCMTable *rTable, const UCMapping *r, |
| UBool lexical) { |
| const uint8_t *lb, *rb; |
| int32_t result, i, length; |
| |
| /* |
| * A lexical comparison is used for sorting in the builder, to allow |
| * an efficient search for a byte sequence that could be a prefix |
| * of a previously entered byte sequence. |
| * |
| * Comparing by lengths first is for compatibility with old .ucm tools |
| * like canonucm and rptp2ucm. |
| */ |
| if(lexical) { |
| /* get the minimum length and continue */ |
| if(l->bLen<=r->bLen) { |
| length=l->bLen; |
| } else { |
| length=r->bLen; |
| } |
| } else { |
| /* compare lengths first */ |
| result=l->bLen-r->bLen; |
| if(result!=0) { |
| return result; |
| } else { |
| length=l->bLen; |
| } |
| } |
| |
| /* get pointers to the byte sequences */ |
| lb=UCM_GET_BYTES(lTable, l); |
| rb=UCM_GET_BYTES(rTable, r); |
| |
| /* compare the bytes */ |
| for(i=0; i<length; ++i) { |
| result=lb[i]-rb[i]; |
| if(result!=0) { |
| return result; |
| } |
| } |
| |
| /* compare the lengths */ |
| return l->bLen-r->bLen; |
| } |
| |
| /* compare UCMappings for sorting */ |
| static int32_t |
| compareMappings(UCMTable *lTable, const UCMapping *l, |
| UCMTable *rTable, const UCMapping *r, |
| UBool uFirst) { |
| int32_t result; |
| |
| /* choose which side to compare first */ |
| if(uFirst) { |
| /* Unicode then bytes */ |
| result=compareUnicode(lTable, l, rTable, r); |
| if(result==0) { |
| result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */ |
| } |
| } else { |
| /* bytes then Unicode */ |
| result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */ |
| if(result==0) { |
| result=compareUnicode(lTable, l, rTable, r); |
| } |
| } |
| |
| if(result!=0) { |
| return result; |
| } |
| |
| /* compare the flags */ |
| return l->f-r->f; |
| } |
| |
| /* sorting by Unicode first sorts mappings directly */ |
| static int32_t |
| compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) { |
| return compareMappings( |
| (UCMTable *)context, (const UCMapping *)left, |
| (UCMTable *)context, (const UCMapping *)right, TRUE); |
| } |
| |
| /* sorting by bytes first sorts the reverseMap; use indirection to mappings */ |
| static int32_t |
| compareMappingsBytesFirst(const void *context, const void *left, const void *right) { |
| UCMTable *table=(UCMTable *)context; |
| int32_t l=*(const int32_t *)left, r=*(const int32_t *)right; |
| return compareMappings( |
| table, table->mappings+l, |
| table, table->mappings+r, FALSE); |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucm_sortTable(UCMTable *t) { |
| UErrorCode errorCode; |
| int32_t i; |
| |
| if(t->isSorted) { |
| return; |
| } |
| |
| errorCode=U_ZERO_ERROR; |
| |
| /* 1. sort by Unicode first */ |
| uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping), |
| compareMappingsUnicodeFirst, t, |
| FALSE, &errorCode); |
| |
| /* build the reverseMap */ |
| if(t->reverseMap==NULL) { |
| /* |
| * allocate mappingsCapacity instead of mappingsLength so that |
| * if mappings are added, the reverseMap need not be |
| * reallocated each time |
| * (see ucm_moveMappings() and ucm_addMapping()) |
| */ |
| t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t)); |
| if(t->reverseMap==NULL) { |
| fprintf(stderr, "ucm error: unable to allocate reverseMap\n"); |
| exit(U_MEMORY_ALLOCATION_ERROR); |
| } |
| } |
| for(i=0; i<t->mappingsLength; ++i) { |
| t->reverseMap[i]=i; |
| } |
| |
| /* 2. sort reverseMap by mappings bytes first */ |
| uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t), |
| compareMappingsBytesFirst, t, |
| FALSE, &errorCode); |
| |
| if(U_FAILURE(errorCode)) { |
| fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n", |
| u_errorName(errorCode)); |
| exit(errorCode); |
| } |
| |
| t->isSorted=TRUE; |
| } |
| |
| /* |
| * remove mappings with their move flag set from the base table |
| * and move some of them (with UCM_MOVE_TO_EXT) to the extension table |
| */ |
| U_CAPI void U_EXPORT2 |
| ucm_moveMappings(UCMTable *base, UCMTable *ext) { |
| UCMapping *mb, *mbLimit; |
| int8_t flag; |
| |
| mb=base->mappings; |
| mbLimit=mb+base->mappingsLength; |
| |
| while(mb<mbLimit) { |
| flag=mb->moveFlag; |
| if(flag!=0) { |
| /* reset the move flag */ |
| mb->moveFlag=0; |
| |
| if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) { |
| /* add the mapping to the extension table */ |
| ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); |
| } |
| |
| /* remove this mapping: move the last base mapping down and overwrite the current one */ |
| if(mb<(mbLimit-1)) { |
| uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); |
| } |
| --mbLimit; |
| --base->mappingsLength; |
| base->isSorted=FALSE; |
| } else { |
| ++mb; |
| } |
| } |
| } |
| |
| enum { |
| NEEDS_MOVE=1, |
| HAS_ERRORS=2 |
| }; |
| |
| static uint8_t |
| checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, |
| UBool moveToExt, UBool intersectBase) { |
| UCMapping *mb, *me, *mbLimit, *meLimit; |
| int32_t cmp; |
| uint8_t result; |
| |
| mb=base->mappings; |
| mbLimit=mb+base->mappingsLength; |
| |
| me=ext->mappings; |
| meLimit=me+ext->mappingsLength; |
| |
| result=0; |
| |
| for(;;) { |
| /* skip irrelevant mappings on both sides */ |
| for(;;) { |
| if(mb==mbLimit) { |
| return result; |
| } |
| |
| if((0<=mb->f && mb->f<=2) || mb->f==4) { |
| break; |
| } |
| |
| ++mb; |
| } |
| |
| for(;;) { |
| if(me==meLimit) { |
| return result; |
| } |
| |
| if((0<=me->f && me->f<=2) || me->f==4) { |
| break; |
| } |
| |
| ++me; |
| } |
| |
| /* compare the base and extension mappings */ |
| cmp=compareUnicode(base, mb, ext, me); |
| if(cmp<0) { |
| if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { |
| /* |
| * mapping in base but not in ext, move it |
| * |
| * if ext is DBCS, move DBCS mappings here |
| * and check SBCS ones for Unicode prefix below |
| */ |
| mb->moveFlag|=UCM_MOVE_TO_EXT; |
| result|=NEEDS_MOVE; |
| |
| /* does mb map from an input sequence that is a prefix of me's? */ |
| } else if( mb->uLen<me->uLen && |
| 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) |
| ) { |
| if(moveToExt) { |
| /* mark this mapping to be moved to the extension table */ |
| mb->moveFlag|=UCM_MOVE_TO_EXT; |
| result|=NEEDS_MOVE; |
| } else { |
| fprintf(stderr, |
| "ucm error: the base table contains a mapping whose input sequence\n" |
| " is a prefix of the input sequence of an extension mapping\n"); |
| ucm_printMapping(base, mb, stderr); |
| ucm_printMapping(ext, me, stderr); |
| result|=HAS_ERRORS; |
| } |
| } |
| |
| ++mb; |
| } else if(cmp==0) { |
| /* |
| * same output: remove the extension mapping, |
| * otherwise treat as an error |
| */ |
| if( mb->f==me->f && mb->bLen==me->bLen && |
| 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) |
| ) { |
| me->moveFlag|=UCM_REMOVE_MAPPING; |
| result|=NEEDS_MOVE; |
| } else if(intersectBase) { |
| /* mapping in base but not in ext, move it */ |
| mb->moveFlag|=UCM_MOVE_TO_EXT; |
| result|=NEEDS_MOVE; |
| } else { |
| fprintf(stderr, |
| "ucm error: the base table contains a mapping whose input sequence\n" |
| " is the same as the input sequence of an extension mapping\n" |
| " but it maps differently\n"); |
| ucm_printMapping(base, mb, stderr); |
| ucm_printMapping(ext, me, stderr); |
| result|=HAS_ERRORS; |
| } |
| |
| ++mb; |
| } else /* cmp>0 */ { |
| ++me; |
| } |
| } |
| } |
| |
| static uint8_t |
| checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, |
| UBool moveToExt, UBool intersectBase) { |
| UCMapping *mb, *me; |
| int32_t *baseMap, *extMap; |
| int32_t b, e, bLimit, eLimit, cmp; |
| uint8_t result; |
| UBool isSISO; |
| |
| baseMap=base->reverseMap; |
| extMap=ext->reverseMap; |
| |
| b=e=0; |
| bLimit=base->mappingsLength; |
| eLimit=ext->mappingsLength; |
| |
| result=0; |
| |
| isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); |
| |
| for(;;) { |
| /* skip irrelevant mappings on both sides */ |
| for(;; ++b) { |
| if(b==bLimit) { |
| return result; |
| } |
| mb=base->mappings+baseMap[b]; |
| |
| if(intersectBase==2 && mb->bLen==1) { |
| /* |
| * comparing a base against a DBCS extension: |
| * leave SBCS base mappings alone |
| */ |
| continue; |
| } |
| |
| if(mb->f==0 || mb->f==3) { |
| break; |
| } |
| } |
| |
| for(;;) { |
| if(e==eLimit) { |
| return result; |
| } |
| me=ext->mappings+extMap[e]; |
| |
| if(me->f==0 || me->f==3) { |
| break; |
| } |
| |
| ++e; |
| } |
| |
| /* compare the base and extension mappings */ |
| cmp=compareBytes(base, mb, ext, me, TRUE); |
| if(cmp<0) { |
| if(intersectBase) { |
| /* mapping in base but not in ext, move it */ |
| mb->moveFlag|=UCM_MOVE_TO_EXT; |
| result|=NEEDS_MOVE; |
| |
| /* |
| * does mb map from an input sequence that is a prefix of me's? |
| * for SI/SO tables, a single byte is never a prefix because it |
| * occurs in a separate single-byte state |
| */ |
| } else if( mb->bLen<me->bLen && |
| (!isSISO || mb->bLen>1) && |
| 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) |
| ) { |
| if(moveToExt) { |
| /* mark this mapping to be moved to the extension table */ |
| mb->moveFlag|=UCM_MOVE_TO_EXT; |
| result|=NEEDS_MOVE; |
| } else { |
| fprintf(stderr, |
| "ucm error: the base table contains a mapping whose input sequence\n" |
| " is a prefix of the input sequence of an extension mapping\n"); |
| ucm_printMapping(base, mb, stderr); |
| ucm_printMapping(ext, me, stderr); |
| result|=HAS_ERRORS; |
| } |
| } |
| |
| ++b; |
| } else if(cmp==0) { |
| /* |
| * same output: remove the extension mapping, |
| * otherwise treat as an error |
| */ |
| if( mb->f==me->f && mb->uLen==me->uLen && |
| 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) |
| ) { |
| me->moveFlag|=UCM_REMOVE_MAPPING; |
| result|=NEEDS_MOVE; |
| } else if(intersectBase) { |
| /* mapping in base but not in ext, move it */ |
| mb->moveFlag|=UCM_MOVE_TO_EXT; |
| result|=NEEDS_MOVE; |
| } else { |
| fprintf(stderr, |
| "ucm error: the base table contains a mapping whose input sequence\n" |
| " is the same as the input sequence of an extension mapping\n" |
| " but it maps differently\n"); |
| ucm_printMapping(base, mb, stderr); |
| ucm_printMapping(ext, me, stderr); |
| result|=HAS_ERRORS; |
| } |
| |
| ++b; |
| } else /* cmp>0 */ { |
| ++e; |
| } |
| } |
| } |
| |
| U_CAPI UBool U_EXPORT2 |
| ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { |
| UCMapping *m, *mLimit; |
| int32_t count; |
| UBool isOK; |
| |
| m=table->mappings; |
| mLimit=m+table->mappingsLength; |
| isOK=TRUE; |
| |
| while(m<mLimit) { |
| count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen); |
| if(count<1) { |
| ucm_printMapping(table, m, stderr); |
| isOK=FALSE; |
| } |
| ++m; |
| } |
| |
| return isOK; |
| } |
| |
| U_CAPI UBool U_EXPORT2 |
| ucm_checkBaseExt(UCMStates *baseStates, |
| UCMTable *base, UCMTable *ext, UCMTable *moveTarget, |
| UBool intersectBase) { |
| uint8_t result; |
| |
| /* if we have an extension table, we must always use precision flags */ |
| if(base->flagsType&UCM_FLAGS_IMPLICIT) { |
| fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n"); |
| return FALSE; |
| } |
| if(ext->flagsType&UCM_FLAGS_IMPLICIT) { |
| fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n"); |
| return FALSE; |
| } |
| |
| /* checking requires both tables to be sorted */ |
| ucm_sortTable(base); |
| ucm_sortTable(ext); |
| |
| /* check */ |
| result= |
| checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)| |
| checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase); |
| |
| if(result&HAS_ERRORS) { |
| return FALSE; |
| } |
| |
| if(result&NEEDS_MOVE) { |
| ucm_moveMappings(ext, NULL); |
| ucm_moveMappings(base, moveTarget); |
| ucm_sortTable(base); |
| ucm_sortTable(ext); |
| if(moveTarget!=NULL) { |
| ucm_sortTable(moveTarget); |
| } |
| } |
| |
| return TRUE; |
| } |
| |
| /* merge tables for rptp2ucm ------------------------------------------------ */ |
| |
| U_CAPI void U_EXPORT2 |
| ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, |
| const uint8_t *subchar, int32_t subcharLength, |
| uint8_t subchar1) { |
| UCMapping *fromUMapping, *toUMapping; |
| int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp; |
| |
| ucm_sortTable(fromUTable); |
| ucm_sortTable(toUTable); |
| |
| fromUMapping=fromUTable->mappings; |
| toUMapping=toUTable->mappings; |
| |
| fromUTop=fromUTable->mappingsLength; |
| toUTop=toUTable->mappingsLength; |
| |
| fromUIndex=toUIndex=0; |
| |
| while(fromUIndex<fromUTop && toUIndex<toUTop) { |
| cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE); |
| if(cmp==0) { |
| /* equal: roundtrip, nothing to do (flags are initially 0) */ |
| ++fromUMapping; |
| ++toUMapping; |
| |
| ++fromUIndex; |
| ++toUIndex; |
| } else if(cmp<0) { |
| /* |
| * the fromU mapping does not have a toU counterpart: |
| * fallback Unicode->codepage |
| */ |
| if( (fromUMapping->bLen==subcharLength && |
| 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || |
| (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) |
| ) { |
| fromUMapping->f=2; /* SUB mapping */ |
| } else { |
| fromUMapping->f=1; /* normal fallback */ |
| } |
| |
| ++fromUMapping; |
| ++fromUIndex; |
| } else { |
| /* |
| * the toU mapping does not have a fromU counterpart: |
| * (reverse) fallback codepage->Unicode, copy it to the fromU table |
| */ |
| |
| /* ignore reverse fallbacks to Unicode SUB */ |
| if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { |
| toUMapping->f=3; /* reverse fallback */ |
| ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); |
| |
| /* the table may have been reallocated */ |
| fromUMapping=fromUTable->mappings+fromUIndex; |
| } |
| |
| ++toUMapping; |
| ++toUIndex; |
| } |
| } |
| |
| /* either one or both tables are exhausted */ |
| while(fromUIndex<fromUTop) { |
| /* leftover fromU mappings are fallbacks */ |
| if( (fromUMapping->bLen==subcharLength && |
| 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || |
| (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) |
| ) { |
| fromUMapping->f=2; /* SUB mapping */ |
| } else { |
| fromUMapping->f=1; /* normal fallback */ |
| } |
| |
| ++fromUMapping; |
| ++fromUIndex; |
| } |
| |
| while(toUIndex<toUTop) { |
| /* leftover toU mappings are reverse fallbacks */ |
| |
| /* ignore reverse fallbacks to Unicode SUB */ |
| if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { |
| toUMapping->f=3; /* reverse fallback */ |
| ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); |
| } |
| |
| ++toUMapping; |
| ++toUIndex; |
| } |
| |
| fromUTable->isSorted=FALSE; |
| } |
| |
| /* separate extension mappings out of base table for rptp2ucm --------------- */ |
| |
| U_CAPI UBool U_EXPORT2 |
| ucm_separateMappings(UCMFile *ucm, UBool isSISO) { |
| UCMTable *table; |
| UCMapping *m, *mLimit; |
| int32_t type; |
| UBool needsMove, isOK; |
| |
| table=ucm->base; |
| m=table->mappings; |
| mLimit=m+table->mappingsLength; |
| |
| needsMove=FALSE; |
| isOK=TRUE; |
| |
| for(; m<mLimit; ++m) { |
| if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { |
| fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); |
| ucm_printMapping(table, m, stderr); |
| m->moveFlag|=UCM_REMOVE_MAPPING; |
| needsMove=TRUE; |
| continue; |
| } |
| |
| type=ucm_mappingType( |
| &ucm->states, m, |
| UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m)); |
| if(type<0) { |
| /* illegal byte sequence */ |
| printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); |
| isOK=FALSE; |
| } else if(type>0) { |
| m->moveFlag|=UCM_MOVE_TO_EXT; |
| needsMove=TRUE; |
| } |
| } |
| |
| if(!isOK) { |
| return FALSE; |
| } |
| if(needsMove) { |
| ucm_moveMappings(ucm->base, ucm->ext); |
| return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE); |
| } else { |
| ucm_sortTable(ucm->base); |
| return TRUE; |
| } |
| } |
| |
| /* ucm parser --------------------------------------------------------------- */ |
| |
| U_CAPI int8_t U_EXPORT2 |
| ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) { |
| const char *s=*ps; |
| char *end; |
| uint8_t byte; |
| int8_t bLen; |
| |
| bLen=0; |
| for(;;) { |
| /* skip an optional plus sign */ |
| if(bLen>0 && *s=='+') { |
| ++s; |
| } |
| if(*s!='\\') { |
| break; |
| } |
| |
| if( s[1]!='x' || |
| (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4 |
| ) { |
| fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line); |
| return -1; |
| } |
| |
| if(bLen==UCNV_EXT_MAX_BYTES) { |
| fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line); |
| return -1; |
| } |
| bytes[bLen++]=byte; |
| s=end; |
| } |
| |
| *ps=s; |
| return bLen; |
| } |
| |
| /* parse a mapping line; must not be empty */ |
| U_CAPI UBool U_EXPORT2 |
| ucm_parseMappingLine(UCMapping *m, |
| UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
| uint8_t bytes[UCNV_EXT_MAX_BYTES], |
| const char *line) { |
| const char *s; |
| char *end; |
| UChar32 cp; |
| int32_t u16Length; |
| int8_t uLen, bLen, f; |
| |
| s=line; |
| uLen=bLen=0; |
| |
| /* parse code points */ |
| for(;;) { |
| /* skip an optional plus sign */ |
| if(uLen>0 && *s=='+') { |
| ++s; |
| } |
| if(*s!='<') { |
| break; |
| } |
| |
| if( s[1]!='U' || |
| (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || |
| *end!='>' |
| ) { |
| fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line); |
| return FALSE; |
| } |
| if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) { |
| fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); |
| return FALSE; |
| } |
| |
| if(uLen==UCNV_EXT_MAX_UCHARS) { |
| fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); |
| return FALSE; |
| } |
| codePoints[uLen++]=cp; |
| s=end+1; |
| } |
| |
| if(uLen==0) { |
| fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); |
| return FALSE; |
| } else if(uLen==1) { |
| m->u=codePoints[0]; |
| } else { |
| UErrorCode errorCode=U_ZERO_ERROR; |
| u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode); |
| if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || |
| u16Length>UCNV_EXT_MAX_UCHARS |
| ) { |
| fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); |
| return FALSE; |
| } |
| } |
| |
| s=u_skipWhitespace(s); |
| |
| /* parse bytes */ |
| bLen=ucm_parseBytes(bytes, line, &s); |
| |
| if(bLen<0) { |
| return FALSE; |
| } else if(bLen==0) { |
| fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); |
| return FALSE; |
| } else if(bLen<=4) { |
| uprv_memcpy(m->b.bytes, bytes, bLen); |
| } |
| |
| /* skip everything until the fallback indicator, even the start of a comment */ |
| for(;;) { |
| if(*s==0) { |
| f=-1; /* no fallback indicator */ |
| break; |
| } else if(*s=='|') { |
| f=(int8_t)(s[1]-'0'); |
| if((uint8_t)f>4) { |
| fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); |
| return FALSE; |
| } |
| break; |
| } |
| ++s; |
| } |
| |
| m->uLen=uLen; |
| m->bLen=bLen; |
| m->f=f; |
| return TRUE; |
| } |
| |
| /* general APIs ------------------------------------------------------------- */ |
| |
| U_CAPI UCMTable * U_EXPORT2 |
| ucm_openTable() { |
| UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable)); |
| if(table==NULL) { |
| fprintf(stderr, "ucm error: unable to allocate a UCMTable\n"); |
| exit(U_MEMORY_ALLOCATION_ERROR); |
| } |
| |
| memset(table, 0, sizeof(UCMTable)); |
| return table; |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucm_closeTable(UCMTable *table) { |
| if(table!=NULL) { |
| uprv_free(table->mappings); |
| uprv_free(table->codePoints); |
| uprv_free(table->bytes); |
| uprv_free(table->reverseMap); |
| uprv_free(table); |
| } |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucm_resetTable(UCMTable *table) { |
| if(table!=NULL) { |
| table->mappingsLength=0; |
| table->flagsType=0; |
| table->unicodeMask=0; |
| table->bytesLength=table->codePointsLength=0; |
| table->isSorted=FALSE; |
| } |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucm_addMapping(UCMTable *table, |
| UCMapping *m, |
| UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
| uint8_t bytes[UCNV_EXT_MAX_BYTES]) { |
| UCMapping *tm; |
| UChar32 c; |
| int32_t idx; |
| |
| if(table->mappingsLength>=table->mappingsCapacity) { |
| /* make the mappings array larger */ |
| if(table->mappingsCapacity==0) { |
| table->mappingsCapacity=1000; |
| } else { |
| table->mappingsCapacity*=10; |
| } |
| table->mappings=(UCMapping *)uprv_realloc(table->mappings, |
| table->mappingsCapacity*sizeof(UCMapping)); |
| if(table->mappings==NULL) { |
| fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n", |
| (int)table->mappingsCapacity); |
| exit(U_MEMORY_ALLOCATION_ERROR); |
| } |
| |
| if(table->reverseMap!=NULL) { |
| /* the reverseMap must be reallocated in a new sort */ |
| uprv_free(table->reverseMap); |
| table->reverseMap=NULL; |
| } |
| } |
| |
| if(m->uLen>1 && table->codePointsCapacity==0) { |
| table->codePointsCapacity=10000; |
| table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4); |
| if(table->codePoints==NULL) { |
| fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n", |
| (int)table->codePointsCapacity); |
| exit(U_MEMORY_ALLOCATION_ERROR); |
| } |
| } |
| |
| if(m->bLen>4 && table->bytesCapacity==0) { |
| table->bytesCapacity=10000; |
| table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity); |
| if(table->bytes==NULL) { |
| fprintf(stderr, "ucm error: unable to allocate %d bytes\n", |
| (int)table->bytesCapacity); |
| exit(U_MEMORY_ALLOCATION_ERROR); |
| } |
| } |
| |
| if(m->uLen>1) { |
| idx=table->codePointsLength; |
| table->codePointsLength+=m->uLen; |
| if(table->codePointsLength>table->codePointsCapacity) { |
| fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n"); |
| exit(U_MEMORY_ALLOCATION_ERROR); |
| } |
| |
| uprv_memcpy(table->codePoints+idx, codePoints, m->uLen*4); |
| m->u=idx; |
| } |
| |
| if(m->bLen>4) { |
| idx=table->bytesLength; |
| table->bytesLength+=m->bLen; |
| if(table->bytesLength>table->bytesCapacity) { |
| fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n"); |
| exit(U_MEMORY_ALLOCATION_ERROR); |
| } |
| |
| uprv_memcpy(table->bytes+idx, bytes, m->bLen); |
| m->b.idx=idx; |
| } |
| |
| /* set unicodeMask */ |
| for(idx=0; idx<m->uLen; ++idx) { |
| c=codePoints[idx]; |
| if(c>=0x10000) { |
| table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */ |
| } else if(U_IS_SURROGATE(c)) { |
| table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */ |
| } |
| } |
| |
| /* set flagsType */ |
| if(m->f<0) { |
| table->flagsType|=UCM_FLAGS_IMPLICIT; |
| } else { |
| table->flagsType|=UCM_FLAGS_EXPLICIT; |
| } |
| |
| tm=table->mappings+table->mappingsLength++; |
| uprv_memcpy(tm, m, sizeof(UCMapping)); |
| |
| table->isSorted=FALSE; |
| } |
| |
| U_CAPI UCMFile * U_EXPORT2 |
| ucm_open() { |
| UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile)); |
| if(ucm==NULL) { |
| fprintf(stderr, "ucm error: unable to allocate a UCMFile\n"); |
| exit(U_MEMORY_ALLOCATION_ERROR); |
| } |
| |
| memset(ucm, 0, sizeof(UCMFile)); |
| |
| ucm->base=ucm_openTable(); |
| ucm->ext=ucm_openTable(); |
| |
| ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT; |
| ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER; |
| ucm->states.outputType=-1; |
| ucm->states.minCharLength=ucm->states.maxCharLength=1; |
| |
| return ucm; |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucm_close(UCMFile *ucm) { |
| if(ucm!=NULL) { |
| ucm_closeTable(ucm->base); |
| ucm_closeTable(ucm->ext); |
| uprv_free(ucm); |
| } |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| ucm_mappingType(UCMStates *baseStates, |
| UCMapping *m, |
| UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
| uint8_t bytes[UCNV_EXT_MAX_BYTES]) { |
| /* check validity of the bytes and count the characters in them */ |
| int32_t count=ucm_countChars(baseStates, bytes, m->bLen); |
| if(count<1) { |
| /* illegal byte sequence */ |
| return -1; |
| } |
| |
| /* |
| * Suitable for an ICU conversion base table means: |
| * - a 1:1 mapping (1 Unicode code point : 1 byte sequence) |
| * - precision flag 0..3 |
| * - SBCS: any 1:1 mapping |
| * (the table stores additional bits to distinguish mapping types) |
| * - MBCS: not a |2 SUB mapping for <subchar1> |
| * - MBCS: not a |1 fallback to 0x00 |
| * - MBCS: not a multi-byte mapping with leading 0x00 bytes |
| * |
| * Further restrictions for fromUnicode tables |
| * are enforced in makeconv (MBCSOkForBaseFromUnicode()). |
| * |
| * All of the MBCS fromUnicode specific tests could be removed from here, |
| * but the ones above are for unusual mappings, and removing the tests |
| * from here would change canonucm output which seems gratuitous. |
| * (Markus Scherer 2006-nov-28) |
| * |
| * Exception: All implicit mappings (f<0) that need to be moved |
| * because of fromUnicode restrictions _must_ be moved here because |
| * makeconv uses a hack for moving mappings only for the fromUnicode table |
| * that only works with non-negative values of f. |
| */ |
| if( m->uLen==1 && count==1 && m->f<=3 && |
| (baseStates->maxCharLength==1 || |
| !((m->f==2 && m->bLen==1) || |
| (m->f==1 && bytes[0]==0) || |
| (m->f<=1 && m->bLen>1 && bytes[0]==0))) |
| ) { |
| return 0; /* suitable for a base table */ |
| } else { |
| return 1; /* needs to go into an extension table */ |
| } |
| } |
| |
| U_CAPI UBool U_EXPORT2 |
| ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, |
| UCMapping *m, |
| UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
| uint8_t bytes[UCNV_EXT_MAX_BYTES]) { |
| int32_t type; |
| |
| if(m->f==2 && m->uLen>1) { |
| fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n"); |
| printMapping(m, codePoints, bytes, stderr); |
| return FALSE; |
| } |
| |
| if(baseStates!=NULL) { |
| /* check validity of the bytes and count the characters in them */ |
| type=ucm_mappingType(baseStates, m, codePoints, bytes); |
| if(type<0) { |
| /* illegal byte sequence */ |
| printMapping(m, codePoints, bytes, stderr); |
| return FALSE; |
| } |
| } else { |
| /* not used - adding a mapping for an extension-only table before its base table is read */ |
| type=1; |
| } |
| |
| /* |
| * Add the mapping to the base table if this is requested and suitable. |
| * Otherwise, add it to the extension table. |
| */ |
| if(forBase && type==0) { |
| ucm_addMapping(ucm->base, m, codePoints, bytes); |
| } else { |
| ucm_addMapping(ucm->ext, m, codePoints, bytes); |
| } |
| |
| return TRUE; |
| } |
| |
| U_CAPI UBool U_EXPORT2 |
| ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) { |
| UCMapping m={ 0 }; |
| UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; |
| uint8_t bytes[UCNV_EXT_MAX_BYTES]; |
| |
| const char *s; |
| |
| /* ignore empty and comment lines */ |
| if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') { |
| return TRUE; |
| } |
| |
| return |
| ucm_parseMappingLine(&m, codePoints, bytes, line) && |
| ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes); |
| } |
| |
| U_CAPI void U_EXPORT2 |
| ucm_readTable(UCMFile *ucm, FileStream* convFile, |
| UBool forBase, UCMStates *baseStates, |
| UErrorCode *pErrorCode) { |
| char line[500]; |
| char *end; |
| UBool isOK; |
| |
| if(U_FAILURE(*pErrorCode)) { |
| return; |
| } |
| |
| isOK=TRUE; |
| |
| for(;;) { |
| /* read the next line */ |
| if(!T_FileStream_readLine(convFile, line, sizeof(line))) { |
| fprintf(stderr, "incomplete charmap section\n"); |
| isOK=FALSE; |
| break; |
| } |
| |
| /* remove CR LF */ |
| end=uprv_strchr(line, 0); |
| while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) { |
| --end; |
| } |
| *end=0; |
| |
| /* ignore empty and comment lines */ |
| if(line[0]==0 || line[0]=='#') { |
| continue; |
| } |
| |
| /* stop at the end of the mapping table */ |
| if(0==uprv_strcmp(line, "END CHARMAP")) { |
| break; |
| } |
| |
| isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates); |
| } |
| |
| if(!isOK) { |
| *pErrorCode=U_INVALID_TABLE_FORMAT; |
| } |
| } |
| #endif |