| /* |
| ****************************************************************************** |
| * |
| * Copyright (C) 2002-2015, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ****************************************************************************** |
| * file name: bocu1tst.c |
| * encoding: US-ASCII |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2002may27 |
| * created by: Markus W. Scherer |
| * |
| * This is the reference implementation of BOCU-1, |
| * the MIME-friendly form of the Binary Ordered Compression for Unicode, |
| * taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/ |
| * The files bocu1.h and bocu1.c from the design folder are taken |
| * verbatim (minus copyright and #include) and copied together into this file. |
| * The reference code and some of the reference bocu1tst.c |
| * is modified to run as part of the ICU cintltst |
| * test framework (minus main(), log_ln() etc. instead of printf()). |
| * |
| * This reference implementation is used here to verify |
| * the ICU BOCU-1 implementation, which is |
| * adapted for ICU conversion APIs and optimized. |
| * ### links in design doc to here and to ucnvbocu.c |
| */ |
| |
| #include "unicode/utypes.h" |
| #include "unicode/ustring.h" |
| #include "unicode/ucnv.h" |
| #include "unicode/utf16.h" |
| #include "cmemory.h" |
| #include "cintltst.h" |
| |
| /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */ |
| |
| /* BOCU-1 constants and macros ---------------------------------------------- */ |
| |
| /* |
| * BOCU-1 encodes the code points of a Unicode string as |
| * a sequence of byte-encoded differences (slope detection), |
| * preserving lexical order. |
| * |
| * Optimize the difference-taking for runs of Unicode text within |
| * small scripts: |
| * |
| * Most small scripts are allocated within aligned 128-blocks of Unicode |
| * code points. Lexical order is preserved if the "previous code point" state |
| * is always moved into the middle of such a block. |
| * |
| * Additionally, "prev" is moved from anywhere in the Unihan and Hangul |
| * areas into the middle of those areas. |
| * |
| * C0 control codes and space are encoded with their US-ASCII bytes. |
| * "prev" is reset for C0 controls but not for space. |
| */ |
| |
| /* initial value for "prev": middle of the ASCII range */ |
| #define BOCU1_ASCII_PREV 0x40 |
| |
| /* bounding byte values for differences */ |
| #define BOCU1_MIN 0x21 |
| #define BOCU1_MIDDLE 0x90 |
| #define BOCU1_MAX_LEAD 0xfe |
| |
| /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */ |
| #define BOCU1_MAX_TRAIL 0xffL |
| #define BOCU1_RESET 0xff |
| |
| /* number of lead bytes */ |
| #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) |
| |
| /* adjust trail byte counts for the use of some C0 control byte values */ |
| #define BOCU1_TRAIL_CONTROLS_COUNT 20 |
| #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) |
| |
| /* number of trail bytes */ |
| #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) |
| |
| /* |
| * number of positive and negative single-byte codes |
| * (counting 0==BOCU1_MIDDLE among the positive ones) |
| */ |
| #define BOCU1_SINGLE 64 |
| |
| /* number of lead bytes for positive and negative 2/3/4-byte sequences */ |
| #define BOCU1_LEAD_2 43 |
| #define BOCU1_LEAD_3 3 |
| #define BOCU1_LEAD_4 1 |
| |
| /* The difference value range for single-byters. */ |
| #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) |
| #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) |
| |
| /* The difference value range for double-byters. */ |
| #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) |
| #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) |
| |
| /* The difference value range for 3-byters. */ |
| #define BOCU1_REACH_POS_3 \ |
| (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) |
| |
| #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) |
| |
| /* The lead byte start values. */ |
| #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) |
| #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) |
| #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) |
| /* ==BOCU1_MAX_LEAD */ |
| |
| #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) |
| #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) |
| #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) |
| /* ==BOCU1_MIN+1 */ |
| |
| /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ |
| #define BOCU1_LENGTH_FROM_LEAD(lead) \ |
| ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ |
| (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ |
| (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) |
| |
| /* The length of a byte sequence, according to its packed form. */ |
| #define BOCU1_LENGTH_FROM_PACKED(packed) \ |
| ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) |
| |
| /* |
| * 12 commonly used C0 control codes (and space) are only used to encode |
| * themselves directly, |
| * which makes BOCU-1 MIME-usable and reasonably safe for |
| * ASCII-oriented software. |
| * |
| * These controls are |
| * 0 NUL |
| * |
| * 7 BEL |
| * 8 BS |
| * |
| * 9 TAB |
| * a LF |
| * b VT |
| * c FF |
| * d CR |
| * |
| * e SO |
| * f SI |
| * |
| * 1a SUB |
| * 1b ESC |
| * |
| * The other 20 C0 controls are also encoded directly (to preserve order) |
| * but are also used as trail bytes in difference encoding |
| * (for better compression). |
| */ |
| #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) |
| |
| /* |
| * Byte value map for control codes, |
| * from external byte values 0x00..0x20 |
| * to trail byte values 0..19 (0..0x13) as used in the difference calculation. |
| * External byte values that are illegal as trail bytes are mapped to -1. |
| */ |
| static const int8_t |
| bocu1ByteToTrail[BOCU1_MIN]={ |
| /* 0 1 2 3 4 5 6 7 */ |
| -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, |
| |
| /* 8 9 a b c d e f */ |
| -1, -1, -1, -1, -1, -1, -1, -1, |
| |
| /* 10 11 12 13 14 15 16 17 */ |
| 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, |
| |
| /* 18 19 1a 1b 1c 1d 1e 1f */ |
| 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, |
| |
| /* 20 */ |
| -1 |
| }; |
| |
| /* |
| * Byte value map for control codes, |
| * from trail byte values 0..19 (0..0x13) as used in the difference calculation |
| * to external byte values 0x00..0x20. |
| */ |
| static const int8_t |
| bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ |
| /* 0 1 2 3 4 5 6 7 */ |
| 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, |
| |
| /* 8 9 a b c d e f */ |
| 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, |
| |
| /* 10 11 12 13 */ |
| 0x1c, 0x1d, 0x1e, 0x1f |
| }; |
| |
| /** |
| * Integer division and modulo with negative numerators |
| * yields negative modulo results and quotients that are one more than |
| * what we need here. |
| * This macro adjust the results so that the modulo-value m is always >=0. |
| * |
| * For positive n, the if() condition is always FALSE. |
| * |
| * @param n Number to be split into quotient and rest. |
| * Will be modified to contain the quotient. |
| * @param d Divisor. |
| * @param m Output variable for the rest (modulo result). |
| */ |
| #define NEGDIVMOD(n, d, m) { \ |
| (m)=(n)%(d); \ |
| (n)/=(d); \ |
| if((m)<0) { \ |
| --(n); \ |
| (m)+=(d); \ |
| } \ |
| } |
| |
| /* State for BOCU-1 decoder function. */ |
| struct Bocu1Rx { |
| int32_t prev, count, diff; |
| }; |
| |
| typedef struct Bocu1Rx Bocu1Rx; |
| |
| /* Function prototypes ------------------------------------------------------ */ |
| |
| /* see bocu1.c */ |
| U_CFUNC int32_t |
| packDiff(int32_t diff); |
| |
| U_CFUNC int32_t |
| encodeBocu1(int32_t *pPrev, int32_t c); |
| |
| U_CFUNC int32_t |
| decodeBocu1(Bocu1Rx *pRx, uint8_t b); |
| |
| /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */ |
| |
| /* BOCU-1 implementation functions ------------------------------------------ */ |
| |
| /** |
| * Compute the next "previous" value for differencing |
| * from the current code point. |
| * |
| * @param c current code point, 0..0x10ffff |
| * @return "previous code point" state value |
| */ |
| static int32_t |
| bocu1Prev(int32_t c) { |
| /* compute new prev */ |
| if(0x3040<=c && c<=0x309f) { |
| /* Hiragana is not 128-aligned */ |
| return 0x3070; |
| } else if(0x4e00<=c && c<=0x9fa5) { |
| /* CJK Unihan */ |
| return 0x4e00-BOCU1_REACH_NEG_2; |
| } else if(0xac00<=c && c<=0xd7a3) { |
| /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */ |
| return ((int32_t)0xd7a3+(int32_t)0xac00)/2; |
| } else { |
| /* mostly small scripts */ |
| return (c&~0x7f)+BOCU1_ASCII_PREV; |
| } |
| } |
| |
| /** |
| * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes |
| * and return a packed integer with them. |
| * |
| * The encoding favors small absolut differences with short encodings |
| * to compress runs of same-script characters. |
| * |
| * @param diff difference value -0x10ffff..0x10ffff |
| * @return |
| * 0x010000zz for 1-byte sequence zz |
| * 0x0200yyzz for 2-byte sequence yy zz |
| * 0x03xxyyzz for 3-byte sequence xx yy zz |
| * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) |
| */ |
| U_CFUNC int32_t |
| packDiff(int32_t diff) { |
| int32_t result, m, lead, count, shift; |
| |
| if(diff>=BOCU1_REACH_NEG_1) { |
| /* mostly positive differences, and single-byte negative ones */ |
| if(diff<=BOCU1_REACH_POS_1) { |
| /* single byte */ |
| return 0x01000000|(BOCU1_MIDDLE+diff); |
| } else if(diff<=BOCU1_REACH_POS_2) { |
| /* two bytes */ |
| diff-=BOCU1_REACH_POS_1+1; |
| lead=BOCU1_START_POS_2; |
| count=1; |
| } else if(diff<=BOCU1_REACH_POS_3) { |
| /* three bytes */ |
| diff-=BOCU1_REACH_POS_2+1; |
| lead=BOCU1_START_POS_3; |
| count=2; |
| } else { |
| /* four bytes */ |
| diff-=BOCU1_REACH_POS_3+1; |
| lead=BOCU1_START_POS_4; |
| count=3; |
| } |
| } else { |
| /* two- and four-byte negative differences */ |
| if(diff>=BOCU1_REACH_NEG_2) { |
| /* two bytes */ |
| diff-=BOCU1_REACH_NEG_1; |
| lead=BOCU1_START_NEG_2; |
| count=1; |
| } else if(diff>=BOCU1_REACH_NEG_3) { |
| /* three bytes */ |
| diff-=BOCU1_REACH_NEG_2; |
| lead=BOCU1_START_NEG_3; |
| count=2; |
| } else { |
| /* four bytes */ |
| diff-=BOCU1_REACH_NEG_3; |
| lead=BOCU1_START_NEG_4; |
| count=3; |
| } |
| } |
| |
| /* encode the length of the packed result */ |
| if(count<3) { |
| result=(count+1)<<24; |
| } else /* count==3, MSB used for the lead byte */ { |
| result=0; |
| } |
| |
| /* calculate trail bytes like digits in itoa() */ |
| shift=0; |
| do { |
| NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); |
| result|=BOCU1_TRAIL_TO_BYTE(m)<<shift; |
| shift+=8; |
| } while(--count>0); |
| |
| /* add lead byte */ |
| result|=(lead+diff)<<shift; |
| |
| return result; |
| } |
| |
| /** |
| * BOCU-1 encoder function. |
| * |
| * @param pPrev pointer to the integer that holds |
| * the "previous code point" state; |
| * the initial value should be 0 which |
| * encodeBocu1 will set to the actual BOCU-1 initial state value |
| * @param c the code point to encode |
| * @return the packed 1/2/3/4-byte encoding, see packDiff(), |
| * or 0 if an error occurs |
| * |
| * @see packDiff |
| */ |
| U_CFUNC int32_t |
| encodeBocu1(int32_t *pPrev, int32_t c) { |
| int32_t prev; |
| |
| if(pPrev==NULL || c<0 || c>0x10ffff) { |
| /* illegal argument */ |
| return 0; |
| } |
| |
| prev=*pPrev; |
| if(prev==0) { |
| /* lenient handling of initial value 0 */ |
| prev=*pPrev=BOCU1_ASCII_PREV; |
| } |
| |
| if(c<=0x20) { |
| /* |
| * ISO C0 control & space: |
| * Encode directly for MIME compatibility, |
| * and reset state except for space, to not disrupt compression. |
| */ |
| if(c!=0x20) { |
| *pPrev=BOCU1_ASCII_PREV; |
| } |
| return 0x01000000|c; |
| } |
| |
| /* |
| * all other Unicode code points c==U+0021..U+10ffff |
| * are encoded with the difference c-prev |
| * |
| * a new prev is computed from c, |
| * placed in the middle of a 0x80-block (for most small scripts) or |
| * in the middle of the Unihan and Hangul blocks |
| * to statistically minimize the following difference |
| */ |
| *pPrev=bocu1Prev(c); |
| return packDiff(c-prev); |
| } |
| |
| /** |
| * Function for BOCU-1 decoder; handles multi-byte lead bytes. |
| * |
| * @param pRx pointer to the decoder state structure |
| * @param b lead byte; |
| * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD |
| * @return -1 (state change only) |
| * |
| * @see decodeBocu1 |
| */ |
| static int32_t |
| decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) { |
| int32_t c, count; |
| |
| if(b>=BOCU1_START_NEG_2) { |
| /* positive difference */ |
| if(b<BOCU1_START_POS_3) { |
| /* two bytes */ |
| c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; |
| count=1; |
| } else if(b<BOCU1_START_POS_4) { |
| /* three bytes */ |
| c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; |
| count=2; |
| } else { |
| /* four bytes */ |
| c=BOCU1_REACH_POS_3+1; |
| count=3; |
| } |
| } else { |
| /* negative difference */ |
| if(b>=BOCU1_START_NEG_3) { |
| /* two bytes */ |
| c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; |
| count=1; |
| } else if(b>BOCU1_MIN) { |
| /* three bytes */ |
| c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; |
| count=2; |
| } else { |
| /* four bytes */ |
| c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; |
| count=3; |
| } |
| } |
| |
| /* set the state for decoding the trail byte(s) */ |
| pRx->diff=c; |
| pRx->count=count; |
| return -1; |
| } |
| |
| /** |
| * Function for BOCU-1 decoder; handles multi-byte trail bytes. |
| * |
| * @param pRx pointer to the decoder state structure |
| * @param b trail byte |
| * @return result value, same as decodeBocu1 |
| * |
| * @see decodeBocu1 |
| */ |
| static int32_t |
| decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) { |
| int32_t t, c, count; |
| |
| if(b<=0x20) { |
| /* skip some C0 controls and make the trail byte range contiguous */ |
| t=bocu1ByteToTrail[b]; |
| if(t<0) { |
| /* illegal trail byte value */ |
| pRx->prev=BOCU1_ASCII_PREV; |
| pRx->count=0; |
| return -99; |
| } |
| #if BOCU1_MAX_TRAIL<0xff |
| } else if(b>BOCU1_MAX_TRAIL) { |
| return -99; |
| #endif |
| } else { |
| t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET; |
| } |
| |
| /* add trail byte into difference and decrement count */ |
| c=pRx->diff; |
| count=pRx->count; |
| |
| if(count==1) { |
| /* final trail byte, deliver a code point */ |
| c=pRx->prev+c+t; |
| if(0<=c && c<=0x10ffff) { |
| /* valid code point result */ |
| pRx->prev=bocu1Prev(c); |
| pRx->count=0; |
| return c; |
| } else { |
| /* illegal code point result */ |
| pRx->prev=BOCU1_ASCII_PREV; |
| pRx->count=0; |
| return -99; |
| } |
| } |
| |
| /* intermediate trail byte */ |
| if(count==2) { |
| pRx->diff=c+t*BOCU1_TRAIL_COUNT; |
| } else /* count==3 */ { |
| pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT; |
| } |
| pRx->count=count-1; |
| return -1; |
| } |
| |
| /** |
| * BOCU-1 decoder function. |
| * |
| * @param pRx pointer to the decoder state structure; |
| * the initial values should be 0 which |
| * decodeBocu1 will set to actual initial state values |
| * @param b an input byte |
| * @return |
| * 0..0x10ffff for a result code point |
| * -1 if only the state changed without code point output |
| * <-1 if an error occurs |
| */ |
| U_CFUNC int32_t |
| decodeBocu1(Bocu1Rx *pRx, uint8_t b) { |
| int32_t prev, c, count; |
| |
| if(pRx==NULL) { |
| /* illegal argument */ |
| return -99; |
| } |
| |
| prev=pRx->prev; |
| if(prev==0) { |
| /* lenient handling of initial 0 values */ |
| prev=pRx->prev=BOCU1_ASCII_PREV; |
| count=pRx->count=0; |
| } else { |
| count=pRx->count; |
| } |
| |
| if(count==0) { |
| /* byte in lead position */ |
| if(b<=0x20) { |
| /* |
| * Direct-encoded C0 control code or space. |
| * Reset prev for C0 control codes but not for space. |
| */ |
| if(b!=0x20) { |
| pRx->prev=BOCU1_ASCII_PREV; |
| } |
| return b; |
| } |
| |
| /* |
| * b is a difference lead byte. |
| * |
| * Return a code point directly from a single-byte difference. |
| * |
| * For multi-byte difference lead bytes, set the decoder state |
| * with the partial difference value from the lead byte and |
| * with the number of trail bytes. |
| * |
| * For four-byte differences, the signedness also affects the |
| * first trail byte, which has special handling farther below. |
| */ |
| if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) { |
| /* single-byte difference */ |
| c=prev+((int32_t)b-BOCU1_MIDDLE); |
| pRx->prev=bocu1Prev(c); |
| return c; |
| } else if(b==BOCU1_RESET) { |
| /* only reset the state, no code point */ |
| pRx->prev=BOCU1_ASCII_PREV; |
| return -1; |
| } else { |
| return decodeBocu1LeadByte(pRx, b); |
| } |
| } else { |
| /* trail byte in any position */ |
| return decodeBocu1TrailByte(pRx, b); |
| } |
| } |
| |
| /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */ |
| |
| /* test code ---------------------------------------------------------------- */ |
| |
| /* test code options */ |
| |
| /* ignore comma when processing name lists in testText() */ |
| #define TEST_IGNORE_COMMA 1 |
| |
| /** |
| * Write a packed BOCU-1 byte sequence into a byte array, |
| * without overflow check. |
| * Test function. |
| * |
| * @param packed packed BOCU-1 byte sequence, see packDiff() |
| * @param p pointer to byte array |
| * @return number of bytes |
| * |
| * @see packDiff |
| */ |
| static int32_t |
| writePacked(int32_t packed, uint8_t *p) { |
| int32_t count=BOCU1_LENGTH_FROM_PACKED(packed); |
| switch(count) { |
| case 4: |
| *p++=(uint8_t)(packed>>24); |
| case 3: |
| *p++=(uint8_t)(packed>>16); |
| case 2: |
| *p++=(uint8_t)(packed>>8); |
| case 1: |
| *p++=(uint8_t)packed; |
| default: |
| break; |
| } |
| |
| return count; |
| } |
| |
| /** |
| * Unpack a packed BOCU-1 non-C0/space byte sequence and get |
| * the difference to initialPrev. |
| * Used only for round-trip testing of the difference encoding and decoding. |
| * Test function. |
| * |
| * @param initialPrev bogus "previous code point" value to make sure that |
| * the resulting code point is in the range 0..0x10ffff |
| * @param packed packed BOCU-1 byte sequence |
| * @return the difference to initialPrev |
| * |
| * @see packDiff |
| * @see writeDiff |
| */ |
| static int32_t |
| unpackDiff(int32_t initialPrev, int32_t packed) { |
| Bocu1Rx rx={ 0, 0, 0 }; |
| int32_t count; |
| |
| rx.prev=initialPrev; |
| count=BOCU1_LENGTH_FROM_PACKED(packed); |
| switch(count) { |
| case 4: |
| decodeBocu1(&rx, (uint8_t)(packed>>24)); |
| case 3: |
| decodeBocu1(&rx, (uint8_t)(packed>>16)); |
| case 2: |
| decodeBocu1(&rx, (uint8_t)(packed>>8)); |
| case 1: |
| /* subtract initial prev */ |
| return decodeBocu1(&rx, (uint8_t)packed)-initialPrev; |
| default: |
| return -0x7fffffff; |
| } |
| } |
| |
| /** |
| * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes, |
| * preserving lexical order. |
| * Also checks for roundtripping of the difference encoding. |
| * Test function. |
| * |
| * @param diff difference value to test, -0x10ffff..0x10ffff |
| * @param p pointer to output byte array |
| * @return p advanced by number of bytes output |
| * |
| * @see unpackDiff |
| */ |
| static uint8_t * |
| writeDiff(int32_t diff, uint8_t *p) { |
| /* generate the difference as a packed value and serialize it */ |
| int32_t packed, initialPrev; |
| |
| packed=packDiff(diff); |
| |
| /* |
| * bogus initial "prev" to work around |
| * code point range check in decodeBocu1() |
| */ |
| if(diff<=0) { |
| initialPrev=0x10ffff; |
| } else { |
| initialPrev=-1; |
| } |
| |
| if(diff!=unpackDiff(initialPrev, packed)) { |
| log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n", |
| diff, packed, unpackDiff(initialPrev, packed)); |
| } |
| return p+writePacked(packed, p); |
| } |
| |
| /** |
| * Encode a UTF-16 string in BOCU-1. |
| * Does not check for overflows, but otherwise useful function. |
| * |
| * @param s input UTF-16 string |
| * @param length number of UChar code units in s |
| * @param p pointer to output byte array |
| * @return number of bytes output |
| */ |
| static int32_t |
| writeString(const UChar *s, int32_t length, uint8_t *p) { |
| uint8_t *p0; |
| int32_t c, prev, i; |
| |
| prev=0; |
| p0=p; |
| i=0; |
| while(i<length) { |
| U16_NEXT(s, i, length, c); |
| p+=writePacked(encodeBocu1(&prev, c), p); |
| } |
| return (int32_t)(p-p0); |
| } |
| |
| /** |
| * Decode a BOCU-1 byte sequence to a UTF-16 string. |
| * Does not check for overflows, but otherwise useful function. |
| * |
| * @param p pointer to input BOCU-1 bytes |
| * @param length number of input bytes |
| * @param s point to output UTF-16 string array |
| * @return number of UChar code units output |
| */ |
| static int32_t |
| readString(const uint8_t *p, int32_t length, UChar *s) { |
| Bocu1Rx rx={ 0, 0, 0 }; |
| int32_t c, i, sLength; |
| |
| i=sLength=0; |
| while(i<length) { |
| c=decodeBocu1(&rx, p[i++]); |
| if(c<-1) { |
| log_err("error: readString detects encoding error at string index %ld\n", i); |
| return -1; |
| } |
| if(c>=0) { |
| U16_APPEND_UNSAFE(s, sLength, c); |
| } |
| } |
| return sLength; |
| } |
| |
| static char |
| hexDigit(uint8_t digit) { |
| return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit); |
| } |
| |
| /** |
| * Pretty-print 0-terminated byte values. |
| * Helper function for test output. |
| * |
| * @param bytes 0-terminated byte array to print |
| */ |
| static void |
| printBytes(uint8_t *bytes, char *out) { |
| int i; |
| uint8_t b; |
| |
| i=0; |
| while((b=*bytes++)!=0) { |
| *out++=' '; |
| *out++=hexDigit((uint8_t)(b>>4)); |
| *out++=hexDigit((uint8_t)(b&0xf)); |
| ++i; |
| } |
| i=3*(5-i); |
| while(i>0) { |
| *out++=' '; |
| --i; |
| } |
| *out=0; |
| } |
| |
| /** |
| * Basic BOCU-1 test function, called when there are no command line arguments. |
| * Prints some of the #define values and performs round-trip tests of the |
| * difference encoding and decoding. |
| */ |
| static void |
| TestBOCU1RefDiff(void) { |
| char buf1[80], buf2[80]; |
| uint8_t prev[5], level[5]; |
| int32_t i, cmp, countErrors; |
| |
| log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1); |
| log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2); |
| log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3); |
| |
| log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1); |
| log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2); |
| log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3); |
| |
| log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE); |
| log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2); |
| log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3); |
| |
| /* test packDiff() & unpackDiff() with some specific values */ |
| writeDiff(0, level); |
| writeDiff(1, level); |
| writeDiff(65, level); |
| writeDiff(130, level); |
| writeDiff(30000, level); |
| writeDiff(1000000, level); |
| writeDiff(-65, level); |
| writeDiff(-130, level); |
| writeDiff(-30000, level); |
| writeDiff(-1000000, level); |
| |
| /* test that each value is smaller than any following one */ |
| countErrors=0; |
| i=-0x10ffff; |
| *writeDiff(i, prev)=0; |
| |
| /* show first number and bytes */ |
| printBytes(prev, buf1); |
| log_verbose(" wD(%8ld) %s\n", i, buf1); |
| |
| for(++i; i<=0x10ffff; ++i) { |
| *writeDiff(i, level)=0; |
| cmp=strcmp((const char *)prev, (const char *)level); |
| if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) { |
| log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n", |
| level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i); |
| } |
| if(cmp<0) { |
| if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) { |
| /* |
| * if the result is good, then print only if the length changed |
| * to get little but interesting output |
| */ |
| printBytes(prev, buf1); |
| printBytes(level, buf2); |
| log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2); |
| } |
| } else { |
| ++countErrors; |
| printBytes(prev, buf1); |
| printBytes(level, buf2); |
| log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2); |
| } |
| /* remember the previous bytes */ |
| memcpy(prev, level, 4); |
| } |
| |
| /* show last number and bytes */ |
| printBytes((uint8_t *)"", buf1); |
| printBytes(prev, buf2); |
| log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, buf2); |
| |
| if(countErrors==0) { |
| log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n"); |
| } else { |
| log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors); |
| } |
| |
| /* output signature byte sequence */ |
| i=0; |
| writePacked(encodeBocu1(&i, 0xfeff), level); |
| log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n", |
| level[0], level[1], level[2]); |
| } |
| |
| /* cintltst code ------------------------------------------------------------ */ |
| |
| static const int32_t DEFAULT_BUFFER_SIZE = 30000; |
| |
| |
| /* test one string with the ICU and the reference BOCU-1 implementations */ |
| static void |
| roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) { |
| UChar *roundtripRef, *roundtripICU; |
| char *bocu1Ref, *bocu1ICU; |
| |
| int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength; |
| UErrorCode errorCode; |
| |
| roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); |
| roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); |
| bocu1Ref = malloc(DEFAULT_BUFFER_SIZE); |
| bocu1ICU = malloc(DEFAULT_BUFFER_SIZE); |
| |
| /* Unicode -> BOCU-1 */ |
| bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref); |
| |
| errorCode=U_ZERO_ERROR; |
| bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode); |
| if(U_FAILURE(errorCode)) { |
| log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); |
| goto cleanup; |
| } |
| |
| if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) { |
| log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength); |
| goto cleanup; |
| } |
| |
| /* BOCU-1 -> Unicode */ |
| roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef); |
| if(roundtripRefLength<0) { |
| goto cleanup; /* readString() found an error and reported it */ |
| } |
| |
| roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode); |
| if(U_FAILURE(errorCode)) { |
| log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); |
| goto cleanup; |
| } |
| |
| if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) { |
| log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength); |
| goto cleanup; |
| } |
| if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) { |
| log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength); |
| goto cleanup; |
| } |
| cleanup: |
| free(roundtripRef); |
| free(roundtripICU); |
| free(bocu1Ref); |
| free(bocu1ICU); |
| } |
| |
| static const UChar feff[]={ 0xfeff }; |
| static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 }; |
| static const UChar crlf[]={ 0xd, 0xa, 0x20 }; |
| static const UChar nul[]={ 0 }; |
| static const UChar latin[]={ 0xdf, 0xe6 }; |
| static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 }; |
| static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 }; |
| static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 }; |
| static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 }; |
| static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */ |
| static const UChar plane1[]={ 0xd800, 0xdc00 }; |
| static const UChar plane2[]={ 0xd845, 0xdddd }; |
| static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 }; |
| static const UChar plane16[]={ 0xdbff, 0xdfff }; |
| static const UChar c0[]={ 1, 0xe40, 0x20, 9 }; |
| |
| static const struct { |
| const UChar *s; |
| int32_t length; |
| } strings[]={ |
| { feff, UPRV_LENGTHOF(feff) }, |
| { ascii, UPRV_LENGTHOF(ascii) }, |
| { crlf, UPRV_LENGTHOF(crlf) }, |
| { nul, UPRV_LENGTHOF(nul) }, |
| { latin, UPRV_LENGTHOF(latin) }, |
| { devanagari, UPRV_LENGTHOF(devanagari) }, |
| { hiragana, UPRV_LENGTHOF(hiragana) }, |
| { unihan, UPRV_LENGTHOF(unihan) }, |
| { hangul, UPRV_LENGTHOF(hangul) }, |
| { surrogates, UPRV_LENGTHOF(surrogates) }, |
| { plane1, UPRV_LENGTHOF(plane1) }, |
| { plane2, UPRV_LENGTHOF(plane2) }, |
| { plane15, UPRV_LENGTHOF(plane15) }, |
| { plane16, UPRV_LENGTHOF(plane16) }, |
| { c0, UPRV_LENGTHOF(c0) } |
| }; |
| |
| /* |
| * Verify that the ICU BOCU-1 implementation produces the same results as |
| * the reference implementation from the design folder. |
| * Generate some texts and convert them with both converters, verifying |
| * identical results and roundtripping. |
| */ |
| static void |
| TestBOCU1(void) { |
| UChar *text; |
| int32_t i, length; |
| |
| UConverter *bocu1; |
| UErrorCode errorCode; |
| |
| errorCode=U_ZERO_ERROR; |
| bocu1=ucnv_open("BOCU-1", &errorCode); |
| if(U_FAILURE(errorCode)) { |
| log_data_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode)); |
| return; |
| } |
| |
| text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); |
| |
| /* text 1: each of strings[] once */ |
| length=0; |
| for(i=0; i<UPRV_LENGTHOF(strings); ++i) { |
| u_memcpy(text+length, strings[i].s, strings[i].length); |
| length+=strings[i].length; |
| } |
| roundtripBOCU1(bocu1, 1, text, length); |
| |
| /* text 2: each of strings[] twice */ |
| length=0; |
| for(i=0; i<UPRV_LENGTHOF(strings); ++i) { |
| u_memcpy(text+length, strings[i].s, strings[i].length); |
| length+=strings[i].length; |
| u_memcpy(text+length, strings[i].s, strings[i].length); |
| length+=strings[i].length; |
| } |
| roundtripBOCU1(bocu1, 2, text, length); |
| |
| /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */ |
| length=0; |
| for(i=1; length<5000; i+=7) { |
| if(i>=UPRV_LENGTHOF(strings)) { |
| i-=UPRV_LENGTHOF(strings); |
| } |
| u_memcpy(text+length, strings[i].s, strings[i].length); |
| length+=strings[i].length; |
| } |
| roundtripBOCU1(bocu1, 3, text, length); |
| |
| ucnv_close(bocu1); |
| free(text); |
| } |
| |
| U_CFUNC void addBOCU1Tests(TestNode** root); |
| |
| U_CFUNC void |
| addBOCU1Tests(TestNode** root) { |
| addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff"); |
| addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1"); |
| } |