|  | /************************************************************************* | 
|  | * | 
|  | *   © 2016 and later: Unicode, Inc. and others. | 
|  | *   License & terms of use: http://www.unicode.org/copyright.html | 
|  | * | 
|  | ************************************************************************** | 
|  | ************************************************************************** | 
|  | * | 
|  | *   Copyright (C) 2000-2016, International Business Machines | 
|  | *   Corporation and others.  All Rights Reserved. | 
|  | * | 
|  | *************************************************************************** | 
|  | *   file name:  convsamp.c | 
|  | *   encoding:   ASCII (7-bit) | 
|  | * | 
|  | *   created on: 2000may30 | 
|  | *   created by: Steven R. Loomis | 
|  | * | 
|  | *   Sample code for the ICU conversion routines. | 
|  | * | 
|  | * Note: Nothing special is needed to build this sample. Link with | 
|  | *       the icu UC and icu I18N libraries. | 
|  | * | 
|  | *       I use 'assert' for error checking, you probably will want | 
|  | *       something more flexible.  '***BEGIN SAMPLE***' and | 
|  | *       '***END SAMPLE***' mark pieces suitable for stand alone | 
|  | *       code snippets. | 
|  | * | 
|  | * | 
|  | *  Each test can define it's own BUFFERSIZE | 
|  | * | 
|  | */ | 
|  |  | 
|  | #define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */ | 
|  |  | 
|  | #include <stdio.h> | 
|  | #include <ctype.h>            /* for isspace, etc.    */ | 
|  | #include <assert.h> | 
|  | #include <string.h> | 
|  | #include <stdlib.h>  /* malloc */ | 
|  |  | 
|  | #include "unicode/utypes.h"   /* Basic ICU data types */ | 
|  | #include "unicode/ucnv.h"     /* C   Converter API    */ | 
|  | #include "unicode/ustring.h"  /* some more string fcns*/ | 
|  | #include "unicode/uchar.h"    /* char names           */ | 
|  | #include "unicode/uloc.h" | 
|  | #include "unicode/unistr.h" | 
|  |  | 
|  | #include "flagcb.h" | 
|  |  | 
|  | /* Some utility functions */ | 
|  | #ifndef UPRV_LENGTHOF | 
|  | #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | 
|  | #endif | 
|  |  | 
|  | static const UChar kNone[] = { 0x0000 }; | 
|  |  | 
|  | #define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }} | 
|  |  | 
|  | /* Print a UChar if possible, in seven characters. */ | 
|  | void prettyPrintUChar(UChar c) | 
|  | { | 
|  | if(  (c <= 0x007F) && | 
|  | (isgraph(c))  ) { | 
|  | printf(" '%c'   ", (char)(0x00FF&c)); | 
|  | } else if ( c > 0x007F ) { | 
|  | char buf[1000]; | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | int32_t o; | 
|  |  | 
|  | o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status); | 
|  | if(U_SUCCESS(status) && (o>0) ) { | 
|  | buf[6] = 0; | 
|  | printf("%7s", buf); | 
|  | } else { | 
|  | printf(" ??????"); | 
|  | } | 
|  | } else { | 
|  | switch((char)(c & 0x007F)) { | 
|  | case ' ': | 
|  | printf(" ' '   "); | 
|  | break; | 
|  | case '\t': | 
|  | printf(" \\t    "); | 
|  | break; | 
|  | case '\n': | 
|  | printf(" \\n    "); | 
|  | break; | 
|  | default: | 
|  | printf("  _    "); | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  |  | 
|  | void printUChars(const char  *name = "?", | 
|  | const UChar *uch  = kNone, | 
|  | int32_t     len   = -1 ) | 
|  | { | 
|  | int32_t i; | 
|  |  | 
|  | if( (len == -1) && (uch) ) { | 
|  | len = u_strlen(uch); | 
|  | } | 
|  |  | 
|  | printf("%5s: ", name); | 
|  | for( i = 0; i <len; i++) { | 
|  | printf("%-6d ", i); | 
|  | } | 
|  | printf("\n"); | 
|  |  | 
|  | printf("%5s: ", "uni"); | 
|  | for( i = 0; i <len; i++) { | 
|  | printf("\\u%04X ", (int)uch[i]); | 
|  | } | 
|  | printf("\n"); | 
|  |  | 
|  | printf("%5s:", "ch"); | 
|  | for( i = 0; i <len; i++) { | 
|  | prettyPrintUChar(uch[i]); | 
|  | } | 
|  | printf("\n"); | 
|  | } | 
|  |  | 
|  | void printBytes(const char  *name = "?", | 
|  | const char *uch  = "", | 
|  | int32_t     len   = -1 ) | 
|  | { | 
|  | int32_t i; | 
|  |  | 
|  | if( (len == -1) && (uch) ) { | 
|  | len = static_cast<int32_t>(strlen(uch)); | 
|  | } | 
|  |  | 
|  | printf("%5s: ", name); | 
|  | for( i = 0; i <len; i++) { | 
|  | printf("%-4d ", i); | 
|  | } | 
|  | printf("\n"); | 
|  |  | 
|  | printf("%5s: ", "uni"); | 
|  | for( i = 0; i <len; i++) { | 
|  | printf("\\x%02X ", 0x00FF & (int)uch[i]); | 
|  | } | 
|  | printf("\n"); | 
|  |  | 
|  | printf("%5s:", "ch"); | 
|  | for( i = 0; i <len; i++) { | 
|  | if(isgraph(0x00FF & (int)uch[i])) { | 
|  | printf(" '%c' ", (char)uch[i]); | 
|  | } else { | 
|  | printf("     "); | 
|  | } | 
|  | } | 
|  | printf("\n"); | 
|  | } | 
|  |  | 
|  | void printUChar(UChar32 ch32) | 
|  | { | 
|  | if(ch32 > 0xFFFF) { | 
|  | printf("ch: U+%06X\n", ch32); | 
|  | } | 
|  | else { | 
|  | UChar ch = (UChar)ch32; | 
|  | printUChars("C", &ch, 1); | 
|  | } | 
|  | } | 
|  |  | 
|  | /******************************************************************* | 
|  | Very simple C sample to convert the word 'Moscow' in Russian in Unicode, | 
|  | followed by an exclamation mark (!) into the KOI8-R Russian code page. | 
|  |  | 
|  | This example first creates a UChar String out of the Unicode chars. | 
|  |  | 
|  | targetSize must be set to the amount of space available in the target | 
|  | buffer. After fromUChars is called, | 
|  | len will contain the number of bytes in target[] which were | 
|  | used in the resulting codepage.  In this case, there is a 1:1 mapping | 
|  | between the input and output characters. The exclamation mark has the | 
|  | same value in both KOI8-R and Unicode. | 
|  |  | 
|  | src: 0      1      2      3      4      5      6 | 
|  | uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 | 
|  | ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!' | 
|  |  | 
|  | targ:  0    1    2    3    4    5    6 | 
|  | uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 | 
|  | ch:                                '!' | 
|  |  | 
|  |  | 
|  | Converting FROM unicode | 
|  | to koi8-r. | 
|  | You must call ucnv_close to clean up the memory used by the | 
|  | converter. | 
|  |  | 
|  | 'len' returns the number of OUTPUT bytes resulting from the | 
|  | conversion. | 
|  | */ | 
|  |  | 
|  | UErrorCode convsample_02() | 
|  | { | 
|  | printf("\n\n==============================================\n" | 
|  | "Sample 02: C: simple Unicode -> koi8-r conversion\n"); | 
|  |  | 
|  |  | 
|  | // **************************** START SAMPLE ******************* | 
|  | // "cat<cat>OK" | 
|  | UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432, | 
|  | 0x0430, 0x0021, 0x0000 }; | 
|  | char target[100]; | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | UConverter *conv; | 
|  | int32_t     len; | 
|  |  | 
|  | // set up the converter | 
|  | //! [ucnv_open] | 
|  | conv = ucnv_open("koi8-r", &status); | 
|  | //! [ucnv_open] | 
|  | assert(U_SUCCESS(status)); | 
|  |  | 
|  | // convert to koi8-r | 
|  | len = ucnv_fromUChars(conv, target, 100, source, -1, &status); | 
|  | assert(U_SUCCESS(status)); | 
|  |  | 
|  | // close the converter | 
|  | ucnv_close(conv); | 
|  |  | 
|  | // ***************************** END SAMPLE ******************** | 
|  |  | 
|  | // Print it out | 
|  | printUChars("src", source); | 
|  | printf("\n"); | 
|  | printBytes("targ", target, len); | 
|  |  | 
|  | return U_ZERO_ERROR; | 
|  | } | 
|  |  | 
|  |  | 
|  | UErrorCode convsample_03() | 
|  | { | 
|  | printf("\n\n==============================================\n" | 
|  | "Sample 03: C: print out all converters\n"); | 
|  |  | 
|  | int32_t count; | 
|  | int32_t i; | 
|  |  | 
|  | // **************************** START SAMPLE ******************* | 
|  | count = ucnv_countAvailable(); | 
|  | printf("Available converters: %d\n", count); | 
|  |  | 
|  | for(i=0;i<count;i++) | 
|  | { | 
|  | printf("%s ", ucnv_getAvailableName(i)); | 
|  | } | 
|  |  | 
|  | // ***************************** END SAMPLE ******************** | 
|  |  | 
|  | printf("\n"); | 
|  |  | 
|  | return U_ZERO_ERROR; | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  | #define BUFFERSIZE 17 /* make it interesting :) */ | 
|  |  | 
|  | /* | 
|  | Converting from a codepage to Unicode in bulk.. | 
|  | What is the best way to determine the buffer size? | 
|  |  | 
|  | The 'buffersize' is in bytes of input. | 
|  | For a given converter, divinding this by the minimum char size | 
|  | give you the maximum number of Unicode characters that could be | 
|  | expected for a given number of input bytes. | 
|  | see: ucnv_getMinCharSize() | 
|  |  | 
|  | For example, a single byte codepage like 'Latin-3' has a | 
|  | minimum char size of 1. (It takes at least 1 byte to represent | 
|  | each Unicode char.) So the unicode buffer has the same number of | 
|  | UChars as the input buffer has bytes. | 
|  |  | 
|  | In a strictly double byte codepage such as cp1362 (Windows | 
|  | Korean), the minimum char size is 2. So, only half as many Unicode | 
|  | chars as bytes are needed. | 
|  |  | 
|  | This work to calculate the buffer size is an optimization. Any | 
|  | size of input and output buffer can be used, as long as the | 
|  | program handles the following cases: If the input buffer is empty, | 
|  | the source pointer will be equal to sourceLimit.  If the output | 
|  | buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned. | 
|  | */ | 
|  |  | 
|  | UErrorCode convsample_05() | 
|  | { | 
|  | printf("\n\n==============================================\n" | 
|  | "Sample 05: C: count the number of letters in a UTF-8 document\n"); | 
|  |  | 
|  | FILE *f; | 
|  | int32_t count; | 
|  | char inBuf[BUFFERSIZE]; | 
|  | const char *source; | 
|  | const char *sourceLimit; | 
|  | UChar *uBuf; | 
|  | UChar *target; | 
|  | UChar *targetLimit; | 
|  | UChar *p; | 
|  | int32_t uBufSize = 0; | 
|  | UConverter *conv; | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | uint32_t letters=0, total=0; | 
|  |  | 
|  | f = fopen("data01.txt", "r"); | 
|  | if(!f) | 
|  | { | 
|  | fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); | 
|  | return U_FILE_ACCESS_ERROR; | 
|  | } | 
|  |  | 
|  | // **************************** START SAMPLE ******************* | 
|  | conv = ucnv_open("utf-8", &status); | 
|  | assert(U_SUCCESS(status)); | 
|  |  | 
|  | uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); | 
|  | printf("input bytes %d / min chars %d = %d UChars\n", | 
|  | BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); | 
|  | uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); | 
|  | assert(uBuf!=NULL); | 
|  |  | 
|  | // grab another buffer's worth | 
|  | while((!feof(f)) && | 
|  | ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) ) | 
|  | { | 
|  | // Convert bytes to unicode | 
|  | source = inBuf; | 
|  | sourceLimit = inBuf + count; | 
|  |  | 
|  | do | 
|  | { | 
|  | target = uBuf; | 
|  | targetLimit = uBuf + uBufSize; | 
|  |  | 
|  | ucnv_toUnicode(conv, &target, targetLimit, | 
|  | &source, sourceLimit, NULL, | 
|  | feof(f)?true:false,         /* pass 'flush' when eof */ | 
|  | /* is true (when no more data will come) */ | 
|  | &status); | 
|  |  | 
|  | if(status == U_BUFFER_OVERFLOW_ERROR) | 
|  | { | 
|  | // simply ran out of space - we'll reset the target ptr the next | 
|  | // time through the loop. | 
|  | status = U_ZERO_ERROR; | 
|  | } | 
|  | else | 
|  | { | 
|  | //  Check other errors here. | 
|  | assert(U_SUCCESS(status)); | 
|  | // Break out of the loop (by force) | 
|  | } | 
|  |  | 
|  | // Process the Unicode | 
|  | // Todo: handle UTF-16/surrogates | 
|  |  | 
|  | for(p = uBuf; p<target; p++) | 
|  | { | 
|  | if(u_isalpha(*p)) | 
|  | letters++; | 
|  | total++; | 
|  | } | 
|  | } while (source < sourceLimit); // while simply out of space | 
|  | } | 
|  |  | 
|  | printf("%d letters out of %d total UChars.\n", letters, total); | 
|  |  | 
|  | // ***************************** END SAMPLE ******************** | 
|  | ucnv_close(conv); | 
|  |  | 
|  | printf("\n"); | 
|  |  | 
|  | fclose(f); | 
|  |  | 
|  | return U_ZERO_ERROR; | 
|  | } | 
|  | #undef BUFFERSIZE | 
|  |  | 
|  | #define BUFFERSIZE 1024 | 
|  | typedef struct | 
|  | { | 
|  | UChar32  codepoint; | 
|  | uint32_t frequency; | 
|  | } CharFreqInfo; | 
|  |  | 
|  | UErrorCode convsample_06() | 
|  | { | 
|  | printf("\n\n==============================================\n" | 
|  | "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); | 
|  |  | 
|  | FILE *f; | 
|  | int32_t count; | 
|  | char inBuf[BUFFERSIZE]; | 
|  | const char *source; | 
|  | const char *sourceLimit; | 
|  | int32_t uBufSize = 0; | 
|  | UConverter *conv; | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | uint32_t letters=0, total=0; | 
|  |  | 
|  | CharFreqInfo   *info; | 
|  | UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ | 
|  | UChar32   p; | 
|  |  | 
|  | uint32_t ie = 0; | 
|  | uint32_t gh = 0; | 
|  | UChar32 l = 0; | 
|  |  | 
|  | f = fopen("data06.txt", "r"); | 
|  | if(!f) | 
|  | { | 
|  | fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); | 
|  | return U_FILE_ACCESS_ERROR; | 
|  | } | 
|  |  | 
|  | info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); | 
|  | if(!info) | 
|  | { | 
|  | fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", static_cast<int>(sizeof(CharFreqInfo)*charCount)); | 
|  | } | 
|  |  | 
|  | /* reset frequencies */ | 
|  | for(p=0;p<charCount;p++) | 
|  | { | 
|  | info[p].codepoint = p; | 
|  | info[p].frequency = 0; | 
|  | } | 
|  |  | 
|  | // **************************** START SAMPLE ******************* | 
|  | conv = ucnv_open("utf-8", &status); | 
|  | assert(U_SUCCESS(status)); | 
|  |  | 
|  | uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); | 
|  | printf("input bytes %d / min chars %d = %d UChars\n", | 
|  | BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); | 
|  |  | 
|  | // grab another buffer's worth | 
|  | while((!feof(f)) && | 
|  | ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) ) | 
|  | { | 
|  | // Convert bytes to unicode | 
|  | source = inBuf; | 
|  | sourceLimit = inBuf + count; | 
|  |  | 
|  | while(source < sourceLimit) | 
|  | { | 
|  | p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); | 
|  | if(U_FAILURE(status)) | 
|  | { | 
|  | fprintf(stderr, "%s @ %d\n", u_errorName(status), total); | 
|  | status = U_ZERO_ERROR; | 
|  | continue; | 
|  | } | 
|  | U_ASSERT(status); | 
|  | total++; | 
|  |  | 
|  | if(u_isalpha(p)) | 
|  | letters++; | 
|  |  | 
|  | if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) | 
|  | ie++; | 
|  |  | 
|  | if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) | 
|  | gh++; | 
|  |  | 
|  | if(p>charCount) | 
|  | { | 
|  | fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); | 
|  | free(info); | 
|  | fclose(f); | 
|  | ucnv_close(conv); | 
|  | return U_UNSUPPORTED_ERROR; | 
|  | } | 
|  | info[p].frequency++; | 
|  | l = p; | 
|  | } | 
|  | } | 
|  |  | 
|  | fclose(f); | 
|  | ucnv_close(conv); | 
|  |  | 
|  | printf("%d letters out of %d total UChars.\n", letters, total); | 
|  | printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); | 
|  |  | 
|  | // now, we could sort it.. | 
|  |  | 
|  | //  qsort(info, charCount, sizeof(info[0]), charfreq_compare); | 
|  |  | 
|  | for(p=0;p<charCount;p++) | 
|  | { | 
|  | if(info[p].frequency) | 
|  | { | 
|  | printf("% 5d U+%06X ", info[p].frequency, p); | 
|  | if(p <= 0xFFFF) | 
|  | { | 
|  | prettyPrintUChar((UChar)p); | 
|  | } | 
|  | printf("\n"); | 
|  | } | 
|  | } | 
|  | free(info); | 
|  | // ***************************** END SAMPLE ******************** | 
|  |  | 
|  | printf("\n"); | 
|  |  | 
|  | return U_ZERO_ERROR; | 
|  | } | 
|  | #undef BUFFERSIZE | 
|  |  | 
|  |  | 
|  | /****************************************************** | 
|  | You must call ucnv_close to clean up the memory used by the | 
|  | converter. | 
|  |  | 
|  | 'len' returns the number of OUTPUT bytes resulting from the | 
|  | conversion. | 
|  | */ | 
|  |  | 
|  | UErrorCode convsample_12() | 
|  | { | 
|  | printf("\n\n==============================================\n" | 
|  | "Sample 12: C: simple sjis -> unicode conversion\n"); | 
|  |  | 
|  |  | 
|  | // **************************** START SAMPLE ******************* | 
|  |  | 
|  | char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 }; | 
|  | UChar target[100]; | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | UConverter *conv; | 
|  | int32_t     len; | 
|  |  | 
|  | // set up the converter | 
|  | conv = ucnv_open("shift_jis", &status); | 
|  | assert(U_SUCCESS(status)); | 
|  |  | 
|  | // convert to Unicode | 
|  | // Note: we can use strlen, we know it's an 8 bit null terminated codepage | 
|  | target[6] = 0xFDCA; | 
|  | len = ucnv_toUChars(conv, target, 100, source, static_cast<int32_t>(strlen(source)), &status); | 
|  | U_ASSERT(status); | 
|  | // close the converter | 
|  | ucnv_close(conv); | 
|  |  | 
|  | // ***************************** END SAMPLE ******************** | 
|  |  | 
|  | // Print it out | 
|  | printBytes("src", source, static_cast<int32_t>(strlen(source)) ); | 
|  | printf("\n"); | 
|  | printUChars("targ", target, len); | 
|  |  | 
|  | return U_ZERO_ERROR; | 
|  | } | 
|  |  | 
|  | /****************************************************************** | 
|  | C: Convert from codepage to Unicode one at a time. | 
|  | */ | 
|  |  | 
|  | UErrorCode convsample_13() | 
|  | { | 
|  | printf("\n\n==============================================\n" | 
|  | "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n"); | 
|  |  | 
|  |  | 
|  | const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e }; | 
|  | //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e }; | 
|  | const char *source, *sourceLimit; | 
|  | UChar32 target; | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | UConverter *conv = NULL; | 
|  | int32_t srcCount=0; | 
|  | int32_t dstCount=0; | 
|  |  | 
|  | srcCount = sizeof(sourceChars); | 
|  |  | 
|  | conv = ucnv_open("Big5", &status); | 
|  | U_ASSERT(status); | 
|  |  | 
|  | source = sourceChars; | 
|  | sourceLimit = sourceChars + sizeof(sourceChars); | 
|  |  | 
|  | // **************************** START SAMPLE ******************* | 
|  |  | 
|  |  | 
|  | printBytes("src", source, static_cast<int32_t>(sourceLimit - source)); | 
|  |  | 
|  | while(source < sourceLimit) | 
|  | { | 
|  | puts(""); | 
|  | target = ucnv_getNextUChar (conv, | 
|  | &source, | 
|  | sourceLimit, | 
|  | &status); | 
|  |  | 
|  | //    printBytes("src",source,sourceLimit-source); | 
|  | U_ASSERT(status); | 
|  | printUChar(target); | 
|  | dstCount++; | 
|  | } | 
|  |  | 
|  |  | 
|  | // ************************** END SAMPLE ************************* | 
|  |  | 
|  | printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount); | 
|  | ucnv_close(conv); | 
|  |  | 
|  | return U_ZERO_ERROR; | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | UBool convsample_20_didSubstitute(const char *source) | 
|  | { | 
|  | UChar uchars[100]; | 
|  | char bytes[100]; | 
|  | UConverter *conv = NULL; | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | uint32_t len, len2; | 
|  | UBool  flagVal; | 
|  |  | 
|  | FromUFLAGContext * context = NULL; | 
|  |  | 
|  | printf("\n\n==============================================\n" | 
|  | "Sample 20: C: Test for substitution using callbacks\n"); | 
|  |  | 
|  | /* print out the original source */ | 
|  | printBytes("src", source); | 
|  | printf("\n"); | 
|  |  | 
|  | /* First, convert from UTF8 to unicode */ | 
|  | conv = ucnv_open("utf-8", &status); | 
|  | U_ASSERT(status); | 
|  |  | 
|  | len = ucnv_toUChars(conv, uchars, 100, source, static_cast<int32_t>(strlen(source)), &status); | 
|  | U_ASSERT(status); | 
|  |  | 
|  | printUChars("uch", uchars, len); | 
|  | printf("\n"); | 
|  |  | 
|  | /* Now, close the converter */ | 
|  | ucnv_close(conv); | 
|  |  | 
|  | /* Now, convert to windows-1252 */ | 
|  | conv = ucnv_open("windows-1252", &status); | 
|  | U_ASSERT(status); | 
|  |  | 
|  | /* Converter starts out with the SUBSTITUTE callback set. */ | 
|  |  | 
|  | /* initialize our callback */ | 
|  | context = flagCB_fromU_openContext(); | 
|  |  | 
|  | /* Set our special callback */ | 
|  | ucnv_setFromUCallBack(conv, | 
|  | flagCB_fromU, | 
|  | context, | 
|  | &(context->subCallback), | 
|  | &(context->subContext), | 
|  | &status); | 
|  |  | 
|  | U_ASSERT(status); | 
|  |  | 
|  | len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status); | 
|  | U_ASSERT(status); | 
|  |  | 
|  | flagVal = context->flag;  /* it's about to go away when we close the cnv */ | 
|  |  | 
|  | ucnv_close(conv); | 
|  |  | 
|  | /* print out the original source */ | 
|  | printBytes("bytes", bytes, len2); | 
|  |  | 
|  | return flagVal; /* true if callback was called */ | 
|  | } | 
|  |  | 
|  | UErrorCode convsample_20() | 
|  | { | 
|  | const char *sample1 = "abc\xdf\xbf"; | 
|  | const char *sample2 = "abc_def"; | 
|  |  | 
|  |  | 
|  | if(convsample_20_didSubstitute(sample1)) | 
|  | { | 
|  | printf("DID substitute.\n******\n"); | 
|  | } | 
|  | else | 
|  | { | 
|  | printf("Did NOT substitute.\n*****\n"); | 
|  | } | 
|  |  | 
|  | if(convsample_20_didSubstitute(sample2)) | 
|  | { | 
|  | printf("DID substitute.\n******\n"); | 
|  | } | 
|  | else | 
|  | { | 
|  | printf("Did NOT substitute.\n*****\n"); | 
|  | } | 
|  |  | 
|  | return U_ZERO_ERROR; | 
|  | } | 
|  |  | 
|  | // 21  - C, callback, with clone and debug | 
|  |  | 
|  |  | 
|  |  | 
|  | UBool convsample_21_didSubstitute(const char *source) | 
|  | { | 
|  | UChar uchars[100]; | 
|  | char bytes[100]; | 
|  | UConverter *conv = NULL, *cloneCnv = NULL; | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | uint32_t len, len2; | 
|  | UBool  flagVal = false; | 
|  | UConverterFromUCallback junkCB; | 
|  |  | 
|  | FromUFLAGContext *flagCtx = NULL, | 
|  | *cloneFlagCtx = NULL; | 
|  |  | 
|  | debugCBContext   *debugCtx1 = NULL, | 
|  | *debugCtx2 = NULL, | 
|  | *cloneDebugCtx = NULL; | 
|  |  | 
|  | printf("\n\n==============================================\n" | 
|  | "Sample 21: C: Test for substitution w/ callbacks & clones \n"); | 
|  |  | 
|  | /* print out the original source */ | 
|  | printBytes("src", source); | 
|  | printf("\n"); | 
|  |  | 
|  | /* First, convert from UTF8 to unicode */ | 
|  | conv = ucnv_open("utf-8", &status); | 
|  | U_ASSERT(status); | 
|  |  | 
|  | len = ucnv_toUChars(conv, uchars, 100, source, static_cast<int32_t>(strlen(source)), &status); | 
|  | U_ASSERT(status); | 
|  |  | 
|  | printUChars("uch", uchars, len); | 
|  | printf("\n"); | 
|  |  | 
|  | /* Now, close the converter */ | 
|  | ucnv_close(conv); | 
|  |  | 
|  | /* Now, convert to windows-1252 */ | 
|  | conv = ucnv_open("windows-1252", &status); | 
|  | U_ASSERT(status); | 
|  |  | 
|  | /* Converter starts out with the SUBSTITUTE callback set. */ | 
|  |  | 
|  | /* initialize our callback */ | 
|  | /* from the 'bottom' innermost, out | 
|  | *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */ | 
|  |  | 
|  | #if DEBUG_TMI | 
|  | printf("flagCB_fromU = %p\n", &flagCB_fromU); | 
|  | printf("debugCB_fromU = %p\n", &debugCB_fromU); | 
|  | #endif | 
|  |  | 
|  | debugCtx1 = debugCB_openContext(); | 
|  | flagCtx  = flagCB_fromU_openContext(); | 
|  | debugCtx2 = debugCB_openContext(); | 
|  |  | 
|  | debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */ | 
|  | debugCtx1->subContext  =  flagCtx; | 
|  |  | 
|  | flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */ | 
|  | flagCtx->subContext    =  debugCtx2; | 
|  |  | 
|  | debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE; | 
|  | debugCtx2->subContext  = NULL; | 
|  |  | 
|  | /* Set our special callback */ | 
|  |  | 
|  | ucnv_setFromUCallBack(conv, | 
|  | debugCB_fromU, | 
|  | debugCtx1, | 
|  | &(debugCtx2->subCallback), | 
|  | &(debugCtx2->subContext), | 
|  | &status); | 
|  |  | 
|  | U_ASSERT(status); | 
|  |  | 
|  | #if DEBUG_TMI | 
|  | printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n", | 
|  | conv, debugCtx1, debugCtx1->subCallback, | 
|  | debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback); | 
|  | #endif | 
|  |  | 
|  | cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status); | 
|  |  | 
|  | U_ASSERT(status); | 
|  |  | 
|  | #if DEBUG_TMI | 
|  | printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv); | 
|  | #endif | 
|  |  | 
|  | ucnv_close(conv); | 
|  |  | 
|  | #if DEBUG_TMI | 
|  | printf("%p closed.\n", conv); | 
|  | #endif | 
|  |  | 
|  | U_ASSERT(status); | 
|  | /* Now, we have to extract the context */ | 
|  | cloneDebugCtx = NULL; | 
|  | cloneFlagCtx  = NULL; | 
|  |  | 
|  | ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx); | 
|  | if(cloneDebugCtx != NULL) { | 
|  | cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext; | 
|  | } | 
|  |  | 
|  | printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n", | 
|  | cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL ); | 
|  |  | 
|  | len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status); | 
|  | U_ASSERT(status); | 
|  |  | 
|  | if(cloneFlagCtx != NULL) { | 
|  | flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */ | 
|  | } else { | 
|  | printf("** Warning, couldn't get the subcallback \n"); | 
|  | } | 
|  |  | 
|  | ucnv_close(cloneCnv); | 
|  |  | 
|  | /* print out the original source */ | 
|  | printBytes("bytes", bytes, len2); | 
|  |  | 
|  | return flagVal; /* true if callback was called */ | 
|  | } | 
|  |  | 
|  | UErrorCode convsample_21() | 
|  | { | 
|  | const char *sample1 = "abc\xdf\xbf"; | 
|  | const char *sample2 = "abc_def"; | 
|  |  | 
|  | if(convsample_21_didSubstitute(sample1)) | 
|  | { | 
|  | printf("DID substitute.\n******\n"); | 
|  | } | 
|  | else | 
|  | { | 
|  | printf("Did NOT substitute.\n*****\n"); | 
|  | } | 
|  |  | 
|  | if(convsample_21_didSubstitute(sample2)) | 
|  | { | 
|  | printf("DID substitute.\n******\n"); | 
|  | } | 
|  | else | 
|  | { | 
|  | printf("Did NOT substitute.\n*****\n"); | 
|  | } | 
|  |  | 
|  | return U_ZERO_ERROR; | 
|  | } | 
|  |  | 
|  |  | 
|  | //  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16] | 
|  |  | 
|  | #define BUFFERSIZE 17 /* make it interesting :) */ | 
|  |  | 
|  | UErrorCode convsample_40() | 
|  | { | 
|  | printf("\n\n==============================================\n" | 
|  | "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); | 
|  |  | 
|  | FILE *f; | 
|  | FILE *out; | 
|  | int32_t count; | 
|  | char inBuf[BUFFERSIZE]; | 
|  | const char *source; | 
|  | const char *sourceLimit; | 
|  | UChar *uBuf; | 
|  | UChar *target; | 
|  | UChar *targetLimit; | 
|  | int32_t uBufSize = 0; | 
|  | UConverter *conv = NULL; | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | uint32_t inbytes=0, total=0; | 
|  |  | 
|  | f = fopen("data02.bin", "rb"); | 
|  | if(!f) | 
|  | { | 
|  | fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); | 
|  | return U_FILE_ACCESS_ERROR; | 
|  | } | 
|  |  | 
|  | out = fopen("data40.utf16", "wb"); | 
|  | if(!out) | 
|  | { | 
|  | fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); | 
|  | fclose(f); | 
|  | return U_FILE_ACCESS_ERROR; | 
|  | } | 
|  |  | 
|  | // **************************** START SAMPLE ******************* | 
|  | conv = ucnv_openCCSID(37, UCNV_IBM, &status); | 
|  | assert(U_SUCCESS(status)); | 
|  |  | 
|  | uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); | 
|  | printf("input bytes %d / min chars %d = %d UChars\n", | 
|  | BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); | 
|  | uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); | 
|  | assert(uBuf!=NULL); | 
|  |  | 
|  | // grab another buffer's worth | 
|  | while((!feof(f)) && | 
|  | ((count=static_cast<int32_t>(fread(inBuf, 1, BUFFERSIZE , f))) > 0) ) | 
|  | { | 
|  | inbytes += count; | 
|  |  | 
|  | // Convert bytes to unicode | 
|  | source = inBuf; | 
|  | sourceLimit = inBuf + count; | 
|  |  | 
|  | do | 
|  | { | 
|  | target = uBuf; | 
|  | targetLimit = uBuf + uBufSize; | 
|  |  | 
|  | ucnv_toUnicode( conv, &target, targetLimit, | 
|  | &source, sourceLimit, NULL, | 
|  | feof(f)?true:false,         /* pass 'flush' when eof */ | 
|  | /* is true (when no more data will come) */ | 
|  | &status); | 
|  |  | 
|  | if(status == U_BUFFER_OVERFLOW_ERROR) | 
|  | { | 
|  | // simply ran out of space - we'll reset the target ptr the next | 
|  | // time through the loop. | 
|  | status = U_ZERO_ERROR; | 
|  | } | 
|  | else | 
|  | { | 
|  | //  Check other errors here. | 
|  | assert(U_SUCCESS(status)); | 
|  | // Break out of the loop (by force) | 
|  | } | 
|  |  | 
|  | // Process the Unicode | 
|  | // Todo: handle UTF-16/surrogates | 
|  | assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == (size_t)(target-uBuf)); | 
|  | total += static_cast<uint32_t>((target-uBuf)); | 
|  | } while (source < sourceLimit); // while simply out of space | 
|  | } | 
|  |  | 
|  | printf("%d bytes in,  %d UChars out.\n", inbytes, total); | 
|  |  | 
|  | // ***************************** END SAMPLE ******************** | 
|  | ucnv_close(conv); | 
|  |  | 
|  | fclose(f); | 
|  | fclose(out); | 
|  | printf("\n"); | 
|  |  | 
|  | return U_ZERO_ERROR; | 
|  | } | 
|  | #undef BUFFERSIZE | 
|  |  | 
|  |  | 
|  |  | 
|  | //  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out] | 
|  |  | 
|  | #define BUFFERSIZE 24 /* make it interesting :) */ | 
|  |  | 
|  | UErrorCode convsample_46() | 
|  | { | 
|  | printf("\n\n==============================================\n" | 
|  | "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n"); | 
|  |  | 
|  | FILE *f; | 
|  | FILE *out; | 
|  | int32_t count; | 
|  | UChar inBuf[BUFFERSIZE]; | 
|  | const UChar *source; | 
|  | const UChar *sourceLimit; | 
|  | char *buf; | 
|  | char *target; | 
|  | char *targetLimit; | 
|  |  | 
|  | int32_t bufSize = 0; | 
|  | UConverter *conv = NULL; | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | uint32_t inchars=0, total=0; | 
|  |  | 
|  | f = fopen("data40.utf16", "rb"); | 
|  | if(!f) | 
|  | { | 
|  | fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n"); | 
|  | return U_FILE_ACCESS_ERROR; | 
|  | } | 
|  |  | 
|  | out = fopen("data46.out", "wb"); | 
|  | if(!out) | 
|  | { | 
|  | fprintf(stderr, "Couldn't create file 'data46.out'.\n"); | 
|  | fclose(f); | 
|  | return U_FILE_ACCESS_ERROR; | 
|  | } | 
|  |  | 
|  | // **************************** START SAMPLE ******************* | 
|  | conv = ucnv_open( "iso-8859-2", &status); | 
|  | assert(U_SUCCESS(status)); | 
|  |  | 
|  | bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv)); | 
|  | printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n", | 
|  | BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize); | 
|  | buf = (char*)malloc(bufSize * sizeof(char)); | 
|  | assert(buf!=NULL); | 
|  |  | 
|  | // grab another buffer's worth | 
|  | while((!feof(f)) && | 
|  | ((count=static_cast<int32_t>(fread(inBuf, sizeof(UChar), BUFFERSIZE , f))) > 0) ) | 
|  | { | 
|  | inchars += count; | 
|  |  | 
|  | // Convert bytes to unicode | 
|  | source = inBuf; | 
|  | sourceLimit = inBuf + count; | 
|  |  | 
|  | do | 
|  | { | 
|  | target = buf; | 
|  | targetLimit = buf + bufSize; | 
|  |  | 
|  | ucnv_fromUnicode( conv, &target, targetLimit, | 
|  | &source, sourceLimit, NULL, | 
|  | feof(f)?true:false,         /* pass 'flush' when eof */ | 
|  | /* is true (when no more data will come) */ | 
|  | &status); | 
|  |  | 
|  | if(status == U_BUFFER_OVERFLOW_ERROR) | 
|  | { | 
|  | // simply ran out of space - we'll reset the target ptr the next | 
|  | // time through the loop. | 
|  | status = U_ZERO_ERROR; | 
|  | } | 
|  | else | 
|  | { | 
|  | //  Check other errors here. | 
|  | assert(U_SUCCESS(status)); | 
|  | // Break out of the loop (by force) | 
|  | } | 
|  |  | 
|  | // Process the Unicode | 
|  | assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == (size_t)(target-buf)); | 
|  | total += static_cast<uint32_t>((target-buf)); | 
|  | } while (source < sourceLimit); // while simply out of space | 
|  | } | 
|  |  | 
|  | printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, static_cast<int>(inchars * sizeof(UChar)), total); | 
|  |  | 
|  | // ***************************** END SAMPLE ******************** | 
|  | ucnv_close(conv); | 
|  |  | 
|  | fclose(f); | 
|  | fclose(out); | 
|  | printf("\n"); | 
|  |  | 
|  | return U_ZERO_ERROR; | 
|  | } | 
|  | #undef BUFFERSIZE | 
|  |  | 
|  | #define BUFFERSIZE 219 | 
|  |  | 
|  | void convsample_50() { | 
|  | printf("\n\n==============================================\n" | 
|  | "Sample 50: C: ucnv_detectUnicodeSignature\n"); | 
|  |  | 
|  | //! [ucnv_detectUnicodeSignature] | 
|  | UErrorCode err = U_ZERO_ERROR; | 
|  | UBool discardSignature = true; /* set to true to throw away the initial U+FEFF */ | 
|  | char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; | 
|  | int32_t signatureLength = 0; | 
|  | const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err); | 
|  | UConverter *conv = NULL; | 
|  | UChar output[100]; | 
|  | UChar *target = output, *out; | 
|  | const char *source = input; | 
|  | if(encoding!=NULL && U_SUCCESS(err)){ | 
|  | // should signature be discarded ? | 
|  | conv = ucnv_open(encoding, &err); | 
|  | // do the conversion | 
|  | ucnv_toUnicode(conv, | 
|  | &target, output + UPRV_LENGTHOF(output), | 
|  | &source, input + sizeof(input), | 
|  | NULL, true, &err); | 
|  | out = output; | 
|  | if (discardSignature){ | 
|  | ++out; // ignore initial U+FEFF | 
|  | } | 
|  | while(out != target) { | 
|  | printf("%04x ", *out++); | 
|  | } | 
|  | puts(""); | 
|  | } | 
|  | //! [ucnv_detectUnicodeSignature] | 
|  | puts(""); | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  | /* main */ | 
|  |  | 
|  | int main() | 
|  | { | 
|  |  | 
|  | printf("Default Converter=%s\n", ucnv_getDefaultName() ); | 
|  |  | 
|  | convsample_02();  // C  , u->koi8r, conv | 
|  | convsample_03();  // C,   iterate | 
|  |  | 
|  | convsample_05();  // C,  utf8->u, getNextUChar | 
|  | convsample_06(); // C freq counter thingy | 
|  |  | 
|  | convsample_12();  // C,  sjis->u, conv | 
|  | convsample_13();  // C,  big5->u, getNextU | 
|  |  | 
|  | convsample_20();  // C, callback | 
|  | convsample_21();  // C, callback debug | 
|  |  | 
|  | convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16] | 
|  |  | 
|  | convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out] | 
|  |  | 
|  | convsample_50();  // C, detect unicode signature | 
|  |  | 
|  | printf("End of converter samples.\n"); | 
|  |  | 
|  | fflush(stdout); | 
|  | fflush(stderr); | 
|  |  | 
|  | return 0; | 
|  | } |