| /******************************************************************** |
| * COPYRIGHT: |
| * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. |
| * |
| ********************************************************************/ |
| /******************************************************************************** |
| * |
| * File ubrkperf.cpp |
| * |
| * Modification History: |
| * Name Description |
| * Vladimir Weinstein First Version, based on collperf |
| * |
| ********************************************************************************* |
| */ |
| |
| // |
| // This program tests break iterator performance |
| // Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs |
| // (if any) |
| // A text file is required as input. It must be in utf-8 or utf-16 format, |
| // and include a byte order mark. Either LE or BE format is OK. |
| // |
| |
| const char gUsageString[] = |
| "usage: ubrkperf options...\n" |
| "-help Display this message.\n" |
| "-file file_name utf-16/utf-8 format file.\n" |
| "-locale name ICU locale to use. Default is en_US\n" |
| "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" |
| " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" |
| "-win Run test using Windows native services. (currently not working) (ICU is default)\n" |
| "-unix Run test using Unix word breaking services. (currently not working) \n" |
| "-mac Run test using MacOSX word breaking services.\n" |
| "-uselen Use API with string lengths. Default is null-terminated strings\n" |
| "-char Use character break iterator\n" |
| "-word Use word break iterator\n" |
| "-line Use line break iterator\n" |
| "-sentence Use sentence break iterator\n" |
| "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" |
| "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" |
| " under test at each call point. For measuring test overhead.\n" |
| "-terse Terse numbers-only output. Intended for use by scripts.\n" |
| "-dump Display stuff.\n" |
| "-capi Use C APIs instead of C++ APIs (currently not working)\n" |
| "-next Do the next test\n" |
| "-isBound Do the isBound test\n" |
| ; |
| |
| |
| #include <stdio.h> |
| #include <string.h> |
| #include <stdlib.h> |
| #include <math.h> |
| #include <locale.h> |
| #include <errno.h> |
| #include <sys/stat.h> |
| |
| #include <unicode/utypes.h> |
| #include <unicode/ucol.h> |
| #include <unicode/ucoleitr.h> |
| #include <unicode/uloc.h> |
| #include <unicode/ustring.h> |
| #include <unicode/ures.h> |
| #include <unicode/uchar.h> |
| #include <unicode/ucnv.h> |
| #include <unicode/utf8.h> |
| |
| #include <unicode/brkiter.h> |
| |
| |
| #if U_PLATFORM_HAS_WIN32_API |
| #include <windows.h> |
| #else |
| // |
| // Stubs for Windows API functions when building on UNIXes. |
| // |
| #include <sys/time.h> |
| unsigned long timeGetTime() { |
| struct timeval t; |
| gettimeofday(&t, 0); |
| unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. |
| val += t.tv_usec / 1000; |
| return val; |
| }; |
| #define MAKELCID(a,b) 0 |
| #endif |
| |
| |
| // |
| // Command line option variables |
| // These global variables are set according to the options specified |
| // on the command line by the user. |
| char * opt_fName = 0; |
| char * opt_locale = "en_US"; |
| int opt_langid = 0; // Defaults to value corresponding to opt_locale. |
| char * opt_rules = 0; |
| UBool opt_help = FALSE; |
| int opt_time = 0; |
| int opt_loopCount = 0; |
| int opt_passesCount= 1; |
| UBool opt_terse = FALSE; |
| UBool opt_icu = TRUE; |
| UBool opt_win = FALSE; // Run with Windows native functions. |
| UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. |
| UBool opt_mac = FALSE; // Run with MacOSX word break services. |
| UBool opt_uselen = FALSE; |
| UBool opt_dump = FALSE; |
| UBool opt_char = FALSE; |
| UBool opt_word = FALSE; |
| UBool opt_line = FALSE; |
| UBool opt_sentence = FALSE; |
| UBool opt_capi = FALSE; |
| |
| UBool opt_next = FALSE; |
| UBool opt_isBound = FALSE; |
| |
| |
| |
| // |
| // Definitions for the command line options |
| // |
| struct OptSpec { |
| const char *name; |
| enum {FLAG, NUM, STRING} type; |
| void *pVar; |
| }; |
| |
| OptSpec opts[] = { |
| {"-file", OptSpec::STRING, &opt_fName}, |
| {"-locale", OptSpec::STRING, &opt_locale}, |
| {"-langid", OptSpec::NUM, &opt_langid}, |
| {"-win", OptSpec::FLAG, &opt_win}, |
| {"-unix", OptSpec::FLAG, &opt_unix}, |
| {"-mac", OptSpec::FLAG, &opt_mac}, |
| {"-uselen", OptSpec::FLAG, &opt_uselen}, |
| {"-loop", OptSpec::NUM, &opt_loopCount}, |
| {"-time", OptSpec::NUM, &opt_time}, |
| {"-passes", OptSpec::NUM, &opt_passesCount}, |
| {"-char", OptSpec::FLAG, &opt_char}, |
| {"-word", OptSpec::FLAG, &opt_word}, |
| {"-line", OptSpec::FLAG, &opt_line}, |
| {"-sentence", OptSpec::FLAG, &opt_sentence}, |
| {"-terse", OptSpec::FLAG, &opt_terse}, |
| {"-dump", OptSpec::FLAG, &opt_dump}, |
| {"-capi", OptSpec::FLAG, &opt_capi}, |
| {"-next", OptSpec::FLAG, &opt_next}, |
| {"-isBound", OptSpec::FLAG, &opt_isBound}, |
| {"-help", OptSpec::FLAG, &opt_help}, |
| {"-?", OptSpec::FLAG, &opt_help}, |
| {0, OptSpec::FLAG, 0} |
| }; |
| |
| |
| //--------------------------------------------------------------------------- |
| // |
| // Global variables pointing to and describing the test file |
| // |
| //--------------------------------------------------------------------------- |
| |
| //DWORD gWinLCID; |
| BreakIterator *brkit = NULL; |
| UChar *text = NULL; |
| int32_t textSize = 0; |
| |
| |
| |
| #if U_PLATFORM_IS_DARWIN_BASED |
| #include <ApplicationServices/ApplicationServices.h> |
| enum{ |
| kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask) |
| }; |
| UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask}; |
| TextBreakLocatorRef breakRef; |
| UCTextBreakType macBreakType; |
| |
| void createMACBrkIt() { |
| OSStatus status = noErr; |
| LocaleRef lref; |
| status = LocaleRefFromLocaleString(opt_locale, &lref); |
| status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef); |
| if(opt_char == TRUE) { |
| macBreakType = kUCTextBreakClusterMask; |
| } else if(opt_word == TRUE) { |
| macBreakType = kUCTextBreakWordMask; |
| } else if(opt_line == TRUE) { |
| macBreakType = kUCTextBreakLineMask; |
| } else if(opt_sentence == TRUE) { |
| // error |
| // brkit = BreakIterator::createSentenceInstance(opt_locale, status); |
| } else { |
| // default is character iterator |
| macBreakType = kUCTextBreakClusterMask; |
| } |
| } |
| #endif |
| |
| void createICUBrkIt() { |
| // |
| // Set up an ICU break iterator |
| // |
| UErrorCode status = U_ZERO_ERROR; |
| if(opt_char == TRUE) { |
| brkit = BreakIterator::createCharacterInstance(opt_locale, status); |
| } else if(opt_word == TRUE) { |
| brkit = BreakIterator::createWordInstance(opt_locale, status); |
| } else if(opt_line == TRUE) { |
| brkit = BreakIterator::createLineInstance(opt_locale, status); |
| } else if(opt_sentence == TRUE) { |
| brkit = BreakIterator::createSentenceInstance(opt_locale, status); |
| } else { |
| // default is character iterator |
| brkit = BreakIterator::createCharacterInstance(opt_locale, status); |
| } |
| if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { |
| fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); |
| } |
| if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { |
| fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); |
| } |
| |
| } |
| |
| //--------------------------------------------------------------------------- |
| // |
| // ProcessOptions() Function to read the command line options. |
| // |
| //--------------------------------------------------------------------------- |
| UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) |
| { |
| int i; |
| int argNum; |
| const char *pArgName; |
| OptSpec *pOpt; |
| |
| for (argNum=1; argNum<argc; argNum++) { |
| pArgName = argv[argNum]; |
| for (pOpt = opts; pOpt->name != 0; pOpt++) { |
| if (strcmp(pOpt->name, pArgName) == 0) { |
| switch (pOpt->type) { |
| case OptSpec::FLAG: |
| *(UBool *)(pOpt->pVar) = TRUE; |
| break; |
| case OptSpec::STRING: |
| argNum ++; |
| if (argNum >= argc) { |
| fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); |
| return FALSE; |
| } |
| *(const char **)(pOpt->pVar) = argv[argNum]; |
| break; |
| case OptSpec::NUM: |
| argNum ++; |
| if (argNum >= argc) { |
| fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); |
| return FALSE; |
| } |
| char *endp; |
| i = strtol(argv[argNum], &endp, 0); |
| if (endp == argv[argNum]) { |
| fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); |
| return FALSE; |
| } |
| *(int *)(pOpt->pVar) = i; |
| } |
| break; |
| } |
| } |
| if (pOpt->name == 0) |
| { |
| fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); |
| return FALSE; |
| } |
| } |
| return TRUE; |
| } |
| |
| |
| void doForwardTest() { |
| if (opt_terse == FALSE) { |
| printf("Doing the forward test\n"); |
| } |
| int32_t noBreaks = 0; |
| int32_t i = 0; |
| unsigned long startTime = timeGetTime(); |
| unsigned long elapsedTime = 0; |
| if(opt_icu) { |
| createICUBrkIt(); |
| brkit->setText(UnicodeString(text, textSize)); |
| brkit->first(); |
| if (opt_terse == FALSE) { |
| printf("Warmup\n"); |
| } |
| int j; |
| while((j = brkit->next()) != BreakIterator::DONE) { |
| noBreaks++; |
| //fprintf(stderr, "%d ", j); |
| } |
| |
| if (opt_terse == FALSE) { |
| printf("Measure\n"); |
| } |
| startTime = timeGetTime(); |
| for(i = 0; i < opt_loopCount; i++) { |
| brkit->first(); |
| while(brkit->next() != BreakIterator::DONE) { |
| } |
| } |
| |
| elapsedTime = timeGetTime()-startTime; |
| } else if(opt_mac) { |
| #if U_PLATFORM_IS_DARWIN_BASED |
| createMACBrkIt(); |
| UniChar* filePtr = text; |
| OSStatus status = noErr; |
| UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize; |
| startOffset = 0; |
| //printf("\t---Search forward--\n"); |
| |
| while (startOffset < numUniChars) |
| { |
| status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, |
| startOffset, &breakOffset); |
| //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status)); |
| //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset)); |
| |
| // Output break |
| //printf("\t%d\n", (int)breakOffset); |
| |
| // Increment counters |
| noBreaks++; |
| startOffset = breakOffset; |
| } |
| startTime = timeGetTime(); |
| for(i = 0; i < opt_loopCount; i++) { |
| startOffset = 0; |
| |
| while (startOffset < numUniChars) |
| { |
| status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, |
| startOffset, &breakOffset); |
| // Increment counters |
| startOffset = breakOffset; |
| } |
| } |
| elapsedTime = timeGetTime()-startTime; |
| UCDisposeTextBreakLocator(&breakRef); |
| #endif |
| |
| |
| } |
| |
| |
| if (opt_terse == FALSE) { |
| int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); |
| int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); |
| int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); |
| printf("forward break iteration average loop time %d\n", loopTime); |
| printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); |
| printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); |
| } else { |
| printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); |
| } |
| |
| |
| } |
| |
| void doIsBoundTest() { |
| int32_t noBreaks = 0, hit = 0; |
| int32_t i = 0, j = 0; |
| unsigned long startTime = timeGetTime(); |
| unsigned long elapsedTime = 0; |
| createICUBrkIt(); |
| brkit->setText(UnicodeString(text, textSize)); |
| brkit->first(); |
| for(j = 0; j < textSize; j++) { |
| if(brkit->isBoundary(j)) { |
| noBreaks++; |
| //fprintf(stderr, "%d ", j); |
| } |
| } |
| /* |
| while(brkit->next() != BreakIterator::DONE) { |
| noBreaks++; |
| } |
| */ |
| |
| startTime = timeGetTime(); |
| for(i = 0; i < opt_loopCount; i++) { |
| for(j = 0; j < textSize; j++) { |
| if(brkit->isBoundary(j)) { |
| hit++; |
| } |
| } |
| } |
| |
| elapsedTime = timeGetTime()-startTime; |
| int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); |
| if (opt_terse == FALSE) { |
| int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); |
| int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); |
| printf("forward break iteration average loop time %d\n", loopTime); |
| printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); |
| printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); |
| } else { |
| printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); |
| } |
| } |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // UnixConvert -- Convert the lines of the file to the encoding for UNIX |
| // Since it appears that Unicode support is going in the general |
| // direction of the use of UTF-8 locales, that is the approach |
| // that is used here. |
| // |
| //---------------------------------------------------------------------------------------- |
| void UnixConvert() { |
| #if 0 |
| int line; |
| |
| UConverter *cvrtr; // An ICU code page converter. |
| UErrorCode status = U_ZERO_ERROR; |
| |
| |
| cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. |
| if (U_FAILURE(status)) { |
| fprintf(stderr, "ICU Converter open failed.: %d\n", &status); |
| exit(-1); |
| } |
| // redo for unix |
| for (line=0; line < gNumFileLines; line++) { |
| int sizeNeeded = ucnv_fromUChars(cvrtr, |
| 0, // ptr to target buffer. |
| 0, // length of target buffer. |
| gFileLines[line].name, |
| -1, // source is null terminated |
| &status); |
| if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { |
| fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); |
| exit(-1); |
| } |
| status = U_ZERO_ERROR; |
| gFileLines[line].unixName = new char[sizeNeeded+1]; |
| sizeNeeded = ucnv_fromUChars(cvrtr, |
| gFileLines[line].unixName, // ptr to target buffer. |
| sizeNeeded+1, // length of target buffer. |
| gFileLines[line].name, |
| -1, // source is null terminated |
| &status); |
| if (U_FAILURE(status)) { |
| fprintf(stderr, "ICU Conversion Failed.: %d\n", status); |
| exit(-1); |
| } |
| gFileLines[line].unixName[sizeNeeded] = 0; |
| }; |
| ucnv_close(cvrtr); |
| #endif |
| } |
| |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // class UCharFile Class to hide all the gorp to read a file in |
| // and produce a stream of UChars. |
| // |
| //---------------------------------------------------------------------------------------- |
| class UCharFile { |
| public: |
| UCharFile(const char *fileName); |
| ~UCharFile(); |
| UChar get(); |
| UBool eof() {return fEof;}; |
| UBool error() {return fError;}; |
| int32_t size() { return fFileSize; }; |
| |
| private: |
| UCharFile (const UCharFile &other) {}; // No copy constructor. |
| UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op |
| |
| FILE *fFile; |
| const char *fName; |
| UBool fEof; |
| UBool fError; |
| UChar fPending2ndSurrogate; |
| int32_t fFileSize; |
| |
| enum {UTF16LE, UTF16BE, UTF8} fEncoding; |
| }; |
| |
| UCharFile::UCharFile(const char * fileName) { |
| fEof = FALSE; |
| fError = FALSE; |
| fName = fileName; |
| struct stat buf; |
| int32_t result = stat(fileName, &buf); |
| if(result != 0) { |
| fprintf(stderr, "Error getting info\n"); |
| fFileSize = -1; |
| } else { |
| fFileSize = buf.st_size; |
| } |
| fFile = fopen(fName, "rb"); |
| fPending2ndSurrogate = 0; |
| if (fFile == NULL) { |
| fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); |
| fError = TRUE; |
| return; |
| } |
| // |
| // Look for the byte order mark at the start of the file. |
| // |
| int BOMC1, BOMC2, BOMC3; |
| BOMC1 = fgetc(fFile); |
| BOMC2 = fgetc(fFile); |
| |
| if (BOMC1 == 0xff && BOMC2 == 0xfe) { |
| fEncoding = UTF16LE; } |
| else if (BOMC1 == 0xfe && BOMC2 == 0xff) { |
| fEncoding = UTF16BE; } |
| else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { |
| fEncoding = UTF8; } |
| else |
| { |
| fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " |
| "must include a BOM.\n", fileName); |
| fError = true; |
| return; |
| } |
| } |
| |
| |
| UCharFile::~UCharFile() { |
| fclose(fFile); |
| } |
| |
| |
| |
| UChar UCharFile::get() { |
| UChar c; |
| switch (fEncoding) { |
| case UTF16LE: |
| { |
| int cL, cH; |
| cL = fgetc(fFile); |
| cH = fgetc(fFile); |
| c = cL | (cH << 8); |
| if (cH == EOF) { |
| c = 0; |
| fEof = TRUE; |
| } |
| break; |
| } |
| case UTF16BE: |
| { |
| int cL, cH; |
| cH = fgetc(fFile); |
| cL = fgetc(fFile); |
| c = cL | (cH << 8); |
| if (cL == EOF) { |
| c = 0; |
| fEof = TRUE; |
| } |
| break; |
| } |
| case UTF8: |
| { |
| if (fPending2ndSurrogate != 0) { |
| c = fPending2ndSurrogate; |
| fPending2ndSurrogate = 0; |
| break; |
| } |
| |
| int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. |
| if (ch == EOF) { |
| c = 0; |
| fEof = TRUE; |
| break; |
| } |
| |
| if (ch <= 0x7f) { |
| // It's ascii. No further utf-8 conversion. |
| c = ch; |
| break; |
| } |
| |
| // Figure out the lenght of the char and read the rest of the bytes |
| // into a temp array. |
| int nBytes; |
| if (ch >= 0xF0) {nBytes=4;} |
| else if (ch >= 0xE0) {nBytes=3;} |
| else if (ch >= 0xC0) {nBytes=2;} |
| else { |
| fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile)); |
| fError = TRUE; |
| return 0; |
| } |
| |
| unsigned char bytes[10]; |
| bytes[0] = (unsigned char)ch; |
| int i; |
| for (i=1; i<nBytes; i++) { |
| bytes[i] = fgetc(fFile); |
| if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { |
| fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch); |
| fError = TRUE; |
| return 0; |
| } |
| } |
| |
| // Convert the bytes from the temp array to a Unicode char. |
| i = 0; |
| uint32_t cp; |
| U8_NEXT_UNSAFE(bytes, i, cp); |
| c = (UChar)cp; |
| |
| if (cp >= 0x10000) { |
| // The code point needs to be broken up into a utf-16 surrogate pair. |
| // Process first half this time through the main loop, and |
| // remember the other half for the next time through. |
| UChar utf16Buf[3]; |
| i = 0; |
| UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); |
| fPending2ndSurrogate = utf16Buf[1]; |
| c = utf16Buf[0]; |
| } |
| break; |
| }; |
| } |
| return c; |
| } |
| |
| |
| //---------------------------------------------------------------------------------------- |
| // |
| // Main -- process command line, read in and pre-process the test file, |
| // call other functions to do the actual tests. |
| // |
| //---------------------------------------------------------------------------------------- |
| int main(int argc, const char** argv) { |
| if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) { |
| printf(gUsageString); |
| exit (1); |
| } |
| // Make sure that we've only got one API selected. |
| if (opt_mac || opt_unix || opt_win) opt_icu = FALSE; |
| if (opt_mac || opt_unix) opt_win = FALSE; |
| if (opt_mac) opt_unix = FALSE; |
| |
| UErrorCode status = U_ZERO_ERROR; |
| |
| |
| |
| // |
| // Set up a Windows LCID |
| // |
| /* |
| if (opt_langid != 0) { |
| gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); |
| } |
| else { |
| gWinLCID = uloc_getLCID(opt_locale); |
| } |
| */ |
| |
| // |
| // Set the UNIX locale |
| // |
| if (opt_unix) { |
| if (setlocale(LC_ALL, opt_locale) == 0) { |
| fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); |
| exit(-1); |
| } |
| } |
| |
| // Read in the input file. |
| // File assumed to be utf-16. |
| // Lines go onto heap buffers. Global index array to line starts is created. |
| // Lines themselves are null terminated. |
| // |
| |
| UCharFile f(opt_fName); |
| if (f.error()) { |
| exit(-1); |
| } |
| int32_t fileSize = f.size(); |
| const int STARTSIZE = 70000; |
| int32_t bufSize = 0; |
| int32_t charCount = 0; |
| if(fileSize != -1) { |
| text = (UChar *)malloc(fileSize*sizeof(UChar)); |
| bufSize = fileSize; |
| } else { |
| text = (UChar *)malloc(STARTSIZE*sizeof(UChar)); |
| bufSize = STARTSIZE; |
| } |
| if(text == NULL) { |
| fprintf(stderr, "Allocating buffer failed\n"); |
| exit(-1); |
| } |
| |
| |
| // Read the file, split into lines, and save in memory. |
| // Loop runs once per utf-16 value from the input file, |
| // (The number of bytes read from file per loop iteration depends on external encoding.) |
| for (;;) { |
| |
| UChar c = f.get(); |
| if(f.eof()) { |
| break; |
| } |
| if (f.error()){ |
| exit(-1); |
| } |
| // We now have a good UTF-16 value in c. |
| text[charCount++] = c; |
| if(charCount == bufSize) { |
| text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar)); |
| if(text == NULL) { |
| fprintf(stderr, "Reallocating buffer failed\n"); |
| exit(-1); |
| } |
| bufSize *= 2; |
| } |
| } |
| |
| |
| if (opt_terse == FALSE) { |
| printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount); |
| } |
| |
| textSize = charCount; |
| |
| |
| |
| |
| // |
| // Dump file contents if requested. |
| // |
| if (opt_dump) { |
| // dump file, etc... possibly |
| } |
| |
| |
| // |
| // We've got the file read into memory. Go do something with it. |
| // |
| int32_t i = 0; |
| for(i = 0; i < opt_passesCount; i++) { |
| if(opt_loopCount != 0) { |
| if(opt_next) { |
| doForwardTest(); |
| } else if(opt_isBound) { |
| doIsBoundTest(); |
| } else { |
| doForwardTest(); |
| } |
| } else if(opt_time != 0) { |
| |
| } |
| } |
| |
| if(text != NULL) { |
| free(text); |
| } |
| if(brkit != NULL) { |
| delete brkit; |
| } |
| |
| return 0; |
| } |