| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 1999-2014, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: unistr_cnv.cpp |
| * encoding: UTF-8 |
| * tab size: 8 (not used) |
| * indentation:2 |
| * |
| * created on: 2004aug19 |
| * created by: Markus W. Scherer |
| * |
| * Character conversion functions moved here from unistr.cpp |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_CONVERSION |
| |
| #include "unicode/putil.h" |
| #include "cstring.h" |
| #include "cmemory.h" |
| #include "unicode/ustring.h" |
| #include "unicode/unistr.h" |
| #include "unicode/ucnv.h" |
| #include "ucnv_imp.h" |
| #include "putilimp.h" |
| #include "ustr_cnv.h" |
| #include "ustr_imp.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| //======================================== |
| // Constructors |
| //======================================== |
| |
| #if !U_CHARSET_IS_UTF8 |
| |
| UnicodeString::UnicodeString(const char *codepageData) { |
| fUnion.fFields.fLengthAndFlags = kShortString; |
| if(codepageData != 0) { |
| doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0); |
| } |
| } |
| |
| UnicodeString::UnicodeString(const char *codepageData, |
| int32_t dataLength) { |
| fUnion.fFields.fLengthAndFlags = kShortString; |
| if(codepageData != 0) { |
| doCodepageCreate(codepageData, dataLength, 0); |
| } |
| } |
| |
| // else see unistr.cpp |
| #endif |
| |
| UnicodeString::UnicodeString(const char *codepageData, |
| const char *codepage) { |
| fUnion.fFields.fLengthAndFlags = kShortString; |
| if(codepageData != 0) { |
| doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); |
| } |
| } |
| |
| UnicodeString::UnicodeString(const char *codepageData, |
| int32_t dataLength, |
| const char *codepage) { |
| fUnion.fFields.fLengthAndFlags = kShortString; |
| if(codepageData != 0) { |
| doCodepageCreate(codepageData, dataLength, codepage); |
| } |
| } |
| |
| UnicodeString::UnicodeString(const char *src, int32_t srcLength, |
| UConverter *cnv, |
| UErrorCode &errorCode) { |
| fUnion.fFields.fLengthAndFlags = kShortString; |
| if(U_SUCCESS(errorCode)) { |
| // check arguments |
| if(src==NULL) { |
| // treat as an empty string, do nothing more |
| } else if(srcLength<-1) { |
| errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| } else { |
| // get input length |
| if(srcLength==-1) { |
| srcLength=(int32_t)uprv_strlen(src); |
| } |
| if(srcLength>0) { |
| if(cnv!=0) { |
| // use the provided converter |
| ucnv_resetToUnicode(cnv); |
| doCodepageCreate(src, srcLength, cnv, errorCode); |
| } else { |
| // use the default converter |
| cnv=u_getDefaultConverter(&errorCode); |
| doCodepageCreate(src, srcLength, cnv, errorCode); |
| u_releaseDefaultConverter(cnv); |
| } |
| } |
| } |
| |
| if(U_FAILURE(errorCode)) { |
| setToBogus(); |
| } |
| } |
| } |
| |
| //======================================== |
| // Codeset conversion |
| //======================================== |
| |
| #if !U_CHARSET_IS_UTF8 |
| |
| int32_t |
| UnicodeString::extract(int32_t start, |
| int32_t length, |
| char *target, |
| uint32_t dstSize) const { |
| return extract(start, length, target, dstSize, 0); |
| } |
| |
| // else see unistr.cpp |
| #endif |
| |
| int32_t |
| UnicodeString::extract(int32_t start, |
| int32_t length, |
| char *target, |
| uint32_t dstSize, |
| const char *codepage) const |
| { |
| // if the arguments are illegal, then do nothing |
| if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { |
| return 0; |
| } |
| |
| // pin the indices to legal values |
| pinIndices(start, length); |
| |
| // We need to cast dstSize to int32_t for all subsequent code. |
| // I don't know why the API was defined with uint32_t but we are stuck with it. |
| // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize |
| // as a limit in some functions, it may wrap around and yield a pointer |
| // that compares less-than target. |
| int32_t capacity; |
| if(dstSize < 0x7fffffff) { |
| // Assume that the capacity is real and a limit pointer won't wrap around. |
| capacity = (int32_t)dstSize; |
| } else { |
| // Pin the capacity so that a limit pointer does not wrap around. |
| char *targetLimit = (char *)U_MAX_PTR(target); |
| // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff |
| // greater than target and does not wrap around the top of the address space. |
| capacity = (int32_t)(targetLimit - target); |
| } |
| |
| // create the converter |
| UConverter *converter; |
| UErrorCode status = U_ZERO_ERROR; |
| |
| // just write the NUL if the string length is 0 |
| if(length == 0) { |
| return u_terminateChars(target, capacity, 0, &status); |
| } |
| |
| // if the codepage is the default, use our cache |
| // if it is an empty string, then use the "invariant character" conversion |
| if (codepage == 0) { |
| const char *defaultName = ucnv_getDefaultName(); |
| if(UCNV_FAST_IS_UTF8(defaultName)) { |
| return toUTF8(start, length, target, capacity); |
| } |
| converter = u_getDefaultConverter(&status); |
| } else if (*codepage == 0) { |
| // use the "invariant characters" conversion |
| int32_t destLength; |
| if(length <= capacity) { |
| destLength = length; |
| } else { |
| destLength = capacity; |
| } |
| u_UCharsToChars(getArrayStart() + start, target, destLength); |
| return u_terminateChars(target, capacity, length, &status); |
| } else { |
| converter = ucnv_open(codepage, &status); |
| } |
| |
| length = doExtract(start, length, target, capacity, converter, status); |
| |
| // close the converter |
| if (codepage == 0) { |
| u_releaseDefaultConverter(converter); |
| } else { |
| ucnv_close(converter); |
| } |
| |
| return length; |
| } |
| |
| int32_t |
| UnicodeString::extract(char *dest, int32_t destCapacity, |
| UConverter *cnv, |
| UErrorCode &errorCode) const |
| { |
| if(U_FAILURE(errorCode)) { |
| return 0; |
| } |
| |
| if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { |
| errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| // nothing to do? |
| if(isEmpty()) { |
| return u_terminateChars(dest, destCapacity, 0, &errorCode); |
| } |
| |
| // get the converter |
| UBool isDefaultConverter; |
| if(cnv==0) { |
| isDefaultConverter=TRUE; |
| cnv=u_getDefaultConverter(&errorCode); |
| if(U_FAILURE(errorCode)) { |
| return 0; |
| } |
| } else { |
| isDefaultConverter=FALSE; |
| ucnv_resetFromUnicode(cnv); |
| } |
| |
| // convert |
| int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode); |
| |
| // release the converter |
| if(isDefaultConverter) { |
| u_releaseDefaultConverter(cnv); |
| } |
| |
| return len; |
| } |
| |
| int32_t |
| UnicodeString::doExtract(int32_t start, int32_t length, |
| char *dest, int32_t destCapacity, |
| UConverter *cnv, |
| UErrorCode &errorCode) const |
| { |
| if(U_FAILURE(errorCode)) { |
| if(destCapacity!=0) { |
| *dest=0; |
| } |
| return 0; |
| } |
| |
| const UChar *src=getArrayStart()+start, *srcLimit=src+length; |
| char *originalDest=dest; |
| const char *destLimit; |
| |
| if(destCapacity==0) { |
| destLimit=dest=0; |
| } else if(destCapacity==-1) { |
| // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. |
| destLimit=(char*)U_MAX_PTR(dest); |
| // for NUL-termination, translate into highest int32_t |
| destCapacity=0x7fffffff; |
| } else { |
| destLimit=dest+destCapacity; |
| } |
| |
| // perform the conversion |
| ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); |
| length=(int32_t)(dest-originalDest); |
| |
| // if an overflow occurs, then get the preflighting length |
| if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
| char buffer[1024]; |
| |
| destLimit=buffer+sizeof(buffer); |
| do { |
| dest=buffer; |
| errorCode=U_ZERO_ERROR; |
| ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); |
| length+=(int32_t)(dest-buffer); |
| } while(errorCode==U_BUFFER_OVERFLOW_ERROR); |
| } |
| |
| return u_terminateChars(originalDest, destCapacity, length, &errorCode); |
| } |
| |
| void |
| UnicodeString::doCodepageCreate(const char *codepageData, |
| int32_t dataLength, |
| const char *codepage) |
| { |
| // if there's nothing to convert, do nothing |
| if(codepageData == 0 || dataLength == 0 || dataLength < -1) { |
| return; |
| } |
| if(dataLength == -1) { |
| dataLength = (int32_t)uprv_strlen(codepageData); |
| } |
| |
| UErrorCode status = U_ZERO_ERROR; |
| |
| // create the converter |
| // if the codepage is the default, use our cache |
| // if it is an empty string, then use the "invariant character" conversion |
| UConverter *converter; |
| if (codepage == 0) { |
| const char *defaultName = ucnv_getDefaultName(); |
| if(UCNV_FAST_IS_UTF8(defaultName)) { |
| setToUTF8(StringPiece(codepageData, dataLength)); |
| return; |
| } |
| converter = u_getDefaultConverter(&status); |
| } else if(*codepage == 0) { |
| // use the "invariant characters" conversion |
| if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { |
| u_charsToUChars(codepageData, getArrayStart(), dataLength); |
| setLength(dataLength); |
| } else { |
| setToBogus(); |
| } |
| return; |
| } else { |
| converter = ucnv_open(codepage, &status); |
| } |
| |
| // if we failed, set the appropriate flags and return |
| if(U_FAILURE(status)) { |
| setToBogus(); |
| return; |
| } |
| |
| // perform the conversion |
| doCodepageCreate(codepageData, dataLength, converter, status); |
| if(U_FAILURE(status)) { |
| setToBogus(); |
| } |
| |
| // close the converter |
| if(codepage == 0) { |
| u_releaseDefaultConverter(converter); |
| } else { |
| ucnv_close(converter); |
| } |
| } |
| |
| void |
| UnicodeString::doCodepageCreate(const char *codepageData, |
| int32_t dataLength, |
| UConverter *converter, |
| UErrorCode &status) |
| { |
| if(U_FAILURE(status)) { |
| return; |
| } |
| |
| // set up the conversion parameters |
| const char *mySource = codepageData; |
| const char *mySourceEnd = mySource + dataLength; |
| UChar *array, *myTarget; |
| |
| // estimate the size needed: |
| int32_t arraySize; |
| if(dataLength <= US_STACKBUF_SIZE) { |
| // try to use the stack buffer |
| arraySize = US_STACKBUF_SIZE; |
| } else { |
| // 1.25 UChar's per source byte should cover most cases |
| arraySize = dataLength + (dataLength >> 2); |
| } |
| |
| // we do not care about the current contents |
| UBool doCopyArray = FALSE; |
| for(;;) { |
| if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { |
| setToBogus(); |
| break; |
| } |
| |
| // perform the conversion |
| array = getArrayStart(); |
| myTarget = array + length(); |
| ucnv_toUnicode(converter, &myTarget, array + getCapacity(), |
| &mySource, mySourceEnd, 0, TRUE, &status); |
| |
| // update the conversion parameters |
| setLength((int32_t)(myTarget - array)); |
| |
| // allocate more space and copy data, if needed |
| if(status == U_BUFFER_OVERFLOW_ERROR) { |
| // reset the error code |
| status = U_ZERO_ERROR; |
| |
| // keep the previous conversion results |
| doCopyArray = TRUE; |
| |
| // estimate the new size needed, larger than before |
| // try 2 UChar's per remaining source byte |
| arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource)); |
| } else { |
| break; |
| } |
| } |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif |