| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ********************************************************************** |
| * Copyright (C) 2005-2016, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_CONVERSION |
| #if defined(STARBOARD) |
| #include "starboard/client_porting/poem/string_poem.h" |
| #endif // defined(STARBOARD) |
| |
| #include "unicode/ucsdet.h" |
| |
| #include "csdetect.h" |
| #include "csmatch.h" |
| #include "uenumimp.h" |
| |
| #include "cmemory.h" |
| #include "cstring.h" |
| #include "umutex.h" |
| #include "ucln_in.h" |
| #include "uarrsort.h" |
| #include "inputext.h" |
| #include "csrsbcs.h" |
| #include "csrmbcs.h" |
| #include "csrutf8.h" |
| #include "csrucode.h" |
| #include "csr2022.h" |
| |
| #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) |
| #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
| |
| U_NAMESPACE_BEGIN |
| |
| struct CSRecognizerInfo : public UMemory { |
| CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) |
| : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {} |
| |
| ~CSRecognizerInfo() {delete recognizer;} |
| |
| CharsetRecognizer *recognizer; |
| UBool isDefaultEnabled; |
| }; |
| |
| U_NAMESPACE_END |
| |
| static icu::CSRecognizerInfo **fCSRecognizers = NULL; |
| static icu::UInitOnce gCSRecognizersInitOnce = U_INITONCE_INITIALIZER; |
| static int32_t fCSRecognizers_size = 0; |
| |
| U_CDECL_BEGIN |
| static UBool U_CALLCONV csdet_cleanup(void) |
| { |
| U_NAMESPACE_USE |
| if (fCSRecognizers != NULL) { |
| for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { |
| delete fCSRecognizers[r]; |
| fCSRecognizers[r] = NULL; |
| } |
| |
| DELETE_ARRAY(fCSRecognizers); |
| fCSRecognizers = NULL; |
| fCSRecognizers_size = 0; |
| } |
| gCSRecognizersInitOnce.reset(); |
| |
| return TRUE; |
| } |
| |
| static int32_t U_CALLCONV |
| charsetMatchComparator(const void * /*context*/, const void *left, const void *right) |
| { |
| U_NAMESPACE_USE |
| |
| const CharsetMatch **csm_l = (const CharsetMatch **) left; |
| const CharsetMatch **csm_r = (const CharsetMatch **) right; |
| |
| // NOTE: compare is backwards to sort from highest to lowest. |
| return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); |
| } |
| |
| static void U_CALLCONV initRecognizers(UErrorCode &status) { |
| U_NAMESPACE_USE |
| ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); |
| CSRecognizerInfo *tempArray[] = { |
| new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), |
| |
| new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), |
| |
| new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), |
| |
| new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), |
| new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), |
| |
| new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), |
| new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), |
| new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), |
| new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) |
| #endif |
| }; |
| int32_t rCount = UPRV_LENGTHOF(tempArray); |
| |
| fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); |
| |
| if (fCSRecognizers == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| else { |
| fCSRecognizers_size = rCount; |
| for (int32_t r = 0; r < rCount; r += 1) { |
| fCSRecognizers[r] = tempArray[r]; |
| if (fCSRecognizers[r] == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| } |
| } |
| |
| U_CDECL_END |
| |
| U_NAMESPACE_BEGIN |
| |
| void CharsetDetector::setRecognizers(UErrorCode &status) |
| { |
| umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); |
| } |
| |
| CharsetDetector::CharsetDetector(UErrorCode &status) |
| : textIn(new InputText(status)), resultArray(NULL), |
| resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), |
| fEnabledRecognizers(NULL) |
| { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| setRecognizers(status); |
| |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); |
| |
| if (resultArray == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { |
| resultArray[i] = new CharsetMatch(); |
| |
| if (resultArray[i] == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| break; |
| } |
| } |
| } |
| |
| CharsetDetector::~CharsetDetector() |
| { |
| delete textIn; |
| |
| for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { |
| delete resultArray[i]; |
| } |
| |
| uprv_free(resultArray); |
| |
| if (fEnabledRecognizers) { |
| uprv_free(fEnabledRecognizers); |
| } |
| } |
| |
| void CharsetDetector::setText(const char *in, int32_t len) |
| { |
| textIn->setText(in, len); |
| fFreshTextSet = TRUE; |
| } |
| |
| UBool CharsetDetector::setStripTagsFlag(UBool flag) |
| { |
| UBool temp = fStripTags; |
| fStripTags = flag; |
| fFreshTextSet = TRUE; |
| return temp; |
| } |
| |
| UBool CharsetDetector::getStripTagsFlag() const |
| { |
| return fStripTags; |
| } |
| |
| void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const |
| { |
| textIn->setDeclaredEncoding(encoding,len); |
| } |
| |
| int32_t CharsetDetector::getDetectableCount() |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| |
| setRecognizers(status); |
| |
| return fCSRecognizers_size; |
| } |
| |
| const CharsetMatch *CharsetDetector::detect(UErrorCode &status) |
| { |
| int32_t maxMatchesFound = 0; |
| |
| detectAll(maxMatchesFound, status); |
| |
| if(maxMatchesFound > 0) { |
| return resultArray[0]; |
| } else { |
| return NULL; |
| } |
| } |
| |
| const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) |
| { |
| if(!textIn->isSet()) { |
| status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set |
| |
| return NULL; |
| } else if (fFreshTextSet) { |
| CharsetRecognizer *csr; |
| int32_t i; |
| |
| textIn->MungeInput(fStripTags); |
| |
| // Iterate over all possible charsets, remember all that |
| // give a match quality > 0. |
| resultCount = 0; |
| for (i = 0; i < fCSRecognizers_size; i += 1) { |
| csr = fCSRecognizers[i]->recognizer; |
| if (csr->match(textIn, resultArray[resultCount])) { |
| resultCount++; |
| } |
| } |
| |
| if (resultCount > 1) { |
| uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); |
| } |
| fFreshTextSet = FALSE; |
| } |
| |
| maxMatchesFound = resultCount; |
| |
| return resultArray; |
| } |
| |
| void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) |
| { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| int32_t modIdx = -1; |
| UBool isDefaultVal = FALSE; |
| for (int32_t i = 0; i < fCSRecognizers_size; i++) { |
| CSRecognizerInfo *csrinfo = fCSRecognizers[i]; |
| if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { |
| modIdx = i; |
| isDefaultVal = (csrinfo->isDefaultEnabled == enabled); |
| break; |
| } |
| } |
| if (modIdx < 0) { |
| // No matching encoding found |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| |
| if (fEnabledRecognizers == NULL && !isDefaultVal) { |
| // Create an array storing the non default setting |
| fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); |
| if (fEnabledRecognizers == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| // Initialize the array with default info |
| for (int32_t i = 0; i < fCSRecognizers_size; i++) { |
| fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; |
| } |
| } |
| |
| if (fEnabledRecognizers != NULL) { |
| fEnabledRecognizers[modIdx] = enabled; |
| } |
| } |
| |
| /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const |
| { |
| if( index > fCSRecognizers_size-1 || index < 0) { |
| status = U_INDEX_OUTOFBOUNDS_ERROR; |
| |
| return 0; |
| } else { |
| return fCSRecognizers[index]->getName(); |
| } |
| }*/ |
| |
| U_NAMESPACE_END |
| |
| U_CDECL_BEGIN |
| typedef struct { |
| int32_t currIndex; |
| UBool all; |
| UBool *enabledRecognizers; |
| } Context; |
| |
| |
| |
| static void U_CALLCONV |
| enumClose(UEnumeration *en) { |
| if(en->context != NULL) { |
| DELETE_ARRAY(en->context); |
| } |
| |
| DELETE_ARRAY(en); |
| } |
| |
| static int32_t U_CALLCONV |
| enumCount(UEnumeration *en, UErrorCode *) { |
| if (((Context *)en->context)->all) { |
| // ucsdet_getAllDetectableCharsets, all charset detector names |
| return fCSRecognizers_size; |
| } |
| |
| // Otherwise, ucsdet_getDetectableCharsets - only enabled ones |
| int32_t count = 0; |
| UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; |
| if (enabledArray != NULL) { |
| // custom set |
| for (int32_t i = 0; i < fCSRecognizers_size; i++) { |
| if (enabledArray[i]) { |
| count++; |
| } |
| } |
| } else { |
| // default set |
| for (int32_t i = 0; i < fCSRecognizers_size; i++) { |
| if (fCSRecognizers[i]->isDefaultEnabled) { |
| count++; |
| } |
| } |
| } |
| return count; |
| } |
| |
| static const char* U_CALLCONV |
| enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { |
| const char *currName = NULL; |
| |
| if (((Context *)en->context)->currIndex < fCSRecognizers_size) { |
| if (((Context *)en->context)->all) { |
| // ucsdet_getAllDetectableCharsets, all charset detector names |
| currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); |
| ((Context *)en->context)->currIndex++; |
| } else { |
| // ucsdet_getDetectableCharsets |
| UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; |
| if (enabledArray != NULL) { |
| // custome set |
| while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { |
| if (enabledArray[((Context *)en->context)->currIndex]) { |
| currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); |
| } |
| ((Context *)en->context)->currIndex++; |
| } |
| } else { |
| // default set |
| while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { |
| if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { |
| currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); |
| } |
| ((Context *)en->context)->currIndex++; |
| } |
| } |
| } |
| } |
| |
| if(resultLength != NULL) { |
| *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); |
| } |
| |
| return currName; |
| } |
| |
| |
| static void U_CALLCONV |
| enumReset(UEnumeration *en, UErrorCode *) { |
| ((Context *)en->context)->currIndex = 0; |
| } |
| |
| static const UEnumeration gCSDetEnumeration = { |
| NULL, |
| NULL, |
| enumClose, |
| enumCount, |
| uenum_unextDefault, |
| enumNext, |
| enumReset |
| }; |
| |
| U_CDECL_END |
| |
| U_NAMESPACE_BEGIN |
| |
| UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) |
| { |
| |
| /* Initialize recognized charsets. */ |
| setRecognizers(status); |
| |
| if(U_FAILURE(status)) { |
| return 0; |
| } |
| |
| UEnumeration *en = NEW_ARRAY(UEnumeration, 1); |
| if (en == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return 0; |
| } |
| memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); |
| en->context = (void*)NEW_ARRAY(Context, 1); |
| if (en->context == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| DELETE_ARRAY(en); |
| return 0; |
| } |
| uprv_memset(en->context, 0, sizeof(Context)); |
| ((Context*)en->context)->all = TRUE; |
| return en; |
| } |
| |
| UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const |
| { |
| if(U_FAILURE(status)) { |
| return 0; |
| } |
| |
| UEnumeration *en = NEW_ARRAY(UEnumeration, 1); |
| if (en == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return 0; |
| } |
| memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); |
| en->context = (void*)NEW_ARRAY(Context, 1); |
| if (en->context == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| DELETE_ARRAY(en); |
| return 0; |
| } |
| uprv_memset(en->context, 0, sizeof(Context)); |
| ((Context*)en->context)->all = FALSE; |
| ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; |
| return en; |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif |