| /* |
| ********************************************************************** |
| * Copyright (C) 2005-2015, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| */ |
| |
| |
| #include "unicode/utypes.h" |
| #include "unicode/ucsdet.h" |
| #include "unicode/ucnv.h" |
| #include "unicode/unistr.h" |
| #include "unicode/putil.h" |
| #include "unicode/uniset.h" |
| |
| #include "intltest.h" |
| #include "csdetest.h" |
| |
| #include "xmlparser.h" |
| |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #ifdef DEBUG_DETECT |
| #include <stdio.h> |
| #endif |
| |
| #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
| |
| #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type)) |
| #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array)) |
| |
| #define CH_SPACE 0x0020 |
| #define CH_SLASH 0x002F |
| |
| #define TEST_ASSERT(x) {if (!(x)) { \ |
| errln("Failure in file %s, line %d", __FILE__, __LINE__);}} |
| |
| #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ |
| errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\ |
| return;}} |
| |
| |
| //--------------------------------------------------------------------------- |
| // |
| // Test class boilerplate |
| // |
| //--------------------------------------------------------------------------- |
| CharsetDetectionTest::CharsetDetectionTest() |
| { |
| } |
| |
| |
| CharsetDetectionTest::~CharsetDetectionTest() |
| { |
| } |
| |
| |
| |
| void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) |
| { |
| if (exec) logln("TestSuite CharsetDetectionTest: "); |
| switch (index) { |
| case 0: name = "ConstructionTest"; |
| if (exec) ConstructionTest(); |
| break; |
| |
| case 1: name = "UTF8Test"; |
| if (exec) UTF8Test(); |
| break; |
| |
| case 2: name = "UTF16Test"; |
| if (exec) UTF16Test(); |
| break; |
| |
| case 3: name = "C1BytesTest"; |
| if (exec) C1BytesTest(); |
| break; |
| |
| case 4: name = "InputFilterTest"; |
| if (exec) InputFilterTest(); |
| break; |
| |
| case 5: name = "DetectionTest"; |
| if (exec) DetectionTest(); |
| break; |
| #if !UCONFIG_NO_LEGACY_CONVERSION |
| case 6: name = "IBM424Test"; |
| if (exec) IBM424Test(); |
| break; |
| |
| case 7: name = "IBM420Test"; |
| if (exec) IBM420Test(); |
| break; |
| #else |
| case 6: |
| case 7: name = "skip"; break; |
| #endif |
| case 8: name = "Ticket6394Test"; |
| if (exec) Ticket6394Test(); |
| break; |
| |
| case 9: name = "Ticket6954Test"; |
| if (exec) Ticket6954Test(); |
| break; |
| |
| default: name = ""; |
| break; //needed to end loop |
| } |
| } |
| |
| static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits) |
| { |
| int32_t offset = -1; |
| |
| splits = 1; |
| while((offset = src.indexOf(ch, offset + 1)) >= 0) { |
| splits += 1; |
| } |
| |
| UnicodeString *result = new UnicodeString[splits]; |
| |
| int32_t start = 0; |
| int32_t split = 0; |
| int32_t end; |
| |
| while((end = src.indexOf(ch, start)) >= 0) { |
| src.extractBetween(start, end, result[split++]); |
| start = end + 1; |
| } |
| |
| src.extractBetween(start, src.length(), result[split]); |
| |
| return result; |
| } |
| |
| static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length) |
| { |
| int32_t sLength = source.length(); |
| char *bytes = NULL; |
| |
| length = source.extract(0, sLength, NULL, codepage); |
| |
| if (length > 0) { |
| bytes = NEW_ARRAY(char, length + 1); |
| source.extract(0, sLength, bytes, codepage); |
| } |
| |
| return bytes; |
| } |
| |
| static void freeBytes(char *bytes) |
| { |
| DELETE_ARRAY(bytes); |
| } |
| |
| void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id) |
| { |
| int32_t splits = 0; |
| int32_t testLength = testString.length(); |
| UnicodeString *eSplit = split(encoding, CH_SLASH, splits); |
| UErrorCode status = U_ZERO_ERROR; |
| int32_t cpLength = eSplit[0].length(); |
| char codepage[64]; |
| |
| u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength); |
| codepage[cpLength] = '\0'; |
| |
| LocalUCharsetDetectorPointer csd(ucsdet_open(&status)); |
| |
| int32_t byteLength = 0; |
| char *bytes = extractBytes(testString, codepage, byteLength); |
| |
| if (bytes == NULL) { |
| #if !UCONFIG_NO_LEGACY_CONVERSION |
| dataerrln("Can't open a " + encoding + " converter for " + id); |
| #endif |
| return; |
| } |
| |
| ucsdet_setText(csd.getAlias(), bytes, byteLength, &status); |
| |
| int32_t matchCount = 0; |
| const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status); |
| |
| |
| UnicodeString name(ucsdet_getName(matches[0], &status)); |
| UnicodeString lang(ucsdet_getLanguage(matches[0], &status)); |
| UChar *decoded = NULL; |
| int32_t dLength = 0; |
| |
| if (matchCount == 0) { |
| errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches"); |
| goto bail; |
| } |
| |
| if (name.compare(eSplit[0]) != 0) { |
| errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name); |
| |
| #ifdef DEBUG_DETECT |
| for (int32_t m = 0; m < matchCount; m += 1) { |
| const char *name = ucsdet_getName(matches[m], &status); |
| const char *lang = ucsdet_getLanguage(matches[m], &status); |
| int32_t confidence = ucsdet_getConfidence(matches[m], &status); |
| |
| printf("%s (%s) %d\n", name, lang, confidence); |
| } |
| #endif |
| goto bail; |
| } |
| |
| if (splits > 1 && lang.compare(eSplit[1]) != 0) { |
| errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang); |
| goto bail; |
| } |
| |
| decoded = NEW_ARRAY(UChar, testLength); |
| dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status); |
| |
| if (testString.compare(decoded, dLength) != 0) { |
| errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string."); |
| |
| #ifdef DEBUG_DETECT |
| for(int32_t i = 0; i < testLength; i += 1) { |
| if(testString[i] != decoded[i]) { |
| printf("Strings differ at byte %d\n", i); |
| break; |
| } |
| } |
| #endif |
| |
| } |
| |
| DELETE_ARRAY(decoded); |
| |
| bail: |
| freeBytes(bytes); |
| delete[] eSplit; |
| } |
| |
| const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) { |
| UErrorCode status = U_ZERO_ERROR; |
| const char *testDataDirectory = IntlTest::getSourceTestData(status); |
| |
| if (U_FAILURE(status)) { |
| errln("ERROR: getPath() failed - %s", u_errorName(status)); |
| return NULL; |
| } |
| |
| strcpy(buffer, testDataDirectory); |
| strcat(buffer, filename); |
| return buffer; |
| } |
| |
| void CharsetDetectionTest::ConstructionTest() |
| { |
| IcuTestErrorCode status(*this, "ConstructionTest"); |
| LocalUCharsetDetectorPointer csd(ucsdet_open(status)); |
| LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status)); |
| int32_t count = uenum_count(e.getAlias(), status); |
| |
| #ifdef DEBUG_DETECT |
| printf("There are %d recognizers.\n", count); |
| #endif |
| |
| for(int32_t i = 0; i < count; i += 1) { |
| int32_t length; |
| const char *name = uenum_next(e.getAlias(), &length, status); |
| |
| if(name == NULL || length <= 0) { |
| errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!"); |
| } |
| |
| #ifdef DEBUG_DETECT |
| printf("%s\n", name); |
| #endif |
| } |
| |
| const char* defDisabled[] = { |
| "IBM420_rtl", "IBM420_ltr", |
| "IBM424_rtl", "IBM424_ltr", |
| 0 |
| }; |
| |
| LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status)); |
| const char *activeName = NULL; |
| |
| while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) { |
| // the charset must be included in all list |
| UBool found = FALSE; |
| |
| const char *name = NULL; |
| uenum_reset(e.getAlias(), status); |
| while ((name = uenum_next(e.getAlias(), NULL, status))) { |
| if (strcmp(activeName, name) == 0) { |
| found = TRUE; |
| break; |
| } |
| } |
| |
| if (!found) { |
| errln(UnicodeString(activeName) + " is not included in the all charset list."); |
| } |
| |
| // some charsets are disabled by default |
| found = FALSE; |
| for (int32_t i = 0; defDisabled[i] != 0; i++) { |
| if (strcmp(activeName, defDisabled[i]) == 0) { |
| found = TRUE; |
| break; |
| } |
| } |
| if (found) { |
| errln(UnicodeString(activeName) + " should not be included in the default charset list."); |
| } |
| } |
| } |
| |
| void CharsetDetectionTest::UTF8Test() |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UnicodeString ss = "This is a string with some non-ascii characters that will " |
| "be converted to UTF-8, then shoved through the detection process. " |
| "\\u0391\\u0392\\u0393\\u0394\\u0395" |
| "Sure would be nice if our source could contain Unicode directly!"; |
| UnicodeString s = ss.unescape(); |
| int32_t byteLength = 0, sLength = s.length(); |
| char *bytes = extractBytes(s, "UTF-8", byteLength); |
| UCharsetDetector *csd = ucsdet_open(&status); |
| const UCharsetMatch *match; |
| UChar *detected = NEW_ARRAY(UChar, sLength); |
| |
| ucsdet_setText(csd, bytes, byteLength, &status); |
| match = ucsdet_detect(csd, &status); |
| |
| if (match == NULL) { |
| errln("Detection failure for UTF-8: got no matches."); |
| goto bail; |
| } |
| |
| ucsdet_getUChars(match, detected, sLength, &status); |
| |
| if (s.compare(detected, sLength) != 0) { |
| errln("Round-trip test failed!"); |
| } |
| |
| ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ |
| |
| bail: |
| DELETE_ARRAY(detected); |
| freeBytes(bytes); |
| ucsdet_close(csd); |
| } |
| |
| void CharsetDetectionTest::UTF16Test() |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| /* Notice the BOM on the start of this string */ |
| UChar chars[] = { |
| 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, |
| 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, |
| 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, |
| 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, |
| 0x064a, 0x062a, 0x0000}; |
| UnicodeString s(chars); |
| int32_t beLength = 0, leLength = 0; |
| char *beBytes = extractBytes(s, "UTF-16BE", beLength); |
| char *leBytes = extractBytes(s, "UTF-16LE", leLength); |
| UCharsetDetector *csd = ucsdet_open(&status); |
| const UCharsetMatch *match; |
| const char *name; |
| int32_t conf; |
| |
| ucsdet_setText(csd, beBytes, beLength, &status); |
| match = ucsdet_detect(csd, &status); |
| |
| if (match == NULL) { |
| errln("Encoding detection failure for UTF-16BE: got no matches."); |
| goto try_le; |
| } |
| |
| name = ucsdet_getName(match, &status); |
| conf = ucsdet_getConfidence(match, &status); |
| |
| if (strcmp(name, "UTF-16BE") != 0) { |
| errln("Encoding detection failure for UTF-16BE: got %s", name); |
| goto try_le; // no point in looking at confidence if we got the wrong character set. |
| } |
| |
| if (conf != 100) { |
| errln("Did not get 100%% confidence for UTF-16BE: got %d", conf); |
| } |
| |
| try_le: |
| ucsdet_setText(csd, leBytes, leLength, &status); |
| match = ucsdet_detect(csd, &status); |
| |
| if (match == NULL) { |
| errln("Encoding detection failure for UTF-16LE: got no matches."); |
| goto bail; |
| } |
| |
| name = ucsdet_getName(match, &status); |
| conf = ucsdet_getConfidence(match, &status); |
| |
| |
| if (strcmp(name, "UTF-16LE") != 0) { |
| errln("Enconding detection failure for UTF-16LE: got %s", name); |
| goto bail; // no point in looking at confidence if we got the wrong character set. |
| } |
| |
| if (conf != 100) { |
| errln("Did not get 100%% confidence for UTF-16LE: got %d", conf); |
| } |
| |
| bail: |
| freeBytes(leBytes); |
| freeBytes(beBytes); |
| ucsdet_close(csd); |
| } |
| |
| void CharsetDetectionTest::InputFilterTest() |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; |
| UnicodeString s = ss.unescape(); |
| int32_t byteLength = 0; |
| char *bytes = extractBytes(s, "ISO-8859-1", byteLength); |
| UCharsetDetector *csd = ucsdet_open(&status); |
| const UCharsetMatch *match; |
| const char *lang, *name; |
| |
| ucsdet_enableInputFilter(csd, TRUE); |
| |
| if (!ucsdet_isInputFilterEnabled(csd)) { |
| errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!"); |
| } |
| |
| |
| ucsdet_setText(csd, bytes, byteLength, &status); |
| match = ucsdet_detect(csd, &status); |
| |
| if (match == NULL) { |
| errln("Turning on the input filter resulted in no matches."); |
| goto turn_off; |
| } |
| |
| name = ucsdet_getName(match, &status); |
| |
| if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { |
| errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name); |
| } else { |
| lang = ucsdet_getLanguage(match, &status); |
| |
| if (lang == NULL || strcmp(lang, "fr") != 0) { |
| errln("Input filter did not strip markup!"); |
| } |
| } |
| |
| turn_off: |
| ucsdet_enableInputFilter(csd, FALSE); |
| ucsdet_setText(csd, bytes, byteLength, &status); |
| match = ucsdet_detect(csd, &status); |
| |
| if (match == NULL) { |
| errln("Turning off the input filter resulted in no matches."); |
| goto bail; |
| } |
| |
| name = ucsdet_getName(match, &status); |
| |
| if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { |
| errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name); |
| } else { |
| lang = ucsdet_getLanguage(match, &status); |
| |
| if (lang == NULL || strcmp(lang, "en") != 0) { |
| errln("Unfiltered input did not detect as English!"); |
| } |
| } |
| |
| bail: |
| freeBytes(bytes); |
| ucsdet_close(csd); |
| } |
| |
| void CharsetDetectionTest::C1BytesTest() |
| { |
| #if !UCONFIG_NO_LEGACY_CONVERSION |
| UErrorCode status = U_ZERO_ERROR; |
| UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; |
| UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); |
| UnicodeString sWindows = ssWindows.unescape(); |
| int32_t lISO = 0, lWindows = 0; |
| char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); |
| char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); |
| UCharsetDetector *csd = ucsdet_open(&status); |
| const UCharsetMatch *match; |
| const char *name; |
| |
| ucsdet_setText(csd, bWindows, lWindows, &status); |
| match = ucsdet_detect(csd, &status); |
| |
| if (match == NULL) { |
| errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status)); |
| goto bail; |
| } |
| |
| name = ucsdet_getName(match, &status); |
| |
| if (strcmp(name, "windows-1252") != 0) { |
| errln("English text with C1 bytes does not detect as windows-1252, but as %s", name); |
| } |
| |
| ucsdet_setText(csd, bISO, lISO, &status); |
| match = ucsdet_detect(csd, &status); |
| |
| if (match == NULL) { |
| errln("English text without C1 bytes got no matches."); |
| goto bail; |
| } |
| |
| name = ucsdet_getName(match, &status); |
| |
| if (strcmp(name, "ISO-8859-1") != 0) { |
| errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name); |
| } |
| |
| bail: |
| freeBytes(bWindows); |
| freeBytes(bISO); |
| |
| ucsdet_close(csd); |
| #endif |
| } |
| |
| void CharsetDetectionTest::DetectionTest() |
| { |
| #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| UErrorCode status = U_ZERO_ERROR; |
| char path[2048]; |
| const char *testFilePath = getPath(path, "csdetest.xml"); |
| |
| if (testFilePath == NULL) { |
| return; /* Couldn't get path: error message already output. */ |
| } |
| |
| UXMLParser *parser = UXMLParser::createParser(status); |
| if (U_FAILURE(status)) { |
| dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status)); |
| return; |
| } |
| |
| UXMLElement *root = parser->parseFile(testFilePath, status); |
| if (!assertSuccess( "parseFile",status)) return; |
| |
| UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case"); |
| UnicodeString id_attr = UNICODE_STRING_SIMPLE("id"); |
| UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings"); |
| |
| const UXMLElement *testCase; |
| int32_t tc = 0; |
| |
| while((testCase = root->nextChildElement(tc)) != NULL) { |
| if (testCase->getTagName().compare(test_case) == 0) { |
| const UnicodeString *id = testCase->getAttribute(id_attr); |
| const UnicodeString *encodings = testCase->getAttribute(enc_attr); |
| const UnicodeString text = testCase->getText(TRUE); |
| int32_t encodingCount; |
| UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount); |
| |
| for(int32_t e = 0; e < encodingCount; e += 1) { |
| checkEncoding(text, encodingList[e], *id); |
| } |
| |
| delete[] encodingList; |
| } |
| } |
| |
| delete root; |
| delete parser; |
| #endif |
| } |
| |
| void CharsetDetectionTest::IBM424Test() |
| { |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| UErrorCode status = U_ZERO_ERROR; |
| |
| static const UChar chars[] = { |
| 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, |
| 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, |
| 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, |
| 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, |
| 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, |
| 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, |
| 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, |
| 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, |
| 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, |
| 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, |
| 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, |
| 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, |
| 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, |
| 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, |
| 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, |
| 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, |
| 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 |
| }; |
| |
| static const UChar chars_reverse[] = { |
| 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, |
| 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, |
| 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, |
| 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, |
| 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, |
| 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, |
| 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, |
| 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, |
| 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, |
| 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, |
| 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, |
| 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, |
| 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, |
| 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, |
| 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, |
| 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, |
| 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, |
| 0x0000 |
| }; |
| |
| int32_t bLength = 0, brLength = 0; |
| |
| UnicodeString s1(chars); |
| UnicodeString s2(chars_reverse); |
| |
| char *bytes = extractBytes(s1, "IBM424", bLength); |
| char *bytes_r = extractBytes(s2, "IBM424", brLength); |
| |
| UCharsetDetector *csd = ucsdet_open(&status); |
| ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); |
| ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); |
| ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); |
| ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); |
| if (U_FAILURE(status)) { |
| errln("Error opening charset detector. - %s", u_errorName(status)); |
| } |
| const UCharsetMatch *match; |
| const char *name; |
| |
| ucsdet_setText(csd, bytes, bLength, &status); |
| match = ucsdet_detect(csd, &status); |
| |
| if (match == NULL) { |
| errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status)); |
| goto bail; |
| } |
| |
| name = ucsdet_getName(match, &status); |
| if (strcmp(name, "IBM424_rtl") != 0) { |
| errln("Encoding detection failure for IBM424_rtl: got %s", name); |
| } |
| |
| ucsdet_setText(csd, bytes_r, brLength, &status); |
| match = ucsdet_detect(csd, &status); |
| |
| if (match == NULL) { |
| errln("Encoding detection failure for IBM424_ltr: got no matches."); |
| goto bail; |
| } |
| |
| name = ucsdet_getName(match, &status); |
| if (strcmp(name, "IBM424_ltr") != 0) { |
| errln("Encoding detection failure for IBM424_ltr: got %s", name); |
| } |
| |
| bail: |
| freeBytes(bytes); |
| freeBytes(bytes_r); |
| ucsdet_close(csd); |
| #endif |
| } |
| |
| void CharsetDetectionTest::IBM420Test() |
| { |
| #if !UCONFIG_ONLY_HTML_CONVERSION |
| UErrorCode status = U_ZERO_ERROR; |
| |
| static const UChar chars[] = { |
| 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, |
| 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, |
| 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, |
| 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, |
| 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, |
| 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, |
| 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, |
| 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, |
| 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, |
| 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, |
| 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, |
| 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, |
| 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, |
| 0x0000 |
| }; |
| static const UChar chars_reverse[] = { |
| 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, |
| 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, |
| 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, |
| 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, |
| 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, |
| 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, |
| 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, |
| 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, |
| 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, |
| 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, |
| 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, |
| 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, |
| 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, |
| 0x0000, |
| }; |
| |
| int32_t bLength = 0, brLength = 0; |
| |
| UnicodeString s1(chars); |
| UnicodeString s2(chars_reverse); |
| |
| char *bytes = extractBytes(s1, "IBM420", bLength); |
| char *bytes_r = extractBytes(s2, "IBM420", brLength); |
| |
| UCharsetDetector *csd = ucsdet_open(&status); |
| if (U_FAILURE(status)) { |
| errln("Error opening charset detector. - %s", u_errorName(status)); |
| } |
| ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status); |
| ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status); |
| ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status); |
| ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status); |
| const UCharsetMatch *match; |
| const char *name; |
| |
| ucsdet_setText(csd, bytes, bLength, &status); |
| match = ucsdet_detect(csd, &status); |
| |
| if (match == NULL) { |
| errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status)); |
| goto bail; |
| } |
| |
| name = ucsdet_getName(match, &status); |
| if (strcmp(name, "IBM420_rtl") != 0) { |
| errln("Encoding detection failure for IBM420_rtl: got %s\n", name); |
| } |
| |
| ucsdet_setText(csd, bytes_r, brLength, &status); |
| match = ucsdet_detect(csd, &status); |
| |
| if (match == NULL) { |
| errln("Encoding detection failure for IBM420_ltr: got no matches.\n"); |
| goto bail; |
| } |
| |
| name = ucsdet_getName(match, &status); |
| if (strcmp(name, "IBM420_ltr") != 0) { |
| errln("Encoding detection failure for IBM420_ltr: got %s\n", name); |
| } |
| |
| bail: |
| freeBytes(bytes); |
| freeBytes(bytes_r); |
| ucsdet_close(csd); |
| #endif |
| } |
| |
| |
| void CharsetDetectionTest::Ticket6394Test() { |
| #if !UCONFIG_NO_CONVERSION |
| const char charText[] = "Here is some random English text that should be detected as ISO-8859-1." |
| "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected " |
| "encodings more than once. The hop through UnicodeString is for platforms " |
| "where this char * string is be EBCDIC and needs conversion to Latin1."; |
| char latin1Text[sizeof(charText)]; |
| UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1"); |
| |
| UErrorCode status = U_ZERO_ERROR; |
| UCharsetDetector *csd = ucsdet_open(&status); |
| ucsdet_setText(csd, latin1Text, -1, &status); |
| if (U_FAILURE(status)) { |
| errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); |
| return; |
| } |
| |
| int32_t matchCount = 0; |
| const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status); |
| if (U_FAILURE(status)) { |
| errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); |
| return; |
| } |
| |
| UnicodeSet setOfCharsetNames; // UnicodSets can hold strings. |
| int32_t i; |
| for (i=0; i<matchCount; i++) { |
| UnicodeString charSetName(ucsdet_getName(matches[i], &status)); |
| if (U_FAILURE(status)) { |
| errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i); |
| status = U_ZERO_ERROR; |
| } |
| if (setOfCharsetNames.contains(charSetName)) { |
| errln("Fail at file %s, line %d ", __FILE__, __LINE__); |
| errln(UnicodeString(" Duplicate charset name = ") + charSetName); |
| } |
| setOfCharsetNames.add(charSetName); |
| } |
| ucsdet_close(csd); |
| #endif |
| } |
| |
| |
| // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between |
| // similar Windows and non-Windows SBCS encodings. State was kept in the shared |
| // Charset Recognizer objects, and could be overwritten. |
| void CharsetDetectionTest::Ticket6954Test() { |
| #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING |
| UErrorCode status = U_ZERO_ERROR; |
| UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; |
| UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly." |
| "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); |
| UnicodeString sWindows = ssWindows.unescape(); |
| int32_t lISO = 0, lWindows = 0; |
| char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); |
| char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); |
| |
| // First do a plain vanilla detect of 1252 text |
| |
| UCharsetDetector *csd1 = ucsdet_open(&status); |
| ucsdet_setText(csd1, bWindows, lWindows, &status); |
| const UCharsetMatch *match1 = ucsdet_detect(csd1, &status); |
| const char *name1 = ucsdet_getName(match1, &status); |
| TEST_ASSERT_SUCCESS(status); |
| TEST_ASSERT(strcmp(name1, "windows-1252")==0); |
| |
| // Next, using a completely separate detector, detect some 8859-1 text |
| |
| UCharsetDetector *csd2 = ucsdet_open(&status); |
| ucsdet_setText(csd2, bISO, lISO, &status); |
| const UCharsetMatch *match2 = ucsdet_detect(csd2, &status); |
| const char *name2 = ucsdet_getName(match2, &status); |
| TEST_ASSERT_SUCCESS(status); |
| TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0); |
| |
| // Recheck the 1252 results from the first detector, which should not have been |
| // altered by the use of a different detector. |
| |
| name1 = ucsdet_getName(match1, &status); |
| TEST_ASSERT_SUCCESS(status); |
| TEST_ASSERT(strcmp(name1, "windows-1252")==0); |
| |
| ucsdet_close(csd1); |
| ucsdet_close(csd2); |
| freeBytes(bISO); |
| freeBytes(bWindows); |
| #endif |
| } |