| --- source/common/brkeng.cpp 2009-11-11 07:47:22.000000000 -0800 |
| +++ source/common/brkeng.cpp 2011-01-21 14:12:45.479922000 -0800 |
| @@ -226,6 +226,30 @@ |
| case USCRIPT_THAI: |
| engine = new ThaiBreakEngine(dict, status); |
| break; |
| + |
| + case USCRIPT_HANGUL: |
| + engine = new CjkBreakEngine(dict, kKorean, status); |
| + break; |
| + |
| + // use same BreakEngine and dictionary for both Chinese and Japanese |
| + case USCRIPT_HIRAGANA: |
| + case USCRIPT_KATAKANA: |
| + case USCRIPT_HAN: |
| + engine = new CjkBreakEngine(dict, kChineseJapanese, status); |
| + break; |
| +#if 0 |
| + // TODO: Have to get some characters with script=common handled |
| + // by CjkBreakEngine (e.g. U+309B). Simply subjecting |
| + // them to CjkBreakEngine does not work. The engine has to |
| + // special-case them. |
| + case USCRIPT_COMMON: |
| + { |
| + UBlockCode block = ublock_getCode(code); |
| + if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) |
| + engine = new CjkBreakEngine(dict, kChineseJapanese, status); |
| + break; |
| + } |
| +#endif |
| default: |
| break; |
| } |
| @@ -281,6 +305,13 @@ |
| dict = NULL; |
| } |
| return dict; |
| + } else if (dictfname != NULL){ |
| + //create dummy dict if dictionary filename not valid |
| + UChar c = 0x0020; |
| + status = U_ZERO_ERROR; |
| + MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE); |
| + mtd->addWord(&c, 1, status, 1); |
| + return new CompactTrieDictionary(*mtd, status); |
| } |
| return NULL; |
| } |
| --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700 |
| +++ source/common/dictbe.cpp 2011-01-21 14:12:45.468928000 -0800 |
| @@ -16,6 +16,9 @@ |
| #include "unicode/ubrk.h" |
| #include "uvector.h" |
| #include "triedict.h" |
| +#include "uassert.h" |
| +#include "unicode/normlzr.h" |
| +#include "cmemory.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| @@ -422,6 +425,294 @@ |
| return wordsFound; |
| } |
| |
| +/* |
| + ****************************************************************** |
| + * CjkBreakEngine |
| + */ |
| +static const uint32_t kuint32max = 0xFFFFFFFF; |
| +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status) |
| +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){ |
| + if (!adoptDictionary->getValued()) { |
| + status = U_ILLEGAL_ARGUMENT_ERROR; |
| + return; |
| + } |
| + |
| + // Korean dictionary only includes Hangul syllables |
| + fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); |
| + fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); |
| + fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); |
| + fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); |
| + |
| + if (U_SUCCESS(status)) { |
| + // handle Korean and Japanese/Chinese using different dictionaries |
| + if (type == kKorean) { |
| + setCharacters(fHangulWordSet); |
| + } else { //Chinese and Japanese |
| + UnicodeSet cjSet; |
| + cjSet.addAll(fHanWordSet); |
| + cjSet.addAll(fKatakanaWordSet); |
| + cjSet.addAll(fHiraganaWordSet); |
| + cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc")); |
| + setCharacters(cjSet); |
| + } |
| + } |
| +} |
| + |
| +CjkBreakEngine::~CjkBreakEngine(){ |
| + delete fDictionary; |
| +} |
| + |
| +// The katakanaCost values below are based on the length frequencies of all |
| +// katakana phrases in the dictionary |
| +static const int kMaxKatakanaLength = 8; |
| +static const int kMaxKatakanaGroupLength = 20; |
| +static const uint32_t maxSnlp = 255; |
| + |
| +static inline uint32_t getKatakanaCost(int wordLength){ |
| + //TODO: fill array with actual values from dictionary! |
| + static const uint32_t katakanaCost[kMaxKatakanaLength + 1] |
| + = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; |
| + return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; |
| +} |
| + |
| +static inline bool isKatakana(uint16_t value) { |
| + return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || |
| + (value >= 0xFF66u && value <= 0xFF9fu); |
| +} |
| + |
| +// A very simple helper class to streamline the buffer handling in |
| +// divideUpDictionaryRange. |
| +template<class T, size_t N> |
| +class AutoBuffer { |
| + public: |
| + AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { |
| + if (size > N) { |
| + buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); |
| + capacity = size; |
| + } |
| + } |
| + ~AutoBuffer() { |
| + if (buffer != stackBuffer) |
| + uprv_free(buffer); |
| + } |
| +#if 0 |
| + T* operator& () { |
| + return buffer; |
| + } |
| +#endif |
| + T* elems() { |
| + return buffer; |
| + } |
| + const T& operator[] (size_t i) const { |
| + return buffer[i]; |
| + } |
| + T& operator[] (size_t i) { |
| + return buffer[i]; |
| + } |
| + |
| + // resize without copy |
| + void resize(size_t size) { |
| + if (size <= capacity) |
| + return; |
| + if (buffer != stackBuffer) |
| + uprv_free(buffer); |
| + buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); |
| + capacity = size; |
| + } |
| + private: |
| + T stackBuffer[N]; |
| + T* buffer; |
| + AutoBuffer(); |
| + size_t capacity; |
| +}; |
| + |
| + |
| +/* |
| + * @param text A UText representing the text |
| + * @param rangeStart The start of the range of dictionary characters |
| + * @param rangeEnd The end of the range of dictionary characters |
| + * @param foundBreaks Output of C array of int32_t break positions, or 0 |
| + * @return The number of breaks found |
| + */ |
| +int32_t |
| +CjkBreakEngine::divideUpDictionaryRange( UText *text, |
| + int32_t rangeStart, |
| + int32_t rangeEnd, |
| + UStack &foundBreaks ) const { |
| + if (rangeStart >= rangeEnd) { |
| + return 0; |
| + } |
| + |
| + const size_t defaultInputLength = 80; |
| + size_t inputLength = rangeEnd - rangeStart; |
| + AutoBuffer<UChar, defaultInputLength> charString(inputLength); |
| + |
| + // Normalize the input string and put it in normalizedText. |
| + // The map from the indices of the normalized input to the raw |
| + // input is kept in charPositions. |
| + UErrorCode status = U_ZERO_ERROR; |
| + utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); |
| + if (U_FAILURE(status)) |
| + return 0; |
| + |
| + UnicodeString inputString(charString.elems(), inputLength); |
| + UNormalizationMode norm_mode = UNORM_NFKC; |
| + UBool isNormalized = |
| + Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || |
| + Normalizer::isNormalized(inputString, norm_mode, status); |
| + |
| + AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); |
| + int numChars = 0; |
| + UText normalizedText = UTEXT_INITIALIZER; |
| + // Needs to be declared here because normalizedText holds onto its buffer. |
| + UnicodeString normalizedString; |
| + if (isNormalized) { |
| + int32_t index = 0; |
| + charPositions[0] = 0; |
| + while(index < inputString.length()) { |
| + index = inputString.moveIndex32(index, 1); |
| + charPositions[++numChars] = index; |
| + } |
| + utext_openUnicodeString(&normalizedText, &inputString, &status); |
| + } |
| + else { |
| + Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); |
| + if (U_FAILURE(status)) |
| + return 0; |
| + charPositions.resize(normalizedString.length() + 1); |
| + Normalizer normalizer(charString.elems(), inputLength, norm_mode); |
| + int32_t index = 0; |
| + charPositions[0] = 0; |
| + while(index < normalizer.endIndex()){ |
| + UChar32 uc = normalizer.next(); |
| + charPositions[++numChars] = index = normalizer.getIndex(); |
| + } |
| + utext_openUnicodeString(&normalizedText, &normalizedString, &status); |
| + } |
| + |
| + if (U_FAILURE(status)) |
| + return 0; |
| + |
| + // From this point on, all the indices refer to the indices of |
| + // the normalized input string. |
| + |
| + // bestSnlp[i] is the snlp of the best segmentation of the first i |
| + // characters in the range to be matched. |
| + AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); |
| + bestSnlp[0] = 0; |
| + for(int i=1; i<=numChars; i++){ |
| + bestSnlp[i] = kuint32max; |
| + } |
| + |
| + // prev[i] is the index of the last CJK character in the previous word in |
| + // the best segmentation of the first i characters. |
| + AutoBuffer<int, defaultInputLength> prev(numChars + 1); |
| + for(int i=0; i<=numChars; i++){ |
| + prev[i] = -1; |
| + } |
| + |
| + const size_t maxWordSize = 20; |
| + AutoBuffer<uint16_t, maxWordSize> values(numChars); |
| + AutoBuffer<int32_t, maxWordSize> lengths(numChars); |
| + |
| + // Dynamic programming to find the best segmentation. |
| + bool is_prev_katakana = false; |
| + for (int i = 0; i < numChars; ++i) { |
| + //utext_setNativeIndex(text, rangeStart + i); |
| + utext_setNativeIndex(&normalizedText, i); |
| + if (bestSnlp[i] == kuint32max) |
| + continue; |
| + |
| + int count; |
| + // limit maximum word length matched to size of current substring |
| + int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i; |
| + |
| + fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); |
| + |
| + // if there are no single character matches found in the dictionary |
| + // starting with this charcter, treat character as a 1-character word |
| + // with the highest value possible, i.e. the least likely to occur. |
| + // Exclude Korean characters from this treatment, as they should be left |
| + // together by default. |
| + if((count == 0 || lengths[0] != 1) && |
| + !fHangulWordSet.contains(utext_current32(&normalizedText))){ |
| + values[count] = maxSnlp; |
| + lengths[count++] = 1; |
| + } |
| + |
| + for (int j = 0; j < count; j++){ |
| + //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp); |
| + uint32_t newSnlp = bestSnlp[i] + values[j]; |
| + if (newSnlp < bestSnlp[lengths[j] + i]) { |
| + bestSnlp[lengths[j] + i] = newSnlp; |
| + prev[lengths[j] + i] = i; |
| + } |
| + } |
| + |
| + // In Japanese, |
| + // Katakana word in single character is pretty rare. So we apply |
| + // the following heuristic to Katakana: any continuous run of Katakana |
| + // characters is considered a candidate word with a default cost |
| + // specified in the katakanaCost table according to its length. |
| + //utext_setNativeIndex(text, rangeStart + i); |
| + utext_setNativeIndex(&normalizedText, i); |
| + bool is_katakana = isKatakana(utext_current32(&normalizedText)); |
| + if (!is_prev_katakana && is_katakana) { |
| + int j = i + 1; |
| + utext_next32(&normalizedText); |
| + // Find the end of the continuous run of Katakana characters |
| + while (j < numChars && (j - i) < kMaxKatakanaGroupLength && |
| + isKatakana(utext_current32(&normalizedText))) { |
| + utext_next32(&normalizedText); |
| + ++j; |
| + } |
| + if ((j - i) < kMaxKatakanaGroupLength) { |
| + uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); |
| + if (newSnlp < bestSnlp[j]) { |
| + bestSnlp[j] = newSnlp; |
| + prev[j] = i; |
| + } |
| + } |
| + } |
| + is_prev_katakana = is_katakana; |
| + } |
| + |
| + // Start pushing the optimal offset index into t_boundary (t for tentative). |
| + // prev[numChars] is guaranteed to be meaningful. |
| + // We'll first push in the reverse order, i.e., |
| + // t_boundary[0] = numChars, and afterwards do a swap. |
| + AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); |
| + |
| + int numBreaks = 0; |
| + // No segmentation found, set boundary to end of range |
| + if (bestSnlp[numChars] == kuint32max) { |
| + t_boundary[numBreaks++] = numChars; |
| + } else { |
| + for (int i = numChars; i > 0; i = prev[i]){ |
| + t_boundary[numBreaks++] = i; |
| + |
| + } |
| + U_ASSERT(prev[t_boundary[numBreaks-1]] == 0); |
| + } |
| + |
| + // Reverse offset index in t_boundary. |
| + // Don't add a break for the start of the dictionary range if there is one |
| + // there already. |
| + if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { |
| + t_boundary[numBreaks++] = 0; |
| + } |
| + |
| + // Now that we're done, convert positions in t_bdry[] (indices in |
| + // the normalized input string) back to indices in the raw input string |
| + // while reversing t_bdry and pushing values to foundBreaks. |
| + for (int i = numBreaks-1; i >= 0; i--) { |
| + foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); |
| + } |
| + |
| + utext_close(&normalizedText); |
| + return numBreaks; |
| +} |
| + |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
| --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700 |
| +++ source/common/dictbe.h 2011-01-21 14:12:45.492920000 -0800 |
| @@ -1,8 +1,8 @@ |
| /** |
| - ******************************************************************************* |
| - * Copyright (C) 2006, International Business Machines Corporation and others. * |
| - * All Rights Reserved. * |
| - ******************************************************************************* |
| + ********************************************************************************** |
| + * Copyright (C) 2006-2010, International Business Machines Corporation and others. |
| + * All Rights Reserved. |
| + ********************************************************************************** |
| */ |
| |
| #ifndef DICTBE_H |
| @@ -65,31 +65,31 @@ |
| */ |
| virtual ~DictionaryBreakEngine(); |
| |
| - /** |
| - * <p>Indicate whether this engine handles a particular character for |
| - * a particular kind of break.</p> |
| - * |
| - * @param c A character which begins a run that the engine might handle |
| - * @param breakType The type of text break which the caller wants to determine |
| - * @return TRUE if this engine handles the particular character and break |
| - * type. |
| - */ |
| + /** |
| + * <p>Indicate whether this engine handles a particular character for |
| + * a particular kind of break.</p> |
| + * |
| + * @param c A character which begins a run that the engine might handle |
| + * @param breakType The type of text break which the caller wants to determine |
| + * @return TRUE if this engine handles the particular character and break |
| + * type. |
| + */ |
| virtual UBool handles( UChar32 c, int32_t breakType ) const; |
| |
| - /** |
| - * <p>Find any breaks within a run in the supplied text.</p> |
| - * |
| - * @param text A UText representing the text. The |
| - * iterator is left at the end of the run of characters which the engine |
| - * is capable of handling. |
| - * @param startPos The start of the run within the supplied text. |
| - * @param endPos The end of the run within the supplied text. |
| - * @param reverse Whether the caller is looking for breaks in a reverse |
| - * direction. |
| - * @param breakType The type of break desired, or -1. |
| - * @param foundBreaks An allocated C array of the breaks found, if any |
| - * @return The number of breaks found. |
| - */ |
| + /** |
| + * <p>Find any breaks within a run in the supplied text.</p> |
| + * |
| + * @param text A UText representing the text. The iterator is left at |
| + * the end of the run of characters which the engine is capable of handling |
| + * that starts from the first (or last) character in the range. |
| + * @param startPos The start of the run within the supplied text. |
| + * @param endPos The end of the run within the supplied text. |
| + * @param reverse Whether the caller is looking for breaks in a reverse |
| + * direction. |
| + * @param breakType The type of break desired, or -1. |
| + * @param foundBreaks An allocated C array of the breaks found, if any |
| + * @return The number of breaks found. |
| + */ |
| virtual int32_t findBreaks( UText *text, |
| int32_t startPos, |
| int32_t endPos, |
| @@ -114,7 +114,7 @@ |
| // virtual void setBreakTypes( uint32_t breakTypes ); |
| |
| /** |
| - * <p>Divide up a range of known dictionary characters.</p> |
| + * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
| * |
| * @param text A UText representing the text |
| * @param rangeStart The start of the range of dictionary characters |
| @@ -171,7 +171,7 @@ |
| |
| protected: |
| /** |
| - * <p>Divide up a range of known dictionary characters.</p> |
| + * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
| * |
| * @param text A UText representing the text |
| * @param rangeStart The start of the range of dictionary characters |
| @@ -186,6 +186,66 @@ |
| |
| }; |
| |
| +/******************************************************************* |
| + * CjkBreakEngine |
| + */ |
| + |
| +//indicates language/script that the CjkBreakEngine will handle |
| +enum LanguageType { |
| + kKorean, |
| + kChineseJapanese |
| +}; |
| + |
| +/** |
| + * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a |
| + * TrieWordDictionary with costs associated with each word and |
| + * Viterbi decoding to determine CJK-specific breaks.</p> |
| + */ |
| +class CjkBreakEngine : public DictionaryBreakEngine { |
| + protected: |
| + /** |
| + * The set of characters handled by this engine |
| + * @internal |
| + */ |
| + UnicodeSet fHangulWordSet; |
| + UnicodeSet fHanWordSet; |
| + UnicodeSet fKatakanaWordSet; |
| + UnicodeSet fHiraganaWordSet; |
| + |
| + const TrieWordDictionary *fDictionary; |
| + |
| + public: |
| + |
| + /** |
| + * <p>Default constructor.</p> |
| + * |
| + * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the |
| + * engine is deleted. The TrieWordDictionary must contain costs for each word |
| + * in order for the dictionary to work properly. |
| + */ |
| + CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status); |
| + |
| + /** |
| + * <p>Virtual destructor.</p> |
| + */ |
| + virtual ~CjkBreakEngine(); |
| + |
| + protected: |
| + /** |
| + * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
| + * |
| + * @param text A UText representing the text |
| + * @param rangeStart The start of the range of dictionary characters |
| + * @param rangeEnd The end of the range of dictionary characters |
| + * @param foundBreaks Output of C array of int32_t break positions, or 0 |
| + * @return The number of breaks found |
| + */ |
| + virtual int32_t divideUpDictionaryRange( UText *text, |
| + int32_t rangeStart, |
| + int32_t rangeEnd, |
| + UStack &foundBreaks ) const; |
| + |
| +}; |
| |
| U_NAMESPACE_END |
| |
| --- source/common/rbbi.cpp 2010-07-22 17:15:37.000000000 -0700 |
| +++ source/common/rbbi.cpp 2011-01-21 14:12:45.457938000 -0800 |
| @@ -1555,10 +1555,12 @@ |
| int32_t endPos, |
| UBool reverse) { |
| // Reset the old break cache first. |
| - uint32_t dictionaryCount = fDictionaryCharCount; |
| reset(); |
| |
| - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { |
| + // note: code segment below assumes that dictionary chars are in the |
| + // startPos-endPos range |
| + // value returned should be next character in sequence |
| + if ((endPos - startPos) <= 1) { |
| return (reverse ? startPos : endPos); |
| } |
| |
| @@ -1711,7 +1713,7 @@ |
| // proposed break by one of the breaks we found. Use following() and |
| // preceding() to do the work. They should never recurse in this case. |
| if (reverse) { |
| - return preceding(endPos - 1); |
| + return preceding(endPos); |
| } |
| else { |
| return following(startPos); |
| --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800 |
| +++ source/common/triedict.cpp 2011-01-21 14:12:45.271006000 -0800 |
| @@ -20,6 +20,7 @@ |
| #include "uvector.h" |
| #include "uvectr32.h" |
| #include "uarrsort.h" |
| +#include "hash.h" |
| |
| //#define DEBUG_TRIE_DICT 1 |
| |
| @@ -27,6 +28,11 @@ |
| #include <sys/times.h> |
| #include <limits.h> |
| #include <stdio.h> |
| +#include <time.h> |
| +#ifndef CLK_TCK |
| +#define CLK_TCK CLOCKS_PER_SEC |
| +#endif |
| + |
| #endif |
| |
| U_NAMESPACE_BEGIN |
| @@ -45,6 +51,11 @@ |
| * MutableTrieDictionary |
| */ |
| |
| +//#define MAX_VALUE 65535 |
| + |
| +// forward declaration |
| +inline uint16_t scaleLogProbabilities(double logprob); |
| + |
| // Node structure for the ternary, uncompressed trie |
| struct TernaryNode : public UMemory { |
| UChar ch; // UTF-16 code unit |
| @@ -77,7 +88,8 @@ |
| delete high; |
| } |
| |
| -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) { |
| +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status, |
| + UBool containsValue /* = FALSE */ ) { |
| // Start the trie off with something. Having the root node already present |
| // cuts a special case out of the search/insertion functions. |
| // Making it a median character cuts the worse case for searches from |
| @@ -91,14 +103,19 @@ |
| if (U_SUCCESS(status) && fIter == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| + |
| + fValued = containsValue; |
| } |
| |
| -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { |
| +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status, |
| + UBool containsValue /* = false */ ) { |
| fTrie = NULL; |
| fIter = utext_openUChars(NULL, NULL, 0, &status); |
| if (U_SUCCESS(status) && fIter == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| + |
| + fValued = containsValue; |
| } |
| |
| MutableTrieDictionary::~MutableTrieDictionary() { |
| @@ -108,12 +125,13 @@ |
| |
| int32_t |
| MutableTrieDictionary::search( UText *text, |
| - int32_t maxLength, |
| - int32_t *lengths, |
| - int &count, |
| - int limit, |
| - TernaryNode *&parent, |
| - UBool &pMatched ) const { |
| + int32_t maxLength, |
| + int32_t *lengths, |
| + int &count, |
| + int limit, |
| + TernaryNode *&parent, |
| + UBool &pMatched, |
| + uint16_t *values /*=NULL*/) const { |
| // TODO: current implementation works in UTF-16 space |
| const TernaryNode *up = NULL; |
| const TernaryNode *p = fTrie; |
| @@ -121,6 +139,10 @@ |
| pMatched = TRUE; |
| int i; |
| |
| + if (!fValued) { |
| + values = NULL; |
| + } |
| + |
| UChar uc = utext_current32(text); |
| for (i = 0; i < maxLength && p != NULL; ++i) { |
| while (p != NULL) { |
| @@ -141,7 +163,11 @@ |
| break; |
| } |
| // Must be equal to get here |
| - if (limit > 0 && (p->flags & kEndsWord)) { |
| + if (limit > 0 && (p->flags > 0)) { |
| + //is there a more efficient way to add values? ie. remove if stmt |
| + if(values != NULL) { |
| + values[mycount] = p->flags; |
| + } |
| lengths[mycount++] = i+1; |
| --limit; |
| } |
| @@ -161,13 +187,14 @@ |
| void |
| MutableTrieDictionary::addWord( const UChar *word, |
| int32_t length, |
| - UErrorCode &status ) { |
| -#if 0 |
| - if (length <= 0) { |
| + UErrorCode &status, |
| + uint16_t value /* = 0 */ ) { |
| + // dictionary cannot store zero values, would interfere with flags |
| + if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| -#endif |
| + |
| TernaryNode *parent; |
| UBool pMatched; |
| int count; |
| @@ -177,7 +204,7 @@ |
| matched = search(fIter, length, NULL, count, 0, parent, pMatched); |
| |
| while (matched++ < length) { |
| - UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? |
| + UChar32 uc = utext_next32(fIter); // TODO: supplementary support? |
| U_ASSERT(uc != U_SENTINEL); |
| TernaryNode *newNode = new TernaryNode(uc); |
| if (newNode == NULL) { |
| @@ -199,30 +226,23 @@ |
| parent = newNode; |
| } |
| |
| - parent->flags |= kEndsWord; |
| -} |
| - |
| -#if 0 |
| -void |
| -MutableTrieDictionary::addWords( UEnumeration *words, |
| - UErrorCode &status ) { |
| - int32_t length; |
| - const UChar *word; |
| - while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) { |
| - addWord(word, length, status); |
| + if(fValued && value > 0){ |
| + parent->flags = value; |
| + } else { |
| + parent->flags |= kEndsWord; |
| } |
| } |
| -#endif |
| |
| int32_t |
| MutableTrieDictionary::matches( UText *text, |
| int32_t maxLength, |
| int32_t *lengths, |
| int &count, |
| - int limit ) const { |
| + int limit, |
| + uint16_t *values /*=NULL*/) const { |
| TernaryNode *parent; |
| UBool pMatched; |
| - return search(text, maxLength, lengths, count, limit, parent, pMatched); |
| + return search(text, maxLength, lengths, count, limit, parent, pMatched, values); |
| } |
| |
| // Implementation of iteration for MutableTrieDictionary |
| @@ -277,7 +297,7 @@ |
| break; |
| } |
| case kEqual: |
| - emit = (node->flags & kEndsWord) != 0; |
| + emit = node->flags > 0; |
| equal = (node->equal != NULL); |
| // If this node should be part of the next emitted string, append |
| // the UChar to the string, and make sure we pop it when we come |
| @@ -299,7 +319,7 @@ |
| } |
| case kGreaterThan: |
| // If this node's character is in the string, remove it. |
| - if (node->equal != NULL || (node->flags & kEndsWord)) { |
| + if (node->equal != NULL || node->flags > 0) { |
| unistr.truncate(unistr.length()-1); |
| } |
| if (node->high != NULL) { |
| @@ -354,12 +374,75 @@ |
| * CompactTrieDictionary |
| */ |
| |
| +//TODO further optimization: |
| +// minimise size of trie with logprobs by storing values |
| +// for terminal nodes directly in offsets[] |
| +// --> calculating from next offset *might* be simpler, but would have to add |
| +// one last offset for logprob of last node |
| +// --> if calculate from current offset, need to factor in possible overflow |
| +// as well. |
| +// idea: store in offset, set first bit to indicate logprob storage-->won't |
| +// have to access additional node |
| + |
| +// {'Dic', 1}, version 1: uses old header, no values |
| +#define COMPACT_TRIE_MAGIC_1 0x44696301 |
| +// version 2: uses new header (more than 2^16 nodes), no values |
| +#define COMPACT_TRIE_MAGIC_2 0x44696302 |
| +// version 3: uses new header, includes values |
| +#define COMPACT_TRIE_MAGIC_3 0x44696303 |
| + |
| struct CompactTrieHeader { |
| uint32_t size; // Size of the data in bytes |
| uint32_t magic; // Magic number (including version) |
| + uint32_t nodeCount; // Number of entries in offsets[] |
| + uint32_t root; // Node number of the root node |
| + uint32_t offsets[1]; // Offsets to nodes from start of data |
| +}; |
| + |
| +// old version of CompactTrieHeader kept for backwards compatibility |
| +struct CompactTrieHeaderV1 { |
| + uint32_t size; // Size of the data in bytes |
| + uint32_t magic; // Magic number (including version) |
| uint16_t nodeCount; // Number of entries in offsets[] |
| uint16_t root; // Node number of the root node |
| - uint32_t offsets[1]; // Offsets to nodes from start of data |
| + uint32_t offsets[1]; // Offsets to nodes from start of data |
| +}; |
| + |
| +// Helper class for managing CompactTrieHeader and CompactTrieHeaderV1 |
| +struct CompactTrieInfo { |
| + uint32_t size; // Size of the data in bytes |
| + uint32_t magic; // Magic number (including version) |
| + uint32_t nodeCount; // Number of entries in offsets[] |
| + uint32_t root; // Node number of the root node |
| + uint32_t *offsets; // Offsets to nodes from start of data |
| + uint8_t *address; // pointer to header bytes in memory |
| + |
| + CompactTrieInfo(const void *data, UErrorCode &status){ |
| + CompactTrieHeader *header = (CompactTrieHeader *) data; |
| + if (header->magic != COMPACT_TRIE_MAGIC_1 && |
| + header->magic != COMPACT_TRIE_MAGIC_2 && |
| + header->magic != COMPACT_TRIE_MAGIC_3) { |
| + status = U_ILLEGAL_ARGUMENT_ERROR; |
| + } else { |
| + size = header->size; |
| + magic = header->magic; |
| + |
| + if (header->magic == COMPACT_TRIE_MAGIC_1) { |
| + CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header; |
| + nodeCount = headerV1->nodeCount; |
| + root = headerV1->root; |
| + offsets = &(headerV1->offsets[0]); |
| + address = (uint8_t *)headerV1; |
| + } else { |
| + nodeCount = header->nodeCount; |
| + root = header->root; |
| + offsets = &(header->offsets[0]); |
| + address = (uint8_t *)header; |
| + } |
| + } |
| + } |
| + |
| + ~CompactTrieInfo(){} |
| }; |
| |
| // Note that to avoid platform-specific alignment issues, all members of the node |
| @@ -375,10 +458,14 @@ |
| enum CompactTrieNodeFlags { |
| kVerticalNode = 0x1000, // This is a vertical node |
| kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word |
| - kReservedFlag1 = 0x4000, |
| - kReservedFlag2 = 0x8000, |
| + kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kReservedFlag1 |
| + kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReservedFlag2 |
| kCountMask = 0x0FFF, // The count portion of flagscount |
| - kFlagMask = 0xF000 // The flags portion of flagscount |
| + kFlagMask = 0xF000, // The flags portion of flagscount |
| + kRootCountMask = 0x7FFF // The count portion of flagscount in the root node |
| + |
| + //offset flags: |
| + //kOffsetContainsValue = 0x80000000 // Offset contains value for parent node |
| }; |
| |
| // The two node types are distinguished by the kVerticalNode flag. |
| @@ -402,63 +489,177 @@ |
| uint16_t chars[1]; // Code units |
| }; |
| |
| -// {'Dic', 1}, version 1 |
| -#define COMPACT_TRIE_MAGIC_1 0x44696301 |
| - |
| CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, |
| UErrorCode &status ) |
| : fUData(dataObj) |
| { |
| - fData = (const CompactTrieHeader *) udata_getMemory(dataObj); |
| + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); |
| + *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status); |
| fOwnData = FALSE; |
| - if (fData->magic != COMPACT_TRIE_MAGIC_1) { |
| - status = U_ILLEGAL_ARGUMENT_ERROR; |
| - fData = NULL; |
| - } |
| } |
| + |
| CompactTrieDictionary::CompactTrieDictionary( const void *data, |
| UErrorCode &status ) |
| : fUData(NULL) |
| { |
| - fData = (const CompactTrieHeader *) data; |
| + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); |
| + *fInfo = CompactTrieInfo(data, status); |
| fOwnData = FALSE; |
| - if (fData->magic != COMPACT_TRIE_MAGIC_1) { |
| - status = U_ILLEGAL_ARGUMENT_ERROR; |
| - fData = NULL; |
| - } |
| } |
| |
| CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict, |
| UErrorCode &status ) |
| : fUData(NULL) |
| { |
| - fData = compactMutableTrieDictionary(dict, status); |
| + const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status); |
| + if (U_SUCCESS(status)) { |
| + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); |
| + *fInfo = CompactTrieInfo(header, status); |
| + } |
| + |
| fOwnData = !U_FAILURE(status); |
| } |
| |
| CompactTrieDictionary::~CompactTrieDictionary() { |
| if (fOwnData) { |
| - uprv_free((void *)fData); |
| + uprv_free((void *)(fInfo->address)); |
| } |
| + uprv_free((void *)fInfo); |
| + |
| if (fUData) { |
| udata_close(fUData); |
| } |
| } |
| |
| +UBool CompactTrieDictionary::getValued() const{ |
| + return fInfo->magic == COMPACT_TRIE_MAGIC_3; |
| +} |
| + |
| uint32_t |
| CompactTrieDictionary::dataSize() const { |
| - return fData->size; |
| + return fInfo->size; |
| } |
| |
| const void * |
| CompactTrieDictionary::data() const { |
| - return fData; |
| + return fInfo->address; |
| +} |
| + |
| +//This function finds the address of a node for us, given its node ID |
| +static inline const CompactTrieNode * |
| +getCompactNode(const CompactTrieInfo *info, uint32_t node) { |
| + if(node < info->root-1) { |
| + return (const CompactTrieNode *)(&info->offsets[node]); |
| + } else { |
| + return (const CompactTrieNode *)(info->address + info->offsets[node]); |
| + } |
| } |
| |
| -// This function finds the address of a node for us, given its node ID |
| +//this version of getCompactNode is currently only used in compactMutableTrieDictionary() |
| static inline const CompactTrieNode * |
| -getCompactNode(const CompactTrieHeader *header, uint16_t node) { |
| - return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]); |
| +getCompactNode(const CompactTrieHeader *header, uint32_t node) { |
| + if(node < header->root-1) { |
| + return (const CompactTrieNode *)(&header->offsets[node]); |
| + } else { |
| + return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]); |
| + } |
| +} |
| + |
| + |
| +/** |
| + * Calculates the number of links in a node |
| + * @node The specified node |
| + */ |
| +static inline const uint16_t |
| +getCount(const CompactTrieNode *node){ |
| + return (node->flagscount & kCountMask); |
| + //use the code below if number of links ever exceed 4096 |
| + //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCount) >> 2); |
| +} |
| + |
| +/** |
| + * calculates an equal link node ID of a horizontal node |
| + * @hnode The horizontal node containing the equal link |
| + * @param index The index into hnode->entries[] |
| + * @param nodeCount The length of hnode->entries[] |
| + */ |
| +static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){ |
| + if(vnode->flagscount & kEqualOverflows){ |
| + // treat overflow bits as an extension of chars[] |
| + uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNode*)vnode)]; |
| + return vnode->equal + (((uint32_t)*overflow) << 16); |
| + }else{ |
| + return vnode->equal; |
| + } |
| +} |
| + |
| +/** |
| + * calculates an equal link node ID of a horizontal node |
| + * @hnode The horizontal node containing the equal link |
| + * @param index The index into hnode->entries[] |
| + * @param nodeCount The length of hnode->entries[] |
| + */ |
| +static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uint16_t index, uint16_t nodeCount){ |
| + if(hnode->flagscount & kEqualOverflows){ |
| + //set overflow to point to the uint16_t containing the overflow bits |
| + uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount]; |
| + overflow += index/4; |
| + uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10; |
| + return hnode->entries[index].equal + (((uint32_t)extraBits) << 16); |
| + } else { |
| + return hnode->entries[index].equal; |
| + } |
| +} |
| + |
| +/** |
| + * Returns the value stored in the specified node which is associated with its |
| + * parent node. |
| + * TODO: how to tell that value is stored in node or in offset? check whether |
| + * node ID < fInfo->root! |
| + */ |
| +static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){ |
| + uint16_t count = getCount((CompactTrieNode *)hnode); |
| + uint16_t overflowSize = 0; //size of node ID overflow storage in bytes |
| + |
| + if(hnode->flagscount & kEqualOverflows) |
| + overflowSize = (count + 3) / 4 * sizeof(uint16_t); |
| + return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize)); |
| +} |
| + |
| +static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){ |
| + // calculate size of total node ID overflow storage in bytes |
| + uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16_t) : 0; |
| + return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)vnode)] + overflowSize)); |
| +} |
| + |
| +static inline uint16_t getValue(const CompactTrieNode *node){ |
| + if(node->flagscount & kVerticalNode) |
| + return getValue((const CompactTrieVerticalNode *)node); |
| + else |
| + return getValue((const CompactTrieHorizontalNode *)node); |
| +} |
| + |
| +//returns index of match in CompactTrieHorizontalNode.entries[] using binary search |
| +inline int16_t |
| +searchHorizontalEntries(const CompactTrieHorizontalEntry *entries, |
| + UChar uc, uint16_t nodeCount){ |
| + int low = 0; |
| + int high = nodeCount-1; |
| + int middle; |
| + while (high >= low) { |
| + middle = (high+low)/2; |
| + if (uc == entries[middle].ch) { |
| + return middle; |
| + } |
| + else if (uc < entries[middle].ch) { |
| + high = middle-1; |
| + } |
| + else { |
| + low = middle+1; |
| + } |
| + } |
| + |
| + return -1; |
| } |
| |
| int32_t |
| @@ -466,17 +667,38 @@ |
| int32_t maxLength, |
| int32_t *lengths, |
| int &count, |
| - int limit ) const { |
| + int limit, |
| + uint16_t *values /*= NULL*/) const { |
| + if (fInfo->magic == COMPACT_TRIE_MAGIC_2) |
| + values = NULL; |
| + |
| // TODO: current implementation works in UTF-16 space |
| - const CompactTrieNode *node = getCompactNode(fData, fData->root); |
| + const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root); |
| int mycount = 0; |
| |
| UChar uc = utext_current32(text); |
| int i = 0; |
| |
| + // handle root node with only kEqualOverflows flag: assume horizontal node without parent |
| + if(node != NULL){ |
| + const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node; |
| + int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask); |
| + if(index > -1){ |
| + node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagscount & kRootCountMask)); |
| + utext_next32(text); |
| + uc = utext_current32(text); |
| + ++i; |
| + }else{ |
| + node = NULL; |
| + } |
| + } |
| + |
| while (node != NULL) { |
| // Check if the node we just exited ends a word |
| if (limit > 0 && (node->flagscount & kParentEndsWord)) { |
| + if(values != NULL){ |
| + values[mycount] = getValue(node); |
| + } |
| lengths[mycount++] = i; |
| --limit; |
| } |
| @@ -487,7 +709,7 @@ |
| break; |
| } |
| |
| - int nodeCount = (node->flagscount & kCountMask); |
| + int nodeCount = getCount(node); |
| if (nodeCount == 0) { |
| // Special terminal node; return now |
| break; |
| @@ -507,35 +729,27 @@ |
| // To get here we must have come through the whole list successfully; |
| // go on to the next node. Note that a word cannot end in the middle |
| // of a vertical node. |
| - node = getCompactNode(fData, vnode->equal); |
| + node = getCompactNode(fInfo, calcEqualLink(vnode)); |
| } |
| else { |
| // Horizontal node; do binary search |
| const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; |
| - int low = 0; |
| - int high = nodeCount-1; |
| - int middle; |
| - node = NULL; // If we don't find a match, we'll fall out of the loop |
| - while (high >= low) { |
| - middle = (high+low)/2; |
| - if (uc == hnode->entries[middle].ch) { |
| - // We hit a match; get the next node and next character |
| - node = getCompactNode(fData, hnode->entries[middle].equal); |
| - utext_next32(text); |
| - uc = utext_current32(text); |
| - ++i; |
| - break; |
| - } |
| - else if (uc < hnode->entries[middle].ch) { |
| - high = middle-1; |
| - } |
| - else { |
| - low = middle+1; |
| - } |
| + const CompactTrieHorizontalEntry *entries; |
| + entries = hnode->entries; |
| + |
| + int index = searchHorizontalEntries(entries, uc, nodeCount); |
| + if(index > -1){ // |
| + // We hit a match; get the next node and next character |
| + node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCount)); |
| + utext_next32(text); |
| + uc = utext_current32(text); |
| + ++i; |
| + }else{ |
| + node = NULL; // If we don't find a match, we'll fall out of the loop |
| } |
| } |
| } |
| -exit: |
| + exit: |
| count = mycount; |
| return i; |
| } |
| @@ -545,16 +759,16 @@ |
| private: |
| UVector32 fNodeStack; // Stack of nodes to process |
| UVector32 fIndexStack; // Stack of where in node we are |
| - const CompactTrieHeader *fHeader; // Trie data |
| + const CompactTrieInfo *fInfo; // Trie data |
| |
| public: |
| static UClassID U_EXPORT2 getStaticClassID(void); |
| virtual UClassID getDynamicClassID(void) const; |
| public: |
| - CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status) |
| + CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status) |
| : fNodeStack(status), fIndexStack(status) { |
| - fHeader = header; |
| - fNodeStack.push(header->root, status); |
| + fInfo = info; |
| + fNodeStack.push(info->root, status); |
| fIndexStack.push(0, status); |
| unistr.remove(); |
| } |
| @@ -564,14 +778,14 @@ |
| |
| virtual StringEnumeration *clone() const { |
| UErrorCode status = U_ZERO_ERROR; |
| - return new CompactTrieEnumeration(fHeader, status); |
| + return new CompactTrieEnumeration(fInfo, status); |
| } |
| |
| virtual const UnicodeString * snext(UErrorCode &status); |
| |
| // Very expensive, but this should never be used. |
| virtual int32_t count(UErrorCode &status) const { |
| - CompactTrieEnumeration counter(fHeader, status); |
| + CompactTrieEnumeration counter(fInfo, status); |
| int32_t result = 0; |
| while (counter.snext(status) != NULL && U_SUCCESS(status)) { |
| ++result; |
| @@ -582,7 +796,7 @@ |
| virtual void reset(UErrorCode &status) { |
| fNodeStack.removeAllElements(); |
| fIndexStack.removeAllElements(); |
| - fNodeStack.push(fHeader->root, status); |
| + fNodeStack.push(fInfo->root, status); |
| fIndexStack.push(0, status); |
| unistr.remove(); |
| } |
| @@ -595,26 +809,34 @@ |
| if (fNodeStack.empty() || U_FAILURE(status)) { |
| return NULL; |
| } |
| - const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); |
| + const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki()); |
| int where = fIndexStack.peeki(); |
| while (!fNodeStack.empty() && U_SUCCESS(status)) { |
| - int nodeCount = (node->flagscount & kCountMask); |
| + int nodeCount; |
| + |
| + bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root); |
| + if(isRoot){ |
| + nodeCount = node->flagscount & kRootCountMask; |
| + } else { |
| + nodeCount = getCount(node); |
| + } |
| + |
| UBool goingDown = FALSE; |
| if (nodeCount == 0) { |
| // Terminal node; go up immediately |
| fNodeStack.popi(); |
| fIndexStack.popi(); |
| - node = getCompactNode(fHeader, fNodeStack.peeki()); |
| + node = getCompactNode(fInfo, fNodeStack.peeki()); |
| where = fIndexStack.peeki(); |
| } |
| - else if (node->flagscount & kVerticalNode) { |
| + else if ((node->flagscount & kVerticalNode) && !isRoot) { |
| // Vertical node |
| const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node; |
| if (where == 0) { |
| // Going down |
| - unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount); |
| + unistr.append((const UChar *)vnode->chars, nodeCount); |
| fIndexStack.setElementAt(1, fIndexStack.size()-1); |
| - node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, status)); |
| + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode), status)); |
| where = fIndexStack.push(0, status); |
| goingDown = TRUE; |
| } |
| @@ -623,7 +845,7 @@ |
| unistr.truncate(unistr.length()-nodeCount); |
| fNodeStack.popi(); |
| fIndexStack.popi(); |
| - node = getCompactNode(fHeader, fNodeStack.peeki()); |
| + node = getCompactNode(fInfo, fNodeStack.peeki()); |
| where = fIndexStack.peeki(); |
| } |
| } |
| @@ -638,7 +860,7 @@ |
| // Push on next node |
| unistr.append((UChar)hnode->entries[where].ch); |
| fIndexStack.setElementAt(where+1, fIndexStack.size()-1); |
| - node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[where].equal, status)); |
| + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode, where, nodeCount), status)); |
| where = fIndexStack.push(0, status); |
| goingDown = TRUE; |
| } |
| @@ -646,12 +868,14 @@ |
| // Going up |
| fNodeStack.popi(); |
| fIndexStack.popi(); |
| - node = getCompactNode(fHeader, fNodeStack.peeki()); |
| + node = getCompactNode(fInfo, fNodeStack.peeki()); |
| where = fIndexStack.peeki(); |
| } |
| } |
| + |
| // Check if the parent of the node we've just gone down to ends a |
| // word. If so, return it. |
| + // The root node should never end up here. |
| if (goingDown && (node->flagscount & kParentEndsWord)) { |
| return &unistr; |
| } |
| @@ -664,7 +888,7 @@ |
| if (U_FAILURE(status)) { |
| return NULL; |
| } |
| - return new CompactTrieEnumeration(fData, status); |
| + return new CompactTrieEnumeration(fInfo, status); |
| } |
| |
| // |
| @@ -672,21 +896,36 @@ |
| // and back again |
| // |
| |
| -// Helper classes to construct the compact trie |
| +enum CompactTrieNodeType { |
| + kHorizontalType = 0, |
| + kVerticalType = 1, |
| + kValueType = 2 |
| +}; |
| + |
| +/** |
| + * The following classes (i.e. BuildCompactTrie*Node) are helper classes to |
| + * construct the compact trie by storing information for each node and later |
| + * writing the node to memory in a sequential format. |
| + */ |
| class BuildCompactTrieNode: public UMemory { |
| - public: |
| +public: |
| UBool fParentEndsWord; |
| - UBool fVertical; |
| + CompactTrieNodeType fNodeType; |
| UBool fHasDuplicate; |
| + UBool fEqualOverflows; |
| int32_t fNodeID; |
| UnicodeString fChars; |
| + uint16_t fValue; |
| |
| - public: |
| - BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UErrorCode &status) { |
| +public: |
| + BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType, |
| + UStack &nodes, UErrorCode &status, uint16_t value = 0) { |
| fParentEndsWord = parentEndsWord; |
| fHasDuplicate = FALSE; |
| - fVertical = vertical; |
| + fNodeType = nodeType; |
| + fEqualOverflows = FALSE; |
| fNodeID = nodes.size(); |
| + fValue = parentEndsWord? value : 0; |
| nodes.push(this, status); |
| } |
| |
| @@ -694,87 +933,225 @@ |
| } |
| |
| virtual uint32_t size() { |
| - return sizeof(uint16_t); |
| + if(fValue > 0) |
| + return sizeof(uint16_t) * 2; |
| + else |
| + return sizeof(uint16_t); |
| } |
| |
| virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) { |
| // Write flag/count |
| - *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask) |
| - | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWord : 0 ); |
| + |
| + // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be |
| + // used as a 5th MSB. |
| + U_ASSERT(fChars.length() < 4096 || fNodeID == 2); |
| + |
| + *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) | |
| + ((fNodeID == 2)? (fChars.length() & kRootCountMask): |
| + ( |
| + (fChars.length() & kCountMask) | |
| + //((fChars.length() << 2) & kExceedsCount) | |
| + (fNodeType == kVerticalType ? kVerticalNode : 0) | |
| + (fParentEndsWord ? kParentEndsWord : 0 ) |
| + ) |
| + ); |
| offset += sizeof(uint16_t); |
| } |
| + |
| + virtual void writeValue(uint8_t *bytes, uint32_t &offset) { |
| + if(fValue > 0){ |
| + *((uint16_t *)(bytes+offset)) = fValue; |
| + offset += sizeof(uint16_t); |
| + } |
| + } |
| + |
| +}; |
| + |
| +/** |
| + * Stores value of parent terminating nodes that have no more subtries. |
| + */ |
| +class BuildCompactTrieValueNode: public BuildCompactTrieNode { |
| +public: |
| + BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value) |
| + : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){ |
| + } |
| + |
| + virtual ~BuildCompactTrieValueNode(){ |
| + } |
| + |
| + virtual uint32_t size() { |
| + return sizeof(uint16_t) * 2; |
| + } |
| + |
| + virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { |
| + // don't write value directly to memory but store it in offset to be written later |
| + //offset = fValue & kOffsetContainsValue; |
| + BuildCompactTrieNode::write(bytes, offset, translate); |
| + BuildCompactTrieNode::writeValue(bytes, offset); |
| + } |
| }; |
| |
| class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { |
| public: |
| UStack fLinks; |
| + UBool fMayOverflow; //intermediate value for fEqualOverflows |
| |
| public: |
| - BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status) |
| - : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(status) { |
| + BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0) |
| + : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value), fLinks(status) { |
| + fMayOverflow = FALSE; |
| } |
| |
| virtual ~BuildCompactTrieHorizontalNode() { |
| } |
| |
| + // It is impossible to know beforehand exactly how much space the node will |
| + // need in memory before being written, because the node IDs in the equal |
| + // links may or may not overflow after node coalescing. Therefore, this method |
| + // returns the maximum size possible for the node. |
| virtual uint32_t size() { |
| - return offsetof(CompactTrieHorizontalNode,entries) + |
| - (fChars.length()*sizeof(CompactTrieHorizontalEntry)); |
| + uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) + |
| + (fChars.length()*sizeof(CompactTrieHorizontalEntry)); |
| + |
| + if(fValue > 0) |
| + estimatedSize += sizeof(uint16_t); |
| + |
| + //estimate extra space needed to store overflow for node ID links |
| + //may be more than what is actually needed |
| + for(int i=0; i < fChars.length(); i++){ |
| + if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){ |
| + fMayOverflow = TRUE; |
| + break; |
| + } |
| + } |
| + if(fMayOverflow) // added space for overflow should be same as ceil(fChars.length()/4) * sizeof(uint16_t) |
| + estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4; |
| + |
| + return estimatedSize; |
| } |
| |
| virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { |
| - BuildCompactTrieNode::write(bytes, offset, translate); |
| int32_t count = fChars.length(); |
| + |
| + //if largest nodeID > 2^16, set flag |
| + //large node IDs are more likely to be at the back of the array |
| + for (int32_t i = count-1; i >= 0; --i) { |
| + if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) > 0xFFFF){ |
| + fEqualOverflows = TRUE; |
| + break; |
| + } |
| + } |
| + |
| + BuildCompactTrieNode::write(bytes, offset, translate); |
| + |
| + // write entries[] to memory |
| for (int32_t i = 0; i < count; ++i) { |
| CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset); |
| entry->ch = fChars[i]; |
| entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID); |
| #ifdef DEBUG_TRIE_DICT |
| - if (entry->equal == 0) { |
| + |
| + if ((entry->equal == 0) && !fEqualOverflows) { |
| fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n", |
| i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); |
| } |
| #endif |
| offset += sizeof(CompactTrieHorizontalEntry); |
| } |
| + |
| + // append extra bits of equal nodes to end if fEqualOverflows |
| + if (fEqualOverflows) { |
| + uint16_t leftmostBits = 0; |
| + for (int16_t i = 0; i < count; i++) { |
| + leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i); |
| + |
| + // write filled uint16_t to memory |
| + if(i % 4 == 3){ |
| + *((uint16_t *)(bytes+offset)) = leftmostBits; |
| + leftmostBits = 0; |
| + offset += sizeof(uint16_t); |
| + } |
| + } |
| + |
| + // pad last uint16_t with zeroes if necessary |
| + int remainder = count % 4; |
| + if (remainder > 0) { |
| + *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remainder)); |
| + offset += sizeof(uint16_t); |
| + } |
| + } |
| + |
| + BuildCompactTrieNode::writeValue(bytes, offset); |
| + } |
| + |
| + // returns leftmost bits of physical node link |
| + uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){ |
| + uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) >> 16); |
| +#ifdef DEBUG_TRIE_DICT |
| + if (leftmostBits > 0xF) { |
| + fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n", |
| + i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); |
| + } |
| +#endif |
| + return leftmostBits; |
| } |
| |
| void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { |
| fChars.append(ch); |
| fLinks.push(link, status); |
| } |
| + |
| }; |
| |
| class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { |
| - public: |
| +public: |
| BuildCompactTrieNode *fEqual; |
| |
| - public: |
| - BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status) |
| - : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) { |
| +public: |
| + BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0) |
| + : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) { |
| fEqual = NULL; |
| } |
| |
| virtual ~BuildCompactTrieVerticalNode() { |
| } |
| |
| + // Returns the maximum possible size of this node. See comment in |
| + // BuildCompactTrieHorizontal node for more information. |
| virtual uint32_t size() { |
| - return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t)); |
| + uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t)); |
| + if(fValue > 0){ |
| + estimatedSize += sizeof(uint16_t); |
| + } |
| + |
| + if(fEqual->fNodeID > 0xFFFF){ |
| + estimatedSize += sizeof(uint16_t); |
| + } |
| + return estimatedSize; |
| } |
| |
| virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { |
| CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset); |
| + fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF); |
| BuildCompactTrieNode::write(bytes, offset, translate); |
| node->equal = translate.elementAti(fEqual->fNodeID); |
| offset += sizeof(node->equal); |
| #ifdef DEBUG_TRIE_DICT |
| - if (node->equal == 0) { |
| + if ((node->equal == 0) && !fEqualOverflows) { |
| fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n", |
| fEqual->fNodeID); |
| } |
| #endif |
| fChars.extract(0, fChars.length(), (UChar *)node->chars); |
| - offset += sizeof(uint16_t)*fChars.length(); |
| + offset += sizeof(UChar)*fChars.length(); |
| + |
| + // append 16 bits of to end for equal node if fEqualOverflows |
| + if (fEqualOverflows) { |
| + *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeID) >> 16); |
| + offset += sizeof(uint16_t); |
| + } |
| + |
| + BuildCompactTrieNode::writeValue(bytes, offset); |
| } |
| |
| void addChar(UChar ch) { |
| @@ -784,60 +1161,85 @@ |
| void setLink(BuildCompactTrieNode *node) { |
| fEqual = node; |
| } |
| + |
| }; |
| |
| // Forward declaration |
| static void walkHorizontal(const TernaryNode *node, |
| BuildCompactTrieHorizontalNode *building, |
| UStack &nodes, |
| - UErrorCode &status); |
| + UErrorCode &status, |
| + Hashtable *values); |
| |
| -// Convert one node. Uses recursion. |
| +// Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion. |
| |
| static BuildCompactTrieNode * |
| -compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UErrorCode &status) { |
| +compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, |
| + UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) { |
| if (U_FAILURE(status)) { |
| return NULL; |
| } |
| BuildCompactTrieNode *result = NULL; |
| UBool horizontal = (node->low != NULL || node->high != NULL); |
| if (horizontal) { |
| - BuildCompactTrieHorizontalNode *hResult = |
| - new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status); |
| + BuildCompactTrieHorizontalNode *hResult; |
| + if(values != NULL){ |
| + hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue); |
| + } else { |
| + hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status); |
| + } |
| + |
| if (hResult == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| if (U_SUCCESS(status)) { |
| - walkHorizontal(node, hResult, nodes, status); |
| + walkHorizontal(node, hResult, nodes, status, values); |
| result = hResult; |
| } |
| } |
| else { |
| - BuildCompactTrieVerticalNode *vResult = |
| - new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); |
| + BuildCompactTrieVerticalNode *vResult; |
| + if(values != NULL){ |
| + vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status, parentValue); |
| + } else { |
| + vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); |
| + } |
| + |
| if (vResult == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| + return NULL; |
| } |
| else if (U_SUCCESS(status)) { |
| - UBool endsWord = FALSE; |
| + uint16_t value = 0; |
| + UBool endsWord = FALSE; |
| // Take up nodes until we end a word, or hit a node with < or > links |
| do { |
| vResult->addChar(node->ch); |
| - endsWord = (node->flags & kEndsWord) != 0; |
| + value = node->flags; |
| + endsWord = value > 0; |
| node = node->equal; |
| } |
| while(node != NULL && !endsWord && node->low == NULL && node->high == NULL); |
| + |
| if (node == NULL) { |
| if (!endsWord) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie |
| } |
| - else { |
| + else if(values != NULL){ |
| + UnicodeString key(value); //store value as a single-char UnicodeString |
| + BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key); |
| + if(link == NULL){ |
| + link = new BuildCompactTrieValueNode(nodes, status, value); //take out nodes? |
| + values->put(key, link, status); |
| + } |
| + vResult->setLink(link); |
| + } else { |
| vResult->setLink((BuildCompactTrieNode *)nodes[1]); |
| } |
| } |
| else { |
| - vResult->setLink(compactOneNode(node, endsWord, nodes, status)); |
| + vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value)); |
| } |
| result = vResult; |
| } |
| @@ -849,19 +1251,28 @@ |
| // Uses recursion. |
| |
| static void walkHorizontal(const TernaryNode *node, |
| - BuildCompactTrieHorizontalNode *building, |
| - UStack &nodes, |
| - UErrorCode &status) { |
| + BuildCompactTrieHorizontalNode *building, |
| + UStack &nodes, |
| + UErrorCode &status, Hashtable *values = NULL) { |
| while (U_SUCCESS(status) && node != NULL) { |
| if (node->low != NULL) { |
| - walkHorizontal(node->low, building, nodes, status); |
| + walkHorizontal(node->low, building, nodes, status, values); |
| } |
| BuildCompactTrieNode *link = NULL; |
| if (node->equal != NULL) { |
| - link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status); |
| + link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags); |
| } |
| - else if (node->flags & kEndsWord) { |
| - link = (BuildCompactTrieNode *)nodes[1]; |
| + else if (node->flags > 0) { |
| + if(values != NULL) { |
| + UnicodeString key(node->flags); //store value as a single-char UnicodeString |
| + link = (BuildCompactTrieValueNode *) values->get(key); |
| + if(link == NULL) { |
| + link = new BuildCompactTrieValueNode(nodes, status, node->flags); //take out nodes? |
| + values->put(key, link, status); |
| + } |
| + } else { |
| + link = (BuildCompactTrieNode *)nodes[1]; |
| + } |
| } |
| if (U_SUCCESS(status) && link != NULL) { |
| building->addNode(node->ch, link, status); |
| @@ -881,13 +1292,15 @@ |
| _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) { |
| BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; |
| BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; |
| + |
| // Check for comparing a node to itself, to avoid spurious duplicates |
| if (left == right) { |
| return 0; |
| } |
| + |
| // Most significant is type of node. Can never coalesce. |
| - if (left->fVertical != right->fVertical) { |
| - return left->fVertical - right->fVertical; |
| + if (left->fNodeType != right->fNodeType) { |
| + return left->fNodeType - right->fNodeType; |
| } |
| // Next, the "parent ends word" flag. If that differs, we cannot coalesce. |
| if (left->fParentEndsWord != right->fParentEndsWord) { |
| @@ -898,12 +1311,19 @@ |
| if (result != 0) { |
| return result; |
| } |
| + |
| + // If the node value differs, we should not coalesce. |
| + // If values aren't stored, all fValues should be 0. |
| + if (left->fValue != right->fValue) { |
| + return left->fValue - right->fValue; |
| + } |
| + |
| // We know they're both the same node type, so branch for the two cases. |
| - if (left->fVertical) { |
| + if (left->fNodeType == kVerticalType) { |
| result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID |
| - - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; |
| + - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; |
| } |
| - else { |
| + else if(left->fChars.length() > 0 && right->fChars.length() > 0){ |
| // We need to compare the links vectors. They should be the |
| // same size because the strings were equal. |
| // We compare the node IDs instead of the pointers, to handle |
| @@ -914,9 +1334,10 @@ |
| int32_t count = hleft->fLinks.size(); |
| for (int32_t i = 0; i < count && result == 0; ++i) { |
| result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - |
| - ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; |
| + ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; |
| } |
| } |
| + |
| // If they are equal to each other, mark them (speeds coalescing) |
| if (result == 0) { |
| left->fHasDuplicate = TRUE; |
| @@ -1031,20 +1452,25 @@ |
| // Add node 0, used as the NULL pointer/sentinel. |
| nodes.addElement((int32_t)0, status); |
| |
| + Hashtable *values = NULL; // Index of (unique) values |
| + if (dict.fValued) { |
| + values = new Hashtable(status); |
| + } |
| + |
| // Start by creating the special empty node we use to indicate that the parent |
| // terminates a word. This must be node 1, because the builder assumes |
| - // that. |
| + // that. This node will never be used for tries storing numerical values. |
| if (U_FAILURE(status)) { |
| return NULL; |
| } |
| - BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes, status); |
| + BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalType, nodes, status); |
| if (terminal == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| |
| // This call does all the work of building the new trie structure. The root |
| - // will be node 2. |
| - BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status); |
| + // will have node ID 2 before writing to memory. |
| + BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status, values); |
| #ifdef DEBUG_TRIE_DICT |
| (void) ::times(&timing); |
| fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", |
| @@ -1077,21 +1503,37 @@ |
| return NULL; |
| } |
| |
| + //map terminal value nodes |
| + int valueCount = 0; |
| + UVector valueNodes(status); |
| + if(values != NULL) { |
| + valueCount = values->count(); //number of unique terminal value nodes |
| + } |
| + |
| + // map non-terminal nodes |
| + int valuePos = 1;//, nodePos = valueCount + valuePos; |
| + nodeCount = valueCount + valuePos; |
| for (i = 1; i < count; ++i) { |
| node = (BuildCompactTrieNode *)nodes[i]; |
| if (node->fNodeID == i) { |
| // Only one node out of each duplicate set is used |
| - if (i >= translate.size()) { |
| + if (node->fNodeID >= translate.size()) { |
| // Logically extend the mapping table |
| - translate.setSize(i+1); |
| + translate.setSize(i + 1); |
| + } |
| + //translate.setElementAt(object, index)! |
| + if(node->fNodeType == kValueType) { |
| + valueNodes.addElement(node, status); |
| + translate.setElementAt(valuePos++, i); |
| + } else { |
| + translate.setElementAt(nodeCount++, i); |
| } |
| - translate.setElementAt(nodeCount++, i); |
| totalSize += node->size(); |
| } |
| } |
| - |
| - // Check for overflowing 16 bits worth of nodes. |
| - if (nodeCount > 0x10000) { |
| + |
| + // Check for overflowing 20 bits worth of nodes. |
| + if (nodeCount > 0x100000) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| @@ -1111,9 +1553,14 @@ |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| - |
| + |
| CompactTrieHeader *header = (CompactTrieHeader *)bytes; |
| - header->size = totalSize; |
| + //header->size = totalSize; |
| + if(dict.fValued){ |
| + header->magic = COMPACT_TRIE_MAGIC_3; |
| + } else { |
| + header->magic = COMPACT_TRIE_MAGIC_2; |
| + } |
| header->nodeCount = nodeCount; |
| header->offsets[0] = 0; // Sentinel |
| header->root = translate.elementAti(root->fNodeID); |
| @@ -1123,23 +1570,40 @@ |
| } |
| #endif |
| uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t)); |
| - nodeCount = 1; |
| + nodeCount = valueCount + 1; |
| + |
| + // Write terminal value nodes to memory |
| + for (i=0; i < valueNodes.size(); i++) { |
| + //header->offsets[i + 1] = offset; |
| + uint32_t tmpOffset = 0; |
| + node = (BuildCompactTrieNode *) valueNodes.elementAt(i); |
| + //header->offsets[i + 1] = (uint32_t)node->fValue; |
| + node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate); |
| + } |
| + |
| // Now write the data |
| for (i = 1; i < count; ++i) { |
| node = (BuildCompactTrieNode *)nodes[i]; |
| - if (node->fNodeID == i) { |
| + if (node->fNodeID == i && node->fNodeType != kValueType) { |
| header->offsets[nodeCount++] = offset; |
| node->write(bytes, offset, translate); |
| } |
| } |
| + |
| + //free all extra space |
| + uprv_realloc(bytes, offset); |
| + header->size = offset; |
| + |
| #ifdef DEBUG_TRIE_DICT |
| + fprintf(stdout, "Space freed: %d\n", totalSize-offset); |
| + |
| (void) ::times(&timing); |
| fprintf(stderr, "Trie built, time user %f system %f\n", |
| (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, |
| (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); |
| previous = timing; |
| fprintf(stderr, "Final offset is %d\n", offset); |
| - |
| + |
| // Collect statistics on node types and sizes |
| int hCount = 0; |
| int vCount = 0; |
| @@ -1148,68 +1612,85 @@ |
| size_t hItemCount = 0; |
| size_t vItemCount = 0; |
| uint32_t previousOff = offset; |
| - for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { |
| + uint32_t numOverflow = 0; |
| + uint32_t valueSpace = 0; |
| + for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { |
| const CompactTrieNode *node = getCompactNode(header, nodeIdx); |
| - if (node->flagscount & kVerticalNode) { |
| + int itemCount; |
| + if(nodeIdx == header->root) |
| + itemCount = node->flagscount & kRootCountMask; |
| + else |
| + itemCount = getCount(node); |
| + if(node->flagscount & kEqualOverflows){ |
| + numOverflow++; |
| + } |
| + if (node->flagscount & kVerticalNode && nodeIdx != header->root) { |
| vCount += 1; |
| - vItemCount += (node->flagscount & kCountMask); |
| + vItemCount += itemCount; |
| vSize += previousOff-header->offsets[nodeIdx]; |
| } |
| else { |
| hCount += 1; |
| - hItemCount += (node->flagscount & kCountMask); |
| - hSize += previousOff-header->offsets[nodeIdx]; |
| + hItemCount += itemCount; |
| + if(nodeIdx >= header->root) { |
| + hSize += previousOff-header->offsets[nodeIdx]; |
| + } |
| } |
| + |
| + if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEndsWord) |
| + valueSpace += sizeof(uint16_t); |
| previousOff = header->offsets[nodeIdx]; |
| } |
| fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount, |
| (double)hSize/hCount, (double)hItemCount/hCount); |
| fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount, |
| (double)vSize/vCount, (double)vItemCount/vCount); |
| + fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverflow); |
| + fprintf(stderr, "Space taken up by values: %d \n", valueSpace); |
| #endif |
| |
| if (U_FAILURE(status)) { |
| uprv_free(bytes); |
| header = NULL; |
| } |
| - else { |
| - header->magic = COMPACT_TRIE_MAGIC_1; |
| - } |
| return header; |
| } |
| |
| // Forward declaration |
| static TernaryNode * |
| -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ); |
| - |
| +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ); |
| |
| // Convert a horizontal node (or subarray thereof) into a ternary subtrie |
| static TernaryNode * |
| -unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizontalEntry *array, |
| - int low, int high, UErrorCode &status ) { |
| +unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalNode *hnode, |
| + int low, int high, int nodeCount, UErrorCode &status) { |
| if (U_FAILURE(status) || low > high) { |
| return NULL; |
| } |
| int middle = (low+high)/2; |
| - TernaryNode *result = new TernaryNode(array[middle].ch); |
| + TernaryNode *result = new TernaryNode(hnode->entries[middle].ch); |
| if (result == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| - const CompactTrieNode *equal = getCompactNode(header, array[middle].equal); |
| + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, middle, nodeCount)); |
| if (equal->flagscount & kParentEndsWord) { |
| - result->flags |= kEndsWord; |
| + if(info->magic == COMPACT_TRIE_MAGIC_3){ |
| + result->flags = getValue(equal); |
| + }else{ |
| + result->flags |= kEndsWord; |
| + } |
| } |
| - result->low = unpackHorizontalArray(header, array, low, middle-1, status); |
| - result->high = unpackHorizontalArray(header, array, middle+1, high, status); |
| - result->equal = unpackOneNode(header, equal, status); |
| + result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status); |
| + result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status); |
| + result->equal = unpackOneNode(info, equal, status); |
| return result; |
| } |
| |
| // Convert one compact trie node into a ternary subtrie |
| static TernaryNode * |
| -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ) { |
| - int nodeCount = (node->flagscount & kCountMask); |
| +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ) { |
| + int nodeCount = getCount(node); |
| if (nodeCount == 0 || U_FAILURE(status)) { |
| // Failure, or terminal node |
| return NULL; |
| @@ -1234,29 +1715,41 @@ |
| previous = latest; |
| } |
| if (latest != NULL) { |
| - const CompactTrieNode *equal = getCompactNode(header, vnode->equal); |
| + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vnode)); |
| if (equal->flagscount & kParentEndsWord) { |
| - latest->flags |= kEndsWord; |
| + if(info->magic == COMPACT_TRIE_MAGIC_3){ |
| + latest->flags = getValue(equal); |
| + } else { |
| + latest->flags |= kEndsWord; |
| + } |
| } |
| - latest->equal = unpackOneNode(header, equal, status); |
| + latest->equal = unpackOneNode(info, equal, status); |
| } |
| return head; |
| } |
| else { |
| // Horizontal node |
| const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; |
| - return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status); |
| + return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, status); |
| } |
| } |
| |
| +// returns a MutableTrieDictionary generated from the CompactTrieDictionary |
| MutableTrieDictionary * |
| CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { |
| - MutableTrieDictionary *result = new MutableTrieDictionary( status ); |
| + MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->magic == COMPACT_TRIE_MAGIC_3 ); |
| if (result == NULL) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| - TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status); |
| + // treat root node as special case: don't call unpackOneNode() or unpackHorizontalArray() directly |
| + // because only kEqualOverflows flag should be checked in root's flagscount |
| + const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *) |
| + getCompactNode(fInfo, fInfo->root); |
| + uint16_t nodeCount = hnode->flagscount & kRootCountMask; |
| + TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1, |
| + nodeCount, status); |
| + |
| if (U_FAILURE(status)) { |
| delete root; // Clean up |
| delete result; |
| @@ -1270,8 +1763,8 @@ |
| |
| U_CAPI int32_t U_EXPORT2 |
| triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, |
| - UErrorCode *status) { |
| - |
| + UErrorCode *status) { |
| + |
| if (status == NULL || U_FAILURE(*status)) { |
| return 0; |
| } |
| @@ -1286,14 +1779,14 @@ |
| // |
| const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); |
| if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ |
| - pInfo->dataFormat[1]==0x72 && |
| - pInfo->dataFormat[2]==0x44 && |
| - pInfo->dataFormat[3]==0x63 && |
| - pInfo->formatVersion[0]==1 )) { |
| + pInfo->dataFormat[1]==0x72 && |
| + pInfo->dataFormat[2]==0x44 && |
| + pInfo->dataFormat[3]==0x63 && |
| + pInfo->formatVersion[0]==1 )) { |
| udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", |
| - pInfo->dataFormat[0], pInfo->dataFormat[1], |
| - pInfo->dataFormat[2], pInfo->dataFormat[3], |
| - pInfo->formatVersion[0]); |
| + pInfo->dataFormat[0], pInfo->dataFormat[1], |
| + pInfo->dataFormat[2], pInfo->dataFormat[3], |
| + pInfo->formatVersion[0]); |
| *status=U_UNSUPPORTED_ERROR; |
| return 0; |
| } |
| @@ -1311,8 +1804,10 @@ |
| // |
| const uint8_t *inBytes =(const uint8_t *)inData+headerSize; |
| const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; |
| - if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 |
| - || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) |
| + uint32_t magic = ds->readUInt32(header->magic); |
| + if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3 |
| + || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1) |
| + || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) |
| { |
| udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n"); |
| *status=U_UNSUPPORTED_ERROR; |
| @@ -1333,10 +1828,10 @@ |
| // |
| if (length < sizeWithUData) { |
| udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n", |
| - totalSize); |
| + totalSize); |
| *status=U_INDEX_OUTOFBOUNDS_ERROR; |
| return 0; |
| - } |
| + } |
| |
| // |
| // Swap the Data. Do the data itself first, then the CompactTrieHeader, because |
| @@ -1355,20 +1850,38 @@ |
| } |
| |
| // We need to loop through all the nodes in the offset table, and swap each one. |
| - uint16_t nodeCount = ds->readUInt16(header->nodeCount); |
| + uint32_t nodeCount, rootId; |
| + if(header->magic == COMPACT_TRIE_MAGIC_1) { |
| + nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount); |
| + rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root); |
| + } else { |
| + nodeCount = ds->readUInt32(header->nodeCount); |
| + rootId = ds->readUInt32(header->root); |
| + } |
| + |
| // Skip node 0, which should always be 0. |
| - for (int i = 1; i < nodeCount; ++i) { |
| + for (uint32_t i = 1; i < nodeCount; ++i) { |
| uint32_t nodeOff = ds->readUInt32(header->offsets[i]); |
| const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff); |
| CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); |
| uint16_t flagscount = ds->readUInt16(inNode->flagscount); |
| - uint16_t itemCount = flagscount & kCountMask; |
| + uint16_t itemCount = getCount(inNode); |
| + //uint16_t itemCount = flagscount & kCountMask; |
| ds->writeUInt16(&outNode->flagscount, flagscount); |
| if (itemCount > 0) { |
| - if (flagscount & kVerticalNode) { |
| + uint16_t overflow = 0; //number of extra uint16_ts needed to be swapped |
| + if (flagscount & kVerticalNode && i != rootId) { |
| + if(flagscount & kEqualOverflows){ |
| + // include overflow bits |
| + overflow += 1; |
| + } |
| + if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsParentWord) { |
| + //include values |
| + overflow += 1; |
| + } |
| ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), |
| - itemCount*sizeof(uint16_t), |
| - outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status); |
| + (itemCount + overflow)*sizeof(uint16_t), |
| + outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status); |
| uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal); |
| ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal)); |
| } |
| @@ -1381,26 +1894,62 @@ |
| word = ds->readUInt16(inHNode->entries[j].equal); |
| ds->writeUInt16(&outHNode->entries[j].equal, word); |
| } |
| + |
| + // swap overflow/value information |
| + if(flagscount & kEqualOverflows){ |
| + overflow += (itemCount + 3) / 4; |
| + } |
| + |
| + if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flagscount & kEndsParentWord) { |
| + //include values |
| + overflow += 1; |
| + } |
| + |
| + uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount]; |
| + uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCount]; |
| + for(int j = 0; j<overflow; j++){ |
| + uint16_t extraInfo = ds->readUInt16(*inOverflow); |
| + ds->writeUInt16(outOverflow, extraInfo); |
| + |
| + inOverflow++; |
| + outOverflow++; |
| + } |
| } |
| } |
| } |
| #endif |
| |
| - // All the data in all the nodes consist of 16 bit items. Swap them all at once. |
| - uint16_t nodeCount = ds->readUInt16(header->nodeCount); |
| - uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount*sizeof(uint32_t)); |
| - ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status); |
| - |
| // Swap the header |
| ds->writeUInt32(&outputHeader->size, totalSize); |
| - uint32_t magic = ds->readUInt32(header->magic); |
| ds->writeUInt32(&outputHeader->magic, magic); |
| - ds->writeUInt16(&outputHeader->nodeCount, nodeCount); |
| - uint16_t root = ds->readUInt16(header->root); |
| - ds->writeUInt16(&outputHeader->root, root); |
| - ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets), |
| - sizeof(uint32_t)*(int32_t)nodeCount, |
| - outBytes+offsetof(CompactTrieHeader,offsets), status); |
| + |
| + uint32_t nodeCount; |
| + uint32_t offsetPos; |
| + if (header->magic == COMPACT_TRIE_MAGIC_1) { |
| + CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header; |
| + CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeader; |
| + |
| + nodeCount = ds->readUInt16(headerV1->nodeCount); |
| + ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount); |
| + uint16_t root = ds->readUInt16(headerV1->root); |
| + ds->writeUInt16(&outputHeaderV1->root, root); |
| + offsetPos = offsetof(CompactTrieHeaderV1,offsets); |
| + } else { |
| + nodeCount = ds->readUInt32(header->nodeCount); |
| + ds->writeUInt32(&outputHeader->nodeCount, nodeCount); |
| + uint32_t root = ds->readUInt32(header->root); |
| + ds->writeUInt32(&outputHeader->root, root); |
| + offsetPos = offsetof(CompactTrieHeader,offsets); |
| + } |
| + |
| + // All the data in all the nodes consist of 16 bit items. Swap them all at once. |
| + uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t)); |
| + ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status); |
| + |
| + //swap offsets |
| + ds->swapArray32(ds, inBytes+offsetPos, |
| + sizeof(uint32_t)*(uint32_t)nodeCount, |
| + outBytes+offsetPos, status); |
| |
| return sizeWithUData; |
| } |
| --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700 |
| +++ source/common/triedict.h 2011-01-21 14:12:45.496927000 -0800 |
| @@ -47,7 +47,6 @@ |
| U_NAMESPACE_BEGIN |
| |
| class StringEnumeration; |
| -struct CompactTrieHeader; |
| |
| /******************************************************************* |
| * TrieWordDictionary |
| @@ -72,23 +71,29 @@ |
| */ |
| virtual ~TrieWordDictionary(); |
| |
| + /** |
| + * <p>Returns true if the dictionary contains values associated with each word.</p> |
| + */ |
| + virtual UBool getValued() const = 0; |
| + |
| /** |
| * <p>Find dictionary words that match the text.</p> |
| * |
| * @param text A UText representing the text. The |
| * iterator is left after the longest prefix match in the dictionary. |
| - * @param start The current position in text. |
| * @param maxLength The maximum number of code units to match. |
| * @param lengths An array that is filled with the lengths of words that matched. |
| * @param count Filled with the number of elements output in lengths. |
| * @param limit The size of the lengths array; this limits the number of words output. |
| + * @param values An array that is filled with the values associated with the matched words. |
| * @return The number of characters in text that were matched. |
| */ |
| virtual int32_t matches( UText *text, |
| int32_t maxLength, |
| int32_t *lengths, |
| int &count, |
| - int limit ) const = 0; |
| + int limit, |
| + uint16_t *values = NULL) const = 0; |
| |
| /** |
| * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p> |
| @@ -128,6 +133,12 @@ |
| |
| UText *fIter; |
| |
| + /** |
| + * A UText for internal use |
| + * @internal |
| + */ |
| + UBool fValued; |
| + |
| friend class CompactTrieDictionary; // For fast conversion |
| |
| public: |
| @@ -138,14 +149,29 @@ |
| * @param median A UChar around which to balance the trie. Ideally, it should |
| * begin at least one word that is near the median of the set in the dictionary |
| * @param status A status code recording the success of the call. |
| + * @param containsValue True if the dictionary stores values associated with each word. |
| */ |
| - MutableTrieDictionary( UChar median, UErrorCode &status ); |
| + MutableTrieDictionary( UChar median, UErrorCode &status, UBool containsValue = FALSE ); |
| |
| /** |
| * <p>Virtual destructor.</p> |
| */ |
| virtual ~MutableTrieDictionary(); |
| |
| + /** |
| + * Indicate whether the MutableTrieDictionary stores values associated with each word |
| + */ |
| + void setValued(UBool valued){ |
| + fValued = valued; |
| + } |
| + |
| + /** |
| + * <p>Returns true if the dictionary contains values associated with each word.</p> |
| + */ |
| + virtual UBool getValued() const { |
| + return fValued; |
| + } |
| + |
| /** |
| * <p>Find dictionary words that match the text.</p> |
| * |
| @@ -155,13 +181,15 @@ |
| * @param lengths An array that is filled with the lengths of words that matched. |
| * @param count Filled with the number of elements output in lengths. |
| * @param limit The size of the lengths array; this limits the number of words output. |
| + * @param values An array that is filled with the values associated with the matched words. |
| * @return The number of characters in text that were matched. |
| */ |
| virtual int32_t matches( UText *text, |
| int32_t maxLength, |
| int32_t *lengths, |
| int &count, |
| - int limit ) const; |
| + int limit, |
| + uint16_t *values = NULL) const; |
| |
| /** |
| * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p> |
| @@ -173,15 +201,17 @@ |
| virtual StringEnumeration *openWords( UErrorCode &status ) const; |
| |
| /** |
| - * <p>Add one word to the dictionary.</p> |
| + * <p>Add one word to the dictionary with an optional associated value.</p> |
| * |
| * @param word A UChar buffer containing the word. |
| * @param length The length of the word. |
| - * @param status The resultant status |
| + * @param status The resultant status. |
| + * @param value The nonzero value associated with this word. |
| */ |
| virtual void addWord( const UChar *word, |
| int32_t length, |
| - UErrorCode &status); |
| + UErrorCode &status, |
| + uint16_t value = 0); |
| |
| #if 0 |
| /** |
| @@ -203,8 +233,9 @@ |
| * @param lengths An array that is filled with the lengths of words that matched. |
| * @param count Filled with the number of elements output in lengths. |
| * @param limit The size of the lengths array; this limits the number of words output. |
| - * @param parent The parent of the current node |
| - * @param pMatched The returned parent node matched the input |
| + * @param parent The parent of the current node. |
| + * @param pMatched The returned parent node matched the input/ |
| + * @param values An array that is filled with the values associated with the matched words. |
| * @return The number of characters in text that were matched. |
| */ |
| virtual int32_t search( UText *text, |
| @@ -213,40 +244,46 @@ |
| int &count, |
| int limit, |
| TernaryNode *&parent, |
| - UBool &pMatched ) const; |
| + UBool &pMatched, |
| + uint16_t *values = NULL) const; |
| |
| private: |
| /** |
| * <p>Private constructor. The root node it not allocated.</p> |
| * |
| * @param status A status code recording the success of the call. |
| + * @param containsValues True if the dictionary will store a value associated |
| + * with each word added. |
| */ |
| - MutableTrieDictionary( UErrorCode &status ); |
| + MutableTrieDictionary( UErrorCode &status, UBool containsValues = false ); |
| }; |
| |
| /******************************************************************* |
| * CompactTrieDictionary |
| */ |
| |
| +//forward declarations |
| +struct CompactTrieHeader; |
| +struct CompactTrieInfo; |
| + |
| /** |
| * <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted |
| * to save space.</p> |
| */ |
| class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary { |
| private: |
| - /** |
| - * The root node of the trie |
| - */ |
| + /** |
| + * The header of the CompactTrieDictionary which contains all info |
| + */ |
| |
| - const CompactTrieHeader *fData; |
| - |
| - /** |
| - * A UBool indicating whether or not we own the fData. |
| - */ |
| + CompactTrieInfo *fInfo; |
| |
| + /** |
| + * A UBool indicating whether or not we own the fData. |
| + */ |
| UBool fOwnData; |
| |
| - UDataMemory *fUData; |
| + UDataMemory *fUData; |
| public: |
| /** |
| * <p>Construct a dictionary from a UDataMemory.</p> |
| @@ -277,6 +314,11 @@ |
| */ |
| virtual ~CompactTrieDictionary(); |
| |
| + /** |
| + * <p>Returns true if the dictionary contains values associated with each word.</p> |
| + */ |
| + virtual UBool getValued() const; |
| + |
| /** |
| * <p>Find dictionary words that match the text.</p> |
| * |
| @@ -286,13 +328,15 @@ |
| * @param lengths An array that is filled with the lengths of words that matched. |
| * @param count Filled with the number of elements output in lengths. |
| * @param limit The size of the lengths array; this limits the number of words output. |
| + * @param values An array that is filled with the values associated with the matched words. |
| * @return The number of characters in text that were matched. |
| */ |
| virtual int32_t matches( UText *text, |
| - int32_t rangeEnd, |
| + int32_t maxLength, |
| int32_t *lengths, |
| int &count, |
| - int limit ) const; |
| + int limit, |
| + uint16_t *values = NULL) const; |
| |
| /** |
| * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p> |
| @@ -311,7 +355,7 @@ |
| virtual uint32_t dataSize() const; |
| |
| /** |
| - * <p>Return a void * pointer to the compact data, platform-endian.</p> |
| + * <p>Return a void * pointer to the (unmanaged) compact data, platform-endian.</p> |
| * |
| * @return The data for the compact dictionary, suitable for passing to the |
| * constructor. |
| @@ -342,5 +386,5 @@ |
| |
| U_NAMESPACE_END |
| |
| - /* TRIEDICT_H */ |
| +/* TRIEDICT_H */ |
| #endif |
| --- source/data/Makefile.in 2010-10-29 13:21:33.000000000 -0700 |
| +++ source/data/Makefile.in 2011-01-26 16:24:24.856798000 -0800 |
| @@ -509,8 +520,9 @@ |
| #################################################### CTD |
| # CTD FILES |
| |
| -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES) |
| - $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $< |
| +# .ctd file now generated regardless of whether dictionary file exists |
| +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES) |
| + $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F).txt |
| |
| #################################################### CFU |
| # CFU FILES |
| --- source/data/brkitr/root.txt 2010-07-28 17:18:28.000000000 -0700 |
| +++ source/data/brkitr/root.txt 2011-01-21 14:12:45.653922000 -0800 |
| @@ -17,5 +17,8 @@ |
| } |
| dictionaries{ |
| Thai:process(dependency){"thaidict.ctd"} |
| + Hani:process(dependency){"cjdict.ctd"} |
| + Hira:process(dependency){"cjdict.ctd"} |
| + Kata:process(dependency){"cjdict.ctd"} |
| } |
| } |
| --- source/data/xml/brkitr/root.xml 2010-03-01 15:13:18.000000000 -0800 |
| +++ source/data/xml/brkitr/root.xml 2011-01-21 14:12:45.735922000 -0800 |
| @@ -25,6 +25,9 @@ |
| </icu:boundaries> |
| <icu:dictionaries> |
| <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/> |
| + <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/> |
| + <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/> |
| + <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/> |
| </icu:dictionaries> |
| </icu:breakIteratorData> |
| </special> |
| --- source/test/cintltst/creststn.c 2010-10-28 10:44:02.000000000 -0700 |
| +++ source/test/cintltst/creststn.c 2011-01-21 14:12:44.995020000 -0800 |
| @@ -2188,21 +2188,21 @@ |
| |
| |
| { |
| - UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status); |
| + UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status); |
| const UChar *got = NULL, *exp=NULL; |
| int32_t gotLen = 0, expLen=0; |
| - ja = ures_getByKey(ja, "boundaries", ja, &status); |
| - exp = tres_getString(ja, -1, "word", &expLen, &status); |
| + th = ures_getByKey(th, "boundaries", th, &status); |
| + exp = tres_getString(th, -1, "grapheme", &expLen, &status); |
| |
| tb = ures_getByKey(aliasB, "boundaries", tb, &status); |
| - got = tres_getString(tb, -1, "word", &gotLen, &status); |
| + got = tres_getString(tb, -1, "grapheme", &gotLen, &status); |
| |
| if(U_FAILURE(status)) { |
| log_err("%s trying to read str boundaries\n", u_errorName(status)); |
| } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) { |
| log_err("Referencing alias didn't get the right data\n"); |
| } |
| - ures_close(ja); |
| + ures_close(th); |
| status = U_ZERO_ERROR; |
| } |
| /* simple alias */ |
| --- source/test/intltest/rbbiapts.cpp 2010-07-12 11:03:29.000000000 -0700 |
| +++ source/test/intltest/rbbiapts.cpp 2011-01-21 14:12:45.033014000 -0800 |
| @@ -156,9 +156,13 @@ |
| if(*a!=*b){ |
| errln("Failed: boilerplate method operator!= does not return correct results"); |
| } |
| - BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status); |
| - if(a && c){ |
| - if(*c==*a){ |
| + // Japanese word break iteratos is identical to root with |
| + // a dictionary-based break iterator, but Thai character break iterator |
| + // is still different from Root. |
| + BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),status); |
| + BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),status); |
| + if(c && d){ |
| + if(*c==*d){ |
| errln("Failed: boilerplate method opertator== does not return correct results"); |
| } |
| }else{ |
| @@ -167,6 +171,7 @@ |
| delete a; |
| delete b; |
| delete c; |
| + delete d; |
| } |
| |
| void RBBIAPITest::TestgetRules() |
| @@ -635,21 +640,21 @@ |
| // |
| void RBBIAPITest::TestRuleStatus() { |
| UChar str[30]; |
| - u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094", |
| - // 012345678901234567 8 9 0 1 2 3 4 5 6 |
| - // Ideographic Katakana Hiragana |
| + //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing |
| + // changed UBRK_WORD_KANA to UBRK_WORD_IDEO |
| + u_unescape("plain word 123.45 \\u30a1\\u30a2 ", |
| + // 012345678901234567 8 9 0 |
| + // Katakana |
| str, 30); |
| UnicodeString testString1(str); |
| - int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26}; |
| + int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21}; |
| int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER, |
| UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE, |
| - UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE, |
| - UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA}; |
| + UBRK_WORD_IDEO, UBRK_WORD_NONE}; |
| |
| int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, |
| UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT, |
| - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT, |
| - UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT}; |
| + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT}; |
| |
| UErrorCode status=U_ZERO_ERROR; |
| |
| @@ -888,9 +893,11 @@ |
| |
| URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status); |
| { |
| +#if 0 // With a dictionary based word breaking, ja_word is identical to root. |
| if (ja_word && *ja_word == *root_word) { |
| errln("japan not different from root"); |
| } |
| +#endif |
| } |
| |
| { |
| --- source/test/intltest/rbbitst.cpp 2010-10-08 18:23:28.000000000 -0700 |
| +++ source/test/intltest/rbbitst.cpp 2011-01-21 14:12:45.180030000 -0800 |
| @@ -35,6 +35,8 @@ |
| #include <string.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| +#include "unicode/numfmt.h" |
| +#include "unicode/uscript.h" |
| |
| #define TEST_ASSERT(x) {if (!(x)) { \ |
| errln("Failure in file %s, line %d", __FILE__, __LINE__);}} |
| @@ -138,11 +140,13 @@ |
| if (exec) TestThaiBreaks(); break; |
| case 23: name = "TestTailoredBreaks"; |
| if (exec) TestTailoredBreaks(); break; |
| + case 24: name = "TestTrieDictWithValue"; |
| + if(exec) TestTrieDictWithValue(); break; |
| #else |
| - case 21: case 22: case 23: name = "skip"; |
| + case 21: case 22: case 23: case 24: name = "skip"; |
| break; |
| #endif |
| - case 24: name = "TestDictRules"; |
| + case 25: name = "TestDictRules"; |
| if (exec) TestDictRules(); break; |
| case 25: name = "TestBug5532"; |
| if (exec) TestBug5532(); break; |
| @@ -607,6 +611,8 @@ |
| |
| |
| void RBBITest::TestJapaneseWordBreak() { |
| +// TODO: Rewrite this test for a dictionary-based word breaking. |
| +#if 0 |
| UErrorCode status = U_ZERO_ERROR; |
| BITestData japaneseWordSelection(status); |
| |
| @@ -628,6 +634,7 @@ |
| |
| generalIteratorTest(*e, japaneseWordSelection); |
| delete e; |
| +#endif |
| } |
| |
| void RBBITest::TestTrieDict() { |
| @@ -849,6 +856,372 @@ |
| delete compact2; |
| } |
| |
| +/*TODO: delete later*/ |
| +inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){ |
| + UErrorCode status = U_ZERO_ERROR; |
| + FILE *outfile = fopen(filename,"w"); |
| + UConverter *cvt = ucnv_open("UTF-8", &status); |
| + if (U_FAILURE(status)) |
| + return; |
| + if(outfile != NULL){ |
| + status = U_ZERO_ERROR; |
| + const UnicodeString *word = enumer->snext(status); |
| + while (word != NULL && U_SUCCESS(status)) { |
| + char u8word[500]; |
| + status = U_ZERO_ERROR; |
| + ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(), |
| + &status); |
| + fprintf(outfile,"%s\n", u8word); |
| + status = U_ZERO_ERROR; |
| + word = enumer->snext(status); |
| + } |
| + fclose(outfile); |
| + } |
| + ucnv_close(cvt); |
| +} |
| + |
| +// A very simple helper class to streamline the buffer handling in |
| +// TestTrieDictWithValue |
| +template<class T, size_t N> |
| +class AutoBuffer { |
| + public: |
| + AutoBuffer(size_t size) : buffer(stackBuffer) { |
| + if (size > N) |
| + buffer = new T[size]; |
| + } |
| + ~AutoBuffer() { |
| + if (buffer != stackBuffer) |
| + delete [] buffer; |
| + } |
| + T* elems() { |
| + return buffer; |
| + } |
| + const T& operator[] (size_t i) const { |
| + return buffer[i]; |
| + } |
| + T& operator[] (size_t i) { |
| + return buffer[i]; |
| + } |
| + private: |
| + T stackBuffer[N]; |
| + T* buffer; |
| + AutoBuffer(); |
| +}; |
| + |
| +//---------------------------------------------------------------------------- |
| +// |
| +// TestTrieDictWithValue Test trie dictionaries with logprob values and |
| +// more than 2^16 nodes after compaction. |
| +// |
| +//---------------------------------------------------------------------------- |
| +void RBBITest::TestTrieDictWithValue() { |
| + UErrorCode status = U_ZERO_ERROR; |
| + |
| + // |
| + // Open and read the test data file. |
| + // |
| + const char *testDataDirectory = IntlTest::getSourceTestData(status); |
| + const char *filename = "cjdict-truncated.txt"; |
| + char testFileName[1000]; |
| + if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) { |
| + errln("Can't open test data. Path too long."); |
| + return; |
| + } |
| + strcpy(testFileName, testDataDirectory); |
| + strcat(testFileName, filename); |
| + |
| + // Items needing deleting at the end |
| + MutableTrieDictionary *mutableDict = NULL; |
| + CompactTrieDictionary *compactDict = NULL; |
| + UnicodeSet *breaks = NULL; |
| + UChar *testFile = NULL; |
| + StringEnumeration *enumer1 = NULL; |
| + StringEnumeration *enumer2 = NULL; |
| + MutableTrieDictionary *mutable2 = NULL; |
| + StringEnumeration *cloneEnum = NULL; |
| + CompactTrieDictionary *compact2 = NULL; |
| + NumberFormat *nf = NULL; |
| + UText *originalText = NULL, *cloneText = NULL; |
| + |
| + const UnicodeString *originalWord = NULL; |
| + const UnicodeString *cloneWord = NULL; |
| + UChar *current; |
| + UChar *word; |
| + UChar uc; |
| + int32_t wordLen; |
| + int32_t wordCount; |
| + int32_t testCount; |
| + int32_t valueLen; |
| + int counter = 0; |
| + |
| + int len; |
| + testFile = ReadAndConvertFile(testFileName, len, NULL, status); |
| + if (U_FAILURE(status)) { |
| + goto cleanup; /* something went wrong, error already output */ |
| + } |
| + |
| + mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE); |
| + if (U_FAILURE(status)) { |
| + errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + breaks = new UnicodeSet; |
| + breaks->add(0x000A); // Line Feed |
| + breaks->add(0x000D); // Carriage Return |
| + breaks->add(0x2028); // Line Separator |
| + breaks->add(0x2029); // Paragraph Separator |
| + breaks->add(0x0009); // Tab character |
| + |
| + // Now add each non-comment line of the file as a word. |
| + current = testFile; |
| + word = current; |
| + uc = *current++; |
| + wordLen = 0; |
| + wordCount = 0; |
| + nf = NumberFormat::createInstance(status); |
| + |
| + while (uc) { |
| + UnicodeString ucharValue; |
| + valueLen = 0; |
| + |
| + if (uc == 0x0023) { // #comment line, skip |
| + while (uc && !breaks->contains(uc)) { |
| + uc = *current++; |
| + } |
| + } |
| + else{ |
| + while (uc && !breaks->contains(uc)) { |
| + ++wordLen; |
| + uc = *current++; |
| + } |
| + if(uc == 0x0009){ //separator is a tab char, read in num after tab |
| + uc = *current++; |
| + while (uc && !breaks->contains(uc)) { |
| + ucharValue.append(uc); |
| + uc = *current++; |
| + } |
| + } |
| + } |
| + if (wordLen > 0) { |
| + Formattable value((int32_t)0); |
| + nf->parse(ucharValue.getTerminatedBuffer(), value, status); |
| + |
| + if(U_FAILURE(status)){ |
| + errln("parsing of value failed when reading in dictionary\n"); |
| + goto cleanup; |
| + } |
| + mutableDict->addWord(word, wordLen, status, value.getLong()); |
| + if (U_FAILURE(status)) { |
| + errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); |
| + goto cleanup; |
| + } |
| + wordCount += 1; |
| + } |
| + |
| + // Find beginning of next line |
| + while (uc && breaks->contains(uc)) { |
| + uc = *current++; |
| + } |
| + word = current-1; |
| + wordLen = 0; |
| + } |
| + |
| + if (wordCount < 50) { |
| + errln("Word count (%d) unreasonably small\n", wordCount); |
| + goto cleanup; |
| + } |
| + |
| + enumer1 = mutableDict->openWords(status); |
| + if (U_FAILURE(status)) { |
| + errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + testCount = 0; |
| + if (wordCount != (testCount = enumer1->count(status))) { |
| + errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", |
| + testCount, wordCount, u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + // Now compact it |
| + compactDict = new CompactTrieDictionary(*mutableDict, status); |
| + if (U_FAILURE(status)) { |
| + errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + enumer2 = compactDict->openWords(status); |
| + if (U_FAILURE(status)) { |
| + errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + |
| + //delete later |
| +// writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt"); |
| +// writeEnumerationToFile(enumer2, "/home/jchye/compact.txt"); |
| + |
| + enumer1->reset(status); |
| + enumer2->reset(status); |
| + |
| + originalWord = enumer1->snext(status); |
| + cloneWord = enumer2->snext(status); |
| + while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { |
| + if (*originalWord != *cloneWord) { |
| + errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n", |
| + counter, originalWord->length(), cloneWord->length()); |
| + goto cleanup; |
| + } |
| + |
| + // check if attached values of the same word in both dictionaries tally |
| +#if 0 |
| + int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()]; |
| + uint16_t values1[originalWord->length()], values2[cloneWord->length()]; |
| +#endif |
| + AutoBuffer<int32_t, 20> lengths1(originalWord->length()); |
| + AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); |
| + AutoBuffer<uint16_t, 20> values1(originalWord->length()); |
| + AutoBuffer<uint16_t, 20> values2(cloneWord->length()); |
| + |
| + originalText = utext_openConstUnicodeString(originalText, originalWord, &status); |
| + cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); |
| + |
| + int count1, count2; |
| + mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems()); |
| + compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems()); |
| + |
| + if(values1[count1-1] != values2[count2-1]){ |
| + errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n", |
| + counter, values1[count1-1], values2[count2-1]); |
| + goto cleanup; |
| + } |
| + |
| + counter++; |
| + originalWord = enumer1->snext(status); |
| + cloneWord = enumer2->snext(status); |
| + } |
| + if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) { |
| + errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same"); |
| + } |
| + |
| + delete enumer1; |
| + enumer1 = NULL; |
| + delete enumer2; |
| + enumer2 = NULL; |
| + |
| + // Now un-compact it |
| + mutable2 = compactDict->cloneMutable(status); |
| + if (U_FAILURE(status)) { |
| + errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + cloneEnum = mutable2->openWords(status); |
| + if (U_FAILURE(status)) { |
| + errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + if (wordCount != (testCount = cloneEnum->count(status))) { |
| + errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", |
| + testCount, wordCount, u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + // Compact original dictionary to clone. Note that we can only compare the same kind of |
| + // dictionary as the order of the enumerators is not guaranteed to be the same between |
| + // different kinds |
| + enumer1 = mutableDict->openWords(status); |
| + if (U_FAILURE(status)) { |
| + errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + counter = 0; |
| + originalWord = enumer1->snext(status); |
| + cloneWord = cloneEnum->snext(status); |
| + while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { |
| + if (*originalWord != *cloneWord) { |
| + errln("Original and cloned MutableTrieDictionary word mismatch\n"); |
| + goto cleanup; |
| + } |
| + |
| + // check if attached values of the same word in both dictionaries tally |
| + AutoBuffer<int32_t, 20> lengths1(originalWord->length()); |
| + AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); |
| + AutoBuffer<uint16_t, 20> values1(originalWord->length()); |
| + AutoBuffer<uint16_t, 20> values2(cloneWord->length()); |
| + originalText = utext_openConstUnicodeString(originalText, originalWord, &status); |
| + cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); |
| + |
| + int count1, count2; |
| + mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems()); |
| + mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems()); |
| + |
| + if(values1[count1-1] != values2[count2-1]){ |
| + errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n", |
| + counter, values1[count1-1], values2[count2-1]); |
| + goto cleanup; |
| + } |
| + |
| + counter++; |
| + |
| + originalWord = enumer1->snext(status); |
| + cloneWord = cloneEnum->snext(status); |
| + } |
| + |
| + if (U_FAILURE(status)) { |
| + errln("Enumeration failed: %s\n", u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + if (originalWord != cloneWord) { |
| + errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); |
| + goto cleanup; |
| + } |
| + |
| + // Test the data copying constructor for CompactTrieDict, and the data access APIs. |
| + compact2 = new CompactTrieDictionary(compactDict->data(), status); |
| + if (U_FAILURE(status)) { |
| + errln("CompactTrieDictionary(const void *,...) failed\n"); |
| + goto cleanup; |
| + } |
| + |
| + if (compact2->dataSize() == 0) { |
| + errln("CompactTrieDictionary->dataSize() == 0\n"); |
| + goto cleanup; |
| + } |
| + |
| + // Now count the words via the second dictionary |
| + delete enumer1; |
| + enumer1 = compact2->openWords(status); |
| + if (U_FAILURE(status)) { |
| + errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + if (wordCount != (testCount = enumer1->count(status))) { |
| + errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", |
| + testCount, wordCount, u_errorName(status)); |
| + goto cleanup; |
| + } |
| + |
| + cleanup: |
| + delete compactDict; |
| + delete mutableDict; |
| + delete breaks; |
| + delete[] testFile; |
| + delete enumer1; |
| + delete mutable2; |
| + delete cloneEnum; |
| + delete compact2; |
| + utext_close(originalText); |
| + utext_close(cloneText); |
| + |
| + |
| +} |
| |
| //---------------------------------------------------------------------------- |
| // |
| @@ -1870,8 +2243,15 @@ |
| // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). |
| static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" |
| "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; |
| +#if 0 |
| static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 }; |
| static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; |
| +#endif |
| +// There's no separate Japanese word break iterator. Root is the same as Japanese. |
| +// Our dictionary-based iterator has to be tweaked to better handle U+3005, |
| +// U+3007, U+300B and some other cases. |
| +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; |
| +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; |
| |
| // UBreakIteratorType UBRK_SENTENCE, Locale "el" |
| // Add break after Greek question mark (cldrbug #2069). |
| @@ -2672,6 +3052,8 @@ |
| UnicodeSet *fNewlineSet; |
| UnicodeSet *fKatakanaSet; |
| UnicodeSet *fALetterSet; |
| + // TODO(jungshik): Do we still need this change? |
| + // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt |
| UnicodeSet *fMidNumLetSet; |
| UnicodeSet *fMidLetterSet; |
| UnicodeSet *fMidNumSet; |
| @@ -2680,6 +3062,7 @@ |
| UnicodeSet *fOtherSet; |
| UnicodeSet *fExtendSet; |
| UnicodeSet *fExtendNumLetSet; |
| + UnicodeSet *fDictionaryCjkSet; |
| |
| RegexMatcher *fMatcher; |
| |
| @@ -2696,12 +3079,24 @@ |
| fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); |
| fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); |
| fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); |
| - fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); |
| + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status); |
| + // Exclude Hangul syllables from ALetterSet during testing. |
| + // Leave CJK dictionary characters out from the monkey tests! |
| +#if 0 |
| + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" |
| + "[\\p{Line_Break = Complex_Context}" |
| + "-\\p{Grapheme_Cluster_Break = Extend}" |
| + "-\\p{Grapheme_Cluster_Break = Control}" |
| + "]]", |
| + status); |
| +#endif |
| + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); |
| + fALetterSet->removeAll(*fDictionaryCjkSet); |
| fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); |
| fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); |
| fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); |
| fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); |
| - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); |
| + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status); |
| fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); |
| fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); |
| fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); |
| @@ -2725,13 +3120,14 @@ |
| fOtherSet->removeAll(*fFormatSet); |
| fOtherSet->removeAll(*fExtendSet); |
| // Inhibit dictionary characters from being tested at all. |
| + fOtherSet->removeAll(*fDictionaryCjkSet); |
| fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); |
| |
| fSets->addElement(fCRSet, status); |
| fSets->addElement(fLFSet, status); |
| fSets->addElement(fNewlineSet, status); |
| fSets->addElement(fALetterSet, status); |
| - fSets->addElement(fKatakanaSet, status); |
| + //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana |
| fSets->addElement(fMidLetterSet, status); |
| fSets->addElement(fMidNumLetSet, status); |
| fSets->addElement(fMidNumSet, status); |
| @@ -3978,6 +4374,7 @@ |
| for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { |
| count --; |
| if (forward[count] != i) { |
| + printStringBreaks(ustr, expected, expectedcount); |
| test->errln("happy break test previous() failed: expected %d but got %d", |
| forward[count], i); |
| break; |
| @@ -4011,23 +4408,25 @@ |
| UErrorCode status = U_ZERO_ERROR; |
| // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); |
| BreakIterator *bi = BreakIterator::createWordInstance(locale, status); |
| + // Replaced any C+J characters in a row with a random sequence of characters |
| + // of the same length to make our C+J segmentation not get in the way. |
| static const char *strlist[] = |
| { |
| "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", |
| - "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b", |
| + "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", |
| "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", |
| "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", |
| - "\\u90ca\\u3588\\u009c\\u0953\\u194b", |
| + "\\uac00\\u3588\\u009c\\u0953\\u194b", |
| "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", |
| "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", |
| - "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", |
| + "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", |
| "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", |
| "\\u003b\\u024a\\u102e\\U000e0071\\u0600", |
| "\\u2027\\U000e0067\\u0a47\\u00b7", |
| "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", |
| "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", |
| "\\u0589\\U000e006e\\u0a42\\U000104a5", |
| - "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", |
| + "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", |
| "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", |
| "\\u0027\\u11af\\U000e0057\\u0602", |
| "\\U0001d7f2\\U000e007\\u0004\\u0589", |
| @@ -4039,7 +4438,7 @@ |
| "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", |
| "\\u0233\\U000e0020\\u0a69\\u0d6a", |
| "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", |
| - "\\u58f4\\U000e0049\\u20e7\\u2027", |
| + "\\u18f4\\U000e0049\\u20e7\\u2027", |
| "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", |
| "\\ua183\\u102d\\u0bec\\u003a", |
| "\\u17e8\\u06e7\\u002e\\u096d\\u003b", |
| @@ -4049,7 +4448,7 @@ |
| "\\U000e005d\\u2044\\u0731\\u0650\\u0061", |
| "\\u003a\\u0664\\u00b7\\u1fba", |
| "\\u003b\\u0027\\u00b7\\u47a3", |
| - "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", |
| + "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", |
| "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", |
| "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", |
| }; |
| @@ -4104,12 +4503,12 @@ |
| "\\U0001d7f2\\U000e007d\\u0004\\u0589", |
| "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", |
| "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", |
| - "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", |
| + "\\U000e0065\\u302c\\u09ee\\U000e0068", |
| "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", |
| "\\u0233\\U000e0020\\u0a69\\u0d6a", |
| "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", |
| "\\u58f4\\U000e0049\\u20e7\\u2027", |
| - "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", |
| + "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", |
| "\\ua183\\u102d\\u0bec\\u003a", |
| "\\u17e8\\u06e7\\u002e\\u096d\\u003b", |
| "\\u003a\\u0e57\\u0fad\\u002e", |
| --- source/test/intltest/rbbitst.h 2010-07-22 17:15:37.000000000 -0700 |
| +++ source/test/intltest/rbbitst.h 2011-01-21 14:12:45.152007000 -0800 |
| @@ -70,6 +70,7 @@ |
| void TestBug5775(); |
| void TestThaiBreaks(); |
| void TestTailoredBreaks(); |
| + void TestTrieDictWithValue(); |
| void TestDictRules(); |
| void TestBug5532(); |
| |
| --- source/test/testdata/rbbitst.txt 2010-07-28 17:18:28.000000000 -0700 |
| +++ source/test/testdata/rbbitst.txt 2011-01-21 14:12:45.221011000 -0800 |
| @@ -161,7 +161,23 @@ |
| <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data> |
| |
| # Hiragana & Katakana stay together, but separates from each other and Latin. |
| -<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data> |
| +# *** what to do about theoretical combos of chars? i.e. hiragana + accent |
| +#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data> |
| + |
| +# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth |
| +<data>•芽キャベツ<400>芽キャベツ<400></data> |
| + |
| +# more Japanese tests |
| +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana |
| +# and the Katakana block are not treated correctly. Enable this later. |
| +#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> |
| +<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> |
| + |
| +# Testing of word boundary for dictionary word containing both kanji and kana |
| +<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data> |
| + |
| +# Testing of Chinese segmentation (taken from a Chinese news article) |
| +<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</data> |
| |
| # Words with interior formatting characters |
| <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data> |
| @@ -169,6 +185,8 @@ |
| # to test for bug #4097779 |
| <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data> |
| |
| +# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts |
| +<data>•ISN'T<200> •19<100>日<400></data> |
| |
| # to test for bug #4098467 |
| # What follows is a string of Korean characters (I found it in the Yellow Pages |
| @@ -178,9 +196,15 @@ |
| # precomposed syllables... |
| <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data> |
| |
| -<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data> |
| +# more Korean tests (Jamo not tested here, not counted as dictionary characters) |
| +# Disable them now because we don't include a Korean dictionary. |
| +#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data> |
| +#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data> |
| + |
| +<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data> |
| + |
| +<data>•\u06c9<200>\uc799<200>\ufffa•</data> |
| |
| -<data>•\u06c9\uc799\ufffa<200></data> |
| |
| # |
| # Try some words from other scripts. |
| @@ -491,8 +515,7 @@ |
| <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data> |
| |
| # conjoining jamo... |
| -# TODO: rules update needed |
| -#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data> |
| +<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data> |
| |
| # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd |
| <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data> |
| --- source/test/testdata/testaliases.txt 2009-11-12 13:53:42.000000000 -0800 |
| +++ source/test/testdata/testaliases.txt 2011-01-21 14:12:45.204005000 -0800 |
| @@ -28,7 +28,7 @@ |
| LocaleScript:alias { "/ICUDATA/ja/LocaleScript" } |
| |
| // aliasing using position |
| - boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding resource in another bundle |
| + boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding resource in another bundle |
| |
| // aliasing arrays |
| zoneTests { |
| --- source/tools/genctd/genctd.cpp 2009-08-04 14:09:17.000000000 -0700 |
| +++ source/tools/genctd/genctd.cpp 2011-01-21 14:12:45.564923000 -0800 |
| @@ -1,6 +1,6 @@ |
| /* |
| ********************************************************************** |
| -* Copyright (C) 2002-2009, International Business Machines |
| +* Copyright (C) 2002-2010, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * |
| @@ -34,12 +34,15 @@ |
| #include "unicode/udata.h" |
| #include "unicode/putil.h" |
| |
| +//#include "unicode/ustdio.h" |
| + |
| #include "uoptions.h" |
| #include "unewdata.h" |
| #include "ucmndata.h" |
| #include "rbbidata.h" |
| #include "triedict.h" |
| #include "cmemory.h" |
| +#include "uassert.h" |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| @@ -199,147 +202,191 @@ |
| long wordFileSize; |
| FILE *file; |
| char *wordBufferC; |
| - |
| + MutableTrieDictionary *mtd = NULL; |
| + |
| file = fopen(wordFileName, "rb"); |
| - if( file == 0 ) { |
| - fprintf(stderr, "Could not open file \"%s\"\n", wordFileName); |
| - exit(-1); |
| - } |
| - fseek(file, 0, SEEK_END); |
| - wordFileSize = ftell(file); |
| - fseek(file, 0, SEEK_SET); |
| - wordBufferC = new char[wordFileSize+10]; |
| - |
| - result = (long)fread(wordBufferC, 1, wordFileSize, file); |
| - if (result != wordFileSize) { |
| - fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); |
| - exit (-1); |
| - } |
| - wordBufferC[wordFileSize]=0; |
| - fclose(file); |
| - |
| - // |
| - // Look for a Unicode Signature (BOM) on the word file |
| - // |
| - int32_t signatureLength; |
| - const char * wordSourceC = wordBufferC; |
| - const char* encoding = ucnv_detectUnicodeSignature( |
| - wordSourceC, wordFileSize, &signatureLength, &status); |
| - if (U_FAILURE(status)) { |
| - exit(status); |
| - } |
| - if(encoding!=NULL ){ |
| - wordSourceC += signatureLength; |
| - wordFileSize -= signatureLength; |
| - } |
| - |
| - // |
| - // Open a converter to take the rule file to UTF-16 |
| - // |
| - UConverter* conv; |
| - conv = ucnv_open(encoding, &status); |
| - if (U_FAILURE(status)) { |
| - fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); |
| - exit(status); |
| - } |
| - |
| - // |
| - // Convert the words to UChar. |
| - // Preflight first to determine required buffer size. |
| - // |
| - uint32_t destCap = ucnv_toUChars(conv, |
| - NULL, // dest, |
| - 0, // destCapacity, |
| - wordSourceC, |
| - wordFileSize, |
| - &status); |
| - if (status != U_BUFFER_OVERFLOW_ERROR) { |
| - fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
| - exit(status); |
| - }; |
| - |
| - status = U_ZERO_ERROR; |
| - UChar *wordSourceU = new UChar[destCap+1]; |
| - ucnv_toUChars(conv, |
| - wordSourceU, // dest, |
| - destCap+1, |
| - wordSourceC, |
| - wordFileSize, |
| - &status); |
| - if (U_FAILURE(status)) { |
| - fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
| - exit(status); |
| - }; |
| - ucnv_close(conv); |
| - |
| - // Get rid of the original file buffer |
| - delete[] wordBufferC; |
| - |
| - // Create a MutableTrieDictionary, and loop through all the lines, inserting |
| - // words. |
| - |
| - // First, pick a median character. |
| - UChar *current = wordSourceU + (destCap/2); |
| - UChar uc = *current++; |
| - UnicodeSet breaks; |
| - breaks.add(0x000A); // Line Feed |
| - breaks.add(0x000D); // Carriage Return |
| - breaks.add(0x2028); // Line Separator |
| - breaks.add(0x2029); // Paragraph Separator |
| - |
| - do { |
| - // Look for line break |
| - while (uc && !breaks.contains(uc)) { |
| - uc = *current++; |
| - } |
| - // Now skip to first non-line-break |
| - while (uc && breaks.contains(uc)) { |
| - uc = *current++; |
| + if( file == 0 ) { //cannot find file |
| + //create 1-line dummy file: ie 1 char, 1 value |
| + UNewDataMemory *pData; |
| + char msg[1024]; |
| + |
| + /* write message with just the name */ |
| + sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName); |
| + fprintf(stderr, "%s\n", msg); |
| + |
| + UChar c = 0x0020; |
| + mtd = new MutableTrieDictionary(c, status, TRUE); |
| + mtd->addWord(&c, 1, status, 1); |
| + |
| + } else { //read words in from input file |
| + fseek(file, 0, SEEK_END); |
| + wordFileSize = ftell(file); |
| + fseek(file, 0, SEEK_SET); |
| + wordBufferC = new char[wordFileSize+10]; |
| + |
| + result = (long)fread(wordBufferC, 1, wordFileSize, file); |
| + if (result != wordFileSize) { |
| + fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); |
| + exit (-1); |
| } |
| - } |
| - while (uc && (breaks.contains(uc) || u_isspace(uc))); |
| - |
| - MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status); |
| + wordBufferC[wordFileSize]=0; |
| + fclose(file); |
| |
| - if (U_FAILURE(status)) { |
| - fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); |
| - exit(status); |
| - } |
| + // |
| + // Look for a Unicode Signature (BOM) on the word file |
| + // |
| + int32_t signatureLength; |
| + const char * wordSourceC = wordBufferC; |
| + const char* encoding = ucnv_detectUnicodeSignature( |
| + wordSourceC, wordFileSize, &signatureLength, &status); |
| + if (U_FAILURE(status)) { |
| + exit(status); |
| + } |
| + if(encoding!=NULL ){ |
| + wordSourceC += signatureLength; |
| + wordFileSize -= signatureLength; |
| + } |
| |
| - // Now add the words. Words are non-space characters at the beginning of |
| - // lines, and must be at least one UChar. |
| - current = wordSourceU; |
| - UChar *candidate = current; |
| - uc = *current++; |
| - int32_t length = 0; |
| - |
| - while (uc) { |
| - while (uc && !u_isspace(uc)) { |
| - ++length; |
| - uc = *current++; |
| + // |
| + // Open a converter to take the rule file to UTF-16 |
| + // |
| + UConverter* conv; |
| + conv = ucnv_open(encoding, &status); |
| + if (U_FAILURE(status)) { |
| + fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); |
| + exit(status); |
| } |
| - if (length > 0) { |
| - mtd->addWord(candidate, length, status); |
| - if (U_FAILURE(status)) { |
| - fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n", |
| - u_errorName(status)); |
| - exit(status); |
| + |
| + // |
| + // Convert the words to UChar. |
| + // Preflight first to determine required buffer size. |
| + // |
| + uint32_t destCap = ucnv_toUChars(conv, |
| + NULL, // dest, |
| + 0, // destCapacity, |
| + wordSourceC, |
| + wordFileSize, |
| + &status); |
| + if (status != U_BUFFER_OVERFLOW_ERROR) { |
| + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
| + exit(status); |
| + }; |
| + |
| + status = U_ZERO_ERROR; |
| + UChar *wordSourceU = new UChar[destCap+1]; |
| + ucnv_toUChars(conv, |
| + wordSourceU, // dest, |
| + destCap+1, |
| + wordSourceC, |
| + wordFileSize, |
| + &status); |
| + if (U_FAILURE(status)) { |
| + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
| + exit(status); |
| + }; |
| + ucnv_close(conv); |
| + |
| + // Get rid of the original file buffer |
| + delete[] wordBufferC; |
| + |
| + // Create a MutableTrieDictionary, and loop through all the lines, inserting |
| + // words. |
| + |
| + // First, pick a median character. |
| + UChar *current = wordSourceU + (destCap/2); |
| + UChar uc = *current++; |
| + UnicodeSet breaks; |
| + breaks.add(0x000A); // Line Feed |
| + breaks.add(0x000D); // Carriage Return |
| + breaks.add(0x2028); // Line Separator |
| + breaks.add(0x2029); // Paragraph Separator |
| + |
| + do { |
| + // Look for line break |
| + while (uc && !breaks.contains(uc)) { |
| + uc = *current++; |
| + } |
| + // Now skip to first non-line-break |
| + while (uc && breaks.contains(uc)) { |
| + uc = *current++; |
| } |
| } |
| - // Find beginning of next line |
| - while (uc && !breaks.contains(uc)) { |
| - uc = *current++; |
| + while (uc && (breaks.contains(uc) || u_isspace(uc))); |
| + |
| + mtd = new MutableTrieDictionary(uc, status); |
| + |
| + if (U_FAILURE(status)) { |
| + fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); |
| + exit(status); |
| } |
| - while (uc && breaks.contains(uc)) { |
| - uc = *current++; |
| + |
| + // Now add the words. Words are non-space characters at the beginning of |
| + // lines, and must be at least one UChar. If a word has an associated value, |
| + // the value should follow the word on the same line after a tab character. |
| + current = wordSourceU; |
| + UChar *candidate = current; |
| + uc = *current++; |
| + int32_t length = 0; |
| + int count = 0; |
| + |
| + while (uc) { |
| + while (uc && !u_isspace(uc)) { |
| + ++length; |
| + uc = *current++; |
| + } |
| + |
| + UnicodeString valueString; |
| + UChar candidateValue; |
| + if(uc == 0x0009){ //separator is a tab char, read in number after space |
| + while (uc && u_isspace(uc)) { |
| + uc = *current++; |
| + } |
| + while (uc && !u_isspace(uc)) { |
| + valueString.append(uc); |
| + uc = *current++; |
| + } |
| + } |
| + |
| + if (length > 0) { |
| + count++; |
| + if(valueString.length() > 0){ |
| + mtd->setValued(TRUE); |
| + |
| + uint32_t value = 0; |
| + char* s = new char[valueString.length()]; |
| + valueString.extract(0,valueString.length(), s, valueString.length()); |
| + int n = sscanf(s, "%ud", &value); |
| + U_ASSERT(n == 1); |
| + U_ASSERT(value >= 0); |
| + mtd->addWord(candidate, length, status, (uint16_t)value); |
| + delete[] s; |
| + } else { |
| + mtd->addWord(candidate, length, status); |
| + } |
| + |
| + if (U_FAILURE(status)) { |
| + fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n", |
| + u_errorName(status), count); |
| + exit(status); |
| + } |
| + } |
| + |
| + // Find beginning of next line |
| + while (uc && !breaks.contains(uc)) { |
| + uc = *current++; |
| + } |
| + // Find next non-line-breaking character |
| + while (uc && breaks.contains(uc)) { |
| + uc = *current++; |
| + } |
| + candidate = current-1; |
| + length = 0; |
| } |
| - candidate = current-1; |
| - length = 0; |
| + |
| + // Get rid of the Unicode text buffer |
| + delete[] wordSourceU; |
| } |
| |
| - // Get rid of the Unicode text buffer |
| - delete[] wordSourceU; |
| - |
| // Now, create a CompactTrieDictionary from the mutable dictionary |
| CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); |
| if (U_FAILURE(status)) { |
| @@ -393,4 +440,3 @@ |
| |
| #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
| } |
| - |
| --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800 |
| +++ source/tools/genctd/Makefile.in 2011-01-21 14:12:45.555920000 -0800 |
| @@ -23,13 +23,13 @@ |
| ## Extra files to remove for 'make clean' |
| CLEANFILES = *~ $(DEPS) $(MAN_FILES) |
| |
| -## Target information |
| +## Target informationcd |
| TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) |
| |
| ifneq ($(top_builddir),$(top_srcdir)) |
| CPPFLAGS += -I$(top_builddir)/common |
| endif |
| -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil |
| +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n |
| LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) |
| |
| OBJECTS = genctd.o |