src/third_party/icu/patches/segmentation.patch - cobalt - Git at Google

 --- source/common/brkeng.cpp	2009-11-11 07:47:22.000000000 -0800
 +++ source/common/brkeng.cpp	2011-01-21 14:12:45.479922000 -0800
 @@ -226,6 +226,30 @@
              case USCRIPT_THAI:
                  engine = new ThaiBreakEngine(dict, status);
                  break;
 +
 +            case USCRIPT_HANGUL:
 +                engine = new CjkBreakEngine(dict, kKorean, status);
 +                break;
 +
 +            // use same BreakEngine and dictionary for both Chinese and Japanese
 +            case USCRIPT_HIRAGANA:
 +            case USCRIPT_KATAKANA:
 +            case USCRIPT_HAN:
 +                engine = new CjkBreakEngine(dict, kChineseJapanese, status);
 +                break;
 +#if 0
 +            // TODO: Have to get some characters with script=common handled
 +            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
 +            // them to CjkBreakEngine does not work. The engine has to
 +            // special-case them.
 +            case USCRIPT_COMMON:
 +            {
 +                UBlockCode block = ublock_getCode(code);
 +                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
 +                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
 +                break;
 +            }
 +#endif
              default:
                  break;
              }
 @@ -281,6 +305,13 @@
              dict = NULL;
          }
          return dict;
 +    } else if (dictfname != NULL){
 +        //create dummy dict if dictionary filename not valid
 +        UChar c = 0x0020;
 +        status = U_ZERO_ERROR;
 +        MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE);
 +        mtd->addWord(&c, 1, status, 1);
 +        return new CompactTrieDictionary(*mtd, status);
      }
      return NULL;
  }
 --- source/common/dictbe.cpp	2008-06-13 12:21:12.000000000 -0700
 +++ source/common/dictbe.cpp	2011-01-21 14:12:45.468928000 -0800
 @@ -16,6 +16,9 @@
  #include "unicode/ubrk.h"
  #include "uvector.h"
  #include "triedict.h"
 +#include "uassert.h"
 +#include "unicode/normlzr.h"
 +#include "cmemory.h"

  U_NAMESPACE_BEGIN

 @@ -422,6 +425,294 @@
      return wordsFound;
  }

 +/*
 + ******************************************************************
 + * CjkBreakEngine
 + */
 +static const uint32_t kuint32max = 0xFFFFFFFF;
 +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status)
 +: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){
 +    if (!adoptDictionary->getValued()) {
 +        status = U_ILLEGAL_ARGUMENT_ERROR;
 +        return;
 +    }
 +
 +    // Korean dictionary only includes Hangul syllables
 +    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
 +    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
 +    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
 +    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
 +
 +    if (U_SUCCESS(status)) {
 +        // handle Korean and Japanese/Chinese using different dictionaries
 +        if (type == kKorean) {
 +            setCharacters(fHangulWordSet);
 +        } else { //Chinese and Japanese
 +            UnicodeSet cjSet;
 +            cjSet.addAll(fHanWordSet);
 +            cjSet.addAll(fKatakanaWordSet);
 +            cjSet.addAll(fHiraganaWordSet);
 +            cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));
 +            setCharacters(cjSet);
 +        }
 +    }
 +}
 +
 +CjkBreakEngine::~CjkBreakEngine(){
 +    delete fDictionary;
 +}
 +
 +// The katakanaCost values below are based on the length frequencies of all
 +// katakana phrases in the dictionary
 +static const int kMaxKatakanaLength = 8;
 +static const int kMaxKatakanaGroupLength = 20;
 +static const uint32_t maxSnlp = 255;
 +
 +static inline uint32_t getKatakanaCost(int wordLength){
 +    //TODO: fill array with actual values from dictionary!
 +    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
 +                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
 +    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
 +}
 +
 +static inline bool isKatakana(uint16_t value) {
 +    return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
 +            (value >= 0xFF66u && value <= 0xFF9fu);
 +}
 +
 +// A very simple helper class to streamline the buffer handling in
 +// divideUpDictionaryRange.
 +template<class T, size_t N>
 +class AutoBuffer {
 + public:
 +  AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
 +    if (size > N) {
 +      buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
 +      capacity = size;
 +    }
 +  }
 +  ~AutoBuffer() {
 +    if (buffer != stackBuffer)
 +      uprv_free(buffer);
 +  }
 +#if 0
 +  T* operator& () {
 +    return buffer;
 +  }
 +#endif
 +  T* elems() {
 +    return buffer;
 +  }
 +  const T& operator[] (size_t i) const {
 +    return buffer[i];
 +  }
 +  T& operator[] (size_t i) {
 +    return buffer[i];
 +  }
 +
 +  // resize without copy
 +  void resize(size_t size) {
 +    if (size <= capacity)
 +      return;
 +    if (buffer != stackBuffer)
 +      uprv_free(buffer);
 +    buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
 +    capacity = size;
 +  }
 + private:
 +  T stackBuffer[N];
 +  T* buffer;
 +  AutoBuffer();
 +  size_t capacity;
 +};
 +
 +
 +/*
 + * @param text A UText representing the text
 + * @param rangeStart The start of the range of dictionary characters
 + * @param rangeEnd The end of the range of dictionary characters
 + * @param foundBreaks Output of C array of int32_t break positions, or 0
 + * @return The number of breaks found
 + */
 +int32_t
 +CjkBreakEngine::divideUpDictionaryRange( UText *text,
 +        int32_t rangeStart,
 +        int32_t rangeEnd,
 +        UStack &foundBreaks ) const {
 +    if (rangeStart >= rangeEnd) {
 +        return 0;
 +    }
 +
 +    const size_t defaultInputLength = 80;
 +    size_t inputLength = rangeEnd - rangeStart;
 +    AutoBuffer<UChar, defaultInputLength> charString(inputLength);
 +
 +    // Normalize the input string and put it in normalizedText.
 +    // The map from the indices of the normalized input to the raw
 +    // input is kept in charPositions.
 +    UErrorCode status = U_ZERO_ERROR;
 +    utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
 +    if (U_FAILURE(status))
 +        return 0;
 +
 +    UnicodeString inputString(charString.elems(), inputLength);
 +    UNormalizationMode norm_mode = UNORM_NFKC;
 +    UBool isNormalized =
 +        Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
 +        Normalizer::isNormalized(inputString, norm_mode, status);
 +
 +    AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
 +    int numChars = 0;
 +    UText normalizedText = UTEXT_INITIALIZER;
 +    // Needs to be declared here because normalizedText holds onto its buffer.
 +    UnicodeString normalizedString;
 +    if (isNormalized) {
 +        int32_t index = 0;
 +        charPositions[0] = 0;
 +        while(index < inputString.length()) {
 +            index = inputString.moveIndex32(index, 1);
 +            charPositions[++numChars] = index;
 +        }
 +        utext_openUnicodeString(&normalizedText, &inputString, &status);
 +    }
 +    else {
 +        Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
 +        if (U_FAILURE(status))
 +            return 0;
 +        charPositions.resize(normalizedString.length() + 1);
 +        Normalizer normalizer(charString.elems(), inputLength, norm_mode);
 +        int32_t index = 0;
 +        charPositions[0] = 0;
 +        while(index < normalizer.endIndex()){
 +            UChar32 uc = normalizer.next();
 +            charPositions[++numChars] = index = normalizer.getIndex();
 +        }
 +        utext_openUnicodeString(&normalizedText, &normalizedString, &status);
 +    }
 +
 +    if (U_FAILURE(status))
 +        return 0;
 +
 +    // From this point on, all the indices refer to the indices of
 +    // the normalized input string.
 +
 +    // bestSnlp[i] is the snlp of the best segmentation of the first i
 +    // characters in the range to be matched.
 +    AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
 +    bestSnlp[0] = 0;
 +    for(int i=1; i<=numChars; i++){
 +        bestSnlp[i] = kuint32max;
 +    }
 +
 +    // prev[i] is the index of the last CJK character in the previous word in
 +    // the best segmentation of the first i characters.
 +    AutoBuffer<int, defaultInputLength> prev(numChars + 1);
 +    for(int i=0; i<=numChars; i++){
 +        prev[i] = -1;
 +    }
 +
 +    const size_t maxWordSize = 20;
 +    AutoBuffer<uint16_t, maxWordSize> values(numChars);
 +    AutoBuffer<int32_t, maxWordSize> lengths(numChars);
 +
 +    // Dynamic programming to find the best segmentation.
 +    bool is_prev_katakana = false;
 +    for (int i = 0; i < numChars; ++i) {
 +        //utext_setNativeIndex(text, rangeStart + i);
 +        utext_setNativeIndex(&normalizedText, i);
 +        if (bestSnlp[i] == kuint32max)
 +            continue;
 +
 +        int count;
 +        // limit maximum word length matched to size of current substring
 +        int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i;
 +
 +        fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
 +
 +        // if there are no single character matches found in the dictionary
 +        // starting with this charcter, treat character as a 1-character word
 +        // with the highest value possible, i.e. the least likely to occur.
 +        // Exclude Korean characters from this treatment, as they should be left
 +        // together by default.
 +        if((count == 0 || lengths[0] != 1) &&
 +                !fHangulWordSet.contains(utext_current32(&normalizedText))){
 +            values[count] = maxSnlp;
 +            lengths[count++] = 1;
 +        }
 +
 +        for (int j = 0; j < count; j++){
 +            //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);
 +            uint32_t newSnlp = bestSnlp[i] + values[j];
 +            if (newSnlp < bestSnlp[lengths[j] + i]) {
 +                bestSnlp[lengths[j] + i] = newSnlp;
 +                prev[lengths[j] + i] = i;
 +            }
 +        }
 +
 +        // In Japanese,
 +        // Katakana word in single character is pretty rare. So we apply
 +        // the following heuristic to Katakana: any continuous run of Katakana
 +        // characters is considered a candidate word with a default cost
 +        // specified in the katakanaCost table according to its length.
 +        //utext_setNativeIndex(text, rangeStart + i);
 +        utext_setNativeIndex(&normalizedText, i);
 +        bool is_katakana = isKatakana(utext_current32(&normalizedText));
 +        if (!is_prev_katakana && is_katakana) {
 +            int j = i + 1;
 +            utext_next32(&normalizedText);
 +            // Find the end of the continuous run of Katakana characters
 +            while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
 +                    isKatakana(utext_current32(&normalizedText))) {
 +                utext_next32(&normalizedText);
 +                ++j;
 +            }
 +            if ((j - i) < kMaxKatakanaGroupLength) {
 +                uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
 +                if (newSnlp < bestSnlp[j]) {
 +                    bestSnlp[j] = newSnlp;
 +                    prev[j] = i;
 +                }
 +            }
 +        }
 +        is_prev_katakana = is_katakana;
 +    }
 +
 +    // Start pushing the optimal offset index into t_boundary (t for tentative).
 +    // prev[numChars] is guaranteed to be meaningful.
 +    // We'll first push in the reverse order, i.e.,
 +    // t_boundary[0] = numChars, and afterwards do a swap.
 +    AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
 +
 +    int numBreaks = 0;
 +    // No segmentation found, set boundary to end of range
 +    if (bestSnlp[numChars] == kuint32max) {
 +        t_boundary[numBreaks++] = numChars;
 +    } else {
 +        for (int i = numChars; i > 0; i = prev[i]){
 +            t_boundary[numBreaks++] = i;
 +
 +        }
 +        U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);
 +    }
 +
 +    // Reverse offset index in t_boundary.
 +    // Don't add a break for the start of the dictionary range if there is one
 +    // there already.
 +    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
 +        t_boundary[numBreaks++] = 0;
 +    }
 +
 +    // Now that we're done, convert positions in t_bdry[] (indices in
 +    // the normalized input string) back to indices in the raw input string
 +    // while reversing t_bdry and pushing values to foundBreaks.
 +    for (int i = numBreaks-1; i >= 0; i--) {
 +        foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
 +    }
 +
 +    utext_close(&normalizedText);
 +    return numBreaks;
 +}
 +
  U_NAMESPACE_END

  #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
 --- source/common/dictbe.h	2006-09-29 17:37:45.000000000 -0700
 +++ source/common/dictbe.h	2011-01-21 14:12:45.492920000 -0800
 @@ -1,8 +1,8 @@
  /**
 - *******************************************************************************
 - * Copyright (C) 2006, International Business Machines Corporation and others. *
 - * All Rights Reserved.                                                        *
 - *******************************************************************************
 + **********************************************************************************
 + * Copyright (C) 2006-2010, International Business Machines Corporation and others.
 + * All Rights Reserved.
 + **********************************************************************************
   */

  #ifndef DICTBE_H
 @@ -65,31 +65,31 @@
     */
    virtual ~DictionaryBreakEngine();

 - /**
 -  * <p>Indicate whether this engine handles a particular character for
 -  * a particular kind of break.</p>
 -  *
 -  * @param c A character which begins a run that the engine might handle
 -  * @param breakType The type of text break which the caller wants to determine
 -  * @return TRUE if this engine handles the particular character and break
 -  * type.
 -  */
 +  /**
 +   * <p>Indicate whether this engine handles a particular character for
 +   * a particular kind of break.</p>
 +   *
 +   * @param c A character which begins a run that the engine might handle
 +   * @param breakType The type of text break which the caller wants to determine
 +   * @return TRUE if this engine handles the particular character and break
 +   * type.
 +   */
    virtual UBool handles( UChar32 c, int32_t breakType ) const;

 - /**
 -  * <p>Find any breaks within a run in the supplied text.</p>
 -  *
 -  * @param text A UText representing the text. The
 -  * iterator is left at the end of the run of characters which the engine
 -  * is capable of handling.
 -  * @param startPos The start of the run within the supplied text.
 -  * @param endPos The end of the run within the supplied text.
 -  * @param reverse Whether the caller is looking for breaks in a reverse
 -  * direction.
 -  * @param breakType The type of break desired, or -1.
 -  * @param foundBreaks An allocated C array of the breaks found, if any
 -  * @return The number of breaks found.
 -  */
 +  /**
 +   * <p>Find any breaks within a run in the supplied text.</p>
 +   *
 +   * @param text A UText representing the text. The iterator is left at
 +   * the end of the run of characters which the engine is capable of handling
 +   * that starts from the first (or last) character in the range.
 +   * @param startPos The start of the run within the supplied text.
 +   * @param endPos The end of the run within the supplied text.
 +   * @param reverse Whether the caller is looking for breaks in a reverse
 +   * direction.
 +   * @param breakType The type of break desired, or -1.
 +   * @param foundBreaks An allocated C array of the breaks found, if any
 +   * @return The number of breaks found.
 +   */
    virtual int32_t findBreaks( UText *text,
                                int32_t startPos,
                                int32_t endPos,
 @@ -114,7 +114,7 @@
  //  virtual void setBreakTypes( uint32_t breakTypes );

   /**
 -  * <p>Divide up a range of known dictionary characters.</p>
 +  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    *
    * @param text A UText representing the text
    * @param rangeStart The start of the range of dictionary characters
 @@ -171,7 +171,7 @@

   protected:
   /**
 -  * <p>Divide up a range of known dictionary characters.</p>
 +  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    *
    * @param text A UText representing the text
    * @param rangeStart The start of the range of dictionary characters
 @@ -186,6 +186,66 @@

  };

 +/*******************************************************************
 + * CjkBreakEngine
 + */
 +
 +//indicates language/script that the CjkBreakEngine will handle
 +enum LanguageType {
 +    kKorean,
 +    kChineseJapanese
 +};
 +
 +/**
 + * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
 + * TrieWordDictionary with costs associated with each word and
 + * Viterbi decoding to determine CJK-specific breaks.</p>
 + */
 +class CjkBreakEngine : public DictionaryBreakEngine {
 + protected:
 +    /**
 +     * The set of characters handled by this engine
 +     * @internal
 +     */
 +  UnicodeSet                fHangulWordSet;
 +  UnicodeSet                fHanWordSet;
 +  UnicodeSet                fKatakanaWordSet;
 +  UnicodeSet                fHiraganaWordSet;
 +
 +  const TrieWordDictionary  *fDictionary;
 +
 + public:
 +
 +    /**
 +     * <p>Default constructor.</p>
 +     *
 +     * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
 +     * engine is deleted. The TrieWordDictionary must contain costs for each word
 +     * in order for the dictionary to work properly.
 +     */
 +  CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status);
 +
 +    /**
 +     * <p>Virtual destructor.</p>
 +     */
 +  virtual ~CjkBreakEngine();
 +
 + protected:
 +    /**
 +     * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
 +     *
 +     * @param text A UText representing the text
 +     * @param rangeStart The start of the range of dictionary characters
 +     * @param rangeEnd The end of the range of dictionary characters
 +     * @param foundBreaks Output of C array of int32_t break positions, or 0
 +     * @return The number of breaks found
 +     */
 +  virtual int32_t divideUpDictionaryRange( UText *text,
 +          int32_t rangeStart,
 +          int32_t rangeEnd,
 +          UStack &foundBreaks ) const;
 +
 +};

  U_NAMESPACE_END

 --- source/common/rbbi.cpp	2010-07-22 17:15:37.000000000 -0700
 +++ source/common/rbbi.cpp	2011-01-21 14:12:45.457938000 -0800
 @@ -1555,10 +1555,12 @@
                              int32_t endPos,
                              UBool reverse) {
      // Reset the old break cache first.
 -    uint32_t dictionaryCount = fDictionaryCharCount;
      reset();

 -    if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
 +    // note: code segment below assumes that dictionary chars are in the
 +    // startPos-endPos range
 +    // value returned should be next character in sequence
 +    if ((endPos - startPos) <= 1) {
          return (reverse ? startPos : endPos);
      }

 @@ -1711,7 +1713,7 @@
              // proposed break by one of the breaks we found. Use following() and
              // preceding() to do the work. They should never recurse in this case.
              if (reverse) {
 -                return preceding(endPos - 1);
 +                return preceding(endPos);
              }
              else {
                  return following(startPos);
 --- source/common/triedict.cpp	2008-02-13 01:35:50.000000000 -0800
 +++ source/common/triedict.cpp	2011-01-21 14:12:45.271006000 -0800
 @@ -20,6 +20,7 @@
  #include "uvector.h"
  #include "uvectr32.h"
  #include "uarrsort.h"
 +#include "hash.h"

  //#define DEBUG_TRIE_DICT 1

 @@ -27,6 +28,11 @@
  #include <sys/times.h>
  #include <limits.h>
  #include <stdio.h>
 +#include <time.h>
 +#ifndef CLK_TCK
 +#define CLK_TCK      CLOCKS_PER_SEC
 +#endif
 +
  #endif

  U_NAMESPACE_BEGIN
 @@ -45,6 +51,11 @@
   * MutableTrieDictionary
   */

 +//#define MAX_VALUE 65535
 +
 +// forward declaration
 +inline uint16_t scaleLogProbabilities(double logprob);
 +
  // Node structure for the ternary, uncompressed trie
  struct TernaryNode : public UMemory {
      UChar       ch;         // UTF-16 code unit
 @@ -77,7 +88,8 @@
      delete high;
  }

 -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {
 +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,
 +                                              UBool containsValue /* = FALSE */  ) {
      // Start the trie off with something. Having the root node already present
      // cuts a special case out of the search/insertion functions.
      // Making it a median character cuts the worse case for searches from
 @@ -91,14 +103,19 @@
      if (U_SUCCESS(status) && fIter == NULL) {
          status = U_MEMORY_ALLOCATION_ERROR;
      }
 +
 +    fValued = containsValue;
  }

 -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {
 +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,
 +                                              UBool containsValue /* = false */ ) {
      fTrie = NULL;
      fIter = utext_openUChars(NULL, NULL, 0, &status);
      if (U_SUCCESS(status) && fIter == NULL) {
          status = U_MEMORY_ALLOCATION_ERROR;
      }
 +
 +    fValued = containsValue;
  }

  MutableTrieDictionary::~MutableTrieDictionary() {
 @@ -108,12 +125,13 @@

  int32_t
  MutableTrieDictionary::search( UText *text,
 -                                   int32_t maxLength,
 -                                   int32_t *lengths,
 -                                   int &count,
 -                                   int limit,
 -                                   TernaryNode *&parent,
 -                                   UBool &pMatched ) const {
 +                               int32_t maxLength,
 +                               int32_t *lengths,
 +                               int &count,
 +                               int limit,
 +                               TernaryNode *&parent,
 +                               UBool &pMatched,
 +                               uint16_t *values /*=NULL*/) const {
      // TODO: current implementation works in UTF-16 space
      const TernaryNode *up = NULL;
      const TernaryNode *p = fTrie;
 @@ -121,6 +139,10 @@
      pMatched = TRUE;
      int i;

 +    if (!fValued) {
 +        values = NULL;
 +    }
 +
      UChar uc = utext_current32(text);
      for (i = 0; i < maxLength && p != NULL; ++i) {
          while (p != NULL) {
 @@ -141,7 +163,11 @@
              break;
          }
          // Must be equal to get here
 -        if (limit > 0 && (p->flags & kEndsWord)) {
 +        if (limit > 0 && (p->flags > 0)) {
 +            //is there a more efficient way to add values? ie. remove if stmt
 +            if(values != NULL) {
 +                values[mycount] = p->flags;
 +            }
              lengths[mycount++] = i+1;
              --limit;
          }
 @@ -161,13 +187,14 @@
  void
  MutableTrieDictionary::addWord( const UChar *word,
                                  int32_t length,
 -                                UErrorCode &status ) {
 -#if 0
 -    if (length <= 0) {
 +                                UErrorCode &status,
 +                                uint16_t value /* = 0 */ ) {
 +    // dictionary cannot store zero values, would interfere with flags
 +    if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) {
          status = U_ILLEGAL_ARGUMENT_ERROR;
          return;
      }
 -#endif
 +
      TernaryNode *parent;
      UBool pMatched;
      int count;
 @@ -177,7 +204,7 @@
      matched = search(fIter, length, NULL, count, 0, parent, pMatched);

      while (matched++ < length) {
 -        UChar32 uc = utext_next32(fIter);  // TODO:  supplemetary support?
 +        UChar32 uc = utext_next32(fIter);  // TODO:  supplementary support?
          U_ASSERT(uc != U_SENTINEL);
          TernaryNode *newNode = new TernaryNode(uc);
          if (newNode == NULL) {
 @@ -199,30 +226,23 @@
          parent = newNode;
      }

 -    parent->flags |= kEndsWord;
 -}
 -
 -#if 0
 -void
 -MutableTrieDictionary::addWords( UEnumeration *words,
 -                                  UErrorCode &status ) {
 -    int32_t length;
 -    const UChar *word;
 -    while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {
 -        addWord(word, length, status);
 +    if(fValued && value > 0){
 +        parent->flags = value;
 +    } else {
 +        parent->flags |= kEndsWord;
      }
  }
 -#endif

  int32_t
  MutableTrieDictionary::matches( UText *text,
                                  int32_t maxLength,
                                  int32_t *lengths,
                                  int &count,
 -                                int limit ) const {
 +                                int limit,
 +                                uint16_t *values /*=NULL*/) const {
      TernaryNode *parent;
      UBool pMatched;
 -    return search(text, maxLength, lengths, count, limit, parent, pMatched);
 +    return search(text, maxLength, lengths, count, limit, parent, pMatched, values);
  }

  // Implementation of iteration for MutableTrieDictionary
 @@ -277,7 +297,7 @@
                      break;
                  }
              case kEqual:
 -                emit = (node->flags & kEndsWord) != 0;
 +                emit = node->flags > 0;
                  equal = (node->equal != NULL);
                  // If this node should be part of the next emitted string, append
                  // the UChar to the string, and make sure we pop it when we come
 @@ -299,7 +319,7 @@
                  }
              case kGreaterThan:
                  // If this node's character is in the string, remove it.
 -                if (node->equal != NULL || (node->flags & kEndsWord)) {
 +                if (node->equal != NULL || node->flags > 0) {
                      unistr.truncate(unistr.length()-1);
                  }
                  if (node->high != NULL) {
 @@ -354,12 +374,75 @@
   * CompactTrieDictionary
   */

 +//TODO further optimization:
 +// minimise size of trie with logprobs by storing values
 +// for terminal nodes directly in offsets[]
 +// --> calculating from next offset *might* be simpler, but would have to add
 +// one last offset for logprob of last node
 +// --> if calculate from current offset, need to factor in possible overflow
 +// as well.
 +// idea: store in offset, set first bit to indicate logprob storage-->won't
 +// have to access additional node
 +
 +// {'Dic', 1}, version 1: uses old header, no values
 +#define COMPACT_TRIE_MAGIC_1 0x44696301
 +// version 2: uses new header (more than 2^16 nodes), no values
 +#define COMPACT_TRIE_MAGIC_2 0x44696302
 +// version 3: uses new header, includes values
 +#define COMPACT_TRIE_MAGIC_3 0x44696303
 +
  struct CompactTrieHeader {
      uint32_t        size;           // Size of the data in bytes
      uint32_t        magic;          // Magic number (including version)
 +    uint32_t        nodeCount;      // Number of entries in offsets[]
 +    uint32_t        root;           // Node number of the root node
 +    uint32_t        offsets[1];     // Offsets to nodes from start of data
 +};
 +
 +// old version of CompactTrieHeader kept for backwards compatibility
 +struct CompactTrieHeaderV1 {
 +    uint32_t        size;           // Size of the data in bytes
 +    uint32_t        magic;          // Magic number (including version)
      uint16_t        nodeCount;      // Number of entries in offsets[]
      uint16_t        root;           // Node number of the root node
 -    uint32_t        offsets[1];      // Offsets to nodes from start of data
 +    uint32_t        offsets[1];     // Offsets to nodes from start of data
 +};
 +
 +// Helper class for managing CompactTrieHeader and CompactTrieHeaderV1
 +struct CompactTrieInfo {
 +    uint32_t        size;           // Size of the data in bytes
 +    uint32_t        magic;          // Magic number (including version)
 +    uint32_t        nodeCount;      // Number of entries in offsets[]
 +    uint32_t        root;           // Node number of the root node
 +    uint32_t        *offsets;       // Offsets to nodes from start of data
 +    uint8_t         *address;       // pointer to header bytes in memory
 +
 +    CompactTrieInfo(const void *data, UErrorCode &status){
 +        CompactTrieHeader *header = (CompactTrieHeader *) data;
 +        if (header->magic != COMPACT_TRIE_MAGIC_1 &&
 +                header->magic != COMPACT_TRIE_MAGIC_2 &&
 +                header->magic != COMPACT_TRIE_MAGIC_3) {
 +            status = U_ILLEGAL_ARGUMENT_ERROR;
 +        } else {
 +            size = header->size;
 +            magic = header->magic;
 +
 +            if (header->magic == COMPACT_TRIE_MAGIC_1) {
 +                CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header;
 +                nodeCount = headerV1->nodeCount;
 +                root = headerV1->root;
 +                offsets = &(headerV1->offsets[0]);
 +                address = (uint8_t *)headerV1;
 +            } else {
 +                nodeCount = header->nodeCount;
 +                root = header->root;
 +                offsets = &(header->offsets[0]);
 +                address = (uint8_t *)header;
 +            }
 +        }
 +    }
 +
 +    ~CompactTrieInfo(){}
  };

  // Note that to avoid platform-specific alignment issues, all members of the node
 @@ -375,10 +458,14 @@
  enum CompactTrieNodeFlags {
      kVerticalNode   = 0x1000,       // This is a vertical node
      kParentEndsWord = 0x2000,       // The node whose equal link points to this ends a word
 -    kReservedFlag1  = 0x4000,
 -    kReservedFlag2  = 0x8000,
 +    kExceedsCount   = 0x4000,       // new MSB for count >= 4096, originally kReservedFlag1
 +    kEqualOverflows = 0x8000,       // Links to nodeIDs > 2^16, orig. kReservedFlag2
      kCountMask      = 0x0FFF,       // The count portion of flagscount
 -    kFlagMask       = 0xF000        // The flags portion of flagscount
 +    kFlagMask       = 0xF000,       // The flags portion of flagscount
 +    kRootCountMask  = 0x7FFF        // The count portion of flagscount in the root node
 +
 +    //offset flags:
 +    //kOffsetContainsValue = 0x80000000       // Offset contains value for parent node
  };

  // The two node types are distinguished by the kVerticalNode flag.
 @@ -402,63 +489,177 @@
      uint16_t        chars[1];       // Code units
  };

 -// {'Dic', 1}, version 1
 -#define COMPACT_TRIE_MAGIC_1 0x44696301
 -
  CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
                                                  UErrorCode &status )
  : fUData(dataObj)
  {
 -    fData = (const CompactTrieHeader *) udata_getMemory(dataObj);
 +    fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
 +    *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status);
      fOwnData = FALSE;
 -    if (fData->magic != COMPACT_TRIE_MAGIC_1) {
 -        status = U_ILLEGAL_ARGUMENT_ERROR;
 -        fData = NULL;
 -    }
  }
 +
  CompactTrieDictionary::CompactTrieDictionary( const void *data,
                                                  UErrorCode &status )
  : fUData(NULL)
  {
 -    fData = (const CompactTrieHeader *) data;
 +    fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
 +    *fInfo = CompactTrieInfo(data, status);
      fOwnData = FALSE;
 -    if (fData->magic != COMPACT_TRIE_MAGIC_1) {
 -        status = U_ILLEGAL_ARGUMENT_ERROR;
 -        fData = NULL;
 -    }
  }

  CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict,
                                                  UErrorCode &status )
  : fUData(NULL)
  {
 -    fData = compactMutableTrieDictionary(dict, status);
 +    const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status);
 +    if (U_SUCCESS(status)) {
 +        fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
 +        *fInfo = CompactTrieInfo(header, status);
 +    }
 +
      fOwnData = !U_FAILURE(status);
  }

  CompactTrieDictionary::~CompactTrieDictionary() {
      if (fOwnData) {
 -        uprv_free((void *)fData);
 +        uprv_free((void *)(fInfo->address));
      }
 +    uprv_free((void *)fInfo);
 +
      if (fUData) {
          udata_close(fUData);
      }
  }

 +UBool CompactTrieDictionary::getValued() const{
 +    return fInfo->magic == COMPACT_TRIE_MAGIC_3;
 +}
 +
  uint32_t
  CompactTrieDictionary::dataSize() const {
 -    return fData->size;
 +    return fInfo->size;
  }

  const void *
  CompactTrieDictionary::data() const {
 -    return fData;
 +    return fInfo->address;
 +}
 +
 +//This function finds the address of a node for us, given its node ID
 +static inline const CompactTrieNode *
 +getCompactNode(const CompactTrieInfo *info, uint32_t node) {
 +    if(node < info->root-1) {
 +        return (const CompactTrieNode *)(&info->offsets[node]);
 +    } else {
 +        return (const CompactTrieNode *)(info->address + info->offsets[node]);
 +    }
  }

 -// This function finds the address of a node for us, given its node ID
 +//this version of getCompactNode is currently only used in compactMutableTrieDictionary()
  static inline const CompactTrieNode *
 -getCompactNode(const CompactTrieHeader *header, uint16_t node) {
 -    return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
 +getCompactNode(const CompactTrieHeader *header, uint32_t node) {
 +    if(node < header->root-1) {
 +        return (const CompactTrieNode *)(&header->offsets[node]);
 +    } else {
 +        return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
 +    }
 +}
 +
 +
 +/**
 + * Calculates the number of links in a node
 + * @node The specified node
 + */
 +static inline const uint16_t
 +getCount(const CompactTrieNode *node){
 +    return (node->flagscount & kCountMask);
 +    //use the code below if number of links ever exceed 4096
 +    //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCount) >> 2);
 +}
 +
 +/**
 + * calculates an equal link node ID of a horizontal node
 + * @hnode The horizontal node containing the equal link
 + * @param index The index into hnode->entries[]
 + * @param nodeCount The length of hnode->entries[]
 + */
 +static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){
 +    if(vnode->flagscount & kEqualOverflows){
 +        // treat overflow bits as an extension of chars[]
 +        uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNode*)vnode)];
 +        return vnode->equal + (((uint32_t)*overflow) << 16);
 +    }else{
 +        return vnode->equal;
 +    }
 +}
 +
 +/**
 + * calculates an equal link node ID of a horizontal node
 + * @hnode The horizontal node containing the equal link
 + * @param index The index into hnode->entries[]
 + * @param nodeCount The length of hnode->entries[]
 + */
 +static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uint16_t index, uint16_t nodeCount){
 +    if(hnode->flagscount & kEqualOverflows){
 +        //set overflow to point to the uint16_t containing the overflow bits
 +        uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount];
 +        overflow += index/4;
 +        uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10;
 +        return hnode->entries[index].equal + (((uint32_t)extraBits) << 16);
 +    } else {
 +        return hnode->entries[index].equal;
 +    }
 +}
 +
 +/**
 + * Returns the value stored in the specified node which is associated with its
 + * parent node.
 + * TODO: how to tell that value is stored in node or in offset? check whether
 + * node ID < fInfo->root!
 + */
 +static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){
 +    uint16_t count = getCount((CompactTrieNode *)hnode);
 +    uint16_t overflowSize = 0; //size of node ID overflow storage in bytes
 +
 +    if(hnode->flagscount & kEqualOverflows)
 +        overflowSize = (count + 3) / 4 * sizeof(uint16_t);
 +    return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize));
 +}
 +
 +static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){
 +    // calculate size of total node ID overflow storage in bytes
 +    uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16_t) : 0;
 +    return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)vnode)] + overflowSize));
 +}
 +
 +static inline uint16_t getValue(const CompactTrieNode *node){
 +    if(node->flagscount & kVerticalNode)
 +        return getValue((const CompactTrieVerticalNode *)node);
 +    else
 +        return getValue((const CompactTrieHorizontalNode *)node);
 +}
 +
 +//returns index of match in CompactTrieHorizontalNode.entries[] using binary search
 +inline int16_t
 +searchHorizontalEntries(const CompactTrieHorizontalEntry *entries,
 +        UChar uc, uint16_t nodeCount){
 +    int low = 0;
 +    int high = nodeCount-1;
 +    int middle;
 +    while (high >= low) {
 +        middle = (high+low)/2;
 +        if (uc == entries[middle].ch) {
 +            return middle;
 +        }
 +        else if (uc < entries[middle].ch) {
 +            high = middle-1;
 +        }
 +        else {
 +            low = middle+1;
 +        }
 +    }
 +
 +    return -1;
  }

  int32_t
 @@ -466,17 +667,38 @@
                                  int32_t maxLength,
                                  int32_t *lengths,
                                  int &count,
 -                                int limit ) const {
 +                                int limit,
 +                                uint16_t *values /*= NULL*/) const {
 +    if (fInfo->magic == COMPACT_TRIE_MAGIC_2)
 +        values = NULL;
 +
      // TODO: current implementation works in UTF-16 space
 -    const CompactTrieNode *node = getCompactNode(fData, fData->root);
 +    const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root);
      int mycount = 0;

      UChar uc = utext_current32(text);
      int i = 0;

 +    // handle root node with only kEqualOverflows flag: assume horizontal node without parent
 +    if(node != NULL){
 +        const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node;
 +        int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask);
 +        if(index > -1){
 +            node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagscount & kRootCountMask));
 +            utext_next32(text);
 +            uc = utext_current32(text);
 +            ++i;
 +        }else{
 +            node = NULL;
 +        }
 +    }
 +
      while (node != NULL) {
          // Check if the node we just exited ends a word
          if (limit > 0 && (node->flagscount & kParentEndsWord)) {
 +            if(values != NULL){
 +                values[mycount] = getValue(node);
 +            }
              lengths[mycount++] = i;
              --limit;
          }
 @@ -487,7 +709,7 @@
              break;
          }

 -        int nodeCount = (node->flagscount & kCountMask);
 +        int nodeCount = getCount(node);
          if (nodeCount == 0) {
              // Special terminal node; return now
              break;
 @@ -507,35 +729,27 @@
              // To get here we must have come through the whole list successfully;
              // go on to the next node. Note that a word cannot end in the middle
              // of a vertical node.
 -            node = getCompactNode(fData, vnode->equal);
 +            node = getCompactNode(fInfo, calcEqualLink(vnode));
          }
          else {
              // Horizontal node; do binary search
              const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
 -            int low = 0;
 -            int high = nodeCount-1;
 -            int middle;
 -            node = NULL;    // If we don't find a match, we'll fall out of the loop
 -            while (high >= low) {
 -                middle = (high+low)/2;
 -                if (uc == hnode->entries[middle].ch) {
 -                    // We hit a match; get the next node and next character
 -                    node = getCompactNode(fData, hnode->entries[middle].equal);
 -                    utext_next32(text);
 -                    uc = utext_current32(text);
 -                    ++i;
 -                    break;
 -                }
 -                else if (uc < hnode->entries[middle].ch) {
 -                    high = middle-1;
 -                }
 -                else {
 -                    low = middle+1;
 -                }
 +            const CompactTrieHorizontalEntry *entries;
 +            entries = hnode->entries;
 +
 +            int index = searchHorizontalEntries(entries, uc, nodeCount);
 +            if(index > -1){  //
 +                // We hit a match; get the next node and next character
 +                node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCount));
 +                utext_next32(text);
 +                uc = utext_current32(text);
 +                ++i;
 +            }else{
 +                node = NULL;    // If we don't find a match, we'll fall out of the loop
              }
          }
      }
 -exit:
 +    exit:
      count = mycount;
      return i;
  }
 @@ -545,16 +759,16 @@
  private:
      UVector32               fNodeStack;     // Stack of nodes to process
      UVector32               fIndexStack;    // Stack of where in node we are
 -    const CompactTrieHeader *fHeader;       // Trie data
 +    const CompactTrieInfo   *fInfo;         // Trie data

  public:
      static UClassID U_EXPORT2 getStaticClassID(void);
      virtual UClassID getDynamicClassID(void) const;
  public:
 -    CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)
 +    CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)
          : fNodeStack(status), fIndexStack(status) {
 -        fHeader = header;
 -        fNodeStack.push(header->root, status);
 +        fInfo = info;
 +        fNodeStack.push(info->root, status);
          fIndexStack.push(0, status);
          unistr.remove();
      }
 @@ -564,14 +778,14 @@

      virtual StringEnumeration *clone() const {
          UErrorCode status = U_ZERO_ERROR;
 -        return new CompactTrieEnumeration(fHeader, status);
 +        return new CompactTrieEnumeration(fInfo, status);
      }

      virtual const UnicodeString * snext(UErrorCode &status);

      // Very expensive, but this should never be used.
      virtual int32_t count(UErrorCode &status) const {
 -        CompactTrieEnumeration counter(fHeader, status);
 +        CompactTrieEnumeration counter(fInfo, status);
          int32_t result = 0;
          while (counter.snext(status) != NULL && U_SUCCESS(status)) {
              ++result;
 @@ -582,7 +796,7 @@
      virtual void reset(UErrorCode &status) {
          fNodeStack.removeAllElements();
          fIndexStack.removeAllElements();
 -        fNodeStack.push(fHeader->root, status);
 +        fNodeStack.push(fInfo->root, status);
          fIndexStack.push(0, status);
          unistr.remove();
      }
 @@ -595,26 +809,34 @@
      if (fNodeStack.empty() || U_FAILURE(status)) {
          return NULL;
      }
 -    const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());
 +    const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());
      int where = fIndexStack.peeki();
      while (!fNodeStack.empty() && U_SUCCESS(status)) {
 -        int nodeCount = (node->flagscount & kCountMask);
 +        int nodeCount;
 +
 +        bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root);
 +        if(isRoot){
 +            nodeCount = node->flagscount & kRootCountMask;
 +        } else {
 +            nodeCount = getCount(node);
 +        }
 +
          UBool goingDown = FALSE;
          if (nodeCount == 0) {
              // Terminal node; go up immediately
              fNodeStack.popi();
              fIndexStack.popi();
 -            node = getCompactNode(fHeader, fNodeStack.peeki());
 +            node = getCompactNode(fInfo, fNodeStack.peeki());
              where = fIndexStack.peeki();
          }
 -        else if (node->flagscount & kVerticalNode) {
 +        else if ((node->flagscount & kVerticalNode) && !isRoot) {
              // Vertical node
              const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
              if (where == 0) {
                  // Going down
 -                unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount);
 +                unistr.append((const UChar *)vnode->chars, nodeCount);
                  fIndexStack.setElementAt(1, fIndexStack.size()-1);
 -                node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, status));
 +                node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode), status));
                  where = fIndexStack.push(0, status);
                  goingDown = TRUE;
              }
 @@ -623,7 +845,7 @@
                  unistr.truncate(unistr.length()-nodeCount);
                  fNodeStack.popi();
                  fIndexStack.popi();
 -                node = getCompactNode(fHeader, fNodeStack.peeki());
 +                node = getCompactNode(fInfo, fNodeStack.peeki());
                  where = fIndexStack.peeki();
              }
          }
 @@ -638,7 +860,7 @@
                  // Push on next node
                  unistr.append((UChar)hnode->entries[where].ch);
                  fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
 -                node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[where].equal, status));
 +                node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode, where, nodeCount), status));
                  where = fIndexStack.push(0, status);
                  goingDown = TRUE;
              }
 @@ -646,12 +868,14 @@
                  // Going up
                  fNodeStack.popi();
                  fIndexStack.popi();
 -                node = getCompactNode(fHeader, fNodeStack.peeki());
 +                node = getCompactNode(fInfo, fNodeStack.peeki());
                  where = fIndexStack.peeki();
              }
          }
 +
          // Check if the parent of the node we've just gone down to ends a
          // word. If so, return it.
 +        // The root node should never end up here.
          if (goingDown && (node->flagscount & kParentEndsWord)) {
              return &unistr;
          }
 @@ -664,7 +888,7 @@
      if (U_FAILURE(status)) {
          return NULL;
      }
 -    return new CompactTrieEnumeration(fData, status);
 +    return new CompactTrieEnumeration(fInfo, status);
  }

  //
 @@ -672,21 +896,36 @@
  // and back again
  //

 -// Helper classes to construct the compact trie
 +enum CompactTrieNodeType {
 +    kHorizontalType = 0,
 +    kVerticalType = 1,
 +    kValueType = 2
 +};
 +
 +/**
 + * The following classes (i.e. BuildCompactTrie*Node) are helper classes to
 + * construct the compact trie by storing information for each node and later
 + * writing the node to memory in a sequential format.
 + */
  class BuildCompactTrieNode: public UMemory {
 - public:
 +public:
      UBool           fParentEndsWord;
 -    UBool           fVertical;
 +    CompactTrieNodeType fNodeType;
      UBool           fHasDuplicate;
 +    UBool           fEqualOverflows;
      int32_t         fNodeID;
      UnicodeString   fChars;
 +    uint16_t        fValue;

 - public:
 -    BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UErrorCode &status) {
 +public:
 +    BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType,
 +            UStack &nodes, UErrorCode &status, uint16_t value = 0) {
          fParentEndsWord = parentEndsWord;
          fHasDuplicate = FALSE;
 -        fVertical = vertical;
 +        fNodeType = nodeType;
 +        fEqualOverflows = FALSE;
          fNodeID = nodes.size();
 +        fValue = parentEndsWord? value : 0;
          nodes.push(this, status);
      }

 @@ -694,87 +933,225 @@
      }

      virtual uint32_t size() {
 -        return sizeof(uint16_t);
 +        if(fValue > 0)
 +            return sizeof(uint16_t) * 2;
 +        else
 +            return sizeof(uint16_t);
      }

      virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) {
          // Write flag/count
 -        *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask)
 -            | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWord : 0 );
 +
 +        // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be
 +        // used as a 5th MSB.
 +        U_ASSERT(fChars.length() < 4096 || fNodeID == 2);
 +
 +        *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) |
 +        ((fNodeID == 2)? (fChars.length() & kRootCountMask):
 +            (
 +                    (fChars.length() & kCountMask) |
 +                    //((fChars.length() << 2) & kExceedsCount) |
 +                    (fNodeType == kVerticalType ? kVerticalNode : 0) |
 +                    (fParentEndsWord ? kParentEndsWord : 0 )
 +            )
 +        );
          offset += sizeof(uint16_t);
      }
 +
 +    virtual void writeValue(uint8_t *bytes, uint32_t &offset) {
 +        if(fValue > 0){
 +            *((uint16_t *)(bytes+offset)) = fValue;
 +            offset += sizeof(uint16_t);
 +        }
 +    }
 +
 +};
 +
 +/**
 + * Stores value of parent terminating nodes that have no more subtries.
 + */
 +class BuildCompactTrieValueNode: public BuildCompactTrieNode {
 +public:
 +    BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value)
 +        : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){
 +    }
 +
 +    virtual ~BuildCompactTrieValueNode(){
 +    }
 +
 +    virtual uint32_t size() {
 +        return sizeof(uint16_t) * 2;
 +    }
 +
 +    virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
 +        // don't write value directly to memory but store it in offset to be written later
 +        //offset = fValue & kOffsetContainsValue;
 +        BuildCompactTrieNode::write(bytes, offset, translate);
 +        BuildCompactTrieNode::writeValue(bytes, offset);
 +    }
  };

  class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {
   public:
      UStack          fLinks;
 +    UBool           fMayOverflow; //intermediate value for fEqualOverflows

   public:
 -    BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
 -        : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(status) {
 +    BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
 +    : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value), fLinks(status) {
 +        fMayOverflow = FALSE;
      }

      virtual ~BuildCompactTrieHorizontalNode() {
      }

 +    // It is impossible to know beforehand exactly how much space the node will
 +    // need in memory before being written, because the node IDs in the equal
 +    // links may or may not overflow after node coalescing. Therefore, this method
 +    // returns the maximum size possible for the node.
      virtual uint32_t size() {
 -        return offsetof(CompactTrieHorizontalNode,entries) +
 -                (fChars.length()*sizeof(CompactTrieHorizontalEntry));
 +        uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) +
 +        (fChars.length()*sizeof(CompactTrieHorizontalEntry));
 +
 +        if(fValue > 0)
 +            estimatedSize += sizeof(uint16_t);
 +
 +        //estimate extra space needed to store overflow for node ID links
 +        //may be more than what is actually needed
 +        for(int i=0; i < fChars.length(); i++){
 +            if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){
 +                fMayOverflow = TRUE;
 +                break;
 +            }
 +        }
 +        if(fMayOverflow) // added space for overflow should be same as ceil(fChars.length()/4) * sizeof(uint16_t)
 +            estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4;
 +
 +        return estimatedSize;
      }

      virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
 -        BuildCompactTrieNode::write(bytes, offset, translate);
          int32_t count = fChars.length();
 +
 +        //if largest nodeID > 2^16, set flag
 +        //large node IDs are more likely to be at the back of the array
 +        for (int32_t i = count-1; i >= 0; --i) {
 +            if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) > 0xFFFF){
 +                fEqualOverflows = TRUE;
 +                break;
 +            }
 +        }
 +
 +        BuildCompactTrieNode::write(bytes, offset, translate);
 +
 +        // write entries[] to memory
          for (int32_t i = 0; i < count; ++i) {
              CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset);
              entry->ch = fChars[i];
              entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID);
  #ifdef DEBUG_TRIE_DICT
 -            if (entry->equal == 0) {
 +
 +            if ((entry->equal == 0) && !fEqualOverflows) {
                  fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
                          i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
              }
  #endif
              offset += sizeof(CompactTrieHorizontalEntry);
          }
 +
 +        // append extra bits of equal nodes to end if fEqualOverflows
 +        if (fEqualOverflows) {
 +            uint16_t leftmostBits = 0;
 +            for (int16_t i = 0; i < count; i++) {
 +                leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i);
 +
 +                // write filled uint16_t to memory
 +                if(i % 4 == 3){
 +                    *((uint16_t *)(bytes+offset)) = leftmostBits;
 +                    leftmostBits = 0;
 +                    offset += sizeof(uint16_t);
 +                }
 +            }
 +
 +            // pad last uint16_t with zeroes if necessary
 +            int remainder = count % 4;
 +            if (remainder > 0) {
 +                *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remainder));
 +                offset += sizeof(uint16_t);
 +            }
 +        }
 +
 +        BuildCompactTrieNode::writeValue(bytes, offset);
 +    }
 +
 +    // returns leftmost bits of physical node link
 +    uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){
 +        uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) >> 16);
 +#ifdef DEBUG_TRIE_DICT
 +        if (leftmostBits > 0xF) {
 +            fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n",
 +                    i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
 +        }
 +#endif
 +        return leftmostBits;
      }

      void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {
          fChars.append(ch);
          fLinks.push(link, status);
      }
 +
  };

  class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {
 - public:
 +public:
      BuildCompactTrieNode    *fEqual;

 - public:
 -    BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
 -        : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) {
 +public:
 +    BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
 +    : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) {
          fEqual = NULL;
      }

      virtual ~BuildCompactTrieVerticalNode() {
      }

 +    // Returns the maximum possible size of this node. See comment in
 +    // BuildCompactTrieHorizontal node for more information.
      virtual uint32_t size() {
 -        return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
 +        uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
 +        if(fValue > 0){
 +            estimatedSize += sizeof(uint16_t);
 +        }
 +
 +        if(fEqual->fNodeID > 0xFFFF){
 +            estimatedSize += sizeof(uint16_t);
 +        }
 +        return estimatedSize;
      }

      virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
          CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset);
 +        fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF);
          BuildCompactTrieNode::write(bytes, offset, translate);
          node->equal = translate.elementAti(fEqual->fNodeID);
          offset += sizeof(node->equal);
  #ifdef DEBUG_TRIE_DICT
 -        if (node->equal == 0) {
 +        if ((node->equal == 0) && !fEqualOverflows) {
              fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n",
                      fEqual->fNodeID);
          }
  #endif
          fChars.extract(0, fChars.length(), (UChar *)node->chars);
 -        offset += sizeof(uint16_t)*fChars.length();
 +        offset += sizeof(UChar)*fChars.length();
 +
 +        // append 16 bits of to end for equal node if fEqualOverflows
 +        if (fEqualOverflows) {
 +            *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeID) >> 16);
 +            offset += sizeof(uint16_t);
 +        }
 +
 +        BuildCompactTrieNode::writeValue(bytes, offset);
      }

      void addChar(UChar ch) {
 @@ -784,60 +1161,85 @@
      void setLink(BuildCompactTrieNode *node) {
          fEqual = node;
      }
 +
  };

  // Forward declaration
  static void walkHorizontal(const TernaryNode *node,
                              BuildCompactTrieHorizontalNode *building,
                              UStack &nodes,
 -                            UErrorCode &status);
 +                            UErrorCode &status,
 +                            Hashtable *values);

 -// Convert one node. Uses recursion.
 +// Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion.

  static BuildCompactTrieNode *
 -compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UErrorCode &status) {
 +compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes,
 +        UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) {
      if (U_FAILURE(status)) {
          return NULL;
      }
      BuildCompactTrieNode *result = NULL;
      UBool horizontal = (node->low != NULL || node->high != NULL);
      if (horizontal) {
 -        BuildCompactTrieHorizontalNode *hResult =
 -                new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
 +        BuildCompactTrieHorizontalNode *hResult;
 +        if(values != NULL){
 +            hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue);
 +        } else {
 +            hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
 +        }
 +
          if (hResult == NULL) {
              status = U_MEMORY_ALLOCATION_ERROR;
              return NULL;
          }
          if (U_SUCCESS(status)) {
 -            walkHorizontal(node, hResult, nodes, status);
 +            walkHorizontal(node, hResult, nodes, status, values);
              result = hResult;
          }
      }
      else {
 -        BuildCompactTrieVerticalNode *vResult =
 -                new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
 +        BuildCompactTrieVerticalNode *vResult;
 +        if(values != NULL){
 +            vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status, parentValue);
 +        } else {
 +            vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
 +        }
 +
          if (vResult == NULL) {
              status = U_MEMORY_ALLOCATION_ERROR;
 +            return NULL;
          }
          else if (U_SUCCESS(status)) {
 -            UBool   endsWord = FALSE;
 +            uint16_t   value = 0;
 +            UBool endsWord = FALSE;
              // Take up nodes until we end a word, or hit a node with < or > links
              do {
                  vResult->addChar(node->ch);
 -                endsWord = (node->flags & kEndsWord) != 0;
 +                value = node->flags;
 +                endsWord = value > 0;
                  node = node->equal;
              }
              while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);
 +
              if (node == NULL) {
                  if (!endsWord) {
                      status = U_ILLEGAL_ARGUMENT_ERROR;  // Corrupt input trie
                  }
 -                else {
 +                else if(values != NULL){
 +                    UnicodeString key(value); //store value as a single-char UnicodeString
 +                    BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key);
 +                    if(link == NULL){
 +                        link = new BuildCompactTrieValueNode(nodes, status, value); //take out nodes?
 +                        values->put(key, link, status);
 +                    }
 +                    vResult->setLink(link);
 +                } else {
                      vResult->setLink((BuildCompactTrieNode *)nodes[1]);
                  }
              }
              else {
 -                vResult->setLink(compactOneNode(node, endsWord, nodes, status));
 +                vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));
              }
              result = vResult;
          }
 @@ -849,19 +1251,28 @@
  // Uses recursion.

  static void walkHorizontal(const TernaryNode *node,
 -                            BuildCompactTrieHorizontalNode *building,
 -                            UStack &nodes,
 -                            UErrorCode &status) {
 +                           BuildCompactTrieHorizontalNode *building,
 +                           UStack &nodes,
 +                           UErrorCode &status, Hashtable *values = NULL) {
      while (U_SUCCESS(status) && node != NULL) {
          if (node->low != NULL) {
 -            walkHorizontal(node->low, building, nodes, status);
 +            walkHorizontal(node->low, building, nodes, status, values);
          }
          BuildCompactTrieNode *link = NULL;
          if (node->equal != NULL) {
 -            link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);
 +            link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);
          }
 -        else if (node->flags & kEndsWord) {
 -            link = (BuildCompactTrieNode *)nodes[1];
 +        else if (node->flags > 0) {
 +            if(values != NULL) {
 +                UnicodeString key(node->flags); //store value as a single-char UnicodeString
 +                link = (BuildCompactTrieValueNode *) values->get(key);
 +                if(link == NULL) {
 +                    link = new BuildCompactTrieValueNode(nodes, status, node->flags); //take out nodes?
 +                    values->put(key, link, status);
 +                }
 +            } else {
 +                link = (BuildCompactTrieNode *)nodes[1];
 +            }
          }
          if (U_SUCCESS(status) && link != NULL) {
              building->addNode(node->ch, link, status);
 @@ -881,13 +1292,15 @@
  _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
      BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
      BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
 +
      // Check for comparing a node to itself, to avoid spurious duplicates
      if (left == right) {
          return 0;
      }
 +
      // Most significant is type of node. Can never coalesce.
 -    if (left->fVertical != right->fVertical) {
 -        return left->fVertical - right->fVertical;
 +    if (left->fNodeType != right->fNodeType) {
 +        return left->fNodeType - right->fNodeType;
      }
      // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
      if (left->fParentEndsWord != right->fParentEndsWord) {
 @@ -898,12 +1311,19 @@
      if (result != 0) {
          return result;
      }
 +
 +    // If the node value differs, we should not coalesce.
 +    // If values aren't stored, all fValues should be 0.
 +    if (left->fValue != right->fValue) {
 +        return left->fValue - right->fValue;
 +    }
 +
      // We know they're both the same node type, so branch for the two cases.
 -    if (left->fVertical) {
 +    if (left->fNodeType == kVerticalType) {
          result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
 -                            - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
 +        - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
      }
 -    else {
 +    else if(left->fChars.length() > 0 && right->fChars.length() > 0){
          // We need to compare the links vectors. They should be the
          // same size because the strings were equal.
          // We compare the node IDs instead of the pointers, to handle
 @@ -914,9 +1334,10 @@
          int32_t count = hleft->fLinks.size();
          for (int32_t i = 0; i < count && result == 0; ++i) {
              result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
 -                     ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
 +            ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
          }
      }
 +
      // If they are equal to each other, mark them (speeds coalescing)
      if (result == 0) {
          left->fHasDuplicate = TRUE;
 @@ -1031,20 +1452,25 @@
      // Add node 0, used as the NULL pointer/sentinel.
      nodes.addElement((int32_t)0, status);

 +    Hashtable *values = NULL;                           // Index of (unique) values
 +    if (dict.fValued) {
 +        values = new Hashtable(status);
 +    }
 +
      // Start by creating the special empty node we use to indicate that the parent
      // terminates a word. This must be node 1, because the builder assumes
 -    // that.
 +    // that. This node will never be used for tries storing numerical values.
      if (U_FAILURE(status)) {
          return NULL;
      }
 -    BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes, status);
 +    BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalType, nodes, status);
      if (terminal == NULL) {
          status = U_MEMORY_ALLOCATION_ERROR;
      }

      // This call does all the work of building the new trie structure. The root
 -    // will be node 2.
 -    BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status);
 +    // will have node ID 2 before writing to memory.
 +    BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status, values);
  #ifdef DEBUG_TRIE_DICT
      (void) ::times(&timing);
      fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
 @@ -1077,21 +1503,37 @@
          return NULL;
      }

 +    //map terminal value nodes
 +    int valueCount = 0;
 +    UVector valueNodes(status);
 +    if(values != NULL) {
 +        valueCount = values->count(); //number of unique terminal value nodes
 +    }
 +
 +    // map non-terminal nodes
 +    int valuePos = 1;//, nodePos = valueCount + valuePos;
 +    nodeCount = valueCount + valuePos;
      for (i = 1; i < count; ++i) {
          node = (BuildCompactTrieNode *)nodes[i];
          if (node->fNodeID == i) {
              // Only one node out of each duplicate set is used
 -            if (i >= translate.size()) {
 +            if (node->fNodeID >= translate.size()) {
                  // Logically extend the mapping table
 -                translate.setSize(i+1);
 +                translate.setSize(i + 1);
 +            }
 +            //translate.setElementAt(object, index)!
 +            if(node->fNodeType == kValueType) {
 +                valueNodes.addElement(node, status);
 +               translate.setElementAt(valuePos++, i);
 +             } else {
 +                translate.setElementAt(nodeCount++, i);
              }
 -            translate.setElementAt(nodeCount++, i);
              totalSize += node->size();
          }
      }
 -
 -    // Check for overflowing 16 bits worth of nodes.
 -    if (nodeCount > 0x10000) {
 +
 +    // Check for overflowing 20 bits worth of nodes.
 +    if (nodeCount > 0x100000) {
          status = U_ILLEGAL_ARGUMENT_ERROR;
          return NULL;
      }
 @@ -1111,9 +1553,14 @@
          status = U_MEMORY_ALLOCATION_ERROR;
          return NULL;
      }
 -
 +
      CompactTrieHeader *header = (CompactTrieHeader *)bytes;
 -    header->size = totalSize;
 +    //header->size = totalSize;
 +    if(dict.fValued){
 +        header->magic = COMPACT_TRIE_MAGIC_3;
 +    } else {
 +        header->magic = COMPACT_TRIE_MAGIC_2;
 +    }
      header->nodeCount = nodeCount;
      header->offsets[0] = 0;                     // Sentinel
      header->root = translate.elementAti(root->fNodeID);
 @@ -1123,23 +1570,40 @@
      }
  #endif
      uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));
 -    nodeCount = 1;
 +    nodeCount = valueCount + 1;
 +
 +    // Write terminal value nodes to memory
 +    for (i=0; i < valueNodes.size(); i++) {
 +        //header->offsets[i + 1] = offset;
 +        uint32_t tmpOffset = 0;
 +        node = (BuildCompactTrieNode *) valueNodes.elementAt(i);
 +        //header->offsets[i + 1] = (uint32_t)node->fValue;
 +        node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate);
 +    }
 +
      // Now write the data
      for (i = 1; i < count; ++i) {
          node = (BuildCompactTrieNode *)nodes[i];
 -        if (node->fNodeID == i) {
 +        if (node->fNodeID == i && node->fNodeType != kValueType) {
              header->offsets[nodeCount++] = offset;
              node->write(bytes, offset, translate);
          }
      }
 +
 +    //free all extra space
 +    uprv_realloc(bytes, offset);
 +    header->size = offset;
 +
  #ifdef DEBUG_TRIE_DICT
 +    fprintf(stdout, "Space freed: %d\n", totalSize-offset);
 +
      (void) ::times(&timing);
      fprintf(stderr, "Trie built, time user %f system %f\n",
          (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
          (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
      previous = timing;
      fprintf(stderr, "Final offset is %d\n", offset);
 -
 +
      // Collect statistics on node types and sizes
      int hCount = 0;
      int vCount = 0;
 @@ -1148,68 +1612,85 @@
      size_t hItemCount = 0;
      size_t vItemCount = 0;
      uint32_t previousOff = offset;
 -    for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
 +    uint32_t numOverflow = 0;
 +    uint32_t valueSpace = 0;
 +    for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
          const CompactTrieNode *node = getCompactNode(header, nodeIdx);
 -        if (node->flagscount & kVerticalNode) {
 +        int itemCount;
 +        if(nodeIdx == header->root)
 +            itemCount = node->flagscount & kRootCountMask;
 +        else
 +            itemCount = getCount(node);
 +        if(node->flagscount & kEqualOverflows){
 +            numOverflow++;
 +        }
 +        if (node->flagscount & kVerticalNode && nodeIdx != header->root) {
              vCount += 1;
 -            vItemCount += (node->flagscount & kCountMask);
 +            vItemCount += itemCount;
              vSize += previousOff-header->offsets[nodeIdx];
          }
          else {
              hCount += 1;
 -            hItemCount += (node->flagscount & kCountMask);
 -            hSize += previousOff-header->offsets[nodeIdx];
 +            hItemCount += itemCount;
 +            if(nodeIdx >= header->root) {
 +                hSize += previousOff-header->offsets[nodeIdx];
 +            }
          }
 +
 +        if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEndsWord)
 +            valueSpace += sizeof(uint16_t);
          previousOff = header->offsets[nodeIdx];
      }
      fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount,
                  (double)hSize/hCount, (double)hItemCount/hCount);
      fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount,
                  (double)vSize/vCount, (double)vItemCount/vCount);
 +    fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverflow);
 +    fprintf(stderr, "Space taken up by values: %d \n", valueSpace);
  #endif

      if (U_FAILURE(status)) {
          uprv_free(bytes);
          header = NULL;
      }
 -    else {
 -        header->magic = COMPACT_TRIE_MAGIC_1;
 -    }
      return header;
  }

  // Forward declaration
  static TernaryNode *
 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status );
 -
 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status );

  // Convert a horizontal node (or subarray thereof) into a ternary subtrie
  static TernaryNode *
 -unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizontalEntry *array,
 -                            int low, int high, UErrorCode &status ) {
 +unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalNode *hnode,
 +        int low, int high, int nodeCount, UErrorCode &status) {
      if (U_FAILURE(status) || low > high) {
          return NULL;
      }
      int middle = (low+high)/2;
 -    TernaryNode *result = new TernaryNode(array[middle].ch);
 +    TernaryNode *result = new TernaryNode(hnode->entries[middle].ch);
      if (result == NULL) {
          status = U_MEMORY_ALLOCATION_ERROR;
          return NULL;
      }
 -    const CompactTrieNode *equal = getCompactNode(header, array[middle].equal);
 +    const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, middle, nodeCount));
      if (equal->flagscount & kParentEndsWord) {
 -        result->flags |= kEndsWord;
 +        if(info->magic == COMPACT_TRIE_MAGIC_3){
 +            result->flags = getValue(equal);
 +        }else{
 +            result->flags |= kEndsWord;
 +        }
      }
 -    result->low = unpackHorizontalArray(header, array, low, middle-1, status);
 -    result->high = unpackHorizontalArray(header, array, middle+1, high, status);
 -    result->equal = unpackOneNode(header, equal, status);
 +    result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status);
 +    result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status);
 +    result->equal = unpackOneNode(info, equal, status);
      return result;
  }

  // Convert one compact trie node into a ternary subtrie
  static TernaryNode *
 -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ) {
 -    int nodeCount = (node->flagscount & kCountMask);
 +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ) {
 +    int nodeCount = getCount(node);
      if (nodeCount == 0 || U_FAILURE(status)) {
          // Failure, or terminal node
          return NULL;
 @@ -1234,29 +1715,41 @@
              previous = latest;
          }
          if (latest != NULL) {
 -            const CompactTrieNode *equal = getCompactNode(header, vnode->equal);
 +            const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vnode));
              if (equal->flagscount & kParentEndsWord) {
 -                latest->flags |= kEndsWord;
 +                if(info->magic == COMPACT_TRIE_MAGIC_3){
 +                    latest->flags = getValue(equal);
 +                } else {
 +                    latest->flags |= kEndsWord;
 +                }
              }
 -            latest->equal = unpackOneNode(header, equal, status);
 +            latest->equal = unpackOneNode(info, equal, status);
          }
          return head;
      }
      else {
          // Horizontal node
          const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
 -        return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status);
 +        return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, status);
      }
  }

 +// returns a MutableTrieDictionary generated from the CompactTrieDictionary
  MutableTrieDictionary *
  CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {
 -    MutableTrieDictionary *result = new MutableTrieDictionary( status );
 +    MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->magic == COMPACT_TRIE_MAGIC_3 );
      if (result == NULL) {
          status = U_MEMORY_ALLOCATION_ERROR;
          return NULL;
      }
 -    TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status);
 +    // treat root node as special case: don't call unpackOneNode() or unpackHorizontalArray() directly
 +    // because only kEqualOverflows flag should be checked in root's flagscount
 +    const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)
 +    getCompactNode(fInfo, fInfo->root);
 +    uint16_t nodeCount = hnode->flagscount & kRootCountMask;
 +    TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,
 +            nodeCount, status);
 +
      if (U_FAILURE(status)) {
          delete root;    // Clean up
          delete result;
 @@ -1270,8 +1763,8 @@

  U_CAPI int32_t U_EXPORT2
  triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
 -           UErrorCode *status) {
 -
 +        UErrorCode *status) {
 +
      if (status == NULL || U_FAILURE(*status)) {
          return 0;
      }
 @@ -1286,14 +1779,14 @@
      //
      const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
      if(!(  pInfo->dataFormat[0]==0x54 &&   /* dataFormat="TrDc" */
 -           pInfo->dataFormat[1]==0x72 &&
 -           pInfo->dataFormat[2]==0x44 &&
 -           pInfo->dataFormat[3]==0x63 &&
 -           pInfo->formatVersion[0]==1  )) {
 +            pInfo->dataFormat[1]==0x72 &&
 +            pInfo->dataFormat[2]==0x44 &&
 +            pInfo->dataFormat[3]==0x63 &&
 +            pInfo->formatVersion[0]==1  )) {
          udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
 -                         pInfo->dataFormat[0], pInfo->dataFormat[1],
 -                         pInfo->dataFormat[2], pInfo->dataFormat[3],
 -                         pInfo->formatVersion[0]);
 +                pInfo->dataFormat[0], pInfo->dataFormat[1],
 +                pInfo->dataFormat[2], pInfo->dataFormat[3],
 +                pInfo->formatVersion[0]);
          *status=U_UNSUPPORTED_ERROR;
          return 0;
      }
 @@ -1311,8 +1804,10 @@
      //
      const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
      const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
 -    if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1
 -            || ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
 +    uint32_t magic = ds->readUInt32(header->magic);
 +    if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3
 +            || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)
 +            || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
      {
          udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");
          *status=U_UNSUPPORTED_ERROR;
 @@ -1333,10 +1828,10 @@
      //
      if (length < sizeWithUData) {
          udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
 -                            totalSize);
 +                totalSize);
          *status=U_INDEX_OUTOFBOUNDS_ERROR;
          return 0;
 -        }
 +    }

      //
      // Swap the Data.  Do the data itself first, then the CompactTrieHeader, because
 @@ -1355,20 +1850,38 @@
      }

      // We need to loop through all the nodes in the offset table, and swap each one.
 -    uint16_t nodeCount = ds->readUInt16(header->nodeCount);
 +    uint32_t nodeCount, rootId;
 +    if(header->magic == COMPACT_TRIE_MAGIC_1) {
 +        nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);
 +        rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);
 +    } else {
 +        nodeCount = ds->readUInt32(header->nodeCount);
 +        rootId = ds->readUInt32(header->root);
 +    }
 +
      // Skip node 0, which should always be 0.
 -    for (int i = 1; i < nodeCount; ++i) {
 +    for (uint32_t i = 1; i < nodeCount; ++i) {
          uint32_t nodeOff = ds->readUInt32(header->offsets[i]);
          const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff);
          CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);
          uint16_t flagscount = ds->readUInt16(inNode->flagscount);
 -        uint16_t itemCount = flagscount & kCountMask;
 +        uint16_t itemCount = getCount(inNode);
 +        //uint16_t itemCount = flagscount & kCountMask;
          ds->writeUInt16(&outNode->flagscount, flagscount);
          if (itemCount > 0) {
 -            if (flagscount & kVerticalNode) {
 +            uint16_t overflow = 0; //number of extra uint16_ts needed to be swapped
 +            if (flagscount & kVerticalNode && i != rootId) {
 +                if(flagscount & kEqualOverflows){
 +                    // include overflow bits
 +                    overflow += 1;
 +                }
 +                if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsParentWord) {
 +                    //include values
 +                    overflow += 1;
 +                }
                  ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars),
 -                                    itemCount*sizeof(uint16_t),
 -                                    outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
 +                        (itemCount + overflow)*sizeof(uint16_t),
 +                        outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
                  uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);
                  ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));
              }
 @@ -1381,26 +1894,62 @@
                      word = ds->readUInt16(inHNode->entries[j].equal);
                      ds->writeUInt16(&outHNode->entries[j].equal, word);
                  }
 +
 +                // swap overflow/value information
 +                if(flagscount & kEqualOverflows){
 +                    overflow += (itemCount + 3) / 4;
 +                }
 +
 +                if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flagscount & kEndsParentWord) {
 +                    //include values
 +                    overflow += 1;
 +                }
 +
 +                uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount];
 +                uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCount];
 +                for(int j = 0; j<overflow; j++){
 +                    uint16_t extraInfo = ds->readUInt16(*inOverflow);
 +                    ds->writeUInt16(outOverflow, extraInfo);
 +
 +                    inOverflow++;
 +                    outOverflow++;
 +                }
              }
          }
      }
  #endif

 -    // All the data in all the nodes consist of 16 bit items. Swap them all at once.
 -    uint16_t nodeCount = ds->readUInt16(header->nodeCount);
 -    uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount*sizeof(uint32_t));
 -    ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
 -
      // Swap the header
      ds->writeUInt32(&outputHeader->size, totalSize);
 -    uint32_t magic = ds->readUInt32(header->magic);
      ds->writeUInt32(&outputHeader->magic, magic);
 -    ds->writeUInt16(&outputHeader->nodeCount, nodeCount);
 -    uint16_t root = ds->readUInt16(header->root);
 -    ds->writeUInt16(&outputHeader->root, root);
 -    ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets),
 -            sizeof(uint32_t)*(int32_t)nodeCount,
 -            outBytes+offsetof(CompactTrieHeader,offsets), status);
 +
 +    uint32_t nodeCount;
 +    uint32_t offsetPos;
 +    if (header->magic == COMPACT_TRIE_MAGIC_1) {
 +        CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header;
 +        CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeader;
 +
 +        nodeCount = ds->readUInt16(headerV1->nodeCount);
 +        ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount);
 +        uint16_t root = ds->readUInt16(headerV1->root);
 +        ds->writeUInt16(&outputHeaderV1->root, root);
 +        offsetPos = offsetof(CompactTrieHeaderV1,offsets);
 +    } else {
 +        nodeCount = ds->readUInt32(header->nodeCount);
 +        ds->writeUInt32(&outputHeader->nodeCount, nodeCount);
 +        uint32_t root = ds->readUInt32(header->root);
 +        ds->writeUInt32(&outputHeader->root, root);
 +        offsetPos = offsetof(CompactTrieHeader,offsets);
 +    }
 +
 +    // All the data in all the nodes consist of 16 bit items. Swap them all at once.
 +    uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t));
 +    ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
 +
 +    //swap offsets
 +    ds->swapArray32(ds, inBytes+offsetPos,
 +            sizeof(uint32_t)*(uint32_t)nodeCount,
 +            outBytes+offsetPos, status);

      return sizeWithUData;
  }
 --- source/common/triedict.h	2006-06-06 15:38:49.000000000 -0700
 +++ source/common/triedict.h	2011-01-21 14:12:45.496927000 -0800
 @@ -47,7 +47,6 @@
  U_NAMESPACE_BEGIN

  class StringEnumeration;
 -struct CompactTrieHeader;

  /*******************************************************************
   * TrieWordDictionary
 @@ -72,23 +71,29 @@
     */
    virtual ~TrieWordDictionary();

 +  /**
 +   * <p>Returns true if the dictionary contains values associated with each word.</p>
 +   */
 +  virtual UBool getValued() const = 0;
 +
   /**
    * <p>Find dictionary words that match the text.</p>
    *
    * @param text A UText representing the text. The
    * iterator is left after the longest prefix match in the dictionary.
 -  * @param start The current position in text.
    * @param maxLength The maximum number of code units to match.
    * @param lengths An array that is filled with the lengths of words that matched.
    * @param count Filled with the number of elements output in lengths.
    * @param limit The size of the lengths array; this limits the number of words output.
 +  * @param values An array that is filled with the values associated with the matched words.
    * @return The number of characters in text that were matched.
    */
    virtual int32_t matches( UText *text,
                                int32_t maxLength,
                                int32_t *lengths,
                                int &count,
 -                              int limit ) const = 0;
 +                              int limit,
 +                              uint16_t *values = NULL) const = 0;

    /**
     * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
 @@ -128,6 +133,12 @@

    UText    *fIter;

 +    /**
 +     * A UText for internal use
 +     * @internal
 +     */
 +  UBool fValued;
 +
    friend class CompactTrieDictionary;   // For fast conversion

   public:
 @@ -138,14 +149,29 @@
    * @param median A UChar around which to balance the trie. Ideally, it should
    * begin at least one word that is near the median of the set in the dictionary
    * @param status A status code recording the success of the call.
 +  * @param containsValue True if the dictionary stores values associated with each word.
    */
 -  MutableTrieDictionary( UChar median, UErrorCode &status );
 +  MutableTrieDictionary( UChar median, UErrorCode &status, UBool containsValue = FALSE );

    /**
     * <p>Virtual destructor.</p>
     */
    virtual ~MutableTrieDictionary();

 +  /**
 +   * Indicate whether the MutableTrieDictionary stores values associated with each word
 +   */
 +  void setValued(UBool valued){
 +      fValued = valued;
 +  }
 +
 +  /**
 +   * <p>Returns true if the dictionary contains values associated with each word.</p>
 +   */
 +  virtual UBool getValued() const {
 +      return fValued;
 +  }
 +
   /**
    * <p>Find dictionary words that match the text.</p>
    *
 @@ -155,13 +181,15 @@
    * @param lengths An array that is filled with the lengths of words that matched.
    * @param count Filled with the number of elements output in lengths.
    * @param limit The size of the lengths array; this limits the number of words output.
 +  * @param values An array that is filled with the values associated with the matched words.
    * @return The number of characters in text that were matched.
    */
    virtual int32_t matches( UText *text,
                                int32_t maxLength,
                                int32_t *lengths,
                                int &count,
 -                              int limit ) const;
 +                              int limit,
 +                              uint16_t *values = NULL) const;

    /**
     * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
 @@ -173,15 +201,17 @@
    virtual StringEnumeration *openWords( UErrorCode &status ) const;

   /**
 -  * <p>Add one word to the dictionary.</p>
 +  * <p>Add one word to the dictionary with an optional associated value.</p>
    *
    * @param word A UChar buffer containing the word.
    * @param length The length of the word.
 -  * @param status The resultant status
 +  * @param status The resultant status.
 +  * @param value The nonzero value associated with this word.
    */
    virtual void addWord( const UChar *word,
                          int32_t length,
 -                        UErrorCode &status);
 +                        UErrorCode &status,
 +                        uint16_t value = 0);

  #if 0
   /**
 @@ -203,8 +233,9 @@
    * @param lengths An array that is filled with the lengths of words that matched.
    * @param count Filled with the number of elements output in lengths.
    * @param limit The size of the lengths array; this limits the number of words output.
 -  * @param parent The parent of the current node
 -  * @param pMatched The returned parent node matched the input
 +  * @param parent The parent of the current node.
 +  * @param pMatched The returned parent node matched the input/
 +  * @param values An array that is filled with the values associated with the matched words.
    * @return The number of characters in text that were matched.
    */
    virtual int32_t search( UText *text,
 @@ -213,40 +244,46 @@
                                int &count,
                                int limit,
                                TernaryNode *&parent,
 -                              UBool &pMatched ) const;
 +                              UBool &pMatched,
 +                              uint16_t *values = NULL) const;

  private:
   /**
    * <p>Private constructor. The root node it not allocated.</p>
    *
    * @param status A status code recording the success of the call.
 +  * @param containsValues True if the dictionary will store a value associated
 +  * with each word added.
    */
 -  MutableTrieDictionary( UErrorCode &status );
 +  MutableTrieDictionary( UErrorCode &status, UBool containsValues = false );
  };

  /*******************************************************************
   * CompactTrieDictionary
   */

 +//forward declarations
 +struct CompactTrieHeader;
 +struct CompactTrieInfo;
 +
  /**
   * <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted
   * to save space.</p>
   */
  class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {
   private:
 -    /**
 -     * The root node of the trie
 -     */
 +  /**
 +   * The header of the CompactTrieDictionary which contains all info
 +   */

 -  const CompactTrieHeader   *fData;
 -
 -    /**
 -     * A UBool indicating whether or not we own the fData.
 -     */
 +  CompactTrieInfo                 *fInfo;

 +  /**
 +   * A UBool indicating whether or not we own the fData.
 +   */
    UBool                     fOwnData;

 -    UDataMemory              *fUData;
 +  UDataMemory              *fUData;
   public:
    /**
     * <p>Construct a dictionary from a UDataMemory.</p>
 @@ -277,6 +314,11 @@
     */
    virtual ~CompactTrieDictionary();

 +  /**
 +   * <p>Returns true if the dictionary contains values associated with each word.</p>
 +   */
 +  virtual UBool getValued() const;
 +
   /**
    * <p>Find dictionary words that match the text.</p>
    *
 @@ -286,13 +328,15 @@
    * @param lengths An array that is filled with the lengths of words that matched.
    * @param count Filled with the number of elements output in lengths.
    * @param limit The size of the lengths array; this limits the number of words output.
 +  * @param values An array that is filled with the values associated with the matched words.
    * @return The number of characters in text that were matched.
    */
    virtual int32_t matches( UText *text,
 -                              int32_t rangeEnd,
 +                              int32_t maxLength,
                                int32_t *lengths,
                                int &count,
 -                              int limit ) const;
 +                              int limit,
 +                              uint16_t *values = NULL) const;

    /**
     * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
 @@ -311,7 +355,7 @@
    virtual uint32_t dataSize() const;

   /**
 -  * <p>Return a void * pointer to the compact data, platform-endian.</p>
 +  * <p>Return a void * pointer to the (unmanaged) compact data, platform-endian.</p>
    *
    * @return The data for the compact dictionary, suitable for passing to the
    * constructor.
 @@ -342,5 +386,5 @@

  U_NAMESPACE_END

 -    /* TRIEDICT_H */
 +/* TRIEDICT_H */
  #endif
 --- source/data/Makefile.in	2010-10-29 13:21:33.000000000 -0700
 +++ source/data/Makefile.in	2011-01-26 16:24:24.856798000 -0800
 @@ -509,8 +520,9 @@
  ####################################################    CTD
  # CTD FILES

 -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
 -	$(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
 +# .ctd file now generated regardless of whether dictionary file exists
 +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
 +	$(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F).txt

  ####################################################    CFU
  # CFU FILES
 --- source/data/brkitr/root.txt	2010-07-28 17:18:28.000000000 -0700
 +++ source/data/brkitr/root.txt	2011-01-21 14:12:45.653922000 -0800
 @@ -17,5 +17,8 @@
      }
      dictionaries{
          Thai:process(dependency){"thaidict.ctd"}
 +        Hani:process(dependency){"cjdict.ctd"}
 +        Hira:process(dependency){"cjdict.ctd"}
 +        Kata:process(dependency){"cjdict.ctd"}
      }
  }
 --- source/data/xml/brkitr/root.xml	2010-03-01 15:13:18.000000000 -0800
 +++ source/data/xml/brkitr/root.xml	2011-01-21 14:12:45.735922000 -0800
 @@ -25,6 +25,9 @@
              </icu:boundaries>
              <icu:dictionaries>
                  <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>
 +                <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/>
 +                <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/>
 +                <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/>
              </icu:dictionaries>
          </icu:breakIteratorData>
      </special>
 --- source/test/cintltst/creststn.c	2010-10-28 10:44:02.000000000 -0700
 +++ source/test/cintltst/creststn.c	2011-01-21 14:12:44.995020000 -0800
 @@ -2188,21 +2188,21 @@


        {
 -            UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status);
 +            UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status);
              const UChar *got = NULL, *exp=NULL;
              int32_t gotLen = 0, expLen=0;
 -            ja = ures_getByKey(ja, "boundaries", ja, &status);
 -            exp = tres_getString(ja, -1, "word", &expLen, &status);
 +            th = ures_getByKey(th, "boundaries", th, &status);
 +            exp = tres_getString(th, -1, "grapheme", &expLen, &status);

              tb = ures_getByKey(aliasB, "boundaries", tb, &status);
 -            got = tres_getString(tb, -1, "word", &gotLen, &status);
 +            got = tres_getString(tb, -1, "grapheme", &gotLen, &status);

              if(U_FAILURE(status)) {
                  log_err("%s trying to read str boundaries\n", u_errorName(status));
              } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) {
                  log_err("Referencing alias didn't get the right data\n");
              }
 -            ures_close(ja);
 +            ures_close(th);
              status = U_ZERO_ERROR;
        }
        /* simple alias */
 --- source/test/intltest/rbbiapts.cpp	2010-07-12 11:03:29.000000000 -0700
 +++ source/test/intltest/rbbiapts.cpp	2011-01-21 14:12:45.033014000 -0800
 @@ -156,9 +156,13 @@
      if(*a!=*b){
          errln("Failed: boilerplate method operator!= does not return correct results");
      }
 -    BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);
 -    if(a && c){
 -        if(*c==*a){
 +    // Japanese word break iteratos is identical to root with
 +    // a dictionary-based break iterator, but Thai character break iterator
 +    // is still different from Root.
 +    BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),status);
 +    BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),status);
 +    if(c && d){
 +        if(*c==*d){
              errln("Failed: boilerplate method opertator== does not return correct results");
          }
      }else{
 @@ -167,6 +171,7 @@
      delete a;
      delete b;
      delete c;
 +    delete d;
  }

  void RBBIAPITest::TestgetRules()
 @@ -635,21 +640,21 @@
  //
  void RBBIAPITest::TestRuleStatus() {
       UChar str[30];
 -     u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094",
 -              // 012345678901234567  8      9    0  1      2    3  4      5    6
 -              //                    Ideographic    Katakana       Hiragana
 +     //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing
 +     // changed UBRK_WORD_KANA to UBRK_WORD_IDEO
 +     u_unescape("plain word 123.45 \\u30a1\\u30a2 ",
 +              // 012345678901234567  8      9    0
 +              //                     Katakana
                  str, 30);
       UnicodeString testString1(str);
 -     int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};
 +     int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};
       int32_t tag_lo[]  = {UBRK_WORD_NONE,     UBRK_WORD_LETTER, UBRK_WORD_NONE,    UBRK_WORD_LETTER,
                            UBRK_WORD_NONE,     UBRK_WORD_NUMBER, UBRK_WORD_NONE,
 -                          UBRK_WORD_IDEO,     UBRK_WORD_IDEO,   UBRK_WORD_NONE,
 -                          UBRK_WORD_KANA,     UBRK_WORD_NONE,   UBRK_WORD_KANA,    UBRK_WORD_KANA};
 +                          UBRK_WORD_IDEO,     UBRK_WORD_NONE};

       int32_t tag_hi[]  = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
                            UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
 -                          UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT,   UBRK_WORD_NONE_LIMIT,
 -                          UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT,   UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};
 +                          UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};

       UErrorCode status=U_ZERO_ERROR;

 @@ -888,9 +893,11 @@

      URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);
      {
 +#if 0 // With a dictionary based word breaking, ja_word is identical to root.
          if (ja_word && *ja_word == *root_word) {
              errln("japan not different from root");
          }
 +#endif
      }

      {
 --- source/test/intltest/rbbitst.cpp	2010-10-08 18:23:28.000000000 -0700
 +++ source/test/intltest/rbbitst.cpp	2011-01-21 14:12:45.180030000 -0800
 @@ -35,6 +35,8 @@
  #include <string.h>
  #include <stdio.h>
  #include <stdlib.h>
 +#include "unicode/numfmt.h"
 +#include "unicode/uscript.h"

  #define TEST_ASSERT(x) {if (!(x)) { \
      errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
 @@ -138,11 +140,13 @@
              if (exec) TestThaiBreaks();                        break;
          case 23: name = "TestTailoredBreaks";
              if (exec) TestTailoredBreaks();                    break;
 +        case 24: name = "TestTrieDictWithValue";
 +            if(exec) TestTrieDictWithValue();                  break;
  #else
 -        case 21: case 22: case 23: name = "skip";
 +        case 21: case 22: case 23: case 24: name = "skip";
              break;
  #endif
 -        case 24: name = "TestDictRules";
 +        case 25: name = "TestDictRules";
              if (exec) TestDictRules();                         break;
          case 25: name = "TestBug5532";
              if (exec) TestBug5532();                           break;
 @@ -607,6 +611,8 @@


  void RBBITest::TestJapaneseWordBreak() {
 +// TODO: Rewrite this test for a dictionary-based word breaking.
 +#if 0
      UErrorCode status = U_ZERO_ERROR;
      BITestData   japaneseWordSelection(status);

 @@ -628,6 +634,7 @@

      generalIteratorTest(*e, japaneseWordSelection);
      delete e;
 +#endif
  }

  void RBBITest::TestTrieDict() {
 @@ -849,6 +856,372 @@
      delete compact2;
  }

 +/*TODO: delete later*/
 +inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){
 +    UErrorCode      status  = U_ZERO_ERROR;
 +    FILE *outfile = fopen(filename,"w");
 +    UConverter *cvt = ucnv_open("UTF-8", &status);
 +    if (U_FAILURE(status))
 +        return;
 +    if(outfile != NULL){
 +        status = U_ZERO_ERROR;
 +        const UnicodeString *word = enumer->snext(status);
 +        while (word != NULL && U_SUCCESS(status)) {
 +            char u8word[500];
 +            status = U_ZERO_ERROR;
 +            ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(),
 +                    &status);
 +            fprintf(outfile,"%s\n", u8word);
 +            status = U_ZERO_ERROR;
 +            word = enumer->snext(status);
 +        }
 +        fclose(outfile);
 +    }
 +    ucnv_close(cvt);
 +}
 +
 +// A very simple helper class to streamline the buffer handling in
 +// TestTrieDictWithValue
 +template<class T, size_t N>
 +class AutoBuffer {
 + public:
 +  AutoBuffer(size_t size) : buffer(stackBuffer) {
 +    if (size > N)
 +      buffer = new T[size];
 +  }
 +  ~AutoBuffer() {
 +    if (buffer != stackBuffer)
 +      delete [] buffer;
 +  }
 +  T* elems() {
 +    return buffer;
 +  }
 +  const T& operator[] (size_t i) const {
 +    return buffer[i];
 +  }
 +  T& operator[] (size_t i) {
 +    return buffer[i];
 +  }
 + private:
 +  T stackBuffer[N];
 +  T* buffer;
 +  AutoBuffer();
 +};
 +
 +//----------------------------------------------------------------------------
 +//
 +// TestTrieDictWithValue    Test trie dictionaries with logprob values and
 +// more than 2^16 nodes after compaction.
 +//
 +//----------------------------------------------------------------------------
 +void RBBITest::TestTrieDictWithValue() {
 +    UErrorCode      status  = U_ZERO_ERROR;
 +
 +    //
 +    //  Open and read the test data file.
 +    //
 +    const char *testDataDirectory = IntlTest::getSourceTestData(status);
 +    const char *filename = "cjdict-truncated.txt";
 +    char testFileName[1000];
 +    if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) {
 +        errln("Can't open test data.  Path too long.");
 +        return;
 +    }
 +    strcpy(testFileName, testDataDirectory);
 +    strcat(testFileName, filename);
 +
 +    // Items needing deleting at the end
 +    MutableTrieDictionary *mutableDict = NULL;
 +    CompactTrieDictionary *compactDict = NULL;
 +    UnicodeSet            *breaks      = NULL;
 +    UChar                 *testFile    = NULL;
 +    StringEnumeration     *enumer1     = NULL;
 +    StringEnumeration     *enumer2     = NULL;
 +    MutableTrieDictionary *mutable2    = NULL;
 +    StringEnumeration     *cloneEnum   = NULL;
 +    CompactTrieDictionary *compact2    = NULL;
 +    NumberFormat          *nf           = NULL;
 +    UText *originalText = NULL, *cloneText = NULL;
 +
 +    const UnicodeString *originalWord = NULL;
 +    const UnicodeString *cloneWord    = NULL;
 +    UChar *current;
 +    UChar *word;
 +    UChar uc;
 +    int32_t wordLen;
 +    int32_t wordCount;
 +    int32_t testCount;
 +    int32_t valueLen;
 +    int counter = 0;
 +
 +    int    len;
 +    testFile = ReadAndConvertFile(testFileName, len, NULL, status);
 +    if (U_FAILURE(status)) {
 +        goto cleanup; /* something went wrong, error already output */
 +    }
 +
 +    mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE);
 +    if (U_FAILURE(status)) {
 +        errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +    breaks = new UnicodeSet;
 +    breaks->add(0x000A);     // Line Feed
 +    breaks->add(0x000D);     // Carriage Return
 +    breaks->add(0x2028);     // Line Separator
 +    breaks->add(0x2029);     // Paragraph Separator
 +    breaks->add(0x0009);     // Tab character
 +
 +    // Now add each non-comment line of the file as a word.
 +    current = testFile;
 +    word = current;
 +    uc = *current++;
 +    wordLen = 0;
 +    wordCount = 0;
 +    nf = NumberFormat::createInstance(status);
 +
 +    while (uc) {
 +        UnicodeString ucharValue;
 +        valueLen = 0;
 +
 +        if (uc == 0x0023) {     // #comment line, skip
 +            while (uc && !breaks->contains(uc)) {
 +                uc = *current++;
 +            }
 +        }
 +        else{
 +            while (uc && !breaks->contains(uc)) {
 +                ++wordLen;
 +                uc = *current++;
 +            }
 +            if(uc == 0x0009){ //separator is a tab char, read in num after tab
 +                uc = *current++;
 +                while (uc && !breaks->contains(uc)) {
 +                    ucharValue.append(uc);
 +                    uc = *current++;
 +                }
 +            }
 +        }
 +        if (wordLen > 0) {
 +            Formattable value((int32_t)0);
 +            nf->parse(ucharValue.getTerminatedBuffer(), value, status);
 +
 +            if(U_FAILURE(status)){
 +                errln("parsing of value failed when reading in dictionary\n");
 +                goto cleanup;
 +            }
 +            mutableDict->addWord(word, wordLen, status, value.getLong());
 +            if (U_FAILURE(status)) {
 +                errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
 +                goto cleanup;
 +            }
 +            wordCount += 1;
 +        }
 +
 +        // Find beginning of next line
 +        while (uc && breaks->contains(uc)) {
 +            uc = *current++;
 +        }
 +        word = current-1;
 +        wordLen = 0;
 +    }
 +
 +    if (wordCount < 50) {
 +        errln("Word count (%d) unreasonably small\n", wordCount);
 +        goto cleanup;
 +    }
 +
 +    enumer1 = mutableDict->openWords(status);
 +    if (U_FAILURE(status)) {
 +        errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +    testCount = 0;
 +    if (wordCount != (testCount = enumer1->count(status))) {
 +        errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 +                testCount, wordCount, u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +    // Now compact it
 +    compactDict = new CompactTrieDictionary(*mutableDict, status);
 +    if (U_FAILURE(status)) {
 +        errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +    enumer2 = compactDict->openWords(status);
 +    if (U_FAILURE(status)) {
 +        errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +
 +    //delete later
 +//    writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt");
 +//    writeEnumerationToFile(enumer2, "/home/jchye/compact.txt");
 +
 +    enumer1->reset(status);
 +    enumer2->reset(status);
 +
 +    originalWord = enumer1->snext(status);
 +    cloneWord = enumer2->snext(status);
 +    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
 +        if (*originalWord != *cloneWord) {
 +            errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n",
 +                    counter, originalWord->length(), cloneWord->length());
 +            goto cleanup;
 +        }
 +
 +        // check if attached values of the same word in both dictionaries tally
 +#if 0
 +        int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()];
 +        uint16_t values1[originalWord->length()], values2[cloneWord->length()];
 +#endif
 +        AutoBuffer<int32_t, 20> lengths1(originalWord->length());
 +        AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
 +        AutoBuffer<uint16_t, 20> values1(originalWord->length());
 +        AutoBuffer<uint16_t, 20> values2(cloneWord->length());
 +
 +        originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
 +        cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
 +
 +        int count1, count2;
 +        mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
 +        compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
 +
 +        if(values1[count1-1] != values2[count2-1]){
 +            errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n",
 +                  counter, values1[count1-1], values2[count2-1]);
 +            goto cleanup;
 +        }
 +
 +        counter++;
 +        originalWord = enumer1->snext(status);
 +        cloneWord = enumer2->snext(status);
 +    }
 +    if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
 +        errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
 +    }
 +
 +    delete enumer1;
 +    enumer1 = NULL;
 +    delete enumer2;
 +    enumer2 = NULL;
 +
 +    // Now un-compact it
 +    mutable2 = compactDict->cloneMutable(status);
 +    if (U_FAILURE(status)) {
 +        errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +    cloneEnum = mutable2->openWords(status);
 +    if (U_FAILURE(status)) {
 +        errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +    if (wordCount != (testCount = cloneEnum->count(status))) {
 +        errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 +                testCount, wordCount, u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +    // Compact original dictionary to clone. Note that we can only compare the same kind of
 +    // dictionary as the order of the enumerators is not guaranteed to be the same between
 +    // different kinds
 +    enumer1 = mutableDict->openWords(status);
 +    if (U_FAILURE(status)) {
 +        errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +    counter = 0;
 +    originalWord = enumer1->snext(status);
 +    cloneWord = cloneEnum->snext(status);
 +    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
 +        if (*originalWord != *cloneWord) {
 +            errln("Original and cloned MutableTrieDictionary word mismatch\n");
 +            goto cleanup;
 +        }
 +
 +        // check if attached values of the same word in both dictionaries tally
 +        AutoBuffer<int32_t, 20> lengths1(originalWord->length());
 +        AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
 +        AutoBuffer<uint16_t, 20> values1(originalWord->length());
 +        AutoBuffer<uint16_t, 20> values2(cloneWord->length());
 +        originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
 +        cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
 +
 +        int count1, count2;
 +        mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
 +        mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
 +
 +        if(values1[count1-1] != values2[count2-1]){
 +            errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n",
 +                  counter, values1[count1-1], values2[count2-1]);
 +            goto cleanup;
 +        }
 +
 +        counter++;
 +
 +        originalWord = enumer1->snext(status);
 +        cloneWord = cloneEnum->snext(status);
 +    }
 +
 +    if (U_FAILURE(status)) {
 +        errln("Enumeration failed: %s\n", u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +    if (originalWord != cloneWord) {
 +        errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
 +        goto cleanup;
 +    }
 +
 +    // Test the data copying constructor for CompactTrieDict, and the data access APIs.
 +    compact2 = new CompactTrieDictionary(compactDict->data(), status);
 +    if (U_FAILURE(status)) {
 +        errln("CompactTrieDictionary(const void *,...) failed\n");
 +        goto cleanup;
 +    }
 +
 +    if (compact2->dataSize() == 0) {
 +        errln("CompactTrieDictionary->dataSize() == 0\n");
 +        goto cleanup;
 +    }
 +
 +    // Now count the words via the second dictionary
 +    delete enumer1;
 +    enumer1 = compact2->openWords(status);
 +    if (U_FAILURE(status)) {
 +        errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +    if (wordCount != (testCount = enumer1->count(status))) {
 +        errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
 +                testCount, wordCount, u_errorName(status));
 +        goto cleanup;
 +    }
 +
 +    cleanup:
 +    delete compactDict;
 +    delete mutableDict;
 +    delete breaks;
 +    delete[] testFile;
 +    delete enumer1;
 +    delete mutable2;
 +    delete cloneEnum;
 +    delete compact2;
 +    utext_close(originalText);
 +    utext_close(cloneText);
 +
 +
 +}

  //----------------------------------------------------------------------------
  //
 @@ -1870,8 +2243,15 @@
  // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
  static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
                                          "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
 +#if 0
  static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
  static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
 +#endif
 +// There's no separate Japanese word break iterator. Root is the same as Japanese.
 +// Our dictionary-based iterator has to be tweaked to better handle U+3005,
 +// U+3007, U+300B and some other cases.
 +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
 +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };

  // UBreakIteratorType UBRK_SENTENCE, Locale "el"
  // Add break after Greek question mark (cldrbug #2069).
 @@ -2672,6 +3052,8 @@
      UnicodeSet  *fNewlineSet;
      UnicodeSet  *fKatakanaSet;
      UnicodeSet  *fALetterSet;
 +    // TODO(jungshik): Do we still need this change?
 +    // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
      UnicodeSet  *fMidNumLetSet;
      UnicodeSet  *fMidLetterSet;
      UnicodeSet  *fMidNumSet;
 @@ -2680,6 +3062,7 @@
      UnicodeSet  *fOtherSet;
      UnicodeSet  *fExtendSet;
      UnicodeSet  *fExtendNumLetSet;
 +    UnicodeSet  *fDictionaryCjkSet;

      RegexMatcher  *fMatcher;

 @@ -2696,12 +3079,24 @@
      fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
      fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
      fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
 -    fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
 +    fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
 +    // Exclude Hangul syllables from ALetterSet during testing.
 +    // Leave CJK dictionary characters out from the monkey tests!
 +#if 0
 +    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
 +                                      "[\\p{Line_Break = Complex_Context}"
 +                                      "-\\p{Grapheme_Cluster_Break = Extend}"
 +                                      "-\\p{Grapheme_Cluster_Break = Control}"
 +                                      "]]",
 +                                      status);
 +#endif
 +    fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
 +    fALetterSet->removeAll(*fDictionaryCjkSet);
      fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
      fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
      fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
      fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
 -    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
 +    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"),      status);
      fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
      fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
      fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
 @@ -2725,13 +3120,14 @@
      fOtherSet->removeAll(*fFormatSet);
      fOtherSet->removeAll(*fExtendSet);
      // Inhibit dictionary characters from being tested at all.
 +    fOtherSet->removeAll(*fDictionaryCjkSet);
      fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));

      fSets->addElement(fCRSet,        status);
      fSets->addElement(fLFSet,        status);
      fSets->addElement(fNewlineSet,   status);
      fSets->addElement(fALetterSet,   status);
 -    fSets->addElement(fKatakanaSet,  status);
 +    //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
      fSets->addElement(fMidLetterSet, status);
      fSets->addElement(fMidNumLetSet, status);
      fSets->addElement(fMidNumSet,    status);
 @@ -3978,6 +4374,7 @@
      for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
          count --;
          if (forward[count] != i) {
 +            printStringBreaks(ustr, expected, expectedcount);
              test->errln("happy break test previous() failed: expected %d but got %d",
                          forward[count], i);
              break;
 @@ -4011,23 +4408,25 @@
      UErrorCode    status = U_ZERO_ERROR;
      // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
      BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
 +    // Replaced any C+J characters in a row with a random sequence of characters
 +    // of the same length to make our C+J segmentation not get in the way.
      static const char *strlist[] =
      {
      "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
 -    "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
 +    "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
      "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
      "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
 -    "\\u90ca\\u3588\\u009c\\u0953\\u194b",
 +    "\\uac00\\u3588\\u009c\\u0953\\u194b",
      "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
      "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
 -    "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
 +    "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
      "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
      "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
      "\\u2027\\U000e0067\\u0a47\\u00b7",
      "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
      "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
      "\\u0589\\U000e006e\\u0a42\\U000104a5",
 -    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
 +    "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
      "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
      "\\u0027\\u11af\\U000e0057\\u0602",
      "\\U0001d7f2\\U000e007\\u0004\\u0589",
 @@ -4039,7 +4438,7 @@
      "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
      "\\u0233\\U000e0020\\u0a69\\u0d6a",
      "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
 -    "\\u58f4\\U000e0049\\u20e7\\u2027",
 +    "\\u18f4\\U000e0049\\u20e7\\u2027",
      "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
      "\\ua183\\u102d\\u0bec\\u003a",
      "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
 @@ -4049,7 +4448,7 @@
      "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
      "\\u003a\\u0664\\u00b7\\u1fba",
      "\\u003b\\u0027\\u00b7\\u47a3",
 -    "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
 +    "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
      "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
      "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
      };
 @@ -4104,12 +4503,12 @@
      "\\U0001d7f2\\U000e007d\\u0004\\u0589",
      "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
      "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
 -    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
 +    "\\U000e0065\\u302c\\u09ee\\U000e0068",
      "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
      "\\u0233\\U000e0020\\u0a69\\u0d6a",
      "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
      "\\u58f4\\U000e0049\\u20e7\\u2027",
 -    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
 +    "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
      "\\ua183\\u102d\\u0bec\\u003a",
      "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
      "\\u003a\\u0e57\\u0fad\\u002e",
 --- source/test/intltest/rbbitst.h	2010-07-22 17:15:37.000000000 -0700
 +++ source/test/intltest/rbbitst.h	2011-01-21 14:12:45.152007000 -0800
 @@ -70,6 +70,7 @@
      void TestBug5775();
      void TestThaiBreaks();
      void TestTailoredBreaks();
 +    void TestTrieDictWithValue();
      void TestDictRules();
      void TestBug5532();

 --- source/test/testdata/rbbitst.txt	2010-07-28 17:18:28.000000000 -0700
 +++ source/test/testdata/rbbitst.txt	2011-01-21 14:12:45.221011000 -0800
 @@ -161,7 +161,23 @@
  <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>

  # Hiragana & Katakana stay together, but separates from each other and Latin.
 -<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
 +# *** what to do about theoretical combos of chars? i.e. hiragana + accent
 +#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
 +
 +# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
 +<data>•芽キャベツ<400>芽キャﾍﾞツ<400></data>
 +
 +# more Japanese tests
 +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana
 +# and the Katakana block are not treated correctly. Enable this later.
 +#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
 +<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400>　•て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
 +
 +# Testing of word boundary for dictionary word containing both kanji and kana
 +<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>
 +
 +# Testing of Chinese segmentation (taken from a Chinese news article)
 +<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>到了<400>“•推荐<400>票<400>”•，•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>，•选出<400>他们<400>属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</data>

  # Words with interior formatting characters
  <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
 @@ -169,6 +185,8 @@
  # to test for bug #4097779
  <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>

 +# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts
 +<data>•ＩＳＮ'Ｔ<200> •１９<100>日<400></data>

  #      to test for bug #4098467
  #      What follows is a string of Korean characters (I found it in the Yellow Pages
 @@ -178,9 +196,15 @@
  #      precomposed syllables...
  <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>

 -<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
 +# more Korean tests (Jamo not tested here, not counted as dictionary characters)
 +# Disable them now because we don't include a Korean dictionary.
 +#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
 +#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>
 +
 +<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data>
 +
 +<data>•\u06c9<200>\uc799<200>\ufffa•</data>

 -<data>•\u06c9\uc799\ufffa<200></data>

  #
  #      Try some words from other scripts.
 @@ -491,8 +515,7 @@
  <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>

  #      conjoining jamo...
 -#      TODO:  rules update needed
 -#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
 +<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>

  #      to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
  <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
 --- source/test/testdata/testaliases.txt	2009-11-12 13:53:42.000000000 -0800
 +++ source/test/testdata/testaliases.txt	2011-01-21 14:12:45.204005000 -0800
 @@ -28,7 +28,7 @@
      LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }

      // aliasing using position
 -    boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding resource in another bundle
 +    boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding resource in another bundle

      // aliasing arrays
      zoneTests {
 --- source/tools/genctd/genctd.cpp	2009-08-04 14:09:17.000000000 -0700
 +++ source/tools/genctd/genctd.cpp	2011-01-21 14:12:45.564923000 -0800
 @@ -1,6 +1,6 @@
  /*
  **********************************************************************
 -*   Copyright (C) 2002-2009, International Business Machines
 +*   Copyright (C) 2002-2010, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  **********************************************************************
  *
 @@ -34,12 +34,15 @@
  #include "unicode/udata.h"
  #include "unicode/putil.h"

 +//#include "unicode/ustdio.h"
 +
  #include "uoptions.h"
  #include "unewdata.h"
  #include "ucmndata.h"
  #include "rbbidata.h"
  #include "triedict.h"
  #include "cmemory.h"
 +#include "uassert.h"

  #include <stdio.h>
  #include <stdlib.h>
 @@ -199,147 +202,191 @@
      long        wordFileSize;
      FILE        *file;
      char        *wordBufferC;
 -
 +    MutableTrieDictionary *mtd = NULL;
 +
      file = fopen(wordFileName, "rb");
 -    if( file == 0 ) {
 -        fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
 -        exit(-1);
 -    }
 -    fseek(file, 0, SEEK_END);
 -    wordFileSize = ftell(file);
 -    fseek(file, 0, SEEK_SET);
 -    wordBufferC = new char[wordFileSize+10];
 -
 -    result = (long)fread(wordBufferC, 1, wordFileSize, file);
 -    if (result != wordFileSize)  {
 -        fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
 -        exit (-1);
 -    }
 -    wordBufferC[wordFileSize]=0;
 -    fclose(file);
 -
 -    //
 -    // Look for a Unicode Signature (BOM) on the word file
 -    //
 -    int32_t        signatureLength;
 -    const char *   wordSourceC = wordBufferC;
 -    const char*    encoding = ucnv_detectUnicodeSignature(
 -                           wordSourceC, wordFileSize, &signatureLength, &status);
 -    if (U_FAILURE(status)) {
 -        exit(status);
 -    }
 -    if(encoding!=NULL ){
 -        wordSourceC  += signatureLength;
 -        wordFileSize -= signatureLength;
 -    }
 -
 -    //
 -    // Open a converter to take the rule file to UTF-16
 -    //
 -    UConverter* conv;
 -    conv = ucnv_open(encoding, &status);
 -    if (U_FAILURE(status)) {
 -        fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
 -        exit(status);
 -    }
 -
 -    //
 -    // Convert the words to UChar.
 -    //  Preflight first to determine required buffer size.
 -    //
 -    uint32_t destCap = ucnv_toUChars(conv,
 -                       NULL,           //  dest,
 -                       0,              //  destCapacity,
 -                       wordSourceC,
 -                       wordFileSize,
 -                       &status);
 -    if (status != U_BUFFER_OVERFLOW_ERROR) {
 -        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
 -        exit(status);
 -    };
 -
 -    status = U_ZERO_ERROR;
 -    UChar *wordSourceU = new UChar[destCap+1];
 -    ucnv_toUChars(conv,
 -                  wordSourceU,     //  dest,
 -                  destCap+1,
 -                  wordSourceC,
 -                  wordFileSize,
 -                  &status);
 -    if (U_FAILURE(status)) {
 -        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
 -        exit(status);
 -    };
 -    ucnv_close(conv);
 -
 -    // Get rid of the original file buffer
 -    delete[] wordBufferC;
 -
 -    // Create a MutableTrieDictionary, and loop through all the lines, inserting
 -    // words.
 -
 -    // First, pick a median character.
 -    UChar *current = wordSourceU + (destCap/2);
 -    UChar uc = *current++;
 -    UnicodeSet breaks;
 -    breaks.add(0x000A);     // Line Feed
 -    breaks.add(0x000D);     // Carriage Return
 -    breaks.add(0x2028);     // Line Separator
 -    breaks.add(0x2029);     // Paragraph Separator
 -
 -    do {
 -        // Look for line break
 -        while (uc && !breaks.contains(uc)) {
 -            uc = *current++;
 -        }
 -        // Now skip to first non-line-break
 -        while (uc && breaks.contains(uc)) {
 -            uc = *current++;
 +    if( file == 0 ) { //cannot find file
 +        //create 1-line dummy file: ie 1 char, 1 value
 +        UNewDataMemory *pData;
 +        char msg[1024];
 +
 +        /* write message with just the name */
 +        sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName);
 +        fprintf(stderr, "%s\n", msg);
 +
 +        UChar c = 0x0020;
 +        mtd = new MutableTrieDictionary(c, status, TRUE);
 +        mtd->addWord(&c, 1, status, 1);
 +
 +    } else { //read words in from input file
 +        fseek(file, 0, SEEK_END);
 +        wordFileSize = ftell(file);
 +        fseek(file, 0, SEEK_SET);
 +        wordBufferC = new char[wordFileSize+10];
 +
 +        result = (long)fread(wordBufferC, 1, wordFileSize, file);
 +        if (result != wordFileSize)  {
 +            fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
 +            exit (-1);
          }
 -    }
 -    while (uc && (breaks.contains(uc) || u_isspace(uc)));
 -
 -    MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
 +        wordBufferC[wordFileSize]=0;
 +        fclose(file);

 -    if (U_FAILURE(status)) {
 -        fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
 -        exit(status);
 -    }
 +        //
 +        // Look for a Unicode Signature (BOM) on the word file
 +        //
 +        int32_t        signatureLength;
 +        const char *   wordSourceC = wordBufferC;
 +        const char*    encoding = ucnv_detectUnicodeSignature(
 +                               wordSourceC, wordFileSize, &signatureLength, &status);
 +        if (U_FAILURE(status)) {
 +            exit(status);
 +        }
 +        if(encoding!=NULL ){
 +            wordSourceC  += signatureLength;
 +            wordFileSize -= signatureLength;
 +        }

 -    // Now add the words. Words are non-space characters at the beginning of
 -    // lines, and must be at least one UChar.
 -    current = wordSourceU;
 -    UChar *candidate = current;
 -    uc = *current++;
 -    int32_t length = 0;
 -
 -    while (uc) {
 -        while (uc && !u_isspace(uc)) {
 -            ++length;
 -            uc = *current++;
 +        //
 +        // Open a converter to take the rule file to UTF-16
 +        //
 +        UConverter* conv;
 +        conv = ucnv_open(encoding, &status);
 +        if (U_FAILURE(status)) {
 +            fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
 +            exit(status);
          }
 -        if (length > 0) {
 -            mtd->addWord(candidate, length, status);
 -            if (U_FAILURE(status)) {
 -                fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
 -                        u_errorName(status));
 -                exit(status);
 +
 +        //
 +        // Convert the words to UChar.
 +        //  Preflight first to determine required buffer size.
 +        //
 +        uint32_t destCap = ucnv_toUChars(conv,
 +                           NULL,           //  dest,
 +                           0,              //  destCapacity,
 +                           wordSourceC,
 +                           wordFileSize,
 +                           &status);
 +        if (status != U_BUFFER_OVERFLOW_ERROR) {
 +            fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
 +            exit(status);
 +        };
 +
 +        status = U_ZERO_ERROR;
 +        UChar *wordSourceU = new UChar[destCap+1];
 +        ucnv_toUChars(conv,
 +                      wordSourceU,     //  dest,
 +                      destCap+1,
 +                      wordSourceC,
 +                      wordFileSize,
 +                      &status);
 +        if (U_FAILURE(status)) {
 +            fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
 +            exit(status);
 +        };
 +        ucnv_close(conv);
 +
 +        // Get rid of the original file buffer
 +        delete[] wordBufferC;
 +
 +        // Create a MutableTrieDictionary, and loop through all the lines, inserting
 +        // words.
 +
 +        // First, pick a median character.
 +        UChar *current = wordSourceU + (destCap/2);
 +        UChar uc = *current++;
 +        UnicodeSet breaks;
 +        breaks.add(0x000A);     // Line Feed
 +        breaks.add(0x000D);     // Carriage Return
 +        breaks.add(0x2028);     // Line Separator
 +        breaks.add(0x2029);     // Paragraph Separator
 +
 +        do {
 +            // Look for line break
 +            while (uc && !breaks.contains(uc)) {
 +                uc = *current++;
 +            }
 +            // Now skip to first non-line-break
 +            while (uc && breaks.contains(uc)) {
 +                uc = *current++;
              }
          }
 -        // Find beginning of next line
 -        while (uc && !breaks.contains(uc)) {
 -            uc = *current++;
 +        while (uc && (breaks.contains(uc) || u_isspace(uc)));
 +
 +        mtd = new MutableTrieDictionary(uc, status);
 +
 +        if (U_FAILURE(status)) {
 +            fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
 +            exit(status);
          }
 -        while (uc && breaks.contains(uc)) {
 -            uc = *current++;
 +
 +        // Now add the words. Words are non-space characters at the beginning of
 +        // lines, and must be at least one UChar. If a word has an associated value,
 +        // the value should follow the word on the same line after a tab character.
 +        current = wordSourceU;
 +        UChar *candidate = current;
 +        uc = *current++;
 +        int32_t length = 0;
 +        int count = 0;
 +
 +        while (uc) {
 +            while (uc && !u_isspace(uc)) {
 +                ++length;
 +                uc = *current++;
 +            }
 +
 +            UnicodeString valueString;
 +            UChar candidateValue;
 +            if(uc == 0x0009){ //separator is a tab char, read in number after space
 +            	while (uc && u_isspace(uc)) {
 +            		uc = *current++;
 +            	}
 +                while (uc && !u_isspace(uc)) {
 +                    valueString.append(uc);
 +                    uc = *current++;
 +                }
 +            }
 +
 +            if (length > 0) {
 +                count++;
 +                if(valueString.length() > 0){
 +                    mtd->setValued(TRUE);
 +
 +                    uint32_t value = 0;
 +                    char* s = new char[valueString.length()];
 +                    valueString.extract(0,valueString.length(), s, valueString.length());
 +                    int n = sscanf(s, "%ud", &value);
 +                    U_ASSERT(n == 1);
 +                    U_ASSERT(value >= 0);
 +                    mtd->addWord(candidate, length, status, (uint16_t)value);
 +                    delete[] s;
 +                } else {
 +                    mtd->addWord(candidate, length, status);
 +                }
 +
 +                if (U_FAILURE(status)) {
 +                    fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n",
 +                            u_errorName(status), count);
 +                    exit(status);
 +                }
 +            }
 +
 +            // Find beginning of next line
 +            while (uc && !breaks.contains(uc)) {
 +                uc = *current++;
 +            }
 +            // Find next non-line-breaking character
 +            while (uc && breaks.contains(uc)) {
 +                uc = *current++;
 +            }
 +            candidate = current-1;
 +            length = 0;
          }
 -        candidate = current-1;
 -        length = 0;
 +
 +        // Get rid of the Unicode text buffer
 +        delete[] wordSourceU;
      }

 -    // Get rid of the Unicode text buffer
 -    delete[] wordSourceU;
 -
      // Now, create a CompactTrieDictionary from the mutable dictionary
      CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
      if (U_FAILURE(status)) {
 @@ -393,4 +440,3 @@

  #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  }
 -
 --- source/tools/genctd/Makefile.in	2006-12-16 13:07:01.000000000 -0800
 +++ source/tools/genctd/Makefile.in	2011-01-21 14:12:45.555920000 -0800
 @@ -23,13 +23,13 @@
  ## Extra files to remove for 'make clean'
  CLEANFILES = *~ $(DEPS) $(MAN_FILES)

 -## Target information
 +## Target informationcd
  TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)

  ifneq ($(top_builddir),$(top_srcdir))
  CPPFLAGS += -I$(top_builddir)/common
  endif
 -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
 +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n
  LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)

  OBJECTS = genctd.o