src/third_party/icu/source/common/rbbisetb.cpp - cobalt - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 //
 //  rbbisetb.cpp
 //
 /*
 ***************************************************************************
 *   Copyright (C) 2002-2008 International Business Machines Corporation   *
 *   and others. All rights reserved.                                      *
 ***************************************************************************
 */
 //
 //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules
 //                   (part of the rule building process.)
 //
 //      Starting with the rules parse tree from the scanner,
 //
 //                   -  Enumerate the set of UnicodeSets that are referenced
 //                      by the RBBI rules.
 //                   -  compute a set of non-overlapping character ranges
 //                      with all characters within a range belonging to the same
 //                      set of input unicode sets.
 //                   -  Derive a set of non-overlapping UnicodeSet (like things)
 //                      that will correspond to columns in the state table for
 //                      the RBBI execution engine.  All characters within one
 //                      of these sets belong to the same set of the original
 //                      UnicodeSets from the user's rules.
 //                   -  construct the trie table that maps input characters
 //                      to the index of the matching non-overlapping set of set from
 //                      the previous step.
 //

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_BREAK_ITERATION

 #if defined(STARBOARD)
 #include "starboard/client_porting/poem/assert_poem.h"
 #include "starboard/client_porting/poem/string_poem.h"
 #endif  // defined(STARBOARD)
 #include "unicode/uniset.h"
 #include "uvector.h"
 #include "uassert.h"
 #include "cmemory.h"
 #include "cstring.h"

 #include "rbbisetb.h"
 #include "rbbinode.h"

 U_NAMESPACE_BEGIN

 const int32_t kMaxCharCategoriesFor8BitsTrie = 255;
 //------------------------------------------------------------------------
 //
 //   Constructor
 //
 //------------------------------------------------------------------------
 RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
 {
     fRB             = rb;
     fStatus         = rb->fStatus;
     fRangeList      = nullptr;
     fMutableTrie    = nullptr;
     fTrie           = nullptr;
     fTrieSize       = 0;
     fGroupCount     = 0;
     fSawBOF         = false;
 }


 //------------------------------------------------------------------------
 //
 //   Destructor
 //
 //------------------------------------------------------------------------
 RBBISetBuilder::~RBBISetBuilder()
 {
     RangeDescriptor   *nextRangeDesc;

     // Walk through & delete the linked list of RangeDescriptors
     for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
         RangeDescriptor *r = nextRangeDesc;
         nextRangeDesc      = r->fNext;
         delete r;
     }

     ucptrie_close(fTrie);
     umutablecptrie_close(fMutableTrie);
 }


 //------------------------------------------------------------------------
 //
 //   build          Build the list of non-overlapping character ranges
 //                  from the Unicode Sets.
 //
 //------------------------------------------------------------------------
 void RBBISetBuilder::buildRanges() {
     RBBINode        *usetNode;
     RangeDescriptor *rlRange;

     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}

     //
     //  Initialize the process by creating a single range encompassing all characters
     //  that is in no sets.
     //
     fRangeList                = new RangeDescriptor(*fStatus); // will check for status here
     if (fRangeList == NULL) {
         *fStatus = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
     fRangeList->fStartChar    = 0;
     fRangeList->fEndChar      = 0x10ffff;

     if (U_FAILURE(*fStatus)) {
         return;
     }

     //
     //  Find the set of non-overlapping ranges of characters
     //
     int  ni;
     for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
         usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
         if (usetNode==NULL) {
             break;
         }

         UnicodeSet      *inputSet             = usetNode->fInputSet;
         int32_t          inputSetRangeCount   = inputSet->getRangeCount();
         int              inputSetRangeIndex   = 0;
                          rlRange              = fRangeList;

         for (;;) {
             if (inputSetRangeIndex >= inputSetRangeCount) {
                 break;
             }
             UChar32      inputSetRangeBegin  = inputSet->getRangeStart(inputSetRangeIndex);
             UChar32      inputSetRangeEnd    = inputSet->getRangeEnd(inputSetRangeIndex);

             // skip over ranges from the range list that are completely
             //   below the current range from the input unicode set.
             while (rlRange->fEndChar < inputSetRangeBegin) {
                 rlRange = rlRange->fNext;
             }

             // If the start of the range from the range list is before with
             //   the start of the range from the unicode set, split the range list range
             //   in two, with one part being before (wholly outside of) the unicode set
             //   and the other containing the rest.
             //   Then continue the loop; the post-split current range will then be skipped
             //     over
             if (rlRange->fStartChar < inputSetRangeBegin) {
                 rlRange->split(inputSetRangeBegin, *fStatus);
                 if (U_FAILURE(*fStatus)) {
                     return;
                 }
                 continue;
             }

             // Same thing at the end of the ranges...
             // If the end of the range from the range list doesn't coincide with
             //   the end of the range from the unicode set, split the range list
             //   range in two.  The first part of the split range will be
             //   wholly inside the Unicode set.
             if (rlRange->fEndChar > inputSetRangeEnd) {
                 rlRange->split(inputSetRangeEnd+1, *fStatus);
                 if (U_FAILURE(*fStatus)) {
                     return;
                 }
             }

             // The current rlRange is now entirely within the UnicodeSet range.
             // Add this unicode set to the list of sets for this rlRange
             if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
                 rlRange->fIncludesSets->addElement(usetNode, *fStatus);
                 if (U_FAILURE(*fStatus)) {
                     return;
                 }
             }

             // Advance over ranges that we are finished with.
             if (inputSetRangeEnd == rlRange->fEndChar) {
                 inputSetRangeIndex++;
             }
             rlRange = rlRange->fNext;
         }
     }

     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}

     //
     //  Group the above ranges, with each group consisting of one or more
     //    ranges that are in exactly the same set of original UnicodeSets.
     //    The groups are numbered, and these group numbers are the set of
     //    input symbols recognized by the run-time state machine.
     //
     //    Numbering: # 0  (state table column 0) is unused.
     //               # 1  is reserved - table column 1 is for end-of-input
     //               # 2  is reserved - table column 2 is for beginning-of-input
     //               # 3  is the first range list.
     //
     RangeDescriptor *rlSearchRange;
     int32_t dictGroupCount = 0;

     for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
         for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
             if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
                 rlRange->fNum = rlSearchRange->fNum;
                 rlRange->fIncludesDict = rlSearchRange->fIncludesDict;
                 break;
             }
         }
         if (rlRange->fNum == 0) {
             rlRange->fFirstInGroup = true;
             if (rlRange->isDictionaryRange()) {
                 rlRange->fNum = ++dictGroupCount;
                 rlRange->fIncludesDict = true;
             } else {
                 fGroupCount++;
                 rlRange->fNum = fGroupCount+2;
                 addValToSets(rlRange->fIncludesSets, rlRange->fNum);
             }
         }
     }

     // Move the character category numbers for any dictionary ranges up, so that they
     // immediately follow the non-dictionary ranges.

     fDictCategoriesStart = fGroupCount + 3;
     for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
         if (rlRange->fIncludesDict) {
             rlRange->fNum += fDictCategoriesStart - 1;
             if (rlRange->fFirstInGroup) {
                 addValToSets(rlRange->fIncludesSets, rlRange->fNum);
             }
         }
     }
     fGroupCount += dictGroupCount;


     // Handle input sets that contain the special string {eof}.
     //   Column 1 of the state table is reserved for EOF on input.
     //   Column 2 is reserved for before-the-start-input.
     //            (This column can be optimized away later if there are no rule
     //             references to {bof}.)
     //   Add this column value (1 or 2) to the equivalent expression
     //     subtree for each UnicodeSet that contains the string {eof}
     //   Because {bof} and {eof} are not characters in the normal sense,
     //   they don't affect the computation of the ranges or TRIE.

     UnicodeString eofString(u"eof");
     UnicodeString bofString(u"bof");
     for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
         usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
         if (usetNode==NULL) {
             break;
         }
         UnicodeSet      *inputSet = usetNode->fInputSet;
         if (inputSet->contains(eofString)) {
             addValToSet(usetNode, 1);
         }
         if (inputSet->contains(bofString)) {
             addValToSet(usetNode, 2);
             fSawBOF = TRUE;
         }
     }


     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
 }


 //
 // Build the Trie table for mapping UChar32 values to the corresponding
 // range group number.
 //
 void RBBISetBuilder::buildTrie() {
     fMutableTrie = umutablecptrie_open(
                         0,       //  Initial value for all code points.
                         0,       //  Error value for out-of-range input.
                         fStatus);

     for (RangeDescriptor *range = fRangeList; range!=nullptr && U_SUCCESS(*fStatus); range=range->fNext) {
         umutablecptrie_setRange(fMutableTrie,
                                 range->fStartChar,     // Range start
                                 range->fEndChar,       // Range end (inclusive)
                                 range->fNum,           // value for range
                                 fStatus);
     }
 }


 void RBBISetBuilder::mergeCategories(IntPair categories) {
     U_ASSERT(categories.first >= 1);
     U_ASSERT(categories.second > categories.first);
     U_ASSERT((categories.first <  fDictCategoriesStart && categories.second <  fDictCategoriesStart) ||
              (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));

     for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
         int32_t rangeNum = rd->fNum;
         if (rangeNum == categories.second) {
             rd->fNum = categories.first;
         } else if (rangeNum > categories.second) {
             rd->fNum--;
         }
     }
     --fGroupCount;
     if (categories.second <= fDictCategoriesStart) {
         --fDictCategoriesStart;
     }
 }


 //-----------------------------------------------------------------------------------
 //
 //  getTrieSize()    Return the size that will be required to serialize the Trie.
 //
 //-----------------------------------------------------------------------------------
 int32_t RBBISetBuilder::getTrieSize()  {
     if (U_FAILURE(*fStatus)) {
         return 0;
     }
     if (fTrie == nullptr) {
         bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
         fTrie = umutablecptrie_buildImmutable(
             fMutableTrie,
             UCPTRIE_TYPE_FAST,
             use8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16,
             fStatus);
         fTrieSize = ucptrie_toBinary(fTrie, nullptr, 0, fStatus);
         if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
             *fStatus = U_ZERO_ERROR;
         }
     }
     return fTrieSize;
 }


 //-----------------------------------------------------------------------------------
 //
 //  serializeTrie()   Put the serialized trie at the specified address.
 //                    Trust the caller to have given us enough memory.
 //                    getTrieSize() MUST be called first.
 //
 //-----------------------------------------------------------------------------------
 void RBBISetBuilder::serializeTrie(uint8_t *where) {
     ucptrie_toBinary(fTrie,
                      where,                // Buffer
                      fTrieSize,            // Capacity
                      fStatus);
 }

 //------------------------------------------------------------------------
 //
 //  addValToSets     Add a runtime-mapped input value to each uset from a
 //                   list of uset nodes. (val corresponds to a state table column.)
 //                   For each of the original Unicode sets - which correspond
 //                   directly to uset nodes - a logically equivalent expression
 //                   is constructed in terms of the remapped runtime input
 //                   symbol set.  This function adds one runtime input symbol to
 //                   a list of sets.
 //
 //                   The "logically equivalent expression" is the tree for an
 //                   or-ing together of all of the symbols that go into the set.
 //
 //------------------------------------------------------------------------
 void  RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
     int32_t       ix;

     for (ix=0; ix<sets->size(); ix++) {
         RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
         addValToSet(usetNode, val);
     }
 }

 void  RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
     RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
     if (leafNode == NULL) {
         *fStatus = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
     leafNode->fVal = (unsigned short)val;
     if (usetNode->fLeftChild == NULL) {
         usetNode->fLeftChild = leafNode;
         leafNode->fParent    = usetNode;
     } else {
         // There are already input symbols present for this set.
         // Set up an OR node, with the previous stuff as the left child
         //   and the new value as the right child.
         RBBINode *orNode = new RBBINode(RBBINode::opOr);
         if (orNode == NULL) {
             *fStatus = U_MEMORY_ALLOCATION_ERROR;
             return;
         }
         orNode->fLeftChild  = usetNode->fLeftChild;
         orNode->fRightChild = leafNode;
         orNode->fLeftChild->fParent  = orNode;
         orNode->fRightChild->fParent = orNode;
         usetNode->fLeftChild = orNode;
         orNode->fParent = usetNode;
     }
 }


 //------------------------------------------------------------------------
 //
 //   getNumCharCategories
 //
 //------------------------------------------------------------------------
 int32_t  RBBISetBuilder::getNumCharCategories() const {
     return fGroupCount + 3;
 }


 //------------------------------------------------------------------------
 //
 //   getDictCategoriesStart
 //
 //------------------------------------------------------------------------
 int32_t  RBBISetBuilder::getDictCategoriesStart() const {
     return fDictCategoriesStart;
 }


 //------------------------------------------------------------------------
 //
 //   sawBOF
 //
 //------------------------------------------------------------------------
 UBool  RBBISetBuilder::sawBOF() const {
     return fSawBOF;
 }


 //------------------------------------------------------------------------
 //
 //   getFirstChar      Given a runtime RBBI character category, find
 //                     the first UChar32 that is in the set of chars
 //                     in the category.
 //------------------------------------------------------------------------
 UChar32  RBBISetBuilder::getFirstChar(int32_t category) const {
     RangeDescriptor   *rlRange;
     UChar32            retVal = (UChar32)-1;
     for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
         if (rlRange->fNum == category) {
             retVal = rlRange->fStartChar;
             break;
         }
     }
     return retVal;
 }


 //------------------------------------------------------------------------
 //
 //   printRanges        A debugging function.
 //                      dump out all of the range definitions.
 //
 //------------------------------------------------------------------------
 #ifdef RBBI_DEBUG
 void RBBISetBuilder::printRanges() {
     RangeDescriptor       *rlRange;
     int                    i;

     RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
     for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
         RBBIDebugPrintf("%4x-%4x  ", rlRange->fStartChar, rlRange->fEndChar);

         for (i=0; i<rlRange->fIncludesSets->size(); i++) {
             RBBINode       *usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
             UnicodeString   setName {u"anon"};
             RBBINode       *setRef = usetNode->fParent;
             if (setRef != nullptr) {
                 RBBINode *varRef = setRef->fParent;
                 if (varRef != nullptr  &&  varRef->fType == RBBINode::varRef) {
                     setName = varRef->fText;
                 }
             }
             RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf("  ");
         }
         RBBIDebugPrintf("\n");
     }
 }
 #endif


 //------------------------------------------------------------------------
 //
 //   printRangeGroups     A debugging function.
 //                        dump out all of the range groups.
 //
 //------------------------------------------------------------------------
 #ifdef RBBI_DEBUG
 void RBBISetBuilder::printRangeGroups() {
     int                    i;

     RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
     for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
         if (rlRange->fFirstInGroup) {
             int groupNum = rlRange->fNum;
             RBBIDebugPrintf("%2i  ", groupNum);

             if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" <DICT> ");}

             for (i=0; i<rlRange->fIncludesSets->size(); i++) {
                 RBBINode       *usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
                 UnicodeString   setName = UNICODE_STRING("anon", 4);
                 RBBINode       *setRef = usetNode->fParent;
                 if (setRef != NULL) {
                     RBBINode *varRef = setRef->fParent;
                     if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
                         setName = varRef->fText;
                     }
                 }
                 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
             }

             i = 0;
             for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) {
                 if (tRange->fNum == rlRange->fNum) {
                     if (i++ % 5 == 0) {
                         RBBIDebugPrintf("\n    ");
                     }
                     RBBIDebugPrintf("  %05x-%05x", tRange->fStartChar, tRange->fEndChar);
                 }
             }
             RBBIDebugPrintf("\n");
         }
     }
     RBBIDebugPrintf("\n");
 }
 #endif


 //------------------------------------------------------------------------
 //
 //   printSets          A debugging function.
 //                      dump out all of the set definitions.
 //
 //------------------------------------------------------------------------
 #ifdef RBBI_DEBUG
 void RBBISetBuilder::printSets() {
     int                   i;

     RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
     for (i=0; ; i++) {
         RBBINode        *usetNode;
         RBBINode        *setRef;
         RBBINode        *varRef;
         UnicodeString    setName;

         usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
         if (usetNode == NULL) {
             break;
         }

         RBBIDebugPrintf("%3d    ", i);
         setName = UNICODE_STRING("anonymous", 9);
         setRef = usetNode->fParent;
         if (setRef != NULL) {
             varRef = setRef->fParent;
             if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
                 setName = varRef->fText;
             }
         }
         RBBI_DEBUG_printUnicodeString(setName);
         RBBIDebugPrintf("   ");
         RBBI_DEBUG_printUnicodeString(usetNode->fText);
         RBBIDebugPrintf("\n");
         if (usetNode->fLeftChild != NULL) {
             RBBINode::printTree(usetNode->fLeftChild, TRUE);
         }
     }
     RBBIDebugPrintf("\n");
 }
 #endif


 //-------------------------------------------------------------------------------------
 //
 //  RangeDescriptor copy constructor
 //
 //-------------------------------------------------------------------------------------

 RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) :
         fStartChar(other.fStartChar), fEndChar {other.fEndChar}, fNum {other.fNum},
         fIncludesDict{other.fIncludesDict}, fFirstInGroup{other.fFirstInGroup} {

     if (U_FAILURE(status)) {
         return;
     }
     fIncludesSets = new UVector(status);
     if (this->fIncludesSets == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
     }
     if (U_FAILURE(status)) {
         return;
     }

     for (int32_t i=0; i<other.fIncludesSets->size(); i++) {
         this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
     }
 }


 //-------------------------------------------------------------------------------------
 //
 //  RangeDesriptor default constructor
 //
 //-------------------------------------------------------------------------------------
 RangeDescriptor::RangeDescriptor(UErrorCode &status) {
     if (U_FAILURE(status)) {
         return;
     }
     fIncludesSets = new UVector(status);
     if (fIncludesSets == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
     }
 }


 //-------------------------------------------------------------------------------------
 //
 //  RangeDesriptor Destructor
 //
 //-------------------------------------------------------------------------------------
 RangeDescriptor::~RangeDescriptor() {
     delete  fIncludesSets;
     fIncludesSets = nullptr;
 }

 //-------------------------------------------------------------------------------------
 //
 //  RangeDesriptor::split()
 //
 //-------------------------------------------------------------------------------------
 void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
     U_ASSERT(where>fStartChar && where<=fEndChar);
     RangeDescriptor *nr = new RangeDescriptor(*this, status);
     if(nr == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
     if (U_FAILURE(status)) {
         delete nr;
         return;
     }
     //  RangeDescriptor copy constructor copies all fields.
     //  Only need to update those that are different after the split.
     nr->fStartChar = where;
     this->fEndChar = where-1;
     nr->fNext      = this->fNext;
     this->fNext    = nr;
 }


 //-------------------------------------------------------------------------------------
 //
 //   RangeDescriptor::isDictionaryRange
 //
 //            Test whether this range includes characters from
 //            the original Unicode Set named "dictionary".
 //
 //            This function looks through the Unicode Sets that
 //            the range includes, checking for one named "dictionary"
 //
 //            TODO:  a faster way would be to find the set node for
 //                   "dictionary" just once, rather than looking it
 //                   up by name every time.
 //
 //-------------------------------------------------------------------------------------
 bool RangeDescriptor::isDictionaryRange() {
     static const char16_t *dictionary = u"dictionary";
     for (int32_t i=0; i<fIncludesSets->size(); i++) {
         RBBINode *usetNode  = (RBBINode *)fIncludesSets->elementAt(i);
         RBBINode *setRef = usetNode->fParent;
         if (setRef != nullptr) {
             RBBINode *varRef = setRef->fParent;
             if (varRef && varRef->fType == RBBINode::varRef) {
                 const UnicodeString *setName = &varRef->fText;
                 if (setName->compare(dictionary, -1) == 0) {
                     return true;
                 }
             }
         }
     }
     return false;
 }

 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
	// © 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html
	//
	// rbbisetb.cpp
	//
	/*
	***************************************************************************
	* Copyright (C) 2002-2008 International Business Machines Corporation *
	* and others. All rights reserved. *
	***************************************************************************
	*/
	//
	// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
	// (part of the rule building process.)
	//
	// Starting with the rules parse tree from the scanner,
	//
	// - Enumerate the set of UnicodeSets that are referenced
	// by the RBBI rules.
	// - compute a set of non-overlapping character ranges
	// with all characters within a range belonging to the same
	// set of input unicode sets.
	// - Derive a set of non-overlapping UnicodeSet (like things)
	// that will correspond to columns in the state table for
	// the RBBI execution engine. All characters within one
	// of these sets belong to the same set of the original
	// UnicodeSets from the user's rules.
	// - construct the trie table that maps input characters
	// to the index of the matching non-overlapping set of set from
	// the previous step.
	//

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_BREAK_ITERATION

	#if defined(STARBOARD)
	#include "starboard/client_porting/poem/assert_poem.h"
	#include "starboard/client_porting/poem/string_poem.h"
	#endif // defined(STARBOARD)
	#include "unicode/uniset.h"
	#include "uvector.h"
	#include "uassert.h"
	#include "cmemory.h"
	#include "cstring.h"

	#include "rbbisetb.h"
	#include "rbbinode.h"

	U_NAMESPACE_BEGIN

	const int32_t kMaxCharCategoriesFor8BitsTrie = 255;
	//------------------------------------------------------------------------
	//
	// Constructor
	//
	//------------------------------------------------------------------------
	RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
	{
	fRB = rb;
	fStatus = rb->fStatus;
	fRangeList = nullptr;
	fMutableTrie = nullptr;
	fTrie = nullptr;
	fTrieSize = 0;
	fGroupCount = 0;
	fSawBOF = false;
	}


	//------------------------------------------------------------------------
	//
	// Destructor
	//
	//------------------------------------------------------------------------
	RBBISetBuilder::~RBBISetBuilder()
	{
	RangeDescriptor *nextRangeDesc;

	// Walk through & delete the linked list of RangeDescriptors
	for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
	RangeDescriptor *r = nextRangeDesc;
	nextRangeDesc = r->fNext;
	delete r;
	}

	ucptrie_close(fTrie);
	umutablecptrie_close(fMutableTrie);
	}




	//------------------------------------------------------------------------
	//
	// build Build the list of non-overlapping character ranges
	// from the Unicode Sets.
	//
	//------------------------------------------------------------------------
	void RBBISetBuilder::buildRanges() {
	RBBINode *usetNode;
	RangeDescriptor *rlRange;

	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}

	//
	// Initialize the process by creating a single range encompassing all characters
	// that is in no sets.
	//
	fRangeList = new RangeDescriptor(*fStatus); // will check for status here
	if (fRangeList == NULL) {
	*fStatus = U_MEMORY_ALLOCATION_ERROR;
	return;
	}
	fRangeList->fStartChar = 0;
	fRangeList->fEndChar = 0x10ffff;

	if (U_FAILURE(*fStatus)) {
	return;
	}

	//
	// Find the set of non-overlapping ranges of characters
	//
	int ni;
	for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
	usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
	if (usetNode==NULL) {
	break;
	}

	UnicodeSet *inputSet = usetNode->fInputSet;
	int32_t inputSetRangeCount = inputSet->getRangeCount();
	int inputSetRangeIndex = 0;
	rlRange = fRangeList;

	for (;;) {
	if (inputSetRangeIndex >= inputSetRangeCount) {
	break;
	}
	UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex);
	UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex);

	// skip over ranges from the range list that are completely
	// below the current range from the input unicode set.
	while (rlRange->fEndChar < inputSetRangeBegin) {
	rlRange = rlRange->fNext;
	}

	// If the start of the range from the range list is before with
	// the start of the range from the unicode set, split the range list range
	// in two, with one part being before (wholly outside of) the unicode set
	// and the other containing the rest.
	// Then continue the loop; the post-split current range will then be skipped
	// over
	if (rlRange->fStartChar < inputSetRangeBegin) {
	rlRange->split(inputSetRangeBegin, *fStatus);
	if (U_FAILURE(*fStatus)) {
	return;
	}
	continue;
	}

	// Same thing at the end of the ranges...
	// If the end of the range from the range list doesn't coincide with
	// the end of the range from the unicode set, split the range list
	// range in two. The first part of the split range will be
	// wholly inside the Unicode set.
	if (rlRange->fEndChar > inputSetRangeEnd) {
	rlRange->split(inputSetRangeEnd+1, *fStatus);
	if (U_FAILURE(*fStatus)) {
	return;
	}
	}

	// The current rlRange is now entirely within the UnicodeSet range.
	// Add this unicode set to the list of sets for this rlRange
	if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
	rlRange->fIncludesSets->addElement(usetNode, *fStatus);
	if (U_FAILURE(*fStatus)) {
	return;
	}
	}

	// Advance over ranges that we are finished with.
	if (inputSetRangeEnd == rlRange->fEndChar) {
	inputSetRangeIndex++;
	}
	rlRange = rlRange->fNext;
	}
	}

	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}

	//
	// Group the above ranges, with each group consisting of one or more
	// ranges that are in exactly the same set of original UnicodeSets.
	// The groups are numbered, and these group numbers are the set of
	// input symbols recognized by the run-time state machine.
	//
	// Numbering: # 0 (state table column 0) is unused.
	// # 1 is reserved - table column 1 is for end-of-input
	// # 2 is reserved - table column 2 is for beginning-of-input
	// # 3 is the first range list.
	//
	RangeDescriptor *rlSearchRange;
	int32_t dictGroupCount = 0;

	for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
	for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
	if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
	rlRange->fNum = rlSearchRange->fNum;
	rlRange->fIncludesDict = rlSearchRange->fIncludesDict;
	break;
	}
	}
	if (rlRange->fNum == 0) {
	rlRange->fFirstInGroup = true;
	if (rlRange->isDictionaryRange()) {
	rlRange->fNum = ++dictGroupCount;
	rlRange->fIncludesDict = true;
	} else {
	fGroupCount++;
	rlRange->fNum = fGroupCount+2;
	addValToSets(rlRange->fIncludesSets, rlRange->fNum);
	}
	}
	}

	// Move the character category numbers for any dictionary ranges up, so that they
	// immediately follow the non-dictionary ranges.

	fDictCategoriesStart = fGroupCount + 3;
	for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
	if (rlRange->fIncludesDict) {
	rlRange->fNum += fDictCategoriesStart - 1;
	if (rlRange->fFirstInGroup) {
	addValToSets(rlRange->fIncludesSets, rlRange->fNum);
	}
	}
	}
	fGroupCount += dictGroupCount;


	// Handle input sets that contain the special string {eof}.
	// Column 1 of the state table is reserved for EOF on input.
	// Column 2 is reserved for before-the-start-input.
	// (This column can be optimized away later if there are no rule
	// references to {bof}.)
	// Add this column value (1 or 2) to the equivalent expression
	// subtree for each UnicodeSet that contains the string {eof}
	// Because {bof} and {eof} are not characters in the normal sense,
	// they don't affect the computation of the ranges or TRIE.

	UnicodeString eofString(u"eof");
	UnicodeString bofString(u"bof");
	for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
	usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
	if (usetNode==NULL) {
	break;
	}
	UnicodeSet *inputSet = usetNode->fInputSet;
	if (inputSet->contains(eofString)) {
	addValToSet(usetNode, 1);
	}
	if (inputSet->contains(bofString)) {
	addValToSet(usetNode, 2);
	fSawBOF = TRUE;
	}
	}


	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
	if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
	}


	//
	// Build the Trie table for mapping UChar32 values to the corresponding
	// range group number.
	//
	void RBBISetBuilder::buildTrie() {
	fMutableTrie = umutablecptrie_open(
	0, // Initial value for all code points.
	0, // Error value for out-of-range input.
	fStatus);

	for (RangeDescriptor range = fRangeList; range!=nullptr && U_SUCCESS(fStatus); range=range->fNext) {
	umutablecptrie_setRange(fMutableTrie,
	range->fStartChar, // Range start
	range->fEndChar, // Range end (inclusive)
	range->fNum, // value for range
	fStatus);
	}
	}


	void RBBISetBuilder::mergeCategories(IntPair categories) {
	U_ASSERT(categories.first >= 1);
	U_ASSERT(categories.second > categories.first);
	U_ASSERT((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) \|\|
	(categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));

	for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
	int32_t rangeNum = rd->fNum;
	if (rangeNum == categories.second) {
	rd->fNum = categories.first;
	} else if (rangeNum > categories.second) {
	rd->fNum--;
	}
	}
	--fGroupCount;
	if (categories.second <= fDictCategoriesStart) {
	--fDictCategoriesStart;
	}
	}


	//-----------------------------------------------------------------------------------
	//
	// getTrieSize() Return the size that will be required to serialize the Trie.
	//
	//-----------------------------------------------------------------------------------
	int32_t RBBISetBuilder::getTrieSize() {
	if (U_FAILURE(*fStatus)) {
	return 0;
	}
	if (fTrie == nullptr) {
	bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
	fTrie = umutablecptrie_buildImmutable(
	fMutableTrie,
	UCPTRIE_TYPE_FAST,
	use8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16,
	fStatus);
	fTrieSize = ucptrie_toBinary(fTrie, nullptr, 0, fStatus);
	if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
	*fStatus = U_ZERO_ERROR;
	}
	}
	return fTrieSize;
	}


	//-----------------------------------------------------------------------------------
	//
	// serializeTrie() Put the serialized trie at the specified address.
	// Trust the caller to have given us enough memory.
	// getTrieSize() MUST be called first.
	//
	//-----------------------------------------------------------------------------------
	void RBBISetBuilder::serializeTrie(uint8_t *where) {
	ucptrie_toBinary(fTrie,
	where, // Buffer
	fTrieSize, // Capacity
	fStatus);
	}

	//------------------------------------------------------------------------
	//
	// addValToSets Add a runtime-mapped input value to each uset from a
	// list of uset nodes. (val corresponds to a state table column.)
	// For each of the original Unicode sets - which correspond
	// directly to uset nodes - a logically equivalent expression
	// is constructed in terms of the remapped runtime input
	// symbol set. This function adds one runtime input symbol to
	// a list of sets.
	//
	// The "logically equivalent expression" is the tree for an
	// or-ing together of all of the symbols that go into the set.
	//
	//------------------------------------------------------------------------
	void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
	int32_t ix;

	for (ix=0; ix<sets->size(); ix++) {
	RBBINode usetNode = (RBBINode )sets->elementAt(ix);
	addValToSet(usetNode, val);
	}
	}

	void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
	RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
	if (leafNode == NULL) {
	*fStatus = U_MEMORY_ALLOCATION_ERROR;
	return;
	}
	leafNode->fVal = (unsigned short)val;
	if (usetNode->fLeftChild == NULL) {
	usetNode->fLeftChild = leafNode;
	leafNode->fParent = usetNode;
	} else {
	// There are already input symbols present for this set.
	// Set up an OR node, with the previous stuff as the left child
	// and the new value as the right child.
	RBBINode *orNode = new RBBINode(RBBINode::opOr);
	if (orNode == NULL) {
	*fStatus = U_MEMORY_ALLOCATION_ERROR;
	return;
	}
	orNode->fLeftChild = usetNode->fLeftChild;
	orNode->fRightChild = leafNode;
	orNode->fLeftChild->fParent = orNode;
	orNode->fRightChild->fParent = orNode;
	usetNode->fLeftChild = orNode;
	orNode->fParent = usetNode;
	}
	}


	//------------------------------------------------------------------------
	//
	// getNumCharCategories
	//
	//------------------------------------------------------------------------
	int32_t RBBISetBuilder::getNumCharCategories() const {
	return fGroupCount + 3;
	}


	//------------------------------------------------------------------------
	//
	// getDictCategoriesStart
	//
	//------------------------------------------------------------------------
	int32_t RBBISetBuilder::getDictCategoriesStart() const {
	return fDictCategoriesStart;
	}


	//------------------------------------------------------------------------
	//
	// sawBOF
	//
	//------------------------------------------------------------------------
	UBool RBBISetBuilder::sawBOF() const {
	return fSawBOF;
	}


	//------------------------------------------------------------------------
	//
	// getFirstChar Given a runtime RBBI character category, find
	// the first UChar32 that is in the set of chars
	// in the category.
	//------------------------------------------------------------------------
	UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
	RangeDescriptor *rlRange;
	UChar32 retVal = (UChar32)-1;
	for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
	if (rlRange->fNum == category) {
	retVal = rlRange->fStartChar;
	break;
	}
	}
	return retVal;
	}


	//------------------------------------------------------------------------
	//
	// printRanges A debugging function.
	// dump out all of the range definitions.
	//
	//------------------------------------------------------------------------
	#ifdef RBBI_DEBUG
	void RBBISetBuilder::printRanges() {
	RangeDescriptor *rlRange;
	int i;

	RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
	for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
	RBBIDebugPrintf("%4x-%4x ", rlRange->fStartChar, rlRange->fEndChar);

	for (i=0; i<rlRange->fIncludesSets->size(); i++) {
	RBBINode usetNode = (RBBINode )rlRange->fIncludesSets->elementAt(i);
	UnicodeString setName {u"anon"};
	RBBINode *setRef = usetNode->fParent;
	if (setRef != nullptr) {
	RBBINode *varRef = setRef->fParent;
	if (varRef != nullptr && varRef->fType == RBBINode::varRef) {
	setName = varRef->fText;
	}
	}
	RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
	}
	RBBIDebugPrintf("\n");
	}
	}
	#endif


	//------------------------------------------------------------------------
	//
	// printRangeGroups A debugging function.
	// dump out all of the range groups.
	//
	//------------------------------------------------------------------------
	#ifdef RBBI_DEBUG
	void RBBISetBuilder::printRangeGroups() {
	int i;

	RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
	for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
	if (rlRange->fFirstInGroup) {
	int groupNum = rlRange->fNum;
	RBBIDebugPrintf("%2i ", groupNum);

	if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" <DICT> ");}

	for (i=0; i<rlRange->fIncludesSets->size(); i++) {
	RBBINode usetNode = (RBBINode )rlRange->fIncludesSets->elementAt(i);
	UnicodeString setName = UNICODE_STRING("anon", 4);
	RBBINode *setRef = usetNode->fParent;
	if (setRef != NULL) {
	RBBINode *varRef = setRef->fParent;
	if (varRef != NULL && varRef->fType == RBBINode::varRef) {
	setName = varRef->fText;
	}
	}
	RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
	}

	i = 0;
	for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) {
	if (tRange->fNum == rlRange->fNum) {
	if (i++ % 5 == 0) {
	RBBIDebugPrintf("\n ");
	}
	RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar);
	}
	}
	RBBIDebugPrintf("\n");
	}
	}
	RBBIDebugPrintf("\n");
	}
	#endif


	//------------------------------------------------------------------------
	//
	// printSets A debugging function.
	// dump out all of the set definitions.
	//
	//------------------------------------------------------------------------
	#ifdef RBBI_DEBUG
	void RBBISetBuilder::printSets() {
	int i;

	RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
	for (i=0; ; i++) {
	RBBINode *usetNode;
	RBBINode *setRef;
	RBBINode *varRef;
	UnicodeString setName;

	usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
	if (usetNode == NULL) {
	break;
	}

	RBBIDebugPrintf("%3d ", i);
	setName = UNICODE_STRING("anonymous", 9);
	setRef = usetNode->fParent;
	if (setRef != NULL) {
	varRef = setRef->fParent;
	if (varRef != NULL && varRef->fType == RBBINode::varRef) {
	setName = varRef->fText;
	}
	}
	RBBI_DEBUG_printUnicodeString(setName);
	RBBIDebugPrintf(" ");
	RBBI_DEBUG_printUnicodeString(usetNode->fText);
	RBBIDebugPrintf("\n");
	if (usetNode->fLeftChild != NULL) {
	RBBINode::printTree(usetNode->fLeftChild, TRUE);
	}
	}
	RBBIDebugPrintf("\n");
	}
	#endif



	//-------------------------------------------------------------------------------------
	//
	// RangeDescriptor copy constructor
	//
	//-------------------------------------------------------------------------------------

	RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) :
	fStartChar(other.fStartChar), fEndChar {other.fEndChar}, fNum {other.fNum},
	fIncludesDict{other.fIncludesDict}, fFirstInGroup{other.fFirstInGroup} {

	if (U_FAILURE(status)) {
	return;
	}
	fIncludesSets = new UVector(status);
	if (this->fIncludesSets == nullptr) {
	status = U_MEMORY_ALLOCATION_ERROR;
	}
	if (U_FAILURE(status)) {
	return;
	}

	for (int32_t i=0; i<other.fIncludesSets->size(); i++) {
	this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
	}
	}


	//-------------------------------------------------------------------------------------
	//
	// RangeDesriptor default constructor
	//
	//-------------------------------------------------------------------------------------
	RangeDescriptor::RangeDescriptor(UErrorCode &status) {
	if (U_FAILURE(status)) {
	return;
	}
	fIncludesSets = new UVector(status);
	if (fIncludesSets == nullptr) {
	status = U_MEMORY_ALLOCATION_ERROR;
	}
	}


	//-------------------------------------------------------------------------------------
	//
	// RangeDesriptor Destructor
	//
	//-------------------------------------------------------------------------------------
	RangeDescriptor::~RangeDescriptor() {
	delete fIncludesSets;
	fIncludesSets = nullptr;
	}

	//-------------------------------------------------------------------------------------
	//
	// RangeDesriptor::split()
	//
	//-------------------------------------------------------------------------------------
	void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
	U_ASSERT(where>fStartChar && where<=fEndChar);
	RangeDescriptor nr = new RangeDescriptor(this, status);
	if(nr == nullptr) {
	status = U_MEMORY_ALLOCATION_ERROR;
	return;
	}
	if (U_FAILURE(status)) {
	delete nr;
	return;
	}
	// RangeDescriptor copy constructor copies all fields.
	// Only need to update those that are different after the split.
	nr->fStartChar = where;
	this->fEndChar = where-1;
	nr->fNext = this->fNext;
	this->fNext = nr;
	}


	//-------------------------------------------------------------------------------------
	//
	// RangeDescriptor::isDictionaryRange
	//
	// Test whether this range includes characters from
	// the original Unicode Set named "dictionary".
	//
	// This function looks through the Unicode Sets that
	// the range includes, checking for one named "dictionary"
	//
	// TODO: a faster way would be to find the set node for
	// "dictionary" just once, rather than looking it
	// up by name every time.
	//
	//-------------------------------------------------------------------------------------
	bool RangeDescriptor::isDictionaryRange() {
	static const char16_t *dictionary = u"dictionary";
	for (int32_t i=0; i<fIncludesSets->size(); i++) {
	RBBINode usetNode = (RBBINode )fIncludesSets->elementAt(i);
	RBBINode *setRef = usetNode->fParent;
	if (setRef != nullptr) {
	RBBINode *varRef = setRef->fParent;
	if (varRef && varRef->fType == RBBINode::varRef) {
	const UnicodeString *setName = &varRef->fText;
	if (setName->compare(dictionary, -1) == 0) {
	return true;
	}
	}
	}
	}
	return false;
	}

	U_NAMESPACE_END

	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */