third_party/icu/source/i18n/rbt_pars.h - cobalt - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 **********************************************************************
 * Copyright (C) 1999-2011, International Business Machines Corporation
 * and others. All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   11/17/99    aliu        Creation.
 **********************************************************************
 */
 #ifndef RBT_PARS_H
 #define RBT_PARS_H

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_TRANSLITERATION
 #ifdef __cplusplus

 #include "unicode/uobject.h"
 #include "unicode/parseerr.h"
 #include "unicode/unorm.h"
 #include "rbt.h"
 #include "hash.h"
 #include "uvector.h"

 U_NAMESPACE_BEGIN

 class TransliterationRuleData;
 class UnicodeFunctor;
 class ParseData;
 class RuleHalf;
 class ParsePosition;
 class StringMatcher;

 class TransliteratorParser : public UMemory {

  public:

     /**
      * A Vector of TransliterationRuleData objects, one for each discrete group
      * of rules in the rule set
      */
     UVector dataVector;

     /**
      * PUBLIC data member.
      * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
      */
     UVector idBlockVector;

     /**
      * PUBLIC data member containing the parsed compound filter, if any.
      */
     UnicodeSet* compoundFilter;

  private:

     /**
      * The current data object for which we are parsing rules
      */
     TransliterationRuleData* curData;

     UTransDirection direction;

     /**
      * Parse error information.
      */
     UParseError parseError;

     /**
      * Temporary symbol table used during parsing.
      */
     ParseData* parseData;

     /**
      * Temporary vector of matcher variables.  When parsing is complete, this
      * is copied into the array data.variables.  As with data.variables,
      * element 0 corresponds to character data.variablesBase.
      */
     UVector variablesVector;

     /**
      * Temporary table of variable names.  When parsing is complete, this is
      * copied into data.variableNames.
      */
     Hashtable variableNames;

     /**
      * String of standins for segments.  Used during the parsing of a single
      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
      * to StringMatcher object segmentObjects.elementAt(0), etc.
      */
     UnicodeString segmentStandins;

     /**
      * Vector of StringMatcher objects for segments.  Used during the
      * parsing of a single rule.
      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
      * to StringMatcher object segmentObjects.elementAt(0), etc.
      */
     UVector segmentObjects;

     /**
      * The next available stand-in for variables.  This starts at some point in
      * the private use area (discovered dynamically) and increments up toward
      * <code>variableLimit</code>.  At any point during parsing, available
      * variables are <code>variableNext..variableLimit-1</code>.
      */
     UChar variableNext;

     /**
      * The last available stand-in for variables.  This is discovered
      * dynamically.  At any point during parsing, available variables are
      * <code>variableNext..variableLimit-1</code>.
      */
     UChar variableLimit;

     /**
      * When we encounter an undefined variable, we do not immediately signal
      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
      * Instead, we save the name of the undefined variable, and substitute
      * in the placeholder char variableLimit - 1, and decrement
      * variableLimit.
      */
     UnicodeString undefinedVariableName;

     /**
      * The stand-in character for the 'dot' set, represented by '.' in
      * patterns.  This is allocated the first time it is needed, and
      * reused thereafter.
      */
     UChar dotStandIn;

 public:

     /**
      * Constructor.
      */
     TransliteratorParser(UErrorCode &statusReturn);

     /**
      * Destructor.
      */
     ~TransliteratorParser();

     /**
      * Parse the given string as a sequence of rules, separated by newline
      * characters ('\n'), and cause this object to implement those rules.  Any
      * previous rules are discarded.  Typically this method is called exactly
      * once after construction.
      *
      * Parse the given rules, in the given direction.  After this call
      * returns, query the public data members for results.  The caller
      * owns the 'data' and 'compoundFilter' data members after this
      * call returns.
      * @param rules      rules, separated by ';'
      * @param direction  either FORWARD or REVERSE.
      * @param pe         Struct to recieve information on position
      *                   of error if an error is encountered
      * @param ec         Output param set to success/failure code.
      */
     void parse(const UnicodeString& rules,
                UTransDirection direction,
                UParseError& pe,
                UErrorCode& ec);

     /**
      * Return the compound filter parsed by parse().  Caller owns result.
      * @return the compound filter parsed by parse().
      */
     UnicodeSet* orphanCompoundFilter();

 private:

     /**
      * Return a representation of this transliterator as source rules.
      * @param rules      Output param to receive the rules.
      * @param direction  either FORWARD or REVERSE.
      */
     void parseRules(const UnicodeString& rules,
                     UTransDirection direction,
                     UErrorCode& status);

     /**
      * MAIN PARSER.  Parse the next rule in the given rule string, starting
      * at pos.  Return the index after the last character parsed.  Do not
      * parse characters at or after limit.
      *
      * Important:  The character at pos must be a non-whitespace character
      * that is not the comment character.
      *
      * This method handles quoting, escaping, and whitespace removal.  It
      * parses the end-of-rule character.  It recognizes context and cursor
      * indicators.  Once it does a lexical breakdown of the rule at pos, it
      * creates a rule object and adds it to our rule list.
      * @param rules      Output param to receive the rules.
      * @param pos        the starting position.
      * @param limit      pointer past the last character of the rule.
      * @return           the index after the last character parsed.
      */
     int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

     /**
      * Set the variable range to [start, end] (inclusive).
      * @param start    the start value of the range.
      * @param end      the end value of the range.
      */
     void setVariableRange(int32_t start, int32_t end, UErrorCode& status);

     /**
      * Assert that the given character is NOT within the variable range.
      * If it is, return false.  This is neccesary to ensure that the
      * variable range does not overlap characters used in a rule.
      * @param ch     the given character.
      * @return       True, if the given character is NOT within the variable range.
      */
     UBool checkVariableRange(UChar32 ch) const;

     /**
      * Set the maximum backup to 'backup', in response to a pragma
      * statement.
      * @param backup    the new value to be set.
      */
     void pragmaMaximumBackup(int32_t backup);

     /**
      * Begin normalizing all rules using the given mode, in response
      * to a pragma statement.
      * @param mode    the given mode.
      */
     void pragmaNormalizeRules(UNormalizationMode mode);

     /**
      * Return true if the given rule looks like a pragma.
      * @param pos offset to the first non-whitespace character
      * of the rule.
      * @param limit pointer past the last character of the rule.
      * @return true if the given rule looks like a pragma.
      */
     static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

     /**
      * Parse a pragma.  This method assumes resemblesPragma() has
      * already returned true.
      * @param pos offset to the first non-whitespace character
      * of the rule.
      * @param limit pointer past the last character of the rule.
      * @return the position index after the final ';' of the pragma,
      * or -1 on failure.
      */
     int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

     /**
      * Called by main parser upon syntax error.  Search the rule string
      * for the probable end of the rule.  Of course, if the error is that
      * the end of rule marker is missing, then the rule end will not be found.
      * In any case the rule start will be correctly reported.
      * @param parseErrorCode error code.
      * @param msg error description.
      * @param start position of first character of current rule.
      * @return start position of first character of current rule.
      */
     int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
                         UErrorCode& status);

     /**
      * Parse a UnicodeSet out, store it, and return the stand-in character
      * used to represent it.
      *
      * @param rule    the rule for UnicodeSet.
      * @param pos     the position in pattern at which to start parsing.
      * @return        the stand-in character used to represent it.
      */
     UChar parseSet(const UnicodeString& rule,
                    ParsePosition& pos,
                    UErrorCode& status);

     /**
      * Generate and return a stand-in for a new UnicodeFunctor.  Store
      * the matcher (adopt it).
      * @param adopted the UnicodeFunctor to be adopted.
      * @return        a stand-in for a new UnicodeFunctor.
      */
     UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);

     /**
      * Return the standin for segment seg (1-based).
      * @param seg    the given segment.
      * @return       the standIn character for the given segment.
      */
     UChar getSegmentStandin(int32_t seg, UErrorCode& status);

     /**
      * Set the object for segment seg (1-based).
      * @param seg      the given segment.
      * @param adopted  the StringMatcher to be adopted.
      */
     void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);

     /**
      * Return the stand-in for the dot set.  It is allocated the first
      * time and reused thereafter.
      * @return    the stand-in for the dot set.
      */
     UChar getDotStandIn(UErrorCode& status);

     /**
      * Append the value of the given variable name to the given
      * UnicodeString.
      * @param name    the variable name to be appended.
      * @param buf     the given UnicodeString to append to.
      */
     void appendVariableDef(const UnicodeString& name,
                            UnicodeString& buf,
                            UErrorCode& status);

     /**
      * Glue method to get around access restrictions in C++.
      */
     /*static Transliterator* createBasicInstance(const UnicodeString& id,
                                                const UnicodeString* canonID);*/

     friend class RuleHalf;

     // Disallowed methods; no impl.
     /**
      * Copy constructor
      */
     TransliteratorParser(const TransliteratorParser&);

     /**
      * Assignment operator
      */
     TransliteratorParser& operator=(const TransliteratorParser&);
 };

 U_NAMESPACE_END

 #endif /* #ifdef __cplusplus */

 /**
  * Strip/convert the following from the transliterator rules:
  * comments
  * newlines
  * white space at the beginning and end of a line
  * unescape \u notation
  *
  * The target must be equal in size as the source.
  * @internal
  */
 U_CAPI int32_t
 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);

 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

 #endif
	// © 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html
	/*
	**********************************************************************
	* Copyright (C) 1999-2011, International Business Machines Corporation
	* and others. All Rights Reserved.
	**********************************************************************
	* Date Name Description
	* 11/17/99 aliu Creation.
	**********************************************************************
	*/
	#ifndef RBT_PARS_H
	#define RBT_PARS_H

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_TRANSLITERATION
	#ifdef __cplusplus

	#include "unicode/uobject.h"
	#include "unicode/parseerr.h"
	#include "unicode/unorm.h"
	#include "rbt.h"
	#include "hash.h"
	#include "uvector.h"

	U_NAMESPACE_BEGIN

	class TransliterationRuleData;
	class UnicodeFunctor;
	class ParseData;
	class RuleHalf;
	class ParsePosition;
	class StringMatcher;

	class TransliteratorParser : public UMemory {

	public:

	/**
	* A Vector of TransliterationRuleData objects, one for each discrete group
	* of rules in the rule set
	*/
	UVector dataVector;

	/**
	* PUBLIC data member.
	* A Vector of UnicodeStrings containing all of the ID blocks in the rule set
	*/
	UVector idBlockVector;

	/**
	* PUBLIC data member containing the parsed compound filter, if any.
	*/
	UnicodeSet* compoundFilter;

	private:

	/**
	* The current data object for which we are parsing rules
	*/
	TransliterationRuleData* curData;

	UTransDirection direction;

	/**
	* Parse error information.
	*/
	UParseError parseError;

	/**
	* Temporary symbol table used during parsing.
	*/
	ParseData* parseData;

	/**
	* Temporary vector of matcher variables. When parsing is complete, this
	* is copied into the array data.variables. As with data.variables,
	* element 0 corresponds to character data.variablesBase.
	*/
	UVector variablesVector;

	/**
	* Temporary table of variable names. When parsing is complete, this is
	* copied into data.variableNames.
	*/
	Hashtable variableNames;

	/**
	* String of standins for segments. Used during the parsing of a single
	* rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
	* to StringMatcher object segmentObjects.elementAt(0), etc.
	*/
	UnicodeString segmentStandins;

	/**
	* Vector of StringMatcher objects for segments. Used during the
	* parsing of a single rule.
	* segmentStandins.charAt(0) is the standin for "$1" and corresponds
	* to StringMatcher object segmentObjects.elementAt(0), etc.
	*/
	UVector segmentObjects;

	/**
	* The next available stand-in for variables. This starts at some point in
	* the private use area (discovered dynamically) and increments up toward
	* <code>variableLimit</code>. At any point during parsing, available
	* variables are <code>variableNext..variableLimit-1</code>.
	*/
	UChar variableNext;

	/**
	* The last available stand-in for variables. This is discovered
	* dynamically. At any point during parsing, available variables are
	* <code>variableNext..variableLimit-1</code>.
	*/
	UChar variableLimit;

	/**
	* When we encounter an undefined variable, we do not immediately signal
	* an error, in case we are defining this variable, e.g., "$a = [a-z];".
	* Instead, we save the name of the undefined variable, and substitute
	* in the placeholder char variableLimit - 1, and decrement
	* variableLimit.
	*/
	UnicodeString undefinedVariableName;

	/**
	* The stand-in character for the 'dot' set, represented by '.' in
	* patterns. This is allocated the first time it is needed, and
	* reused thereafter.
	*/
	UChar dotStandIn;

	public:

	/**
	* Constructor.
	*/
	TransliteratorParser(UErrorCode &statusReturn);

	/**
	* Destructor.
	*/
	~TransliteratorParser();

	/**
	* Parse the given string as a sequence of rules, separated by newline
	* characters ('\n'), and cause this object to implement those rules. Any
	* previous rules are discarded. Typically this method is called exactly
	* once after construction.
	*
	* Parse the given rules, in the given direction. After this call
	* returns, query the public data members for results. The caller
	* owns the 'data' and 'compoundFilter' data members after this
	* call returns.
	* @param rules rules, separated by ';'
	* @param direction either FORWARD or REVERSE.
	* @param pe Struct to recieve information on position
	* of error if an error is encountered
	* @param ec Output param set to success/failure code.
	*/
	void parse(const UnicodeString& rules,
	UTransDirection direction,
	UParseError& pe,
	UErrorCode& ec);

	/**
	* Return the compound filter parsed by parse(). Caller owns result.
	* @return the compound filter parsed by parse().
	*/
	UnicodeSet* orphanCompoundFilter();

	private:

	/**
	* Return a representation of this transliterator as source rules.
	* @param rules Output param to receive the rules.
	* @param direction either FORWARD or REVERSE.
	*/
	void parseRules(const UnicodeString& rules,
	UTransDirection direction,
	UErrorCode& status);

	/**
	* MAIN PARSER. Parse the next rule in the given rule string, starting
	* at pos. Return the index after the last character parsed. Do not
	* parse characters at or after limit.
	*
	* Important: The character at pos must be a non-whitespace character
	* that is not the comment character.
	*
	* This method handles quoting, escaping, and whitespace removal. It
	* parses the end-of-rule character. It recognizes context and cursor
	* indicators. Once it does a lexical breakdown of the rule at pos, it
	* creates a rule object and adds it to our rule list.
	* @param rules Output param to receive the rules.
	* @param pos the starting position.
	* @param limit pointer past the last character of the rule.
	* @return the index after the last character parsed.
	*/
	int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

	/**
	* Set the variable range to [start, end] (inclusive).
	* @param start the start value of the range.
	* @param end the end value of the range.
	*/
	void setVariableRange(int32_t start, int32_t end, UErrorCode& status);

	/**
	* Assert that the given character is NOT within the variable range.
	* If it is, return false. This is neccesary to ensure that the
	* variable range does not overlap characters used in a rule.
	* @param ch the given character.
	* @return True, if the given character is NOT within the variable range.
	*/
	UBool checkVariableRange(UChar32 ch) const;

	/**
	* Set the maximum backup to 'backup', in response to a pragma
	* statement.
	* @param backup the new value to be set.
	*/
	void pragmaMaximumBackup(int32_t backup);

	/**
	* Begin normalizing all rules using the given mode, in response
	* to a pragma statement.
	* @param mode the given mode.
	*/
	void pragmaNormalizeRules(UNormalizationMode mode);

	/**
	* Return true if the given rule looks like a pragma.
	* @param pos offset to the first non-whitespace character
	* of the rule.
	* @param limit pointer past the last character of the rule.
	* @return true if the given rule looks like a pragma.
	*/
	static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

	/**
	* Parse a pragma. This method assumes resemblesPragma() has
	* already returned true.
	* @param pos offset to the first non-whitespace character
	* of the rule.
	* @param limit pointer past the last character of the rule.
	* @return the position index after the final ';' of the pragma,
	* or -1 on failure.
	*/
	int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

	/**
	* Called by main parser upon syntax error. Search the rule string
	* for the probable end of the rule. Of course, if the error is that
	* the end of rule marker is missing, then the rule end will not be found.
	* In any case the rule start will be correctly reported.
	* @param parseErrorCode error code.
	* @param msg error description.
	* @param start position of first character of current rule.
	* @return start position of first character of current rule.
	*/
	int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
	UErrorCode& status);

	/**
	* Parse a UnicodeSet out, store it, and return the stand-in character
	* used to represent it.
	*
	* @param rule the rule for UnicodeSet.
	* @param pos the position in pattern at which to start parsing.
	* @return the stand-in character used to represent it.
	*/
	UChar parseSet(const UnicodeString& rule,
	ParsePosition& pos,
	UErrorCode& status);

	/**
	* Generate and return a stand-in for a new UnicodeFunctor. Store
	* the matcher (adopt it).
	* @param adopted the UnicodeFunctor to be adopted.
	* @return a stand-in for a new UnicodeFunctor.
	*/
	UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);

	/**
	* Return the standin for segment seg (1-based).
	* @param seg the given segment.
	* @return the standIn character for the given segment.
	*/
	UChar getSegmentStandin(int32_t seg, UErrorCode& status);

	/**
	* Set the object for segment seg (1-based).
	* @param seg the given segment.
	* @param adopted the StringMatcher to be adopted.
	*/
	void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);

	/**
	* Return the stand-in for the dot set. It is allocated the first
	* time and reused thereafter.
	* @return the stand-in for the dot set.
	*/
	UChar getDotStandIn(UErrorCode& status);

	/**
	* Append the value of the given variable name to the given
	* UnicodeString.
	* @param name the variable name to be appended.
	* @param buf the given UnicodeString to append to.
	*/
	void appendVariableDef(const UnicodeString& name,
	UnicodeString& buf,
	UErrorCode& status);

	/**
	* Glue method to get around access restrictions in C++.
	*/
	/static Transliterator createBasicInstance(const UnicodeString& id,
	const UnicodeString* canonID);*/

	friend class RuleHalf;

	// Disallowed methods; no impl.
	/**
	* Copy constructor
	*/
	TransliteratorParser(const TransliteratorParser&);

	/**
	* Assignment operator
	*/
	TransliteratorParser& operator=(const TransliteratorParser&);
	};

	U_NAMESPACE_END

	#endif /* #ifdef __cplusplus */

	/**
	* Strip/convert the following from the transliterator rules:
	* comments
	* newlines
	* white space at the beginning and end of a line
	* unescape \u notation
	*
	* The target must be equal in size as the source.
	* @internal
	*/
	U_CAPI int32_t
	utrans_stripRules(const UChar source, int32_t sourceLen, UChar target, UErrorCode *status);

	#endif /* #if !UCONFIG_NO_TRANSLITERATION */

	#endif