src/third_party/icu/source/i18n/strrepl.cpp - cobalt - Git at Google

 /*
 **********************************************************************
 *   Copyright (c) 2002-2004, International Business Machines Corporation
 *   and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   01/21/2002  aliu        Creation.
 **********************************************************************
 */

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_TRANSLITERATION

 #include "strrepl.h"
 #include "rbt_data.h"
 #include "util.h"
 #include "unicode/uniset.h"

 U_NAMESPACE_BEGIN

 static const UChar EMPTY[] = { 0 }; // empty string: ""

 UnicodeReplacer::~UnicodeReplacer() {}
 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)

 /**
  * Construct a StringReplacer that sets the emits the given output
  * text and sets the cursor to the given position.
  * @param theOutput text that will replace input text when the
  * replace() method is called.  May contain stand-in characters
  * that represent nested replacers.
  * @param theCursorPos cursor position that will be returned by
  * the replace() method
  * @param theData transliterator context object that translates
  * stand-in characters to UnicodeReplacer objects
  */
 StringReplacer::StringReplacer(const UnicodeString& theOutput,
                                int32_t theCursorPos,
                                const TransliterationRuleData* theData) {
     output = theOutput;
     cursorPos = theCursorPos;
     hasCursor = TRUE;
     data = theData;
     isComplex = TRUE;
 }

 /**
  * Construct a StringReplacer that sets the emits the given output
  * text and does not modify the cursor.
  * @param theOutput text that will replace input text when the
  * replace() method is called.  May contain stand-in characters
  * that represent nested replacers.
  * @param theData transliterator context object that translates
  * stand-in characters to UnicodeReplacer objects
  */
 StringReplacer::StringReplacer(const UnicodeString& theOutput,
                                const TransliterationRuleData* theData) {
     output = theOutput;
     cursorPos = 0;
     hasCursor = FALSE;
     data = theData;
     isComplex = TRUE;
 }

 /**
  * Copy constructor.
  */
 StringReplacer::StringReplacer(const StringReplacer& other) :
     UnicodeFunctor(other),
     UnicodeReplacer(other)
 {
     output = other.output;
     cursorPos = other.cursorPos;
     hasCursor = other.hasCursor;
     data = other.data;
     isComplex = other.isComplex;
 }

 /**
  * Destructor
  */
 StringReplacer::~StringReplacer() {
 }

 /**
  * Implement UnicodeFunctor
  */
 UnicodeFunctor* StringReplacer::clone() const {
     return new StringReplacer(*this);
 }

 /**
  * Implement UnicodeFunctor
  */
 UnicodeReplacer* StringReplacer::toReplacer() const {
     return (UnicodeReplacer*) this;
 }

 /**
  * UnicodeReplacer API
  */
 int32_t StringReplacer::replace(Replaceable& text,
                                 int32_t start,
                                 int32_t limit,
                                 int32_t& cursor) {
     int32_t outLen;
     int32_t newStart = 0;

     // NOTE: It should be possible to _always_ run the complex
     // processing code; just slower.  If not, then there is a bug
     // in the complex processing code.

     // Simple (no nested replacers) Processing Code :
     if (!isComplex) {
         text.handleReplaceBetween(start, limit, output);
         outLen = output.length();

         // Setup default cursor position (for cursorPos within output)
         newStart = cursorPos;
     }

     // Complex (nested replacers) Processing Code :
     else {
         /* When there are segments to be copied, use the Replaceable.copy()
          * API in order to retain out-of-band data.  Copy everything to the
          * end of the string, then copy them back over the key.  This preserves
          * the integrity of indices into the key and surrounding context while
          * generating the output text.
          */
         UnicodeString buf;
         int32_t oOutput; // offset into 'output'
         isComplex = FALSE;

         // The temporary buffer starts at tempStart, and extends
         // to destLimit.  The start of the buffer has a single
         // character from before the key.  This provides style
         // data when addition characters are filled into the
         // temporary buffer.  If there is nothing to the left, use
         // the non-character U+FFFF, which Replaceable subclasses
         // should treat specially as a "no-style character."
         // destStart points to the point after the style context
         // character, so it is tempStart+1 or tempStart+2.
         int32_t tempStart = text.length(); // start of temp buffer
         int32_t destStart = tempStart; // copy new text to here
         if (start > 0) {
             int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1));
             text.copy(start-len, start, tempStart);
             destStart += len;
         } else {
             UnicodeString str((UChar) 0xFFFF);
             text.handleReplaceBetween(tempStart, tempStart, str);
             destStart++;
         }
         int32_t destLimit = destStart;

         for (oOutput=0; oOutput<output.length(); ) {
             if (oOutput == cursorPos) {
                 // Record the position of the cursor
                 newStart = destLimit - destStart; // relative to start
             }
             UChar32 c = output.char32At(oOutput);
             UnicodeReplacer* r = data->lookupReplacer(c);
             if (r == NULL) {
                 // Accumulate straight (non-segment) text.
                 buf.append(c);
             } else {
                 isComplex = TRUE;

                 // Insert any accumulated straight text.
                 if (buf.length() > 0) {
                     text.handleReplaceBetween(destLimit, destLimit, buf);
                     destLimit += buf.length();
                     buf.truncate(0);
                 }

                 // Delegate output generation to replacer object
                 int32_t len = r->replace(text, destLimit, destLimit, cursor);
                 destLimit += len;
             }
             oOutput += UTF_CHAR_LENGTH(c);
         }
         // Insert any accumulated straight text.
         if (buf.length() > 0) {
             text.handleReplaceBetween(destLimit, destLimit, buf);
             destLimit += buf.length();
         }
         if (oOutput == cursorPos) {
             // Record the position of the cursor
             newStart = destLimit - destStart; // relative to start
         }

         outLen = destLimit - destStart;

         // Copy new text to start, and delete it
         text.copy(destStart, destLimit, start);
         text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY);

         // Delete the old text (the key)
         text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY);
     }

     if (hasCursor) {
         // Adjust the cursor for positions outside the key.  These
         // refer to code points rather than code units.  If cursorPos
         // is within the output string, then use newStart, which has
         // already been set above.
         if (cursorPos < 0) {
             newStart = start;
             int32_t n = cursorPos;
             // Outside the output string, cursorPos counts code points
             while (n < 0 && newStart > 0) {
                 newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1));
                 ++n;
             }
             newStart += n;
         } else if (cursorPos > output.length()) {
             newStart = start + outLen;
             int32_t n = cursorPos - output.length();
             // Outside the output string, cursorPos counts code points
             while (n > 0 && newStart < text.length()) {
                 newStart += UTF_CHAR_LENGTH(text.char32At(newStart));
                 --n;
             }
             newStart += n;
         } else {
             // Cursor is within output string.  It has been set up above
             // to be relative to start.
             newStart += start;
         }

         cursor = newStart;
     }

     return outLen;
 }

 /**
  * UnicodeReplacer API
  */
 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
                                                  UBool escapeUnprintable) const {
     rule.truncate(0);
     UnicodeString quoteBuf;

     int32_t cursor = cursorPos;

     // Handle a cursor preceding the output
     if (hasCursor && cursor < 0) {
         while (cursor++ < 0) {
             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
         }
         // Fall through and append '|' below
     }

     for (int32_t i=0; i<output.length(); ++i) {
         if (hasCursor && i == cursor) {
             ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
         }
         UChar c = output.charAt(i); // Ok to use 16-bits here

         UnicodeReplacer* r = data->lookupReplacer(c);
         if (r == NULL) {
             ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
         } else {
             UnicodeString buf;
             r->toReplacerPattern(buf, escapeUnprintable);
             buf.insert(0, (UChar)0x20);
             buf.append((UChar)0x20);
             ICU_Utility::appendToRule(rule, buf,
                                       TRUE, escapeUnprintable, quoteBuf);
         }
     }

     // Handle a cursor after the output.  Use > rather than >= because
     // if cursor == output.length() it is at the end of the output,
     // which is the default position, so we need not emit it.
     if (hasCursor && cursor > output.length()) {
         cursor -= output.length();
         while (cursor-- > 0) {
             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
         }
         ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
     }
     // Flush quoteBuf out to result
     ICU_Utility::appendToRule(rule, -1,
                               TRUE, escapeUnprintable, quoteBuf);

     return rule;
 }

 /**
  * Implement UnicodeReplacer
  */
 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
     UChar32 ch;
     for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) {
     ch = output.char32At(i);
     UnicodeReplacer* r = data->lookupReplacer(ch);
     if (r == NULL) {
         toUnionTo.add(ch);
     } else {
         r->addReplacementSetTo(toUnionTo);
     }
     }
 }

 /**
  * UnicodeFunctor API
  */
 void StringReplacer::setData(const TransliterationRuleData* d) {
     data = d;
     int32_t i = 0;
     while (i<output.length()) {
         UChar32 c = output.char32At(i);
         UnicodeFunctor* f = data->lookup(c);
         if (f != NULL) {
             f->setData(data);
         }
         i += UTF_CHAR_LENGTH(c);
     }
 }

 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

 //eof
	/*
	**********************************************************************
	* Copyright (c) 2002-2004, International Business Machines Corporation
	* and others. All Rights Reserved.
	**********************************************************************
	* Date Name Description
	* 01/21/2002 aliu Creation.
	**********************************************************************
	*/

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_TRANSLITERATION

	#include "strrepl.h"
	#include "rbt_data.h"
	#include "util.h"
	#include "unicode/uniset.h"

	U_NAMESPACE_BEGIN

	static const UChar EMPTY[] = { 0 }; // empty string: ""

	UnicodeReplacer::~UnicodeReplacer() {}
	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)

	/**
	* Construct a StringReplacer that sets the emits the given output
	* text and sets the cursor to the given position.
	* @param theOutput text that will replace input text when the
	* replace() method is called. May contain stand-in characters
	* that represent nested replacers.
	* @param theCursorPos cursor position that will be returned by
	* the replace() method
	* @param theData transliterator context object that translates
	* stand-in characters to UnicodeReplacer objects
	*/
	StringReplacer::StringReplacer(const UnicodeString& theOutput,
	int32_t theCursorPos,
	const TransliterationRuleData* theData) {
	output = theOutput;
	cursorPos = theCursorPos;
	hasCursor = TRUE;
	data = theData;
	isComplex = TRUE;
	}

	/**
	* Construct a StringReplacer that sets the emits the given output
	* text and does not modify the cursor.
	* @param theOutput text that will replace input text when the
	* replace() method is called. May contain stand-in characters
	* that represent nested replacers.
	* @param theData transliterator context object that translates
	* stand-in characters to UnicodeReplacer objects
	*/
	StringReplacer::StringReplacer(const UnicodeString& theOutput,
	const TransliterationRuleData* theData) {
	output = theOutput;
	cursorPos = 0;
	hasCursor = FALSE;
	data = theData;
	isComplex = TRUE;
	}

	/**
	* Copy constructor.
	*/
	StringReplacer::StringReplacer(const StringReplacer& other) :
	UnicodeFunctor(other),
	UnicodeReplacer(other)
	{
	output = other.output;
	cursorPos = other.cursorPos;
	hasCursor = other.hasCursor;
	data = other.data;
	isComplex = other.isComplex;
	}

	/**
	* Destructor
	*/
	StringReplacer::~StringReplacer() {
	}

	/**
	* Implement UnicodeFunctor
	*/
	UnicodeFunctor* StringReplacer::clone() const {
	return new StringReplacer(*this);
	}

	/**
	* Implement UnicodeFunctor
	*/
	UnicodeReplacer* StringReplacer::toReplacer() const {
	return (UnicodeReplacer*) this;
	}

	/**
	* UnicodeReplacer API
	*/
	int32_t StringReplacer::replace(Replaceable& text,
	int32_t start,
	int32_t limit,
	int32_t& cursor) {
	int32_t outLen;
	int32_t newStart = 0;

	// NOTE: It should be possible to _always_ run the complex
	// processing code; just slower. If not, then there is a bug
	// in the complex processing code.

	// Simple (no nested replacers) Processing Code :
	if (!isComplex) {
	text.handleReplaceBetween(start, limit, output);
	outLen = output.length();

	// Setup default cursor position (for cursorPos within output)
	newStart = cursorPos;
	}

	// Complex (nested replacers) Processing Code :
	else {
	/* When there are segments to be copied, use the Replaceable.copy()
	* API in order to retain out-of-band data. Copy everything to the
	* end of the string, then copy them back over the key. This preserves
	* the integrity of indices into the key and surrounding context while
	* generating the output text.
	*/
	UnicodeString buf;
	int32_t oOutput; // offset into 'output'
	isComplex = FALSE;

	// The temporary buffer starts at tempStart, and extends
	// to destLimit. The start of the buffer has a single
	// character from before the key. This provides style
	// data when addition characters are filled into the
	// temporary buffer. If there is nothing to the left, use
	// the non-character U+FFFF, which Replaceable subclasses
	// should treat specially as a "no-style character."
	// destStart points to the point after the style context
	// character, so it is tempStart+1 or tempStart+2.
	int32_t tempStart = text.length(); // start of temp buffer
	int32_t destStart = tempStart; // copy new text to here
	if (start > 0) {
	int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1));
	text.copy(start-len, start, tempStart);
	destStart += len;
	} else {
	UnicodeString str((UChar) 0xFFFF);
	text.handleReplaceBetween(tempStart, tempStart, str);
	destStart++;
	}
	int32_t destLimit = destStart;

	for (oOutput=0; oOutput<output.length(); ) {
	if (oOutput == cursorPos) {
	// Record the position of the cursor
	newStart = destLimit - destStart; // relative to start
	}
	UChar32 c = output.char32At(oOutput);
	UnicodeReplacer* r = data->lookupReplacer(c);
	if (r == NULL) {
	// Accumulate straight (non-segment) text.
	buf.append(c);
	} else {
	isComplex = TRUE;

	// Insert any accumulated straight text.
	if (buf.length() > 0) {
	text.handleReplaceBetween(destLimit, destLimit, buf);
	destLimit += buf.length();
	buf.truncate(0);
	}

	// Delegate output generation to replacer object
	int32_t len = r->replace(text, destLimit, destLimit, cursor);
	destLimit += len;
	}
	oOutput += UTF_CHAR_LENGTH(c);
	}
	// Insert any accumulated straight text.
	if (buf.length() > 0) {
	text.handleReplaceBetween(destLimit, destLimit, buf);
	destLimit += buf.length();
	}
	if (oOutput == cursorPos) {
	// Record the position of the cursor
	newStart = destLimit - destStart; // relative to start
	}

	outLen = destLimit - destStart;

	// Copy new text to start, and delete it
	text.copy(destStart, destLimit, start);
	text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY);

	// Delete the old text (the key)
	text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY);
	}

	if (hasCursor) {
	// Adjust the cursor for positions outside the key. These
	// refer to code points rather than code units. If cursorPos
	// is within the output string, then use newStart, which has
	// already been set above.
	if (cursorPos < 0) {
	newStart = start;
	int32_t n = cursorPos;
	// Outside the output string, cursorPos counts code points
	while (n < 0 && newStart > 0) {
	newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1));
	++n;
	}
	newStart += n;
	} else if (cursorPos > output.length()) {
	newStart = start + outLen;
	int32_t n = cursorPos - output.length();
	// Outside the output string, cursorPos counts code points
	while (n > 0 && newStart < text.length()) {
	newStart += UTF_CHAR_LENGTH(text.char32At(newStart));
	--n;
	}
	newStart += n;
	} else {
	// Cursor is within output string. It has been set up above
	// to be relative to start.
	newStart += start;
	}

	cursor = newStart;
	}

	return outLen;
	}

	/**
	* UnicodeReplacer API
	*/
	UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
	UBool escapeUnprintable) const {
	rule.truncate(0);
	UnicodeString quoteBuf;

	int32_t cursor = cursorPos;

	// Handle a cursor preceding the output
	if (hasCursor && cursor < 0) {
	while (cursor++ < 0) {
	ICU_Utility::appendToRule(rule, (UChar)0x0040 /@/, TRUE, escapeUnprintable, quoteBuf);
	}
	// Fall through and append '\|' below
	}

	for (int32_t i=0; i<output.length(); ++i) {
	if (hasCursor && i == cursor) {
	ICU_Utility::appendToRule(rule, (UChar)0x007C /\|/, TRUE, escapeUnprintable, quoteBuf);
	}
	UChar c = output.charAt(i); // Ok to use 16-bits here

	UnicodeReplacer* r = data->lookupReplacer(c);
	if (r == NULL) {
	ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
	} else {
	UnicodeString buf;
	r->toReplacerPattern(buf, escapeUnprintable);
	buf.insert(0, (UChar)0x20);
	buf.append((UChar)0x20);
	ICU_Utility::appendToRule(rule, buf,
	TRUE, escapeUnprintable, quoteBuf);
	}
	}

	// Handle a cursor after the output. Use > rather than >= because
	// if cursor == output.length() it is at the end of the output,
	// which is the default position, so we need not emit it.
	if (hasCursor && cursor > output.length()) {
	cursor -= output.length();
	while (cursor-- > 0) {
	ICU_Utility::appendToRule(rule, (UChar)0x0040 /@/, TRUE, escapeUnprintable, quoteBuf);
	}
	ICU_Utility::appendToRule(rule, (UChar)0x007C /\|/, TRUE, escapeUnprintable, quoteBuf);
	}
	// Flush quoteBuf out to result
	ICU_Utility::appendToRule(rule, -1,
	TRUE, escapeUnprintable, quoteBuf);

	return rule;
	}

	/**
	* Implement UnicodeReplacer
	*/
	void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
	UChar32 ch;
	for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) {
	ch = output.char32At(i);
	UnicodeReplacer* r = data->lookupReplacer(ch);
	if (r == NULL) {
	toUnionTo.add(ch);
	} else {
	r->addReplacementSetTo(toUnionTo);
	}
	}
	}

	/**
	* UnicodeFunctor API
	*/
	void StringReplacer::setData(const TransliterationRuleData* d) {
	data = d;
	int32_t i = 0;
	while (i<output.length()) {
	UChar32 c = output.char32At(i);
	UnicodeFunctor* f = data->lookup(c);
	if (f != NULL) {
	f->setData(data);
	}
	i += UTF_CHAR_LENGTH(c);
	}
	}

	U_NAMESPACE_END

	#endif /* #if !UCONFIG_NO_TRANSLITERATION */

	//eof