src/third_party/icu/source/i18n/brktrans.cpp - cobalt - Git at Google

 /*
 **********************************************************************
 *   Copyright (C) 2008-2010, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   05/11/2008  Andy Heninger  Port from Java
 **********************************************************************
 */

 #include "unicode/utypes.h"

 #if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION

 #include "unicode/unifilt.h"
 #include "unicode/uchar.h"
 #include "unicode/uniset.h"
 #include "unicode/brkiter.h"
 #include "brktrans.h"
 #include "unicode/uchar.h"
 #include "cmemory.h"
 #include "uprops.h"
 #include "uinvchar.h"
 #include "util.h"
 #include "uvectr32.h"

 U_NAMESPACE_BEGIN

 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)

 static const UChar SPACE       = 32;  // ' '


 /**
  * Constructs a transliterator with the default delimiters '{' and
  * '}'.
  */
 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
     Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
     fInsertion(SPACE) {
         bi = NULL;
         UErrorCode status = U_ZERO_ERROR;
         boundaries = new UVector32(status);
     }


 /**
  * Destructor.
  */
 BreakTransliterator::~BreakTransliterator() {
     delete bi;
     bi = NULL;
     delete boundaries;
     boundaries = NULL;
 }

 /**
  * Copy constructor.
  */
 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
     Transliterator(o) {
         bi = NULL;
         if (o.bi != NULL) {
             bi = o.bi->clone();
         }
         fInsertion = o.fInsertion;
         UErrorCode status = U_ZERO_ERROR;
         boundaries = new UVector32(status);
     }


 /**
  * Transliterator API.
  */
 Transliterator* BreakTransliterator::clone(void) const {
     return new BreakTransliterator(*this);
 }

 /**
  * Implements {@link Transliterator#handleTransliterate}.
  */
 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                     UBool isIncremental ) const {

         UErrorCode status = U_ZERO_ERROR;
         boundaries->removeAllElements();
         BreakTransliterator *nonConstThis = (BreakTransliterator *)this;
         nonConstThis->getBreakIterator(); // Lazy-create it if necessary
         UnicodeString sText = replaceableAsString(text);
         bi->setText(sText);
         bi->preceding(offsets.start);

         // To make things much easier, we will stack the boundaries, and then insert at the end.
         // generally, we won't need too many, since we will be filtered.

         int32_t boundary;
         for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
             if (boundary == 0) continue;
             // HACK: Check to see that preceeding item was a letter

             UChar32 cp = sText.char32At(boundary-1);
             int type = u_charType(cp);
             //System.out.println(Integer.toString(cp,16) + " (before): " + type);
             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;

             cp = sText.char32At(boundary);
             type = u_charType(cp);
             //System.out.println(Integer.toString(cp,16) + " (after): " + type);
             if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;

             boundaries->addElement(boundary, status);
             // printf("Boundary at %d\n", boundary);
         }

         int delta = 0;
         int lastBoundary = 0;

         if (boundaries->size() != 0) { // if we found something, adjust
             delta = boundaries->size() * fInsertion.length();
             lastBoundary = boundaries->lastElementi();

             // we do this from the end backwards, so that we don't have to keep updating.

             while (boundaries->size() > 0) {
                 boundary = boundaries->popi();
                 text.handleReplaceBetween(boundary, boundary, fInsertion);
             }
         }

         // Now fix up the return values
         offsets.contextLimit += delta;
         offsets.limit += delta;
         offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;

         // TODO:  do something with U_FAILURE(status);
         //        (need to look at transliterators overall, not just here.)
 }

 //
 //  getInsertion()
 //
 const UnicodeString &BreakTransliterator::getInsertion() const {
     return fInsertion;
 }

 //
 //  setInsertion()
 //
 void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
     this->fInsertion = insertion;
 }

 //
 //  getBreakIterator     Lazily create the break iterator if it does
 //                       not already exist.  Copied from Java, probably
 //                       better to just create it in the constructor.
 //
 BreakIterator *BreakTransliterator::getBreakIterator() {
     UErrorCode status = U_ZERO_ERROR;
     if (bi == NULL) {
         // Note:  Thai breaking behavior is universal, it is not
         //        tied to the Thai locale.
         bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
     }
     return bi;
 }

 //
 //   replaceableAsString   Hack to let break iterators work
 //                         on the replaceable text from transliterators.
 //                         In practice, the only real Replaceable type that we
 //                         will be seeing is UnicodeString, so this function
 //                         will normally be efficient.
 //
 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
     UnicodeString s;
     UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
     if (rs != NULL) {
         s = *rs;
     } else {
         r.extractBetween(0, r.length(), s);
     }
     return s;
 }

 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
	/*
	**********************************************************************
	* Copyright (C) 2008-2010, International Business Machines
	* Corporation and others. All Rights Reserved.
	**********************************************************************
	* Date Name Description
	* 05/11/2008 Andy Heninger Port from Java
	**********************************************************************
	*/

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION

	#include "unicode/unifilt.h"
	#include "unicode/uchar.h"
	#include "unicode/uniset.h"
	#include "unicode/brkiter.h"
	#include "brktrans.h"
	#include "unicode/uchar.h"
	#include "cmemory.h"
	#include "uprops.h"
	#include "uinvchar.h"
	#include "util.h"
	#include "uvectr32.h"

	U_NAMESPACE_BEGIN

	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)

	static const UChar SPACE = 32; // ' '


	/**
	* Constructs a transliterator with the default delimiters '{' and
	* '}'.
	*/
	BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
	Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
	fInsertion(SPACE) {
	bi = NULL;
	UErrorCode status = U_ZERO_ERROR;
	boundaries = new UVector32(status);
	}


	/**
	* Destructor.
	*/
	BreakTransliterator::~BreakTransliterator() {
	delete bi;
	bi = NULL;
	delete boundaries;
	boundaries = NULL;
	}

	/**
	* Copy constructor.
	*/
	BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
	Transliterator(o) {
	bi = NULL;
	if (o.bi != NULL) {
	bi = o.bi->clone();
	}
	fInsertion = o.fInsertion;
	UErrorCode status = U_ZERO_ERROR;
	boundaries = new UVector32(status);
	}


	/**
	* Transliterator API.
	*/
	Transliterator* BreakTransliterator::clone(void) const {
	return new BreakTransliterator(*this);
	}

	/**
	* Implements {@link Transliterator#handleTransliterate}.
	*/
	void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
	UBool isIncremental ) const {

	UErrorCode status = U_ZERO_ERROR;
	boundaries->removeAllElements();
	BreakTransliterator nonConstThis = (BreakTransliterator )this;
	nonConstThis->getBreakIterator(); // Lazy-create it if necessary
	UnicodeString sText = replaceableAsString(text);
	bi->setText(sText);
	bi->preceding(offsets.start);

	// To make things much easier, we will stack the boundaries, and then insert at the end.
	// generally, we won't need too many, since we will be filtered.

	int32_t boundary;
	for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
	if (boundary == 0) continue;
	// HACK: Check to see that preceeding item was a letter

	UChar32 cp = sText.char32At(boundary-1);
	int type = u_charType(cp);
	//System.out.println(Integer.toString(cp,16) + " (before): " + type);
	if ((U_MASK(type) & (U_GC_L_MASK \| U_GC_M_MASK)) == 0) continue;

	cp = sText.char32At(boundary);
	type = u_charType(cp);
	//System.out.println(Integer.toString(cp,16) + " (after): " + type);
	if ((U_MASK(type) & (U_GC_L_MASK \| U_GC_M_MASK)) == 0) continue;

	boundaries->addElement(boundary, status);
	// printf("Boundary at %d\n", boundary);
	}

	int delta = 0;
	int lastBoundary = 0;

	if (boundaries->size() != 0) { // if we found something, adjust
	delta = boundaries->size() * fInsertion.length();
	lastBoundary = boundaries->lastElementi();

	// we do this from the end backwards, so that we don't have to keep updating.

	while (boundaries->size() > 0) {
	boundary = boundaries->popi();
	text.handleReplaceBetween(boundary, boundary, fInsertion);
	}
	}

	// Now fix up the return values
	offsets.contextLimit += delta;
	offsets.limit += delta;
	offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;

	// TODO: do something with U_FAILURE(status);
	// (need to look at transliterators overall, not just here.)
	}

	//
	// getInsertion()
	//
	const UnicodeString &BreakTransliterator::getInsertion() const {
	return fInsertion;
	}

	//
	// setInsertion()
	//
	void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
	this->fInsertion = insertion;
	}

	//
	// getBreakIterator Lazily create the break iterator if it does
	// not already exist. Copied from Java, probably
	// better to just create it in the constructor.
	//
	BreakIterator *BreakTransliterator::getBreakIterator() {
	UErrorCode status = U_ZERO_ERROR;
	if (bi == NULL) {
	// Note: Thai breaking behavior is universal, it is not
	// tied to the Thai locale.
	bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
	}
	return bi;
	}

	//
	// replaceableAsString Hack to let break iterators work
	// on the replaceable text from transliterators.
	// In practice, the only real Replaceable type that we
	// will be seeing is UnicodeString, so this function
	// will normally be efficient.
	//
	UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
	UnicodeString s;
	UnicodeString rs = dynamic_cast<UnicodeString >(&r);
	if (rs != NULL) {
	s = *rs;
	} else {
	r.extractBetween(0, r.length(), s);
	}
	return s;
	}

	U_NAMESPACE_END

	#endif /* #if !UCONFIG_NO_TRANSLITERATION */