src/third_party/icu/source/i18n/anytrans.cpp - cobalt - Git at Google

 /*
 *****************************************************************
 * Copyright (c) 2002-2008, International Business Machines Corporation
 * and others.  All Rights Reserved.
 *****************************************************************
 * Date        Name        Description
 * 06/06/2002  aliu        Creation.
 *****************************************************************
 */

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_TRANSLITERATION

 #include "unicode/uobject.h"
 #include "unicode/uscript.h"
 #include "nultrans.h"
 #include "anytrans.h"
 #include "uvector.h"
 #include "tridpars.h"
 #include "hash.h"
 #include "putilimp.h"
 #include "uinvchar.h"

 //------------------------------------------------------------
 // Constants

 static const UChar TARGET_SEP = 45; // '-'
 static const UChar VARIANT_SEP = 47; // '/'
 static const UChar ANY[] = {65,110,121,0}; // "Any"
 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"

 //------------------------------------------------------------

 U_CDECL_BEGIN
 /**
  * Deleter function for Transliterator*.
  */
 static void U_CALLCONV
 _deleteTransliterator(void *obj) {
     delete (U_NAMESPACE_QUALIFIER Transliterator*) obj;
 }
 U_CDECL_END

 //------------------------------------------------------------

 U_NAMESPACE_BEGIN

 //------------------------------------------------------------
 // ScriptRunIterator

 /**
  * Returns a series of ranges corresponding to scripts. They will be
  * of the form:
  *
  * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
  * |            |          - first run (start, limit)
  *          |           |  - second run (start, limit)
  *
  * That is, the runs will overlap. The reason for this is so that a
  * transliterator can consider common characters both before and after
  * the scripts.
  */
 class ScriptRunIterator : public UMemory {
 private:
     const Replaceable& text;
     int32_t textStart;
     int32_t textLimit;

 public:
     /**
      * The code of the current run, valid after next() returns.  May
      * be USCRIPT_INVALID_CODE if and only if the entire text is
      * COMMON/INHERITED.
      */
     UScriptCode scriptCode;

     /**
      * The start of the run, inclusive, valid after next() returns.
      */
     int32_t start;

     /**
      * The end of the run, exclusive, valid after next() returns.
      */
     int32_t limit;

     /**
      * Constructs a run iterator over the given text from start
      * (inclusive) to limit (exclusive).
      */
     ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);

     /**
      * Returns TRUE if there are any more runs.  TRUE is always
      * returned at least once.  Upon return, the caller should
      * examine scriptCode, start, and limit.
      */
     UBool next();

     /**
      * Adjusts internal indices for a change in the limit index of the
      * given delta.  A positive delta means the limit has increased.
      */
     void adjustLimit(int32_t delta);

 private:
     ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
     ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
 };

 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
                                      int32_t myStart, int32_t myLimit) :
     text(theText)
 {
     textStart = myStart;
     textLimit = myLimit;
     limit = myStart;
 }

 UBool ScriptRunIterator::next() {
     UChar32 ch;
     UScriptCode s;
     UErrorCode ec = U_ZERO_ERROR;

     scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
     start = limit;

     // Are we done?
     if (start == textLimit) {
         return FALSE;
     }

     // Move start back to include adjacent COMMON or INHERITED
     // characters
     while (start > textStart) {
         ch = text.char32At(start - 1); // look back
         s = uscript_getScript(ch, &ec);
         if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
             --start;
         } else {
             break;
         }
     }

     // Move limit ahead to include COMMON, INHERITED, and characters
     // of the current script.
     while (limit < textLimit) {
         ch = text.char32At(limit); // look ahead
         s = uscript_getScript(ch, &ec);
         if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
             if (scriptCode == USCRIPT_INVALID_CODE) {
                 scriptCode = s;
             } else if (s != scriptCode) {
                 break;
             }
         }
         ++limit;
     }

     // Return TRUE even if the entire text is COMMON / INHERITED, in
     // which case scriptCode will be USCRIPT_INVALID_CODE.
     return TRUE;
 }

 void ScriptRunIterator::adjustLimit(int32_t delta) {
     limit += delta;
     textLimit += delta;
 }

 //------------------------------------------------------------
 // AnyTransliterator

 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)

 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
                                      const UnicodeString& theTarget,
                                      const UnicodeString& theVariant,
                                      UScriptCode theTargetScript,
                                      UErrorCode& ec) :
     Transliterator(id, NULL),
     targetScript(theTargetScript)
 {
     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
     if (U_FAILURE(ec)) {
         return;
     }
     uhash_setValueDeleter(cache, _deleteTransliterator);

     target = theTarget;
     if (theVariant.length() > 0) {
         target.append(VARIANT_SEP).append(theVariant);
     }
 }

 AnyTransliterator::~AnyTransliterator() {
     uhash_close(cache);
 }

 /**
  * Copy constructor.
  */
 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
     Transliterator(o),
     target(o.target),
     targetScript(o.targetScript)
 {
     // Don't copy the cache contents
     UErrorCode ec = U_ZERO_ERROR;
     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
     if (U_FAILURE(ec)) {
         return;
     }
     uhash_setValueDeleter(cache, _deleteTransliterator);
 }

 /**
  * Transliterator API.
  */
 Transliterator* AnyTransliterator::clone() const {
     return new AnyTransliterator(*this);
 }

 /**
  * Implements {@link Transliterator#handleTransliterate}.
  */
 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
                                             UBool isIncremental) const {
     int32_t allStart = pos.start;
     int32_t allLimit = pos.limit;

     ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);

     while (it.next()) {
         // Ignore runs in the ante context
         if (it.limit <= allStart) continue;

         // Try to instantiate transliterator from it.scriptCode to
         // our target or target/variant
         Transliterator* t = getTransliterator(it.scriptCode);

         if (t == NULL) {
             // We have no transliterator.  Do nothing, but keep
             // pos.start up to date.
             pos.start = it.limit;
             continue;
         }

         // If the run end is before the transliteration limit, do
         // a non-incremental transliteration.  Otherwise do an
         // incremental one.
         UBool incremental = isIncremental && (it.limit >= allLimit);

         pos.start = uprv_max(allStart, it.start);
         pos.limit = uprv_min(allLimit, it.limit);
         int32_t limit = pos.limit;
         t->filteredTransliterate(text, pos, incremental);
         int32_t delta = pos.limit - limit;
         allLimit += delta;
         it.adjustLimit(delta);

         // We're done if we enter the post context
         if (it.limit >= allLimit) break;
     }

     // Restore limit.  pos.start is fine where the last transliterator
     // left it, or at the end of the last run.
     pos.limit = allLimit;
 }

 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {

     if (source == targetScript || source == USCRIPT_INVALID_CODE) {
         return NULL;
     }

     Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
     if (t == NULL) {
         UErrorCode ec = U_ZERO_ERROR;
         UnicodeString sourceName(uscript_getName(source), -1, US_INV);
         UnicodeString id(sourceName);
         id.append(TARGET_SEP).append(target);

         t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
         if (U_FAILURE(ec) || t == NULL) {
             delete t;

             // Try to pivot around Latin, our most common script
             id = sourceName;
             id.append(LATIN_PIVOT).append(target);
             t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
             if (U_FAILURE(ec) || t == NULL) {
                 delete t;
                 t = NULL;
             }
         }

         if (t != NULL) {
             uhash_iput(cache, (int32_t) source, t, &ec);
         }
     }

     return t;
 }

 /**
  * Return the script code for a given name, or -1 if not found.
  */
 static UScriptCode scriptNameToCode(const UnicodeString& name) {
     char buf[128];
     UScriptCode code;
     UErrorCode ec = U_ZERO_ERROR;
     int32_t nameLen = name.length();
     UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);

     if (isInvariant) {
         name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
         buf[127] = 0;   // Make sure that we NULL terminate the string.
     }
     if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
     {
         code = USCRIPT_INVALID_CODE;
     }
     return code;
 }

 /**
  * Registers standard transliterators with the system.  Called by
  * Transliterator during initialization.  Scan all current targets and
  * register those that are scripts T as Any-T/V.
  */
 void AnyTransliterator::registerIDs() {

     UErrorCode ec = U_ZERO_ERROR;
     Hashtable seen(TRUE, ec);

     int32_t sourceCount = Transliterator::_countAvailableSources();
     for (int32_t s=0; s<sourceCount; ++s) {
         UnicodeString source;
         Transliterator::_getAvailableSource(s, source);

         // Ignore the "Any" source
         if (source.caseCompare(ANY, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;

         int32_t targetCount = Transliterator::_countAvailableTargets(source);
         for (int32_t t=0; t<targetCount; ++t) {
             UnicodeString target;
             Transliterator::_getAvailableTarget(t, source, target);

             // Only process each target once
             if (seen.geti(target) != 0) continue;
             ec = U_ZERO_ERROR;
             seen.puti(target, 1, ec);

             // Get the script code for the target.  If not a script, ignore.
             UScriptCode targetScript = scriptNameToCode(target);
             if (targetScript == USCRIPT_INVALID_CODE) continue;

             int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
             // assert(variantCount >= 1);
             for (int32_t v=0; v<variantCount; ++v) {
                 UnicodeString variant;
                 Transliterator::_getAvailableVariant(v, source, target, variant);

                 UnicodeString id;
                 TransliteratorIDParser::STVtoID(ANY, target, variant, id);
                 ec = U_ZERO_ERROR;
                 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
                                                              targetScript, ec);
                 if (U_FAILURE(ec)) {
                     delete t;
                 } else {
                     Transliterator::_registerInstance(t);
                     Transliterator::_registerSpecialInverse(target, NULL_ID, FALSE);
                 }
             }
         }
     }
 }

 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

 //eof
	/*
	*****************************************************************
	* Copyright (c) 2002-2008, International Business Machines Corporation
	* and others. All Rights Reserved.
	*****************************************************************
	* Date Name Description
	* 06/06/2002 aliu Creation.
	*****************************************************************
	*/

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_TRANSLITERATION

	#include "unicode/uobject.h"
	#include "unicode/uscript.h"
	#include "nultrans.h"
	#include "anytrans.h"
	#include "uvector.h"
	#include "tridpars.h"
	#include "hash.h"
	#include "putilimp.h"
	#include "uinvchar.h"

	//------------------------------------------------------------
	// Constants

	static const UChar TARGET_SEP = 45; // '-'
	static const UChar VARIANT_SEP = 47; // '/'
	static const UChar ANY[] = {65,110,121,0}; // "Any"
	static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
	static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"

	//------------------------------------------------------------

	U_CDECL_BEGIN
	/**
	* Deleter function for Transliterator*.
	*/
	static void U_CALLCONV
	_deleteTransliterator(void *obj) {
	delete (U_NAMESPACE_QUALIFIER Transliterator*) obj;
	}
	U_CDECL_END

	//------------------------------------------------------------

	U_NAMESPACE_BEGIN

	//------------------------------------------------------------
	// ScriptRunIterator

	/**
	* Returns a series of ranges corresponding to scripts. They will be
	* of the form:
	*
	* ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
	* \| \| - first run (start, limit)
	* \| \| - second run (start, limit)
	*
	* That is, the runs will overlap. The reason for this is so that a
	* transliterator can consider common characters both before and after
	* the scripts.
	*/
	class ScriptRunIterator : public UMemory {
	private:
	const Replaceable& text;
	int32_t textStart;
	int32_t textLimit;

	public:
	/**
	* The code of the current run, valid after next() returns. May
	* be USCRIPT_INVALID_CODE if and only if the entire text is
	* COMMON/INHERITED.
	*/
	UScriptCode scriptCode;

	/**
	* The start of the run, inclusive, valid after next() returns.
	*/
	int32_t start;

	/**
	* The end of the run, exclusive, valid after next() returns.
	*/
	int32_t limit;

	/**
	* Constructs a run iterator over the given text from start
	* (inclusive) to limit (exclusive).
	*/
	ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);

	/**
	* Returns TRUE if there are any more runs. TRUE is always
	* returned at least once. Upon return, the caller should
	* examine scriptCode, start, and limit.
	*/
	UBool next();

	/**
	* Adjusts internal indices for a change in the limit index of the
	* given delta. A positive delta means the limit has increased.
	*/
	void adjustLimit(int32_t delta);

	private:
	ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
	ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
	};

	ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
	int32_t myStart, int32_t myLimit) :
	text(theText)
	{
	textStart = myStart;
	textLimit = myLimit;
	limit = myStart;
	}

	UBool ScriptRunIterator::next() {
	UChar32 ch;
	UScriptCode s;
	UErrorCode ec = U_ZERO_ERROR;

	scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
	start = limit;

	// Are we done?
	if (start == textLimit) {
	return FALSE;
	}

	// Move start back to include adjacent COMMON or INHERITED
	// characters
	while (start > textStart) {
	ch = text.char32At(start - 1); // look back
	s = uscript_getScript(ch, &ec);
	if (s == USCRIPT_COMMON \|\| s == USCRIPT_INHERITED) {
	--start;
	} else {
	break;
	}
	}

	// Move limit ahead to include COMMON, INHERITED, and characters
	// of the current script.
	while (limit < textLimit) {
	ch = text.char32At(limit); // look ahead
	s = uscript_getScript(ch, &ec);
	if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
	if (scriptCode == USCRIPT_INVALID_CODE) {
	scriptCode = s;
	} else if (s != scriptCode) {
	break;
	}
	}
	++limit;
	}

	// Return TRUE even if the entire text is COMMON / INHERITED, in
	// which case scriptCode will be USCRIPT_INVALID_CODE.
	return TRUE;
	}

	void ScriptRunIterator::adjustLimit(int32_t delta) {
	limit += delta;
	textLimit += delta;
	}

	//------------------------------------------------------------
	// AnyTransliterator

	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)

	AnyTransliterator::AnyTransliterator(const UnicodeString& id,
	const UnicodeString& theTarget,
	const UnicodeString& theVariant,
	UScriptCode theTargetScript,
	UErrorCode& ec) :
	Transliterator(id, NULL),
	targetScript(theTargetScript)
	{
	cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
	if (U_FAILURE(ec)) {
	return;
	}
	uhash_setValueDeleter(cache, _deleteTransliterator);

	target = theTarget;
	if (theVariant.length() > 0) {
	target.append(VARIANT_SEP).append(theVariant);
	}
	}

	AnyTransliterator::~AnyTransliterator() {
	uhash_close(cache);
	}

	/**
	* Copy constructor.
	*/
	AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
	Transliterator(o),
	target(o.target),
	targetScript(o.targetScript)
	{
	// Don't copy the cache contents
	UErrorCode ec = U_ZERO_ERROR;
	cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
	if (U_FAILURE(ec)) {
	return;
	}
	uhash_setValueDeleter(cache, _deleteTransliterator);
	}

	/**
	* Transliterator API.
	*/
	Transliterator* AnyTransliterator::clone() const {
	return new AnyTransliterator(*this);
	}

	/**
	* Implements {@link Transliterator#handleTransliterate}.
	*/
	void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
	UBool isIncremental) const {
	int32_t allStart = pos.start;
	int32_t allLimit = pos.limit;

	ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);

	while (it.next()) {
	// Ignore runs in the ante context
	if (it.limit <= allStart) continue;

	// Try to instantiate transliterator from it.scriptCode to
	// our target or target/variant
	Transliterator* t = getTransliterator(it.scriptCode);

	if (t == NULL) {
	// We have no transliterator. Do nothing, but keep
	// pos.start up to date.
	pos.start = it.limit;
	continue;
	}

	// If the run end is before the transliteration limit, do
	// a non-incremental transliteration. Otherwise do an
	// incremental one.
	UBool incremental = isIncremental && (it.limit >= allLimit);

	pos.start = uprv_max(allStart, it.start);
	pos.limit = uprv_min(allLimit, it.limit);
	int32_t limit = pos.limit;
	t->filteredTransliterate(text, pos, incremental);
	int32_t delta = pos.limit - limit;
	allLimit += delta;
	it.adjustLimit(delta);

	// We're done if we enter the post context
	if (it.limit >= allLimit) break;
	}

	// Restore limit. pos.start is fine where the last transliterator
	// left it, or at the end of the last run.
	pos.limit = allLimit;
	}

	Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {

	if (source == targetScript \|\| source == USCRIPT_INVALID_CODE) {
	return NULL;
	}

	Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
	if (t == NULL) {
	UErrorCode ec = U_ZERO_ERROR;
	UnicodeString sourceName(uscript_getName(source), -1, US_INV);
	UnicodeString id(sourceName);
	id.append(TARGET_SEP).append(target);

	t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
	if (U_FAILURE(ec) \|\| t == NULL) {
	delete t;

	// Try to pivot around Latin, our most common script
	id = sourceName;
	id.append(LATIN_PIVOT).append(target);
	t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
	if (U_FAILURE(ec) \|\| t == NULL) {
	delete t;
	t = NULL;
	}
	}

	if (t != NULL) {
	uhash_iput(cache, (int32_t) source, t, &ec);
	}
	}

	return t;
	}

	/**
	* Return the script code for a given name, or -1 if not found.
	*/
	static UScriptCode scriptNameToCode(const UnicodeString& name) {
	char buf[128];
	UScriptCode code;
	UErrorCode ec = U_ZERO_ERROR;
	int32_t nameLen = name.length();
	UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);

	if (isInvariant) {
	name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
	buf[127] = 0; // Make sure that we NULL terminate the string.
	}
	if (!isInvariant \|\| uscript_getCode(buf, &code, 1, &ec) != 1 \|\| U_FAILURE(ec))
	{
	code = USCRIPT_INVALID_CODE;
	}
	return code;
	}

	/**
	* Registers standard transliterators with the system. Called by
	* Transliterator during initialization. Scan all current targets and
	* register those that are scripts T as Any-T/V.
	*/
	void AnyTransliterator::registerIDs() {

	UErrorCode ec = U_ZERO_ERROR;
	Hashtable seen(TRUE, ec);

	int32_t sourceCount = Transliterator::_countAvailableSources();
	for (int32_t s=0; s<sourceCount; ++s) {
	UnicodeString source;
	Transliterator::_getAvailableSource(s, source);

	// Ignore the "Any" source
	if (source.caseCompare(ANY, 0 /U_FOLD_CASE_DEFAULT/) == 0) continue;

	int32_t targetCount = Transliterator::_countAvailableTargets(source);
	for (int32_t t=0; t<targetCount; ++t) {
	UnicodeString target;
	Transliterator::_getAvailableTarget(t, source, target);

	// Only process each target once
	if (seen.geti(target) != 0) continue;
	ec = U_ZERO_ERROR;
	seen.puti(target, 1, ec);

	// Get the script code for the target. If not a script, ignore.
	UScriptCode targetScript = scriptNameToCode(target);
	if (targetScript == USCRIPT_INVALID_CODE) continue;

	int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
	// assert(variantCount >= 1);
	for (int32_t v=0; v<variantCount; ++v) {
	UnicodeString variant;
	Transliterator::_getAvailableVariant(v, source, target, variant);

	UnicodeString id;
	TransliteratorIDParser::STVtoID(ANY, target, variant, id);
	ec = U_ZERO_ERROR;
	AnyTransliterator* t = new AnyTransliterator(id, target, variant,
	targetScript, ec);
	if (U_FAILURE(ec)) {
	delete t;
	} else {
	Transliterator::_registerInstance(t);
	Transliterator::_registerSpecialInverse(target, NULL_ID, FALSE);
	}
	}
	}
	}
	}

	U_NAMESPACE_END

	#endif /* #if !UCONFIG_NO_TRANSLITERATION */

	//eof