| /* |
| ****************************************************************************** |
| * Copyright (c) 1996-2014, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ****************************************************************************** |
| * File unorm.cpp |
| * |
| * Created by: Vladimir Weinstein 12052000 |
| * |
| * Modification history : |
| * |
| * Date Name Description |
| * 02/01/01 synwee Added normalization quickcheck enum and method. |
| * 02/12/01 synwee Commented out quickcheck util api has been approved |
| * Added private method for doing FCD checks |
| * 02/23/01 synwee Modified quickcheck and checkFCE to run through |
| * string for codepoints < 0x300 for the normalization |
| * mode NFC. |
| * 05/25/01+ Markus Scherer total rewrite, implement all normalization here |
| * instead of just wrappers around normlzr.cpp, |
| * load unorm.dat, support Unicode 3.1 with |
| * supplementary code points, etc. |
| * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_NORMALIZATION |
| |
| #include "unicode/udata.h" |
| #include "unicode/ustring.h" |
| #include "unicode/uiter.h" |
| #include "unicode/unorm.h" |
| #include "unicode/unorm2.h" |
| #include "normalizer2impl.h" |
| #include "unormimp.h" |
| #include "uprops.h" |
| #include "ustr_imp.h" |
| |
| U_NAMESPACE_USE |
| |
| /* quick check functions ---------------------------------------------------- */ |
| |
| U_CAPI UNormalizationCheckResult U_EXPORT2 |
| unorm_quickCheck(const UChar *src, |
| int32_t srcLength, |
| UNormalizationMode mode, |
| UErrorCode *pErrorCode) { |
| const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
| return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
| } |
| |
| U_CAPI UNormalizationCheckResult U_EXPORT2 |
| unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, |
| UNormalizationMode mode, int32_t options, |
| UErrorCode *pErrorCode) { |
| const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
| if(options&UNORM_UNICODE_3_2) { |
| FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); |
| return unorm2_quickCheck( |
| reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), |
| src, srcLength, pErrorCode); |
| } else { |
| return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
| } |
| } |
| |
| U_CAPI UBool U_EXPORT2 |
| unorm_isNormalized(const UChar *src, int32_t srcLength, |
| UNormalizationMode mode, |
| UErrorCode *pErrorCode) { |
| const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
| return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
| } |
| |
| U_CAPI UBool U_EXPORT2 |
| unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, |
| UNormalizationMode mode, int32_t options, |
| UErrorCode *pErrorCode) { |
| const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
| if(options&UNORM_UNICODE_3_2) { |
| FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); |
| return unorm2_isNormalized( |
| reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), |
| src, srcLength, pErrorCode); |
| } else { |
| return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); |
| } |
| } |
| |
| /* normalize() API ---------------------------------------------------------- */ |
| |
| /** Public API for normalizing. */ |
| U_CAPI int32_t U_EXPORT2 |
| unorm_normalize(const UChar *src, int32_t srcLength, |
| UNormalizationMode mode, int32_t options, |
| UChar *dest, int32_t destCapacity, |
| UErrorCode *pErrorCode) { |
| const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
| if(options&UNORM_UNICODE_3_2) { |
| FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); |
| return unorm2_normalize( |
| reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), |
| src, srcLength, dest, destCapacity, pErrorCode); |
| } else { |
| return unorm2_normalize((const UNormalizer2 *)n2, |
| src, srcLength, dest, destCapacity, pErrorCode); |
| } |
| } |
| |
| |
| /* iteration functions ------------------------------------------------------ */ |
| |
| static int32_t |
| _iterate(UCharIterator *src, UBool forward, |
| UChar *dest, int32_t destCapacity, |
| const Normalizer2 *n2, |
| UBool doNormalize, UBool *pNeededToNormalize, |
| UErrorCode *pErrorCode) { |
| if(U_FAILURE(*pErrorCode)) { |
| return 0; |
| } |
| if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) { |
| *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| if(pNeededToNormalize!=NULL) { |
| *pNeededToNormalize=FALSE; |
| } |
| if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { |
| return u_terminateUChars(dest, destCapacity, 0, pErrorCode); |
| } |
| |
| UnicodeString buffer; |
| UChar32 c; |
| if(forward) { |
| /* get one character and ignore its properties */ |
| buffer.append(uiter_next32(src)); |
| /* get all following characters until we see a boundary */ |
| while((c=uiter_next32(src))>=0) { |
| if(n2->hasBoundaryBefore(c)) { |
| /* back out the latest movement to stop at the boundary */ |
| src->move(src, -U16_LENGTH(c), UITER_CURRENT); |
| break; |
| } else { |
| buffer.append(c); |
| } |
| } |
| } else { |
| while((c=uiter_previous32(src))>=0) { |
| /* always write this character to the front of the buffer */ |
| buffer.insert(0, c); |
| /* stop if this just-copied character is a boundary */ |
| if(n2->hasBoundaryBefore(c)) { |
| break; |
| } |
| } |
| } |
| |
| UnicodeString destString(dest, 0, destCapacity); |
| if(buffer.length()>0 && doNormalize) { |
| n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); |
| if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { |
| *pNeededToNormalize= destString!=buffer; |
| } |
| return destString.length(); |
| } else { |
| /* just copy the source characters */ |
| return buffer.extract(dest, destCapacity, *pErrorCode); |
| } |
| } |
| |
| static int32_t |
| unorm_iterate(UCharIterator *src, UBool forward, |
| UChar *dest, int32_t destCapacity, |
| UNormalizationMode mode, int32_t options, |
| UBool doNormalize, UBool *pNeededToNormalize, |
| UErrorCode *pErrorCode) { |
| const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
| if(options&UNORM_UNICODE_3_2) { |
| const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| return 0; |
| } |
| FilteredNormalizer2 fn2(*n2, *uni32); |
| return _iterate(src, forward, dest, destCapacity, |
| &fn2, doNormalize, pNeededToNormalize, pErrorCode); |
| } |
| return _iterate(src, forward, dest, destCapacity, |
| n2, doNormalize, pNeededToNormalize, pErrorCode); |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| unorm_previous(UCharIterator *src, |
| UChar *dest, int32_t destCapacity, |
| UNormalizationMode mode, int32_t options, |
| UBool doNormalize, UBool *pNeededToNormalize, |
| UErrorCode *pErrorCode) { |
| return unorm_iterate(src, FALSE, |
| dest, destCapacity, |
| mode, options, |
| doNormalize, pNeededToNormalize, |
| pErrorCode); |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| unorm_next(UCharIterator *src, |
| UChar *dest, int32_t destCapacity, |
| UNormalizationMode mode, int32_t options, |
| UBool doNormalize, UBool *pNeededToNormalize, |
| UErrorCode *pErrorCode) { |
| return unorm_iterate(src, TRUE, |
| dest, destCapacity, |
| mode, options, |
| doNormalize, pNeededToNormalize, |
| pErrorCode); |
| } |
| |
| /* Concatenation of normalized strings -------------------------------------- */ |
| |
| static int32_t |
| _concatenate(const UChar *left, int32_t leftLength, |
| const UChar *right, int32_t rightLength, |
| UChar *dest, int32_t destCapacity, |
| const Normalizer2 *n2, |
| UErrorCode *pErrorCode) { |
| if(U_FAILURE(*pErrorCode)) { |
| return 0; |
| } |
| if(destCapacity<0 || (dest==NULL && destCapacity>0) || |
| left==NULL || leftLength<-1 || right==NULL || rightLength<-1) { |
| *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| /* check for overlapping right and destination */ |
| if( dest!=NULL && |
| ((right>=dest && right<(dest+destCapacity)) || |
| (rightLength>0 && dest>=right && dest<(right+rightLength))) |
| ) { |
| *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| /* allow left==dest */ |
| UnicodeString destString; |
| if(left==dest) { |
| destString.setTo(dest, leftLength, destCapacity); |
| } else { |
| destString.setTo(dest, 0, destCapacity); |
| destString.append(left, leftLength); |
| } |
| return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). |
| extract(dest, destCapacity, *pErrorCode); |
| } |
| |
| U_CAPI int32_t U_EXPORT2 |
| unorm_concatenate(const UChar *left, int32_t leftLength, |
| const UChar *right, int32_t rightLength, |
| UChar *dest, int32_t destCapacity, |
| UNormalizationMode mode, int32_t options, |
| UErrorCode *pErrorCode) { |
| const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
| if(options&UNORM_UNICODE_3_2) { |
| const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); |
| if(U_FAILURE(*pErrorCode)) { |
| return 0; |
| } |
| FilteredNormalizer2 fn2(*n2, *uni32); |
| return _concatenate(left, leftLength, right, rightLength, |
| dest, destCapacity, &fn2, pErrorCode); |
| } |
| return _concatenate(left, leftLength, right, rightLength, |
| dest, destCapacity, n2, pErrorCode); |
| } |
| |
| #endif /* #if !UCONFIG_NO_NORMALIZATION */ |