blob: 4f0a2b6dceac139b63b7c05e9b4ef1ba516f9ef7 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationsets.h
*
* created on: 2013feb09
* created by: Markus W. Scherer
*/
#ifndef __COLLATIONSETS_H__
#define __COLLATIONSETS_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/uniset.h"
#include "collation.h"
U_NAMESPACE_BEGIN
struct CollationData;
/**
* Finds the set of characters and strings that sort differently in the tailoring
* from the base data.
*
* Every mapping in the tailoring needs to be compared to the base,
* because some mappings are copied for optimization, and
* all contractions for a character are copied if any contractions for that character
* are added, modified or removed.
*
* It might be simpler to re-parse the rule string, but:
* - That would require duplicating some of the from-rules builder code.
* - That would make the runtime code depend on the builder.
* - That would only work if we have the rule string, and we allow users to
* omit the rule string from data files.
*/
class TailoredSet : public UMemory {
public:
TailoredSet(UnicodeSet *t)
: data(NULL), baseData(NULL),
tailored(t),
suffix(NULL),
errorCode(U_ZERO_ERROR) {}
void forData(const CollationData *d, UErrorCode &errorCode);
/**
* @return U_SUCCESS(errorCode) in C++, void in Java
* @internal only public for access by callback
*/
UBool handleCE32(UChar32 start, UChar32 end, uint32_t ce32);
private:
void compare(UChar32 c, uint32_t ce32, uint32_t baseCE32);
void comparePrefixes(UChar32 c, const UChar *p, const UChar *q);
void compareContractions(UChar32 c, const UChar *p, const UChar *q);
void addPrefixes(const CollationData *d, UChar32 c, const UChar *p);
void addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32);
void addContractions(UChar32 c, const UChar *p);
void addSuffix(UChar32 c, const UnicodeString &sfx);
void add(UChar32 c);
/** Prefixes are reversed in the data structure. */
void setPrefix(const UnicodeString &pfx) {
unreversedPrefix = pfx;
unreversedPrefix.reverse();
}
void resetPrefix() {
unreversedPrefix.remove();
}
const CollationData *data;
const CollationData *baseData;
UnicodeSet *tailored;
UnicodeString unreversedPrefix;
const UnicodeString *suffix;
UErrorCode errorCode;
};
class ContractionsAndExpansions : public UMemory {
public:
class CESink : public UMemory {
public:
virtual ~CESink();
virtual void handleCE(int64_t ce) = 0;
virtual void handleExpansion(const int64_t ces[], int32_t length) = 0;
};
ContractionsAndExpansions(UnicodeSet *con, UnicodeSet *exp, CESink *s, UBool prefixes)
: data(NULL),
contractions(con), expansions(exp),
sink(s),
addPrefixes(prefixes),
checkTailored(0),
suffix(NULL),
errorCode(U_ZERO_ERROR) {}
void forData(const CollationData *d, UErrorCode &errorCode);
void forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec);
// all following: @internal, only public for access by callback
void handleCE32(UChar32 start, UChar32 end, uint32_t ce32);
void handlePrefixes(UChar32 start, UChar32 end, uint32_t ce32);
void handleContractions(UChar32 start, UChar32 end, uint32_t ce32);
void addExpansions(UChar32 start, UChar32 end);
void addStrings(UChar32 start, UChar32 end, UnicodeSet *set);
/** Prefixes are reversed in the data structure. */
void setPrefix(const UnicodeString &pfx) {
unreversedPrefix = pfx;
unreversedPrefix.reverse();
}
void resetPrefix() {
unreversedPrefix.remove();
}
const CollationData *data;
UnicodeSet *contractions;
UnicodeSet *expansions;
CESink *sink;
UBool addPrefixes;
int8_t checkTailored; // -1: collected tailored +1: exclude tailored
UnicodeSet tailored;
UnicodeSet ranges;
UnicodeString unreversedPrefix;
const UnicodeString *suffix;
int64_t ces[Collation::MAX_EXPANSION_LENGTH];
UErrorCode errorCode;
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION
#endif // __COLLATIONSETS_H__