| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2013-2015, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * collationdatawriter.cpp |
| * |
| * created on: 2013aug06 |
| * created by: Markus W. Scherer |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #if defined(STARBOARD) |
| #include "starboard/client_porting/poem/assert_poem.h" |
| #include "starboard/client_porting/poem/string_poem.h" |
| #endif // defined(STARBOARD) |
| #include "unicode/tblcoll.h" |
| #include "unicode/udata.h" |
| #include "unicode/uniset.h" |
| #include "cmemory.h" |
| #include "collationdata.h" |
| #include "collationdatabuilder.h" |
| #include "collationdatareader.h" |
| #include "collationdatawriter.h" |
| #include "collationfastlatin.h" |
| #include "collationsettings.h" |
| #include "collationtailoring.h" |
| #include "uassert.h" |
| #include "ucmndata.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| uint8_t * |
| RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const { |
| if(U_FAILURE(errorCode)) { return NULL; } |
| LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000)); |
| if(buffer.isNull()) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| length = cloneBinary(buffer.getAlias(), 20000, errorCode); |
| if(errorCode == U_BUFFER_OVERFLOW_ERROR) { |
| if(buffer.allocateInsteadAndCopy(length, 0) == NULL) { |
| errorCode = U_MEMORY_ALLOCATION_ERROR; |
| return NULL; |
| } |
| errorCode = U_ZERO_ERROR; |
| length = cloneBinary(buffer.getAlias(), length, errorCode); |
| } |
| if(U_FAILURE(errorCode)) { return NULL; } |
| return buffer.orphan(); |
| } |
| |
| int32_t |
| RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const { |
| int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1]; |
| return CollationDataWriter::writeTailoring( |
| *tailoring, *settings, indexes, dest, capacity, |
| errorCode); |
| } |
| |
| static const UDataInfo dataInfo = { |
| sizeof(UDataInfo), |
| 0, |
| |
| U_IS_BIG_ENDIAN, |
| U_CHARSET_FAMILY, |
| U_SIZEOF_UCHAR, |
| 0, |
| |
| { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol" |
| { 5, 0, 0, 0 }, // formatVersion |
| { 6, 3, 0, 0 } // dataVersion |
| }; |
| |
| int32_t |
| CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings, |
| const void *rootElements, int32_t rootElementsLength, |
| int32_t indexes[], uint8_t *dest, int32_t capacity, |
| UErrorCode &errorCode) { |
| return write(TRUE, NULL, |
| data, settings, |
| rootElements, rootElementsLength, |
| indexes, dest, capacity, errorCode); |
| } |
| |
| int32_t |
| CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings, |
| int32_t indexes[], uint8_t *dest, int32_t capacity, |
| UErrorCode &errorCode) { |
| return write(FALSE, t.version, |
| *t.data, settings, |
| NULL, 0, |
| indexes, dest, capacity, errorCode); |
| } |
| |
| int32_t |
| CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion, |
| const CollationData &data, const CollationSettings &settings, |
| const void *rootElements, int32_t rootElementsLength, |
| int32_t indexes[], uint8_t *dest, int32_t capacity, |
| UErrorCode &errorCode) { |
| if(U_FAILURE(errorCode)) { return 0; } |
| if(capacity < 0 || (capacity > 0 && dest == NULL)) { |
| errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| |
| // Figure out which data items to write before settling on |
| // the indexes length and writing offsets. |
| // For any data item, we need to write the start and limit offsets, |
| // so the indexes length must be at least index-of-start-offset + 2. |
| int32_t indexesLength; |
| UBool hasMappings; |
| UnicodeSet unsafeBackwardSet; |
| const CollationData *baseData = data.base; |
| |
| int32_t fastLatinVersion; |
| if(data.fastLatinTable != NULL) { |
| fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16; |
| } else { |
| fastLatinVersion = 0; |
| } |
| int32_t fastLatinTableLength = 0; |
| |
| if(isBase) { |
| // For the root collator, we write an even number of indexes |
| // so that we start with an 8-aligned offset. |
| indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1; |
| U_ASSERT(settings.reorderCodesLength == 0); |
| hasMappings = TRUE; |
| unsafeBackwardSet = *data.unsafeBackwardSet; |
| fastLatinTableLength = data.fastLatinTableLength; |
| } else if(baseData == NULL) { |
| hasMappings = FALSE; |
| if(settings.reorderCodesLength == 0) { |
| // only options |
| indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here |
| } else { |
| // only options, reorder codes, and the reorder table |
| indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2; |
| } |
| } else { |
| hasMappings = TRUE; |
| // Tailored mappings, and what else? |
| // Check in ascending order of optional tailoring data items. |
| indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2; |
| if(data.contextsLength != 0) { |
| indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2; |
| } |
| unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet); |
| if(!unsafeBackwardSet.isEmpty()) { |
| indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2; |
| } |
| if(data.fastLatinTable != baseData->fastLatinTable) { |
| fastLatinTableLength = data.fastLatinTableLength; |
| indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2; |
| } |
| } |
| |
| UVector32 codesAndRanges(errorCode); |
| const int32_t *reorderCodes = settings.reorderCodes; |
| int32_t reorderCodesLength = settings.reorderCodesLength; |
| if(settings.hasReordering() && |
| CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) { |
| // Rebuild the full list of reorder ranges. |
| // The list in the settings is truncated for efficiency. |
| data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode); |
| // Write the codes, then the ranges. |
| for(int32_t i = 0; i < reorderCodesLength; ++i) { |
| codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode); |
| } |
| if(U_FAILURE(errorCode)) { return 0; } |
| reorderCodes = codesAndRanges.getBuffer(); |
| reorderCodesLength = codesAndRanges.size(); |
| } |
| |
| int32_t headerSize; |
| if(isBase) { |
| headerSize = 0; // udata_create() writes the header |
| } else { |
| DataHeader header; |
| header.dataHeader.magic1 = 0xda; |
| header.dataHeader.magic2 = 0x27; |
| uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo)); |
| uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo)); |
| headerSize = (int32_t)sizeof(header); |
| U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes |
| if(hasMappings && data.cesLength != 0) { |
| // Sum of the sizes of the data items which are |
| // not automatically multiples of 8 bytes and which are placed before the CEs. |
| int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4; |
| if((sum & 7) != 0) { |
| // We need to add padding somewhere so that the 64-bit CEs are 8-aligned. |
| // We add to the header size here. |
| // Alternatively, we could increment the indexesLength |
| // or add a few bytes to the reorderTable. |
| headerSize += 4; |
| } |
| } |
| header.dataHeader.headerSize = (uint16_t)headerSize; |
| if(headerSize <= capacity) { |
| uprv_memcpy(dest, &header, sizeof(header)); |
| // Write 00 bytes so that the padding is not mistaken for a copyright string. |
| uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header)); |
| dest += headerSize; |
| capacity -= headerSize; |
| } else { |
| dest = NULL; |
| capacity = 0; |
| } |
| } |
| |
| indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength; |
| U_ASSERT((settings.options & ~0xffff) == 0); |
| indexes[CollationDataReader::IX_OPTIONS] = |
| data.numericPrimary | fastLatinVersion | settings.options; |
| indexes[CollationDataReader::IX_RESERVED2] = 0; |
| indexes[CollationDataReader::IX_RESERVED3] = 0; |
| |
| // Byte offsets of data items all start from the start of the indexes. |
| // We add the headerSize at the very end. |
| int32_t totalSize = indexesLength * 4; |
| |
| if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) { |
| indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s); |
| } else { |
| indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1; |
| } |
| |
| indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize; |
| totalSize += reorderCodesLength * 4; |
| |
| indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize; |
| if(settings.reorderTable != NULL) { |
| totalSize += 256; |
| } |
| |
| indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize; |
| if(hasMappings) { |
| UErrorCode errorCode2 = U_ZERO_ERROR; |
| int32_t length; |
| if(totalSize < capacity) { |
| length = utrie2_serialize(data.trie, dest + totalSize, |
| capacity - totalSize, &errorCode2); |
| } else { |
| length = utrie2_serialize(data.trie, NULL, 0, &errorCode2); |
| } |
| if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { |
| errorCode = errorCode2; |
| return 0; |
| } |
| // The trie size should be a multiple of 8 bytes due to the way |
| // compactIndex2(UNewTrie2 *trie) currently works. |
| U_ASSERT((length & 7) == 0); |
| totalSize += length; |
| } |
| |
| indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize; |
| indexes[CollationDataReader::IX_CES_OFFSET] = totalSize; |
| if(hasMappings && data.cesLength != 0) { |
| U_ASSERT(((headerSize + totalSize) & 7) == 0); |
| totalSize += data.cesLength * 8; |
| } |
| |
| indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize; |
| indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize; |
| if(hasMappings) { |
| totalSize += data.ce32sLength * 4; |
| } |
| |
| indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize; |
| totalSize += rootElementsLength * 4; |
| |
| indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize; |
| if(hasMappings) { |
| totalSize += data.contextsLength * 2; |
| } |
| |
| indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize; |
| if(hasMappings && !unsafeBackwardSet.isEmpty()) { |
| UErrorCode errorCode2 = U_ZERO_ERROR; |
| int32_t length; |
| if(totalSize < capacity) { |
| uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize); |
| length = unsafeBackwardSet.serialize( |
| p, (capacity - totalSize) / 2, errorCode2); |
| } else { |
| length = unsafeBackwardSet.serialize(NULL, 0, errorCode2); |
| } |
| if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { |
| errorCode = errorCode2; |
| return 0; |
| } |
| totalSize += length * 2; |
| } |
| |
| indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize; |
| totalSize += fastLatinTableLength * 2; |
| |
| UnicodeString scripts; |
| indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize; |
| if(isBase) { |
| scripts.append((UChar)data.numScripts); |
| scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16); |
| scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength); |
| totalSize += scripts.length() * 2; |
| } |
| |
| indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize; |
| if(isBase) { |
| totalSize += 256; |
| } |
| |
| indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize; |
| indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize; |
| |
| if(totalSize > capacity) { |
| errorCode = U_BUFFER_OVERFLOW_ERROR; |
| return headerSize + totalSize; |
| } |
| |
| uprv_memcpy(dest, indexes, indexesLength * 4); |
| copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest); |
| copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest); |
| // The trie has already been serialized into the dest buffer. |
| copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest); |
| copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest); |
| copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest); |
| copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest); |
| // The unsafeBackwardSet has already been serialized into the dest buffer. |
| copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest); |
| copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest); |
| copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest); |
| |
| return headerSize + totalSize; |
| } |
| |
| void |
| CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex, |
| const void *src, uint8_t *dest) { |
| int32_t start = indexes[startIndex]; |
| int32_t limit = indexes[startIndex + 1]; |
| if(start < limit) { |
| uprv_memcpy(dest + start, src, limit - start); |
| } |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif // !UCONFIG_NO_COLLATION |