| // Copyright 2018 Google LLC. |
| // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. |
| |
| #include "src/utils/SkUTF.h" |
| |
| #include <climits> |
| |
| static constexpr inline int32_t left_shift(int32_t value, int32_t shift) { |
| return (int32_t) ((uint32_t) value << shift); |
| } |
| |
| template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); } |
| |
| template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); } |
| |
| static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; } |
| |
| static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; } |
| |
| /** @returns -1 iff invalid UTF8 byte, |
| 0 iff UTF8 continuation byte, |
| 1 iff ASCII byte, |
| 2 iff leading byte of 2-byte sequence, |
| 3 iff leading byte of 3-byte sequence, and |
| 4 iff leading byte of 4-byte sequence. |
| I.e.: if return value > 0, then gives length of sequence. |
| */ |
| static int utf8_byte_type(uint8_t c) { |
| if (c < 0x80) { |
| return 1; |
| } else if (c < 0xC0) { |
| return 0; |
| } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear" |
| return -1; |
| } else { |
| int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1; |
| // assert(value >= 2 && value <=4); |
| return value; |
| } |
| } |
| static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; } |
| |
| static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| int SkUTF::CountUTF8(const char* utf8, size_t byteLength) { |
| if (!utf8) { |
| return -1; |
| } |
| int count = 0; |
| const char* stop = utf8 + byteLength; |
| while (utf8 < stop) { |
| int type = utf8_byte_type(*(const uint8_t*)utf8); |
| if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) { |
| return -1; // Sequence extends beyond end. |
| } |
| while(type-- > 1) { |
| ++utf8; |
| if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) { |
| return -1; |
| } |
| } |
| ++utf8; |
| ++count; |
| } |
| return count; |
| } |
| |
| int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) { |
| if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) { |
| return -1; |
| } |
| const uint16_t* src = (const uint16_t*)utf16; |
| const uint16_t* stop = src + (byteLength >> 1); |
| int count = 0; |
| while (src < stop) { |
| unsigned c = *src++; |
| if (utf16_is_low_surrogate(c)) { |
| return -1; |
| } |
| if (utf16_is_high_surrogate(c)) { |
| if (src >= stop) { |
| return -1; |
| } |
| c = *src++; |
| if (!utf16_is_low_surrogate(c)) { |
| return -1; |
| } |
| } |
| count += 1; |
| } |
| return count; |
| } |
| |
| int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) { |
| if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || byteLength >> 2 > INT_MAX) { |
| return -1; |
| } |
| const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits |
| const uint32_t* ptr = (const uint32_t*)utf32; |
| const uint32_t* stop = ptr + (byteLength >> 2); |
| while (ptr < stop) { |
| if (*ptr & kInvalidUnicharMask) { |
| return -1; |
| } |
| ptr += 1; |
| } |
| return (int)(byteLength >> 2); |
| } |
| |
| template <typename T> |
| static SkUnichar next_fail(const T** ptr, const T* end) { |
| *ptr = end; |
| return -1; |
| } |
| |
| SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) { |
| if (!ptr || !end ) { |
| return -1; |
| } |
| const uint8_t* p = (const uint8_t*)*ptr; |
| if (!p || p >= (const uint8_t*)end) { |
| return next_fail(ptr, end); |
| } |
| int c = *p; |
| int hic = c << 24; |
| |
| if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) { |
| return next_fail(ptr, end); |
| } |
| if (hic < 0) { |
| uint32_t mask = (uint32_t)~0x3F; |
| hic = left_shift(hic, 1); |
| do { |
| ++p; |
| if (p >= (const uint8_t*)end) { |
| return next_fail(ptr, end); |
| } |
| // check before reading off end of array. |
| uint8_t nextByte = *p; |
| if (!utf8_byte_is_continuation(nextByte)) { |
| return next_fail(ptr, end); |
| } |
| c = (c << 6) | (nextByte & 0x3F); |
| mask <<= 5; |
| } while ((hic = left_shift(hic, 1)) < 0); |
| c &= ~mask; |
| } |
| *ptr = (char*)p + 1; |
| return c; |
| } |
| |
| SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) { |
| if (!ptr || !end ) { |
| return -1; |
| } |
| const uint16_t* src = *ptr; |
| if (!src || src + 1 > end || !is_align2(intptr_t(src))) { |
| return next_fail(ptr, end); |
| } |
| uint16_t c = *src++; |
| SkUnichar result = c; |
| if (utf16_is_low_surrogate(c)) { |
| return next_fail(ptr, end); // srcPtr should never point at low surrogate. |
| } |
| if (utf16_is_high_surrogate(c)) { |
| if (src + 1 > end) { |
| return next_fail(ptr, end); // Truncated string. |
| } |
| uint16_t low = *src++; |
| if (!utf16_is_low_surrogate(low)) { |
| return next_fail(ptr, end); |
| } |
| /* |
| [paraphrased from wikipedia] |
| Take the high surrogate and subtract 0xD800, then multiply by 0x400. |
| Take the low surrogate and subtract 0xDC00. Add these two results |
| together, and finally add 0x10000 to get the final decoded codepoint. |
| |
| unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000 |
| unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000 |
| unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000 |
| unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000) |
| */ |
| result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000); |
| } |
| *ptr = src; |
| return result; |
| } |
| |
| SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) { |
| if (!ptr || !end ) { |
| return -1; |
| } |
| const int32_t* s = *ptr; |
| if (!s || s + 1 > end || !is_align4(intptr_t(s))) { |
| return next_fail(ptr, end); |
| } |
| int32_t value = *s; |
| const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits |
| if (value & kInvalidUnicharMask) { |
| return next_fail(ptr, end); |
| } |
| *ptr = s + 1; |
| return value; |
| } |
| |
| size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) { |
| if ((uint32_t)uni > 0x10FFFF) { |
| return 0; |
| } |
| if (uni <= 127) { |
| if (utf8) { |
| *utf8 = (char)uni; |
| } |
| return 1; |
| } |
| char tmp[4]; |
| char* p = tmp; |
| size_t count = 1; |
| while (uni > 0x7F >> count) { |
| *p++ = (char)(0x80 | (uni & 0x3F)); |
| uni >>= 6; |
| count += 1; |
| } |
| if (utf8) { |
| p = tmp; |
| utf8 += count; |
| while (p < tmp + count - 1) { |
| *--utf8 = *p++; |
| } |
| *--utf8 = (char)(~(0xFF >> count) | uni); |
| } |
| return count; |
| } |
| |
| size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) { |
| if ((uint32_t)uni > 0x10FFFF) { |
| return 0; |
| } |
| int extra = (uni > 0xFFFF); |
| if (utf16) { |
| if (extra) { |
| utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10)); |
| utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF)); |
| } else { |
| utf16[0] = (uint16_t)uni; |
| } |
| } |
| return 1 + extra; |
| } |
| |