| // Copyright 2016 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include <memory> |
| #include <string> |
| #include <vector> |
| |
| #include "src/unicode-decoder.h" |
| #include "src/unicode-inl.h" |
| #include "testing/gtest/include/gtest/gtest.h" |
| |
| namespace v8 { |
| namespace internal { |
| |
| namespace { |
| |
| using Utf8Decoder = unibrow::Utf8Decoder<512>; |
| |
| void Decode(Utf8Decoder* decoder, const std::string& str) { |
| // Put the string in its own buffer on the heap to make sure that |
| // AddressSanitizer's heap-buffer-overflow logic can see what's going on. |
| std::unique_ptr<char[]> buffer(new char[str.length()]); |
| memcpy(buffer.get(), str.data(), str.length()); |
| decoder->Reset(buffer.get(), str.length()); |
| } |
| |
| void DecodeNormally(const std::vector<byte>& bytes, |
| std::vector<unibrow::uchar>* output) { |
| size_t cursor = 0; |
| while (cursor < bytes.size()) { |
| output->push_back( |
| unibrow::Utf8::ValueOf(bytes.data() + cursor, bytes.size(), &cursor)); |
| } |
| } |
| |
| void DecodeIncrementally(const std::vector<byte>& bytes, |
| std::vector<unibrow::uchar>* output) { |
| unibrow::Utf8::Utf8IncrementalBuffer buffer = 0; |
| for (auto b : bytes) { |
| unibrow::uchar result = unibrow::Utf8::ValueOfIncremental(b, &buffer); |
| if (result != unibrow::Utf8::kIncomplete) { |
| output->push_back(result); |
| } |
| } |
| unibrow::uchar result = unibrow::Utf8::ValueOfIncrementalFinish(&buffer); |
| if (result != unibrow::Utf8::kBufferEmpty) { |
| output->push_back(result); |
| } |
| } |
| |
| } // namespace |
| |
| TEST(UnicodeTest, ReadOffEndOfUtf8String) { |
| Utf8Decoder decoder; |
| |
| // Not enough continuation bytes before string ends. |
| Decode(&decoder, "\xE0"); |
| Decode(&decoder, "\xED"); |
| Decode(&decoder, "\xF0"); |
| Decode(&decoder, "\xF4"); |
| } |
| |
| TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) { |
| // Unfortunately, V8 has two UTF-8 decoders. This test checks that they |
| // produce the same result. This test was inspired by |
| // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt . |
| typedef struct { |
| std::vector<byte> bytes; |
| std::vector<unibrow::uchar> unicode_expected; |
| } TestCase; |
| |
| TestCase data[] = { |
| // Correct UTF-8 text. |
| {{0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5}, |
| {0x3ba, 0x1f79, 0x3c3, 0x3bc, 0x3b5}}, |
| |
| // First possible sequence of a certain length: |
| // 1 byte |
| {{0x00}, {0x0}}, |
| // 2 bytes |
| {{0xc2, 0x80}, {0x80}}, |
| // 3 bytes |
| {{0xe0, 0xa0, 0x80}, {0x800}}, |
| // 4 bytes |
| {{0xf0, 0x90, 0x80, 0x80}, {0x10000}}, |
| // 5 bytes (not supported) |
| {{0xf8, 0x88, 0x80, 0x80, 0x80}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 6 bytes (not supported) |
| {{0xfc, 0x84, 0x80, 0x80, 0x80, 0x80}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| |
| // Last possible sequence of certain length: |
| // 1 byte |
| {{0x7f}, {0x7f}}, |
| // 2 bytes |
| {{0xdf, 0xbf}, {0x7ff}}, |
| // 3 bytes |
| {{0xef, 0xbf, 0xbf}, {0xffff}}, |
| // 4 bytes (this sequence is not a valid code point) |
| {{0xf7, 0xbf, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 5 bytes (not supported) |
| {{0xfb, 0xbf, 0xbf, 0xbf, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 6 bytes (not supported) |
| {{0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // Other boundary conditions: |
| {{0xed, 0x9f, 0xbf}, {0xd7ff}}, |
| {{0xee, 0x80, 0x80}, {0xe000}}, |
| // U+fffd (invalid code point) |
| {{0xef, 0xbf, 0xbd}, {0xfffd}}, |
| // U+10ffff (last valid code point) |
| {{0xf4, 0x8f, 0xbf, 0xbf}, {0x10ffff}}, |
| // First invalid (too large) code point |
| {{0xf4, 0x90, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| |
| // Malformed sequences: |
| // Unexpected continuation bytes: |
| // First continuation byte |
| {{0x80}, {0xfffd}}, |
| // Last continuation byte |
| {{0xbf}, {0xfffd}}, |
| // 2 continuation bytes |
| {{0x80, 0xbf}, {0xfffd, 0xfffd}}, |
| // 3 continuation bytes |
| {{0x80, 0xbf, 0x80}, {0xfffd, 0xfffd, 0xfffd}}, |
| // 4 continuation bytes |
| {{0x80, 0xbf, 0x80, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 5 continuation bytes |
| {{0x80, 0xbf, 0x80, 0xbf, 0x80}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 6 continuation bytes |
| {{0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 7 continuation bytes |
| {{0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // Sequence of all 64 possible continuation bytes |
| {{0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, |
| 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, |
| 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, |
| 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, |
| 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, |
| 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, |
| 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, |
| 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, |
| 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, |
| 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, |
| 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, |
| 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, |
| 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // Using each possible continuation byte in a two-byte sequence: |
| {{0xd0, 0x80, 0xd0, 0x81, 0xd0, 0x82, 0xd0, 0x83, 0xd0, 0x84, 0xd0, 0x85, |
| 0xd0, 0x86, 0xd0, 0x87, 0xd0, 0x88, 0xd0, 0x89, 0xd0, 0x8a, 0xd0, 0x8b, |
| 0xd0, 0x8c, 0xd0, 0x8d, 0xd0, 0x8e, 0xd0, 0x8f, 0xd0, 0x90, 0xd0, 0x91, |
| 0xd0, 0x92, 0xd0, 0x93, 0xd0, 0x94, 0xd0, 0x95, 0xd0, 0x96, 0xd0, 0x97, |
| 0xd0, 0x98, 0xd0, 0x99, 0xd0, 0x9a, 0xd0, 0x9b, 0xd0, 0x9c, 0xd0, 0x9d, |
| 0xd0, 0x9e, 0xd0, 0x9f, 0xd0, 0xa0, 0xd0, 0xa1, 0xd0, 0xa2, 0xd0, 0xa3, |
| 0xd0, 0xa4, 0xd0, 0xa5, 0xd0, 0xa6, 0xd0, 0xa7, 0xd0, 0xa8, 0xd0, 0xa9, |
| 0xd0, 0xaa, 0xd0, 0xab, 0xd0, 0xac, 0xd0, 0xad, 0xd0, 0xae, 0xd0, 0xaf, |
| 0xd0, 0xb0, 0xd0, 0xb1, 0xd0, 0xb2, 0xd0, 0xb3, 0xd0, 0xb4, 0xd0, 0xb5, |
| 0xd0, 0xb6, 0xd0, 0xb7, 0xd0, 0xb8, 0xd0, 0xb9, 0xd0, 0xba, 0xd0, 0xbb, |
| 0xd0, 0xbc, 0xd0, 0xbd, 0xd0, 0xbe, 0xd0, 0xbf}, |
| {0x400, 0x401, 0x402, 0x403, 0x404, 0x405, 0x406, 0x407, 0x408, 0x409, |
| 0x40a, 0x40b, 0x40c, 0x40d, 0x40e, 0x40f, 0x410, 0x411, 0x412, 0x413, |
| 0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d, |
| 0x41e, 0x41f, 0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, |
| 0x428, 0x429, 0x42a, 0x42b, 0x42c, 0x42d, 0x42e, 0x42f, 0x430, 0x431, |
| 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43a, 0x43b, |
| 0x43c, 0x43d, 0x43e, 0x43f}}, |
| |
| // Lonely first bytes: |
| // All 32 first bytes of 32-byte sequences, each followed by a space |
| // (generates 32 invalid char + space sequences. |
| {{0xc0, 0x20, 0xc1, 0x20, 0xc2, 0x20, 0xc3, 0x20, 0xc4, 0x20, 0xc5, |
| 0x20, 0xc6, 0x20, 0xc7, 0x20, 0xc8, 0x20, 0xc9, 0x20, 0xca, 0x20, |
| 0xcb, 0x20, 0xcc, 0x20, 0xcd, 0x20, 0xce, 0x20, 0xcf, 0x20, 0xd0, |
| 0x20, 0xd1, 0x20, 0xd2, 0x20, 0xd3, 0x20, 0xd4, 0x20, 0xd5, 0x20, |
| 0xd6, 0x20, 0xd7, 0x20, 0xd8, 0x20, 0xd9, 0x20, 0xda, 0x20, 0xdb, |
| 0x20, 0xdc, 0x20, 0xdd, 0x20, 0xde, 0x20, 0xdf, 0x20}, |
| {0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, |
| 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, |
| 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, |
| 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, |
| 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, |
| 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, |
| 0xfffd, 0x20, 0xfffd, 0x20}}, |
| // All 16 first bytes of 3-byte sequences, each followed by a space |
| // (generates 16 invalid char + space sequences): |
| {{0xe0, 0x20, 0xe1, 0x20, 0xe2, 0x20, 0xe3, 0x20, 0xe4, 0x20, 0xe5, |
| 0x20, 0xe6, 0x20, 0xe7, 0x20, 0xe8, 0x20, 0xe9, 0x20, 0xea, 0x20, |
| 0xeb, 0x20, 0xec, 0x20, 0xed, 0x20, 0xee, 0x20, 0xef, 0x20}, |
| {0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, |
| 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, |
| 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, |
| 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}}, |
| // All 8 first bytes of 4-byte sequences, each followed by a space |
| // (generates 8 invalid char + space sequences): |
| {{0xf0, 0x20, 0xf1, 0x20, 0xf2, 0x20, 0xf3, 0x20, 0xf4, 0x20, 0xf5, 0x20, |
| 0xf6, 0x20, 0xf7, 0x20}, |
| {0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, |
| 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}}, |
| // All 4 first bytes of 5-byte sequences (not supported), each followed by |
| // a space (generates 4 invalid char + space sequences): |
| {{0xf8, 0x20, 0xf9, 0x20, 0xfa, 0x20, 0xfb, 0x20}, |
| {0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}}, |
| // All 2 first bytes of 6-byte sequences (not supported), each followed by |
| // a space (generates 2 invalid char + space sequences): |
| {{0xfc, 0x20, 0xfd, 0x20}, {0xfffd, 0x20, 0xfffd, 0x20}}, |
| |
| // Sequences with last continuation byte missing. Normally the whole |
| // incomplete sequence generates a single invalid character (exceptions |
| // explained below). |
| |
| // 2-byte sequences with last byte missing |
| {{0xc0}, {0xfffd}}, |
| {{0xdf}, {0xfffd}}, |
| // 3-byte sequences with last byte missing. |
| {{0xe8, 0x80}, {0xfffd}}, |
| {{0xe0, 0xbf}, {0xfffd}}, |
| {{0xef, 0xbf}, {0xfffd}}, |
| // Start of an overlong sequence. The first "maximal subpart" is the first |
| // byte; it creates an invalid character. Each following byte generates an |
| // invalid character too. |
| {{0xe0, 0x80}, {0xfffd, 0xfffd}}, |
| // 4-byte sequences with last byte missing |
| {{0xf1, 0x80, 0x80}, {0xfffd}}, |
| {{0xf4, 0x8f, 0xbf}, {0xfffd}}, |
| // Start of an overlong sequence. The first "maximal subpart" is the first |
| // byte; it creates an invalid character. Each following byte generates an |
| // invalid character too. |
| {{0xf0, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd}}, |
| // 5-byte sequences (not supported) with last byte missing |
| {{0xf8, 0x80, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xfb, 0xbf, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 6-byte sequences (not supported) with last byte missing |
| {{0xfc, 0x80, 0x80, 0x80, 0x80}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xfd, 0xbf, 0xbf, 0xbf, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| |
| // Concatenation of incomplete sequences: above incomplete sequences |
| // concatenated. |
| {{0xc0, 0xdf, 0xe8, 0x80, 0xe0, 0xbf, 0xef, 0xbf, 0xe0, 0x80, |
| 0xf1, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xf0, 0x80, 0x80, 0xf8, |
| 0x80, 0x80, 0x80, 0xfb, 0xbf, 0xbf, 0xbf, 0xfc, 0x80, 0x80, |
| 0x80, 0x80, 0xfd, 0xbf, 0xbf, 0xbf, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, |
| 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, |
| 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, |
| 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| |
| // Incomplete sequence tests repeated with a space after the incomplete |
| // sequence. |
| |
| // 2-byte sequences with last byte missing |
| {{0xc0, 0x20}, {0xfffd, 0x20}}, |
| {{0xdf, 0x20}, {0xfffd, 0x20}}, |
| // 3-byte sequences with last byte missing |
| {{0xe8, 0x80, 0x20}, {0xfffd, 0x20}}, |
| {{0xe0, 0xbf, 0x20}, {0xfffd, 0x20}}, |
| {{0xef, 0xbf, 0x20}, {0xfffd, 0x20}}, |
| // Start of overlong 3-byte sequence with last byte missing |
| {{0xe0, 0x80, 0x20}, {0xfffd, 0xfffd, 0x20}}, |
| // 4-byte sequences with last byte missing |
| {{0xf1, 0x80, 0x80, 0x20}, {0xfffd, 0x20}}, |
| {{0xf4, 0x8f, 0xbf, 0x20}, {0xfffd, 0x20}}, |
| // Start of overlong 4-byte sequence with last byte missing |
| {{0xf0, 0x80, 0x80, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0x20}}, |
| // 5-byte sequences (not supported) with last byte missing |
| {{0xf8, 0x80, 0x80, 0x80, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}}, |
| {{0xfb, 0xbf, 0xbf, 0xbf, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}}, |
| // 6-byte sequences (not supported) with last byte missing |
| {{0xfc, 0x80, 0x80, 0x80, 0x80, 0x20}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}}, |
| {{0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0x20}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}}, |
| |
| // Impossible bytes |
| {{0xfe}, {0xfffd}}, |
| {{0xff}, {0xfffd}}, |
| {{0xfe, 0xfe, 0xff, 0xff}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // Lead-byte-like bytes which aren't valid lead bytes. |
| {{0xc0}, {0xfffd}}, |
| {{0xc0, 0xaa}, {0xfffd, 0xfffd}}, |
| {{0xc1}, {0xfffd}}, |
| {{0xc1, 0xaa}, {0xfffd, 0xfffd}}, |
| {{0xf5}, {0xfffd}}, |
| {{0xf5, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xf6}, {0xfffd}}, |
| {{0xf6, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xf7}, {0xfffd}}, |
| {{0xf7, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xf8}, {0xfffd}}, |
| {{0xf8, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xf9}, {0xfffd}}, |
| {{0xf9, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xfa}, {0xfffd}}, |
| {{0xfa, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xfb}, {0xfffd}}, |
| {{0xfb, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xfc}, {0xfffd}}, |
| {{0xfc, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xfd}, {0xfffd}}, |
| {{0xfd, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xfe}, {0xfffd}}, |
| {{0xfe, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xff}, {0xfffd}}, |
| {{0xff, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| |
| // Overlong sequences: |
| |
| // Overlong encodings for "/" |
| {{0xc0, 0xaf}, {0xfffd, 0xfffd}}, |
| {{0xe0, 0x80, 0xaf}, {0xfffd, 0xfffd, 0xfffd}}, |
| {{0xf0, 0x80, 0x80, 0xaf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 5-byte sequence (not supported anyway) |
| {{0xf8, 0x80, 0x80, 0x80, 0xaf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 6-byte sequence (not supported anyway) |
| {{0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| |
| // Maximum overlong sequences |
| {{0xc1, 0xbf}, {0xfffd, 0xfffd}}, |
| {{0xe0, 0x9f, 0xbf}, {0xfffd, 0xfffd, 0xfffd}}, |
| {{0xf0, 0x8f, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 5-byte sequence (not supported anyway) |
| {{0xf8, 0x87, 0xbf, 0xbf, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 6-byte sequence (not supported anyway) |
| {{0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| |
| // Overlong encodings for 0 |
| {{0xc0, 0x80}, {0xfffd, 0xfffd}}, |
| {{0xe0, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd}}, |
| {{0xf0, 0x80, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 5-byte sequence (not supported anyway) |
| {{0xf8, 0x80, 0x80, 0x80, 0x80}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| // 6-byte sequence (not supported anyway) |
| {{0xfc, 0x80, 0x80, 0x80, 0x80, 0x80}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| |
| // Illegal code positions: |
| |
| // Single UTF-16 surrogates |
| {{0xed, 0xa0, 0x80}, {0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xa0, 0x80}, {0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xad, 0xbf}, {0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xae, 0x80}, {0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xaf, 0xbf}, {0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xb0, 0x80}, {0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xbe, 0x80}, {0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd}}, |
| |
| // Paired surrogates |
| {{0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xae, 0x80, 0xed, 0xb0, 0x80}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| {{0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf}, |
| {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}}, |
| |
| // Surrogates with the last byte missing. |
| {{0xed, 0xa0}, {0xfffd, 0xfffd}}, |
| {{0xed, 0xa0}, {0xfffd, 0xfffd}}, |
| {{0xed, 0xad}, {0xfffd, 0xfffd}}, |
| {{0xed, 0xae}, {0xfffd, 0xfffd}}, |
| {{0xed, 0xaf}, {0xfffd, 0xfffd}}, |
| {{0xed, 0xb0}, {0xfffd, 0xfffd}}, |
| {{0xed, 0xbe}, {0xfffd, 0xfffd}}, |
| {{0xed, 0xbf}, {0xfffd, 0xfffd}}, |
| |
| // Other non-characters |
| {{0xef, 0xbf, 0xbe}, {0xfffe}}, |
| {{0xef, 0xbf, 0xbf}, {0xffff}}, |
| {{0xef, 0xb7, 0x90, 0xef, 0xb7, 0x91, 0xef, 0xb7, 0x92, 0xef, 0xb7, 0x93, |
| 0xef, 0xb7, 0x94, 0xef, 0xb7, 0x95, 0xef, 0xb7, 0x96, 0xef, 0xb7, 0x97, |
| 0xef, 0xb7, 0x98, 0xef, 0xb7, 0x99, 0xef, 0xb7, 0x9a, 0xef, 0xb7, 0x9b, |
| 0xef, 0xb7, 0x9c, 0xef, 0xb7, 0x9d, 0xef, 0xb7, 0x9e, 0xef, 0xb7, 0x9f, |
| 0xef, 0xb7, 0xa0, 0xef, 0xb7, 0xa1, 0xef, 0xb7, 0xa2, 0xef, 0xb7, 0xa3, |
| 0xef, 0xb7, 0xa4, 0xef, 0xb7, 0xa5, 0xef, 0xb7, 0xa6, 0xef, 0xb7, 0xa7, |
| 0xef, 0xb7, 0xa8, 0xef, 0xb7, 0xa9, 0xef, 0xb7, 0xaa, 0xef, 0xb7, 0xab, |
| 0xef, 0xb7, 0xac, 0xef, 0xb7, 0xad, 0xef, 0xb7, 0xae, 0xef, 0xb7, 0xaf}, |
| {0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, |
| 0xfdd8, 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, |
| 0xfde0, 0xfde1, 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, |
| 0xfde8, 0xfde9, 0xfdea, 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef}}, |
| {{0xf0, 0x9f, 0xbf, 0xbe, 0xf0, 0x9f, 0xbf, 0xbf, 0xf0, 0xaf, 0xbf, |
| 0xbe, 0xf0, 0xaf, 0xbf, 0xbf, 0xf0, 0xbf, 0xbf, 0xbe, 0xf0, 0xbf, |
| 0xbf, 0xbf, 0xf1, 0x8f, 0xbf, 0xbe, 0xf1, 0x8f, 0xbf, 0xbf, 0xf1, |
| 0x9f, 0xbf, 0xbe, 0xf1, 0x9f, 0xbf, 0xbf, 0xf1, 0xaf, 0xbf, 0xbe, |
| 0xf1, 0xaf, 0xbf, 0xbf, 0xf1, 0xbf, 0xbf, 0xbe, 0xf1, 0xbf, 0xbf, |
| 0xbf, 0xf2, 0x8f, 0xbf, 0xbe, 0xf2, 0x8f, 0xbf, 0xbf}, |
| {0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff, 0x4fffe, 0x4ffff, |
| 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff, 0x8fffe, |
| 0x8ffff}}, |
| }; |
| |
| for (auto test : data) { |
| // For figuring out which test fails: |
| fprintf(stderr, "test: "); |
| for (auto b : test.bytes) { |
| fprintf(stderr, "%x ", b); |
| } |
| fprintf(stderr, "\n"); |
| |
| std::vector<unibrow::uchar> output_normal; |
| DecodeNormally(test.bytes, &output_normal); |
| |
| CHECK_EQ(output_normal.size(), test.unicode_expected.size()); |
| for (size_t i = 0; i < output_normal.size(); ++i) { |
| CHECK_EQ(output_normal[i], test.unicode_expected[i]); |
| } |
| |
| std::vector<unibrow::uchar> output_incremental; |
| DecodeIncrementally(test.bytes, &output_incremental); |
| |
| CHECK_EQ(output_incremental.size(), test.unicode_expected.size()); |
| for (size_t i = 0; i < output_incremental.size(); ++i) { |
| CHECK_EQ(output_incremental[i], test.unicode_expected[i]); |
| } |
| } |
| } |
| |
| } // namespace internal |
| } // namespace v8 |