src/v8/test/unittests/unicode-unittest.cc - cobalt - Git at Google

 // Copyright 2016 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include <memory>
 #include <string>
 #include <vector>

 #include "src/unicode-decoder.h"
 #include "src/unicode-inl.h"
 #include "testing/gtest/include/gtest/gtest.h"

 namespace v8 {
 namespace internal {

 namespace {

 using Utf8Decoder = unibrow::Utf8Decoder<512>;

 void Decode(Utf8Decoder* decoder, const std::string& str) {
   // Put the string in its own buffer on the heap to make sure that
   // AddressSanitizer's heap-buffer-overflow logic can see what's going on.
   std::unique_ptr<char[]> buffer(new char[str.length()]);
   memcpy(buffer.get(), str.data(), str.length());
   decoder->Reset(buffer.get(), str.length());
 }

 void DecodeNormally(const std::vector<byte>& bytes,
                     std::vector<unibrow::uchar>* output) {
   size_t cursor = 0;
   while (cursor < bytes.size()) {
     output->push_back(
         unibrow::Utf8::ValueOf(bytes.data() + cursor, bytes.size(), &cursor));
   }
 }

 void DecodeIncrementally(const std::vector<byte>& bytes,
                          std::vector<unibrow::uchar>* output) {
   unibrow::Utf8::Utf8IncrementalBuffer buffer = 0;
   for (auto b : bytes) {
     unibrow::uchar result = unibrow::Utf8::ValueOfIncremental(b, &buffer);
     if (result != unibrow::Utf8::kIncomplete) {
       output->push_back(result);
     }
   }
   unibrow::uchar result = unibrow::Utf8::ValueOfIncrementalFinish(&buffer);
   if (result != unibrow::Utf8::kBufferEmpty) {
     output->push_back(result);
   }
 }

 }  // namespace

 TEST(UnicodeTest, ReadOffEndOfUtf8String) {
   Utf8Decoder decoder;

   // Not enough continuation bytes before string ends.
   Decode(&decoder, "\xE0");
   Decode(&decoder, "\xED");
   Decode(&decoder, "\xF0");
   Decode(&decoder, "\xF4");
 }

 TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
   // Unfortunately, V8 has two UTF-8 decoders. This test checks that they
   // produce the same result. This test was inspired by
   // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt .
   typedef struct {
     std::vector<byte> bytes;
     std::vector<unibrow::uchar> unicode_expected;
   } TestCase;

   TestCase data[] = {
       // Correct UTF-8 text.
       {{0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5},
        {0x3ba, 0x1f79, 0x3c3, 0x3bc, 0x3b5}},

       // First possible sequence of a certain length:
       // 1 byte
       {{0x00}, {0x0}},
       // 2 bytes
       {{0xc2, 0x80}, {0x80}},
       // 3 bytes
       {{0xe0, 0xa0, 0x80}, {0x800}},
       // 4 bytes
       {{0xf0, 0x90, 0x80, 0x80}, {0x10000}},
       // 5 bytes (not supported)
       {{0xf8, 0x88, 0x80, 0x80, 0x80},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 6 bytes (not supported)
       {{0xfc, 0x84, 0x80, 0x80, 0x80, 0x80},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

       // Last possible sequence of certain length:
       // 1 byte
       {{0x7f}, {0x7f}},
       // 2 bytes
       {{0xdf, 0xbf}, {0x7ff}},
       // 3 bytes
       {{0xef, 0xbf, 0xbf}, {0xffff}},
       // 4 bytes (this sequence is not a valid code point)
       {{0xf7, 0xbf, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 5 bytes (not supported)
       {{0xfb, 0xbf, 0xbf, 0xbf, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 6 bytes (not supported)
       {{0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // Other boundary conditions:
       {{0xed, 0x9f, 0xbf}, {0xd7ff}},
       {{0xee, 0x80, 0x80}, {0xe000}},
       // U+fffd (invalid code point)
       {{0xef, 0xbf, 0xbd}, {0xfffd}},
       // U+10ffff (last valid code point)
       {{0xf4, 0x8f, 0xbf, 0xbf}, {0x10ffff}},
       // First invalid (too large) code point
       {{0xf4, 0x90, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},

       // Malformed sequences:
       // Unexpected continuation bytes:
       // First continuation byte
       {{0x80}, {0xfffd}},
       // Last continuation byte
       {{0xbf}, {0xfffd}},
       // 2 continuation bytes
       {{0x80, 0xbf}, {0xfffd, 0xfffd}},
       // 3 continuation bytes
       {{0x80, 0xbf, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
       // 4 continuation bytes
       {{0x80, 0xbf, 0x80, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 5 continuation bytes
       {{0x80, 0xbf, 0x80, 0xbf, 0x80},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 6 continuation bytes
       {{0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 7 continuation bytes
       {{0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // Sequence of all 64 possible continuation bytes
       {{0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
         0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
         0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0,
         0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab,
         0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
         0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // Using each possible continuation byte in a two-byte sequence:
       {{0xd0, 0x80, 0xd0, 0x81, 0xd0, 0x82, 0xd0, 0x83, 0xd0, 0x84, 0xd0, 0x85,
         0xd0, 0x86, 0xd0, 0x87, 0xd0, 0x88, 0xd0, 0x89, 0xd0, 0x8a, 0xd0, 0x8b,
         0xd0, 0x8c, 0xd0, 0x8d, 0xd0, 0x8e, 0xd0, 0x8f, 0xd0, 0x90, 0xd0, 0x91,
         0xd0, 0x92, 0xd0, 0x93, 0xd0, 0x94, 0xd0, 0x95, 0xd0, 0x96, 0xd0, 0x97,
         0xd0, 0x98, 0xd0, 0x99, 0xd0, 0x9a, 0xd0, 0x9b, 0xd0, 0x9c, 0xd0, 0x9d,
         0xd0, 0x9e, 0xd0, 0x9f, 0xd0, 0xa0, 0xd0, 0xa1, 0xd0, 0xa2, 0xd0, 0xa3,
         0xd0, 0xa4, 0xd0, 0xa5, 0xd0, 0xa6, 0xd0, 0xa7, 0xd0, 0xa8, 0xd0, 0xa9,
         0xd0, 0xaa, 0xd0, 0xab, 0xd0, 0xac, 0xd0, 0xad, 0xd0, 0xae, 0xd0, 0xaf,
         0xd0, 0xb0, 0xd0, 0xb1, 0xd0, 0xb2, 0xd0, 0xb3, 0xd0, 0xb4, 0xd0, 0xb5,
         0xd0, 0xb6, 0xd0, 0xb7, 0xd0, 0xb8, 0xd0, 0xb9, 0xd0, 0xba, 0xd0, 0xbb,
         0xd0, 0xbc, 0xd0, 0xbd, 0xd0, 0xbe, 0xd0, 0xbf},
        {0x400, 0x401, 0x402, 0x403, 0x404, 0x405, 0x406, 0x407, 0x408, 0x409,
         0x40a, 0x40b, 0x40c, 0x40d, 0x40e, 0x40f, 0x410, 0x411, 0x412, 0x413,
         0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d,
         0x41e, 0x41f, 0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427,
         0x428, 0x429, 0x42a, 0x42b, 0x42c, 0x42d, 0x42e, 0x42f, 0x430, 0x431,
         0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43a, 0x43b,
         0x43c, 0x43d, 0x43e, 0x43f}},

       // Lonely first bytes:
       // All 32 first bytes of 32-byte sequences, each followed by a space
       // (generates 32 invalid char + space sequences.
       {{0xc0, 0x20, 0xc1, 0x20, 0xc2, 0x20, 0xc3, 0x20, 0xc4, 0x20, 0xc5,
         0x20, 0xc6, 0x20, 0xc7, 0x20, 0xc8, 0x20, 0xc9, 0x20, 0xca, 0x20,
         0xcb, 0x20, 0xcc, 0x20, 0xcd, 0x20, 0xce, 0x20, 0xcf, 0x20, 0xd0,
         0x20, 0xd1, 0x20, 0xd2, 0x20, 0xd3, 0x20, 0xd4, 0x20, 0xd5, 0x20,
         0xd6, 0x20, 0xd7, 0x20, 0xd8, 0x20, 0xd9, 0x20, 0xda, 0x20, 0xdb,
         0x20, 0xdc, 0x20, 0xdd, 0x20, 0xde, 0x20, 0xdf, 0x20},
        {0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
         0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
         0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
         0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
         0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
         0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
         0xfffd, 0x20, 0xfffd, 0x20}},
       // All 16 first bytes of 3-byte sequences, each followed by a space
       // (generates 16 invalid char + space sequences):
       {{0xe0, 0x20, 0xe1, 0x20, 0xe2, 0x20, 0xe3, 0x20, 0xe4, 0x20, 0xe5,
         0x20, 0xe6, 0x20, 0xe7, 0x20, 0xe8, 0x20, 0xe9, 0x20, 0xea, 0x20,
         0xeb, 0x20, 0xec, 0x20, 0xed, 0x20, 0xee, 0x20, 0xef, 0x20},
        {0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
         0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
         0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
         0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}},
       // All 8 first bytes of 4-byte sequences, each followed by a space
       // (generates 8 invalid char + space sequences):
       {{0xf0, 0x20, 0xf1, 0x20, 0xf2, 0x20, 0xf3, 0x20, 0xf4, 0x20, 0xf5, 0x20,
         0xf6, 0x20, 0xf7, 0x20},
        {0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
         0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}},
       // All 4 first bytes of 5-byte sequences (not supported), each followed by
       // a space (generates 4 invalid char + space sequences):
       {{0xf8, 0x20, 0xf9, 0x20, 0xfa, 0x20, 0xfb, 0x20},
        {0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}},
       // All 2 first bytes of 6-byte sequences (not supported), each followed by
       // a space (generates 2 invalid char + space sequences):
       {{0xfc, 0x20, 0xfd, 0x20}, {0xfffd, 0x20, 0xfffd, 0x20}},

       // Sequences with last continuation byte missing. Normally the whole
       // incomplete sequence generates a single invalid character (exceptions
       // explained below).

       // 2-byte sequences with last byte missing
       {{0xc0}, {0xfffd}},
       {{0xdf}, {0xfffd}},
       // 3-byte sequences with last byte missing.
       {{0xe8, 0x80}, {0xfffd}},
       {{0xe0, 0xbf}, {0xfffd}},
       {{0xef, 0xbf}, {0xfffd}},
       // Start of an overlong sequence. The first "maximal subpart" is the first
       // byte; it creates an invalid character. Each following byte generates an
       // invalid character too.
       {{0xe0, 0x80}, {0xfffd, 0xfffd}},
       // 4-byte sequences with last byte missing
       {{0xf1, 0x80, 0x80}, {0xfffd}},
       {{0xf4, 0x8f, 0xbf}, {0xfffd}},
       // Start of an overlong sequence. The first "maximal subpart" is the first
       // byte; it creates an invalid character. Each following byte generates an
       // invalid character too.
       {{0xf0, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
       // 5-byte sequences (not supported) with last byte missing
       {{0xf8, 0x80, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xfb, 0xbf, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 6-byte sequences (not supported) with last byte missing
       {{0xfc, 0x80, 0x80, 0x80, 0x80},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xfd, 0xbf, 0xbf, 0xbf, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

       // Concatenation of incomplete sequences: above incomplete sequences
       // concatenated.
       {{0xc0, 0xdf, 0xe8, 0x80, 0xe0, 0xbf, 0xef, 0xbf, 0xe0, 0x80,
         0xf1, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xf0, 0x80, 0x80, 0xf8,
         0x80, 0x80, 0x80, 0xfb, 0xbf, 0xbf, 0xbf, 0xfc, 0x80, 0x80,
         0x80, 0x80, 0xfd, 0xbf, 0xbf, 0xbf, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

       // Incomplete sequence tests repeated with a space after the incomplete
       // sequence.

       // 2-byte sequences with last byte missing
       {{0xc0, 0x20}, {0xfffd, 0x20}},
       {{0xdf, 0x20}, {0xfffd, 0x20}},
       // 3-byte sequences with last byte missing
       {{0xe8, 0x80, 0x20}, {0xfffd, 0x20}},
       {{0xe0, 0xbf, 0x20}, {0xfffd, 0x20}},
       {{0xef, 0xbf, 0x20}, {0xfffd, 0x20}},
       // Start of overlong 3-byte sequence with last byte missing
       {{0xe0, 0x80, 0x20}, {0xfffd, 0xfffd, 0x20}},
       // 4-byte sequences with last byte missing
       {{0xf1, 0x80, 0x80, 0x20}, {0xfffd, 0x20}},
       {{0xf4, 0x8f, 0xbf, 0x20}, {0xfffd, 0x20}},
       // Start of overlong 4-byte sequence with last byte missing
       {{0xf0, 0x80, 0x80, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0x20}},
       // 5-byte sequences (not supported) with last byte missing
       {{0xf8, 0x80, 0x80, 0x80, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},
       {{0xfb, 0xbf, 0xbf, 0xbf, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},
       // 6-byte sequences (not supported) with last byte missing
       {{0xfc, 0x80, 0x80, 0x80, 0x80, 0x20},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},
       {{0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0x20},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},

       // Impossible bytes
       {{0xfe}, {0xfffd}},
       {{0xff}, {0xfffd}},
       {{0xfe, 0xfe, 0xff, 0xff}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // Lead-byte-like bytes which aren't valid lead bytes.
       {{0xc0}, {0xfffd}},
       {{0xc0, 0xaa}, {0xfffd, 0xfffd}},
       {{0xc1}, {0xfffd}},
       {{0xc1, 0xaa}, {0xfffd, 0xfffd}},
       {{0xf5}, {0xfffd}},
       {{0xf5, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xf6}, {0xfffd}},
       {{0xf6, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xf7}, {0xfffd}},
       {{0xf7, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xf8}, {0xfffd}},
       {{0xf8, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xf9}, {0xfffd}},
       {{0xf9, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xfa}, {0xfffd}},
       {{0xfa, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xfb}, {0xfffd}},
       {{0xfb, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xfc}, {0xfffd}},
       {{0xfc, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xfd}, {0xfffd}},
       {{0xfd, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xfe}, {0xfffd}},
       {{0xfe, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xff}, {0xfffd}},
       {{0xff, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},

       // Overlong sequences:

       // Overlong encodings for "/"
       {{0xc0, 0xaf}, {0xfffd, 0xfffd}},
       {{0xe0, 0x80, 0xaf}, {0xfffd, 0xfffd, 0xfffd}},
       {{0xf0, 0x80, 0x80, 0xaf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 5-byte sequence (not supported anyway)
       {{0xf8, 0x80, 0x80, 0x80, 0xaf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 6-byte sequence (not supported anyway)
       {{0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

       // Maximum overlong sequences
       {{0xc1, 0xbf}, {0xfffd, 0xfffd}},
       {{0xe0, 0x9f, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},
       {{0xf0, 0x8f, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 5-byte sequence (not supported anyway)
       {{0xf8, 0x87, 0xbf, 0xbf, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 6-byte sequence (not supported anyway)
       {{0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

       // Overlong encodings for 0
       {{0xc0, 0x80}, {0xfffd, 0xfffd}},
       {{0xe0, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
       {{0xf0, 0x80, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 5-byte sequence (not supported anyway)
       {{0xf8, 0x80, 0x80, 0x80, 0x80},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       // 6-byte sequence (not supported anyway)
       {{0xfc, 0x80, 0x80, 0x80, 0x80, 0x80},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

       // Illegal code positions:

       // Single UTF-16 surrogates
       {{0xed, 0xa0, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xa0, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xad, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xae, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xaf, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xb0, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xbe, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},

       // Paired surrogates
       {{0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xae, 0x80, 0xed, 0xb0, 0x80},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
       {{0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf},
        {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

       // Surrogates with the last byte missing.
       {{0xed, 0xa0}, {0xfffd, 0xfffd}},
       {{0xed, 0xa0}, {0xfffd, 0xfffd}},
       {{0xed, 0xad}, {0xfffd, 0xfffd}},
       {{0xed, 0xae}, {0xfffd, 0xfffd}},
       {{0xed, 0xaf}, {0xfffd, 0xfffd}},
       {{0xed, 0xb0}, {0xfffd, 0xfffd}},
       {{0xed, 0xbe}, {0xfffd, 0xfffd}},
       {{0xed, 0xbf}, {0xfffd, 0xfffd}},

       // Other non-characters
       {{0xef, 0xbf, 0xbe}, {0xfffe}},
       {{0xef, 0xbf, 0xbf}, {0xffff}},
       {{0xef, 0xb7, 0x90, 0xef, 0xb7, 0x91, 0xef, 0xb7, 0x92, 0xef, 0xb7, 0x93,
         0xef, 0xb7, 0x94, 0xef, 0xb7, 0x95, 0xef, 0xb7, 0x96, 0xef, 0xb7, 0x97,
         0xef, 0xb7, 0x98, 0xef, 0xb7, 0x99, 0xef, 0xb7, 0x9a, 0xef, 0xb7, 0x9b,
         0xef, 0xb7, 0x9c, 0xef, 0xb7, 0x9d, 0xef, 0xb7, 0x9e, 0xef, 0xb7, 0x9f,
         0xef, 0xb7, 0xa0, 0xef, 0xb7, 0xa1, 0xef, 0xb7, 0xa2, 0xef, 0xb7, 0xa3,
         0xef, 0xb7, 0xa4, 0xef, 0xb7, 0xa5, 0xef, 0xb7, 0xa6, 0xef, 0xb7, 0xa7,
         0xef, 0xb7, 0xa8, 0xef, 0xb7, 0xa9, 0xef, 0xb7, 0xaa, 0xef, 0xb7, 0xab,
         0xef, 0xb7, 0xac, 0xef, 0xb7, 0xad, 0xef, 0xb7, 0xae, 0xef, 0xb7, 0xaf},
        {0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7,
         0xfdd8, 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf,
         0xfde0, 0xfde1, 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7,
         0xfde8, 0xfde9, 0xfdea, 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef}},
       {{0xf0, 0x9f, 0xbf, 0xbe, 0xf0, 0x9f, 0xbf, 0xbf, 0xf0, 0xaf, 0xbf,
         0xbe, 0xf0, 0xaf, 0xbf, 0xbf, 0xf0, 0xbf, 0xbf, 0xbe, 0xf0, 0xbf,
         0xbf, 0xbf, 0xf1, 0x8f, 0xbf, 0xbe, 0xf1, 0x8f, 0xbf, 0xbf, 0xf1,
         0x9f, 0xbf, 0xbe, 0xf1, 0x9f, 0xbf, 0xbf, 0xf1, 0xaf, 0xbf, 0xbe,
         0xf1, 0xaf, 0xbf, 0xbf, 0xf1, 0xbf, 0xbf, 0xbe, 0xf1, 0xbf, 0xbf,
         0xbf, 0xf2, 0x8f, 0xbf, 0xbe, 0xf2, 0x8f, 0xbf, 0xbf},
        {0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff, 0x4fffe, 0x4ffff,
         0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff, 0x8fffe,
         0x8ffff}},
   };

   for (auto test : data) {
     // For figuring out which test fails:
     fprintf(stderr, "test: ");
     for (auto b : test.bytes) {
       fprintf(stderr, "%x ", b);
     }
     fprintf(stderr, "\n");

     std::vector<unibrow::uchar> output_normal;
     DecodeNormally(test.bytes, &output_normal);

     CHECK_EQ(output_normal.size(), test.unicode_expected.size());
     for (size_t i = 0; i < output_normal.size(); ++i) {
       CHECK_EQ(output_normal[i], test.unicode_expected[i]);
     }

     std::vector<unibrow::uchar> output_incremental;
     DecodeIncrementally(test.bytes, &output_incremental);

     CHECK_EQ(output_incremental.size(), test.unicode_expected.size());
     for (size_t i = 0; i < output_incremental.size(); ++i) {
       CHECK_EQ(output_incremental[i], test.unicode_expected[i]);
     }
   }
 }

 }  // namespace internal
 }  // namespace v8
	// Copyright 2016 the V8 project authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include <memory>
	#include <string>
	#include <vector>

	#include "src/unicode-decoder.h"
	#include "src/unicode-inl.h"
	#include "testing/gtest/include/gtest/gtest.h"

	namespace v8 {
	namespace internal {

	namespace {

	using Utf8Decoder = unibrow::Utf8Decoder<512>;

	void Decode(Utf8Decoder* decoder, const std::string& str) {
	// Put the string in its own buffer on the heap to make sure that
	// AddressSanitizer's heap-buffer-overflow logic can see what's going on.
	std::unique_ptr<char[]> buffer(new char[str.length()]);
	memcpy(buffer.get(), str.data(), str.length());
	decoder->Reset(buffer.get(), str.length());
	}

	void DecodeNormally(const std::vector<byte>& bytes,
	std::vector<unibrow::uchar>* output) {
	size_t cursor = 0;
	while (cursor < bytes.size()) {
	output->push_back(
	unibrow::Utf8::ValueOf(bytes.data() + cursor, bytes.size(), &cursor));
	}
	}

	void DecodeIncrementally(const std::vector<byte>& bytes,
	std::vector<unibrow::uchar>* output) {
	unibrow::Utf8::Utf8IncrementalBuffer buffer = 0;
	for (auto b : bytes) {
	unibrow::uchar result = unibrow::Utf8::ValueOfIncremental(b, &buffer);
	if (result != unibrow::Utf8::kIncomplete) {
	output->push_back(result);
	}
	}
	unibrow::uchar result = unibrow::Utf8::ValueOfIncrementalFinish(&buffer);
	if (result != unibrow::Utf8::kBufferEmpty) {
	output->push_back(result);
	}
	}

	} // namespace

	TEST(UnicodeTest, ReadOffEndOfUtf8String) {
	Utf8Decoder decoder;

	// Not enough continuation bytes before string ends.
	Decode(&decoder, "\xE0");
	Decode(&decoder, "\xED");
	Decode(&decoder, "\xF0");
	Decode(&decoder, "\xF4");
	}

	TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
	// Unfortunately, V8 has two UTF-8 decoders. This test checks that they
	// produce the same result. This test was inspired by
	// https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt .
	typedef struct {
	std::vector<byte> bytes;
	std::vector<unibrow::uchar> unicode_expected;
	} TestCase;

	TestCase data[] = {
	// Correct UTF-8 text.
	{{0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5},
	{0x3ba, 0x1f79, 0x3c3, 0x3bc, 0x3b5}},

	// First possible sequence of a certain length:
	// 1 byte
	{{0x00}, {0x0}},
	// 2 bytes
	{{0xc2, 0x80}, {0x80}},
	// 3 bytes
	{{0xe0, 0xa0, 0x80}, {0x800}},
	// 4 bytes
	{{0xf0, 0x90, 0x80, 0x80}, {0x10000}},
	// 5 bytes (not supported)
	{{0xf8, 0x88, 0x80, 0x80, 0x80},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 6 bytes (not supported)
	{{0xfc, 0x84, 0x80, 0x80, 0x80, 0x80},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

	// Last possible sequence of certain length:
	// 1 byte
	{{0x7f}, {0x7f}},
	// 2 bytes
	{{0xdf, 0xbf}, {0x7ff}},
	// 3 bytes
	{{0xef, 0xbf, 0xbf}, {0xffff}},
	// 4 bytes (this sequence is not a valid code point)
	{{0xf7, 0xbf, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 5 bytes (not supported)
	{{0xfb, 0xbf, 0xbf, 0xbf, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 6 bytes (not supported)
	{{0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// Other boundary conditions:
	{{0xed, 0x9f, 0xbf}, {0xd7ff}},
	{{0xee, 0x80, 0x80}, {0xe000}},
	// U+fffd (invalid code point)
	{{0xef, 0xbf, 0xbd}, {0xfffd}},
	// U+10ffff (last valid code point)
	{{0xf4, 0x8f, 0xbf, 0xbf}, {0x10ffff}},
	// First invalid (too large) code point
	{{0xf4, 0x90, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},

	// Malformed sequences:
	// Unexpected continuation bytes:
	// First continuation byte
	{{0x80}, {0xfffd}},
	// Last continuation byte
	{{0xbf}, {0xfffd}},
	// 2 continuation bytes
	{{0x80, 0xbf}, {0xfffd, 0xfffd}},
	// 3 continuation bytes
	{{0x80, 0xbf, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
	// 4 continuation bytes
	{{0x80, 0xbf, 0x80, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 5 continuation bytes
	{{0x80, 0xbf, 0x80, 0xbf, 0x80},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 6 continuation bytes
	{{0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 7 continuation bytes
	{{0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// Sequence of all 64 possible continuation bytes
	{{0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
	0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
	0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0,
	0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab,
	0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
	0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
	0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
	0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
	0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
	0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
	0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
	0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
	0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// Using each possible continuation byte in a two-byte sequence:
	{{0xd0, 0x80, 0xd0, 0x81, 0xd0, 0x82, 0xd0, 0x83, 0xd0, 0x84, 0xd0, 0x85,
	0xd0, 0x86, 0xd0, 0x87, 0xd0, 0x88, 0xd0, 0x89, 0xd0, 0x8a, 0xd0, 0x8b,
	0xd0, 0x8c, 0xd0, 0x8d, 0xd0, 0x8e, 0xd0, 0x8f, 0xd0, 0x90, 0xd0, 0x91,
	0xd0, 0x92, 0xd0, 0x93, 0xd0, 0x94, 0xd0, 0x95, 0xd0, 0x96, 0xd0, 0x97,
	0xd0, 0x98, 0xd0, 0x99, 0xd0, 0x9a, 0xd0, 0x9b, 0xd0, 0x9c, 0xd0, 0x9d,
	0xd0, 0x9e, 0xd0, 0x9f, 0xd0, 0xa0, 0xd0, 0xa1, 0xd0, 0xa2, 0xd0, 0xa3,
	0xd0, 0xa4, 0xd0, 0xa5, 0xd0, 0xa6, 0xd0, 0xa7, 0xd0, 0xa8, 0xd0, 0xa9,
	0xd0, 0xaa, 0xd0, 0xab, 0xd0, 0xac, 0xd0, 0xad, 0xd0, 0xae, 0xd0, 0xaf,
	0xd0, 0xb0, 0xd0, 0xb1, 0xd0, 0xb2, 0xd0, 0xb3, 0xd0, 0xb4, 0xd0, 0xb5,
	0xd0, 0xb6, 0xd0, 0xb7, 0xd0, 0xb8, 0xd0, 0xb9, 0xd0, 0xba, 0xd0, 0xbb,
	0xd0, 0xbc, 0xd0, 0xbd, 0xd0, 0xbe, 0xd0, 0xbf},
	{0x400, 0x401, 0x402, 0x403, 0x404, 0x405, 0x406, 0x407, 0x408, 0x409,
	0x40a, 0x40b, 0x40c, 0x40d, 0x40e, 0x40f, 0x410, 0x411, 0x412, 0x413,
	0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41a, 0x41b, 0x41c, 0x41d,
	0x41e, 0x41f, 0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427,
	0x428, 0x429, 0x42a, 0x42b, 0x42c, 0x42d, 0x42e, 0x42f, 0x430, 0x431,
	0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43a, 0x43b,
	0x43c, 0x43d, 0x43e, 0x43f}},

	// Lonely first bytes:
	// All 32 first bytes of 32-byte sequences, each followed by a space
	// (generates 32 invalid char + space sequences.
	{{0xc0, 0x20, 0xc1, 0x20, 0xc2, 0x20, 0xc3, 0x20, 0xc4, 0x20, 0xc5,
	0x20, 0xc6, 0x20, 0xc7, 0x20, 0xc8, 0x20, 0xc9, 0x20, 0xca, 0x20,
	0xcb, 0x20, 0xcc, 0x20, 0xcd, 0x20, 0xce, 0x20, 0xcf, 0x20, 0xd0,
	0x20, 0xd1, 0x20, 0xd2, 0x20, 0xd3, 0x20, 0xd4, 0x20, 0xd5, 0x20,
	0xd6, 0x20, 0xd7, 0x20, 0xd8, 0x20, 0xd9, 0x20, 0xda, 0x20, 0xdb,
	0x20, 0xdc, 0x20, 0xdd, 0x20, 0xde, 0x20, 0xdf, 0x20},
	{0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
	0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
	0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
	0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
	0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
	0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
	0xfffd, 0x20, 0xfffd, 0x20}},
	// All 16 first bytes of 3-byte sequences, each followed by a space
	// (generates 16 invalid char + space sequences):
	{{0xe0, 0x20, 0xe1, 0x20, 0xe2, 0x20, 0xe3, 0x20, 0xe4, 0x20, 0xe5,
	0x20, 0xe6, 0x20, 0xe7, 0x20, 0xe8, 0x20, 0xe9, 0x20, 0xea, 0x20,
	0xeb, 0x20, 0xec, 0x20, 0xed, 0x20, 0xee, 0x20, 0xef, 0x20},
	{0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
	0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
	0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
	0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}},
	// All 8 first bytes of 4-byte sequences, each followed by a space
	// (generates 8 invalid char + space sequences):
	{{0xf0, 0x20, 0xf1, 0x20, 0xf2, 0x20, 0xf3, 0x20, 0xf4, 0x20, 0xf5, 0x20,
	0xf6, 0x20, 0xf7, 0x20},
	{0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20,
	0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}},
	// All 4 first bytes of 5-byte sequences (not supported), each followed by
	// a space (generates 4 invalid char + space sequences):
	{{0xf8, 0x20, 0xf9, 0x20, 0xfa, 0x20, 0xfb, 0x20},
	{0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20, 0xfffd, 0x20}},
	// All 2 first bytes of 6-byte sequences (not supported), each followed by
	// a space (generates 2 invalid char + space sequences):
	{{0xfc, 0x20, 0xfd, 0x20}, {0xfffd, 0x20, 0xfffd, 0x20}},

	// Sequences with last continuation byte missing. Normally the whole
	// incomplete sequence generates a single invalid character (exceptions
	// explained below).

	// 2-byte sequences with last byte missing
	{{0xc0}, {0xfffd}},
	{{0xdf}, {0xfffd}},
	// 3-byte sequences with last byte missing.
	{{0xe8, 0x80}, {0xfffd}},
	{{0xe0, 0xbf}, {0xfffd}},
	{{0xef, 0xbf}, {0xfffd}},
	// Start of an overlong sequence. The first "maximal subpart" is the first
	// byte; it creates an invalid character. Each following byte generates an
	// invalid character too.
	{{0xe0, 0x80}, {0xfffd, 0xfffd}},
	// 4-byte sequences with last byte missing
	{{0xf1, 0x80, 0x80}, {0xfffd}},
	{{0xf4, 0x8f, 0xbf}, {0xfffd}},
	// Start of an overlong sequence. The first "maximal subpart" is the first
	// byte; it creates an invalid character. Each following byte generates an
	// invalid character too.
	{{0xf0, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
	// 5-byte sequences (not supported) with last byte missing
	{{0xf8, 0x80, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xfb, 0xbf, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 6-byte sequences (not supported) with last byte missing
	{{0xfc, 0x80, 0x80, 0x80, 0x80},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xfd, 0xbf, 0xbf, 0xbf, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

	// Concatenation of incomplete sequences: above incomplete sequences
	// concatenated.
	{{0xc0, 0xdf, 0xe8, 0x80, 0xe0, 0xbf, 0xef, 0xbf, 0xe0, 0x80,
	0xf1, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xf0, 0x80, 0x80, 0xf8,
	0x80, 0x80, 0x80, 0xfb, 0xbf, 0xbf, 0xbf, 0xfc, 0x80, 0x80,
	0x80, 0x80, 0xfd, 0xbf, 0xbf, 0xbf, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
	0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
	0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
	0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

	// Incomplete sequence tests repeated with a space after the incomplete
	// sequence.

	// 2-byte sequences with last byte missing
	{{0xc0, 0x20}, {0xfffd, 0x20}},
	{{0xdf, 0x20}, {0xfffd, 0x20}},
	// 3-byte sequences with last byte missing
	{{0xe8, 0x80, 0x20}, {0xfffd, 0x20}},
	{{0xe0, 0xbf, 0x20}, {0xfffd, 0x20}},
	{{0xef, 0xbf, 0x20}, {0xfffd, 0x20}},
	// Start of overlong 3-byte sequence with last byte missing
	{{0xe0, 0x80, 0x20}, {0xfffd, 0xfffd, 0x20}},
	// 4-byte sequences with last byte missing
	{{0xf1, 0x80, 0x80, 0x20}, {0xfffd, 0x20}},
	{{0xf4, 0x8f, 0xbf, 0x20}, {0xfffd, 0x20}},
	// Start of overlong 4-byte sequence with last byte missing
	{{0xf0, 0x80, 0x80, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0x20}},
	// 5-byte sequences (not supported) with last byte missing
	{{0xf8, 0x80, 0x80, 0x80, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},
	{{0xfb, 0xbf, 0xbf, 0xbf, 0x20}, {0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},
	// 6-byte sequences (not supported) with last byte missing
	{{0xfc, 0x80, 0x80, 0x80, 0x80, 0x20},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},
	{{0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0x20},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x20}},

	// Impossible bytes
	{{0xfe}, {0xfffd}},
	{{0xff}, {0xfffd}},
	{{0xfe, 0xfe, 0xff, 0xff}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// Lead-byte-like bytes which aren't valid lead bytes.
	{{0xc0}, {0xfffd}},
	{{0xc0, 0xaa}, {0xfffd, 0xfffd}},
	{{0xc1}, {0xfffd}},
	{{0xc1, 0xaa}, {0xfffd, 0xfffd}},
	{{0xf5}, {0xfffd}},
	{{0xf5, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xf6}, {0xfffd}},
	{{0xf6, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xf7}, {0xfffd}},
	{{0xf7, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xf8}, {0xfffd}},
	{{0xf8, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xf9}, {0xfffd}},
	{{0xf9, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xfa}, {0xfffd}},
	{{0xfa, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xfb}, {0xfffd}},
	{{0xfb, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xfc}, {0xfffd}},
	{{0xfc, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xfd}, {0xfffd}},
	{{0xfd, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xfe}, {0xfffd}},
	{{0xfe, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xff}, {0xfffd}},
	{{0xff, 0xaa, 0xaa, 0xaa}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},

	// Overlong sequences:

	// Overlong encodings for "/"
	{{0xc0, 0xaf}, {0xfffd, 0xfffd}},
	{{0xe0, 0x80, 0xaf}, {0xfffd, 0xfffd, 0xfffd}},
	{{0xf0, 0x80, 0x80, 0xaf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 5-byte sequence (not supported anyway)
	{{0xf8, 0x80, 0x80, 0x80, 0xaf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 6-byte sequence (not supported anyway)
	{{0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

	// Maximum overlong sequences
	{{0xc1, 0xbf}, {0xfffd, 0xfffd}},
	{{0xe0, 0x9f, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},
	{{0xf0, 0x8f, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 5-byte sequence (not supported anyway)
	{{0xf8, 0x87, 0xbf, 0xbf, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 6-byte sequence (not supported anyway)
	{{0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

	// Overlong encodings for 0
	{{0xc0, 0x80}, {0xfffd, 0xfffd}},
	{{0xe0, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
	{{0xf0, 0x80, 0x80, 0x80}, {0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 5-byte sequence (not supported anyway)
	{{0xf8, 0x80, 0x80, 0x80, 0x80},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	// 6-byte sequence (not supported anyway)
	{{0xfc, 0x80, 0x80, 0x80, 0x80, 0x80},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

	// Illegal code positions:

	// Single UTF-16 surrogates
	{{0xed, 0xa0, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xa0, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xad, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xae, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xaf, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xb0, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xbe, 0x80}, {0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xbf, 0xbf}, {0xfffd, 0xfffd, 0xfffd}},

	// Paired surrogates
	{{0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xae, 0x80, 0xed, 0xb0, 0x80},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},
	{{0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf},
	{0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd}},

	// Surrogates with the last byte missing.
	{{0xed, 0xa0}, {0xfffd, 0xfffd}},
	{{0xed, 0xa0}, {0xfffd, 0xfffd}},
	{{0xed, 0xad}, {0xfffd, 0xfffd}},
	{{0xed, 0xae}, {0xfffd, 0xfffd}},
	{{0xed, 0xaf}, {0xfffd, 0xfffd}},
	{{0xed, 0xb0}, {0xfffd, 0xfffd}},
	{{0xed, 0xbe}, {0xfffd, 0xfffd}},
	{{0xed, 0xbf}, {0xfffd, 0xfffd}},

	// Other non-characters
	{{0xef, 0xbf, 0xbe}, {0xfffe}},
	{{0xef, 0xbf, 0xbf}, {0xffff}},
	{{0xef, 0xb7, 0x90, 0xef, 0xb7, 0x91, 0xef, 0xb7, 0x92, 0xef, 0xb7, 0x93,
	0xef, 0xb7, 0x94, 0xef, 0xb7, 0x95, 0xef, 0xb7, 0x96, 0xef, 0xb7, 0x97,
	0xef, 0xb7, 0x98, 0xef, 0xb7, 0x99, 0xef, 0xb7, 0x9a, 0xef, 0xb7, 0x9b,
	0xef, 0xb7, 0x9c, 0xef, 0xb7, 0x9d, 0xef, 0xb7, 0x9e, 0xef, 0xb7, 0x9f,
	0xef, 0xb7, 0xa0, 0xef, 0xb7, 0xa1, 0xef, 0xb7, 0xa2, 0xef, 0xb7, 0xa3,
	0xef, 0xb7, 0xa4, 0xef, 0xb7, 0xa5, 0xef, 0xb7, 0xa6, 0xef, 0xb7, 0xa7,
	0xef, 0xb7, 0xa8, 0xef, 0xb7, 0xa9, 0xef, 0xb7, 0xaa, 0xef, 0xb7, 0xab,
	0xef, 0xb7, 0xac, 0xef, 0xb7, 0xad, 0xef, 0xb7, 0xae, 0xef, 0xb7, 0xaf},
	{0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7,
	0xfdd8, 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf,
	0xfde0, 0xfde1, 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7,
	0xfde8, 0xfde9, 0xfdea, 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef}},
	{{0xf0, 0x9f, 0xbf, 0xbe, 0xf0, 0x9f, 0xbf, 0xbf, 0xf0, 0xaf, 0xbf,
	0xbe, 0xf0, 0xaf, 0xbf, 0xbf, 0xf0, 0xbf, 0xbf, 0xbe, 0xf0, 0xbf,
	0xbf, 0xbf, 0xf1, 0x8f, 0xbf, 0xbe, 0xf1, 0x8f, 0xbf, 0xbf, 0xf1,
	0x9f, 0xbf, 0xbe, 0xf1, 0x9f, 0xbf, 0xbf, 0xf1, 0xaf, 0xbf, 0xbe,
	0xf1, 0xaf, 0xbf, 0xbf, 0xf1, 0xbf, 0xbf, 0xbe, 0xf1, 0xbf, 0xbf,
	0xbf, 0xf2, 0x8f, 0xbf, 0xbe, 0xf2, 0x8f, 0xbf, 0xbf},
	{0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff, 0x4fffe, 0x4ffff,
	0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff, 0x8fffe,
	0x8ffff}},
	};

	for (auto test : data) {
	// For figuring out which test fails:
	fprintf(stderr, "test: ");
	for (auto b : test.bytes) {
	fprintf(stderr, "%x ", b);
	}
	fprintf(stderr, "\n");

	std::vector<unibrow::uchar> output_normal;
	DecodeNormally(test.bytes, &output_normal);

	CHECK_EQ(output_normal.size(), test.unicode_expected.size());
	for (size_t i = 0; i < output_normal.size(); ++i) {
	CHECK_EQ(output_normal[i], test.unicode_expected[i]);
	}

	std::vector<unibrow::uchar> output_incremental;
	DecodeIncrementally(test.bytes, &output_incremental);

	CHECK_EQ(output_incremental.size(), test.unicode_expected.size());
	for (size_t i = 0; i < output_incremental.size(); ++i) {
	CHECK_EQ(output_incremental[i], test.unicode_expected[i]);
	}
	}
	}

	} // namespace internal
	} // namespace v8