| // Copyright 2014 The Chromium Authors. All rights reserved. | 
 | // Use of this source code is governed by a BSD-style license that can be | 
 | // found in the LICENSE file. | 
 |  | 
 | #include "base/i18n/streaming_utf8_validator.h" | 
 |  | 
 | #include <stdio.h> | 
 | #include <string.h> | 
 |  | 
 | #include <string> | 
 |  | 
 | #include "base/macros.h" | 
 | #include "base/strings/string_piece.h" | 
 | #include "testing/gtest/include/gtest/gtest.h" | 
 |  | 
 | // Define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST to verify that this class | 
 | // accepts exactly the same set of 4-byte strings as ICU-based validation. This | 
 | // tests every possible 4-byte string, so it is too slow to run routinely on | 
 | // low-powered machines. | 
 | // | 
 | // #define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST | 
 |  | 
 | #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST | 
 |  | 
 | #include "base/bind.h" | 
 | #include "base/location.h" | 
 | #include "base/logging.h" | 
 | #include "base/memory/ref_counted.h" | 
 | #include "base/strings/string_util.h" | 
 | #include "base/strings/stringprintf.h" | 
 | #include "base/strings/utf_string_conversion_utils.h" | 
 | #include "base/synchronization/lock.h" | 
 | #include "base/task/post_task.h" | 
 | #include "base/task/task_scheduler/task_scheduler.h" | 
 | #include "starboard/memory.h" | 
 | #include "starboard/string.h" | 
 | #include "starboard/types.h" | 
 | #include "third_party/icu/source/common/unicode/utf8.h" | 
 |  | 
 | #endif  // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST | 
 |  | 
 | namespace base { | 
 | namespace { | 
 |  | 
 | // Avoid having to qualify the enum values in the tests. | 
 | const StreamingUtf8Validator::State VALID_ENDPOINT = | 
 |     StreamingUtf8Validator::VALID_ENDPOINT; | 
 | const StreamingUtf8Validator::State VALID_MIDPOINT = | 
 |     StreamingUtf8Validator::VALID_MIDPOINT; | 
 | const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID; | 
 |  | 
 | #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST | 
 |  | 
 | const uint32_t kThoroughTestChunkSize = 1 << 24; | 
 |  | 
 | class StreamingUtf8ValidatorThoroughTest : public ::testing::Test { | 
 |  protected: | 
 |   StreamingUtf8ValidatorThoroughTest() | 
 |       : tasks_dispatched_(0), tasks_finished_(0) {} | 
 |  | 
 |   // This uses the same logic as base::IsStringUTF8 except it considers | 
 |   // non-characters valid (and doesn't require a string as input). | 
 |   static bool IsStringUtf8(const char* src, int32_t src_len) { | 
 |     int32_t char_index = 0; | 
 |  | 
 |     while (char_index < src_len) { | 
 |       int32_t code_point; | 
 |       U8_NEXT(src, char_index, src_len, code_point); | 
 |       if (!base::IsValidCodepoint(code_point)) | 
 |         return false; | 
 |     } | 
 |     return true; | 
 |   } | 
 |  | 
 |   // Converts the passed-in integer to a 4 byte string and then | 
 |   // verifies that IsStringUtf8 and StreamingUtf8Validator agree on | 
 |   // whether it is valid UTF-8 or not. | 
 |   void TestNumber(uint32_t n) const { | 
 |     char test[sizeof n]; | 
 |     SbMemoryCopy(test, &n, sizeof n); | 
 |     StreamingUtf8Validator validator; | 
 |     EXPECT_EQ(IsStringUtf8(test, sizeof n), | 
 |               validator.AddBytes(test, sizeof n) == VALID_ENDPOINT) | 
 |         << "Difference of opinion for \"" | 
 |         << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X", | 
 |                               test[0] & 0xFF, | 
 |                               test[1] & 0xFF, | 
 |                               test[2] & 0xFF, | 
 |                               test[3] & 0xFF) << "\""; | 
 |   } | 
 |  | 
 |  public: | 
 |   // Tests the 4-byte sequences corresponding to the |size| integers | 
 |   // starting at |begin|. This is intended to be run from a worker | 
 |   // pool. Signals |all_done_| at the end if it thinks all tasks are | 
 |   // finished. | 
 |   void TestRange(uint32_t begin, uint32_t size) { | 
 |     for (uint32_t i = 0; i < size; ++i) { | 
 |       TestNumber(begin + i); | 
 |     } | 
 |     base::AutoLock al(lock_); | 
 |     ++tasks_finished_; | 
 |     LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_ | 
 |               << " tasks done\n"; | 
 |   } | 
 |  | 
 |  protected: | 
 |   base::Lock lock_; | 
 |   int tasks_dispatched_; | 
 |   int tasks_finished_; | 
 | }; | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorThoroughTest, TestEverything) { | 
 |   base::TaskScheduler::CreateAndStartWithDefaultParams( | 
 |       "StreamingUtf8ValidatorThoroughTest"); | 
 |   { | 
 |     base::AutoLock al(lock_); | 
 |     uint32_t begin = 0; | 
 |     do { | 
 |       base::PostTaskWithTraits( | 
 |           FROM_HERE, {base::TaskShutdownBehavior::BLOCK_SHUTDOWN}, | 
 |           base::BindOnce(&StreamingUtf8ValidatorThoroughTest::TestRange, | 
 |                          base::Unretained(this), begin, | 
 |                          kThoroughTestChunkSize)); | 
 |       ++tasks_dispatched_; | 
 |       begin += kThoroughTestChunkSize; | 
 |     } while (begin != 0); | 
 |   } | 
 |   base::TaskScheduler::GetInstance()->Shutdown(); | 
 |   base::TaskScheduler::GetInstance()->JoinForTesting(); | 
 |   base::TaskScheduler::SetInstance(nullptr); | 
 | } | 
 |  | 
 | #endif  // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST | 
 |  | 
 | // These valid and invalid UTF-8 sequences are based on the tests from | 
 | // base/strings/string_util_unittest.cc | 
 |  | 
 | // All of the strings in |valid| must represent a single codepoint, because | 
 | // partial sequences are constructed by taking non-empty prefixes of these | 
 | // strings. | 
 | const char* const valid[] = {"\r",           "\n",           "a", | 
 |                              "\xc2\x81",     "\xe1\x80\xbf", "\xf1\x80\xa0\xbf", | 
 |                              "\xef\xbb\xbf",  // UTF-8 BOM | 
 | }; | 
 |  | 
 | const char* const* const valid_end = valid + arraysize(valid); | 
 |  | 
 | const char* const invalid[] = { | 
 |     // always invalid bytes | 
 |     "\xc0", "\xc1", | 
 |     "\xf5", "\xf6", "\xf7", | 
 |     "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff", | 
 |     // surrogate code points | 
 |     "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf", | 
 |     // | 
 |     // overlong sequences | 
 |     "\xc0\x80",              // U+0000 | 
 |     "\xc1\x80",              // "A" | 
 |     "\xc1\x81",              // "B" | 
 |     "\xe0\x80\x80",          // U+0000 | 
 |     "\xe0\x82\x80",          // U+0080 | 
 |     "\xe0\x9f\xbf",          // U+07ff | 
 |     "\xf0\x80\x80\x8D",      // U+000D | 
 |     "\xf0\x80\x82\x91",      // U+0091 | 
 |     "\xf0\x80\xa0\x80",      // U+0800 | 
 |     "\xf0\x8f\xbb\xbf",      // U+FEFF (BOM) | 
 |     "\xf8\x80\x80\x80\xbf",  // U+003F | 
 |     "\xfc\x80\x80\x80\xa0\xa5", | 
 |     // | 
 |     // Beyond U+10FFFF | 
 |     "\xf4\x90\x80\x80",          // U+110000 | 
 |     "\xf8\xa0\xbf\x80\xbf",      // 5 bytes | 
 |     "\xfc\x9c\xbf\x80\xbf\x80",  // 6 bytes | 
 |     // | 
 |     // BOMs in UTF-16(BE|LE) | 
 |     "\xfe\xff", "\xff\xfe", | 
 | }; | 
 |  | 
 | const char* const* const invalid_end = invalid + arraysize(invalid); | 
 |  | 
 | // A ForwardIterator which returns all the non-empty prefixes of the elements of | 
 | // "valid". | 
 | class PartialIterator { | 
 |  public: | 
 |   // The constructor returns the first iterator, ie. it is equivalent to | 
 |   // begin(). | 
 |   PartialIterator() : index_(0), prefix_length_(0) { Advance(); } | 
 |   // The trivial destructor left intentionally undefined. | 
 |   // This is a value type; the default copy constructor and assignment operator | 
 |   // generated by the compiler are used. | 
 |  | 
 |   static PartialIterator end() { return PartialIterator(arraysize(valid), 1); } | 
 |  | 
 |   PartialIterator& operator++() { | 
 |     Advance(); | 
 |     return *this; | 
 |   } | 
 |  | 
 |   base::StringPiece operator*() const { | 
 |     return base::StringPiece(valid[index_], prefix_length_); | 
 |   } | 
 |  | 
 |   bool operator==(const PartialIterator& rhs) const { | 
 |     return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_; | 
 |   } | 
 |  | 
 |   bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); } | 
 |  | 
 |  private: | 
 |   // This constructor is used by the end() method. | 
 |   PartialIterator(size_t index, size_t prefix_length) | 
 |       : index_(index), prefix_length_(prefix_length) {} | 
 |  | 
 |   void Advance() { | 
 |     if (index_ < arraysize(valid) && | 
 |         prefix_length_ < SbStringGetLength(valid[index_])) | 
 |       ++prefix_length_; | 
 |     while (index_ < arraysize(valid) && | 
 |            prefix_length_ == SbStringGetLength(valid[index_])) { | 
 |       ++index_; | 
 |       prefix_length_ = 1; | 
 |     } | 
 |   } | 
 |  | 
 |   // The UTF-8 sequence, as an offset into the |valid| array. | 
 |   size_t index_; | 
 |   size_t prefix_length_; | 
 | }; | 
 |  | 
 | // A test fixture for tests which test one UTF-8 sequence (or invalid | 
 | // byte sequence) at a time. | 
 | class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test { | 
 |  protected: | 
 |   // Iterator must be convertible when de-referenced to StringPiece. | 
 |   template <typename Iterator> | 
 |   void CheckRange(Iterator begin, | 
 |                   Iterator end, | 
 |                   StreamingUtf8Validator::State expected) { | 
 |     for (Iterator it = begin; it != end; ++it) { | 
 |       StreamingUtf8Validator validator; | 
 |       base::StringPiece sequence = *it; | 
 |       EXPECT_EQ(expected, | 
 |                 validator.AddBytes(sequence.data(), sequence.size())) | 
 |           << "Failed for \"" << sequence << "\""; | 
 |     } | 
 |   } | 
 |  | 
 |   // Adding input a byte at a time should make absolutely no difference. | 
 |   template <typename Iterator> | 
 |   void CheckRangeByteAtATime(Iterator begin, | 
 |                              Iterator end, | 
 |                              StreamingUtf8Validator::State expected) { | 
 |     for (Iterator it = begin; it != end; ++it) { | 
 |       StreamingUtf8Validator validator; | 
 |       base::StringPiece sequence = *it; | 
 |       StreamingUtf8Validator::State state = VALID_ENDPOINT; | 
 |       for (base::StringPiece::const_iterator cit = sequence.begin(); | 
 |            cit != sequence.end(); | 
 |            ++cit) { | 
 |         state = validator.AddBytes(&*cit, 1); | 
 |       } | 
 |       EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\""; | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | // A test fixture for tests which test the concatenation of byte sequences. | 
 | class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test { | 
 |  protected: | 
 |   // Check every possible concatenation of byte sequences from two | 
 |   // ranges, and verify that the combination matches the expected | 
 |   // state. | 
 |   template <typename Iterator1, typename Iterator2> | 
 |   void CheckCombinations(Iterator1 begin1, | 
 |                          Iterator1 end1, | 
 |                          Iterator2 begin2, | 
 |                          Iterator2 end2, | 
 |                          StreamingUtf8Validator::State expected) { | 
 |     StreamingUtf8Validator validator; | 
 |     for (Iterator1 it1 = begin1; it1 != end1; ++it1) { | 
 |       base::StringPiece c1 = *it1; | 
 |       for (Iterator2 it2 = begin2; it2 != end2; ++it2) { | 
 |         base::StringPiece c2 = *it2; | 
 |         validator.AddBytes(c1.data(), c1.size()); | 
 |         EXPECT_EQ(expected, validator.AddBytes(c2.data(), c2.size())) | 
 |             << "Failed for \"" << c1 << c2 << "\""; | 
 |         validator.Reset(); | 
 |       } | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | TEST(StreamingUtf8ValidatorTest, NothingIsValid) { | 
 |   static const char kNothing[] = ""; | 
 |   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNothing, 0)); | 
 | } | 
 |  | 
 | // Because the members of the |valid| array need to be non-zero length | 
 | // sequences and are measured with strlen(), |valid| cannot be used it | 
 | // to test the NUL character '\0', so the NUL character gets its own | 
 | // test. | 
 | TEST(StreamingUtf8ValidatorTest, NulIsValid) { | 
 |   static const char kNul[] = "\x00"; | 
 |   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNul, 1)); | 
 | } | 
 |  | 
 | // Just a basic sanity test before we start getting fancy. | 
 | TEST(StreamingUtf8ValidatorTest, HelloWorld) { | 
 |   static const char kHelloWorld[] = "Hello, World!"; | 
 |   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes( | 
 |                                 kHelloWorld, SbStringGetLength(kHelloWorld))); | 
 | } | 
 |  | 
 | // Check that the Reset() method works. | 
 | TEST(StreamingUtf8ValidatorTest, ResetWorks) { | 
 |   StreamingUtf8Validator validator; | 
 |   EXPECT_EQ(INVALID, validator.AddBytes("\xC0", 1)); | 
 |   EXPECT_EQ(INVALID, validator.AddBytes("a", 1)); | 
 |   validator.Reset(); | 
 |   EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes("a", 1)); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) { | 
 |   CheckRange(valid, valid_end, VALID_ENDPOINT); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) { | 
 |   CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) { | 
 |   CheckRange(invalid, invalid_end, INVALID); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) { | 
 |   CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) { | 
 |   CheckRangeByteAtATime( | 
 |       PartialIterator(), PartialIterator::end(), VALID_MIDPOINT); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) { | 
 |   CheckRangeByteAtATime(invalid, invalid_end, INVALID); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) { | 
 |   CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) { | 
 |   CheckCombinations(valid, | 
 |                     valid_end, | 
 |                     PartialIterator(), | 
 |                     PartialIterator::end(), | 
 |                     VALID_MIDPOINT); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) { | 
 |   CheckCombinations( | 
 |       PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) { | 
 |   CheckCombinations(PartialIterator(), | 
 |                     PartialIterator::end(), | 
 |                     PartialIterator(), | 
 |                     PartialIterator::end(), | 
 |                     INVALID); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) { | 
 |   CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) { | 
 |   CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) { | 
 |   CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) { | 
 |   CheckCombinations( | 
 |       invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID); | 
 | } | 
 |  | 
 | TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) { | 
 |   CheckCombinations( | 
 |       PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID); | 
 | } | 
 |  | 
 | TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) { | 
 |   EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string())); | 
 | } | 
 |  | 
 | TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) { | 
 |   EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81")); | 
 | } | 
 |  | 
 | TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) { | 
 |   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80")); | 
 | } | 
 |  | 
 | TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) { | 
 |   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2")); | 
 | } | 
 |  | 
 | }  // namespace | 
 | }  // namespace base |