| // Copyright 2014 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // All data that is passed through a WebSocket with type "Text" needs to be |
| // validated as UTF8. Since this is done on the IO thread, it needs to be |
| // reasonably fast. |
| |
| // We are only interested in the performance on valid UTF8. Invalid UTF8 will |
| // result in a connection failure, so is unlikely to become a source of |
| // performance issues. |
| |
| #include "base/i18n/streaming_utf8_validator.h" |
| |
| #include <string> |
| |
| #include "base/bind.h" |
| #include "base/callback.h" |
| #include "base/macros.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/stringprintf.h" |
| #include "base/test/perf_time_logger.h" |
| #include "starboard/types.h" |
| #include "testing/gtest/include/gtest/gtest.h" |
| |
| namespace base { |
| namespace { |
| |
| // We want to test ranges of valid UTF-8 sequences. These ranges are inclusive. |
| // They are intended to be large enough that the validator needs to do |
| // meaningful work while being in some sense "realistic" (eg. control characters |
| // are not included). |
| const char kOneByteSeqRangeStart[] = " "; // U+0020 |
| const char kOneByteSeqRangeEnd[] = "~"; // U+007E |
| |
| const char kTwoByteSeqRangeStart[] = "\xc2\xa0"; // U+00A0 non-breaking space |
| const char kTwoByteSeqRangeEnd[] = "\xc9\x8f"; // U+024F small y with stroke |
| |
| const char kThreeByteSeqRangeStart[] = "\xe3\x81\x82"; // U+3042 Hiragana "a" |
| const char kThreeByteSeqRangeEnd[] = "\xe9\xbf\x83"; // U+9FC3 "to blink" |
| |
| const char kFourByteSeqRangeStart[] = "\xf0\xa0\x80\x8b"; // U+2000B |
| const char kFourByteSeqRangeEnd[] = "\xf0\xaa\x9a\xb2"; // U+2A6B2 |
| |
| // The different lengths of strings to test. |
| const size_t kTestLengths[] = {1, 32, 256, 32768, 1 << 20}; |
| |
| // Simplest possible byte-at-a-time validator, to provide a baseline |
| // for comparison. This is only tried on 1-byte UTF-8 sequences, as |
| // the results will not be meaningful with sequences containing |
| // top-bit-set bytes. |
| bool IsString7Bit(const std::string& s) { |
| for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) { |
| if (*it & 0x80) |
| return false; |
| } |
| return true; |
| } |
| |
| // Assumes that |previous| is a valid UTF-8 sequence, and attempts to return |
| // the next one. Is just barely smart enough to iterate through the ranges |
| // defined about. |
| std::string NextUtf8Sequence(const std::string& previous) { |
| DCHECK(StreamingUtf8Validator::Validate(previous)); |
| std::string next = previous; |
| for (int i = static_cast<int>(previous.length() - 1); i >= 0; --i) { |
| // All bytes in a UTF-8 sequence except the first one are |
| // constrained to the range 0x80 to 0xbf, inclusive. When we |
| // increment past 0xbf, we carry into the previous byte. |
| if (i > 0 && next[i] == '\xbf') { |
| next[i] = '\x80'; |
| continue; // carry |
| } |
| ++next[i]; |
| break; // no carry |
| } |
| DCHECK(StreamingUtf8Validator::Validate(next)) |
| << "Result \"" << next << "\" failed validation"; |
| return next; |
| } |
| |
| typedef bool (*TestTargetType)(const std::string&); |
| |
| // Run fuction |target| over |test_string| |times| times, and report the results |
| // using |description|. |
| bool RunTest(const std::string& description, |
| TestTargetType target, |
| const std::string& test_string, |
| int times) { |
| base::PerfTimeLogger timer(description.c_str()); |
| bool result = true; |
| for (int i = 0; i < times; ++i) { |
| result = target(test_string) && result; |
| } |
| timer.Done(); |
| return result; |
| } |
| |
| // Construct a string by repeating |input| enough times to equal or exceed |
| // |length|. |
| std::string ConstructRepeatedTestString(const std::string& input, |
| size_t length) { |
| std::string output = input; |
| while (output.length() * 2 < length) { |
| output += output; |
| } |
| if (output.length() < length) { |
| output += ConstructRepeatedTestString(input, length - output.length()); |
| } |
| return output; |
| } |
| |
| // Construct a string by expanding the range of UTF-8 sequences |
| // between |input_start| and |input_end|, inclusive, and then |
| // repeating the resulting string until it equals or exceeds |length| |
| // bytes. |input_start| and |input_end| must be valid UTF-8 |
| // sequences. |
| std::string ConstructRangedTestString(const std::string& input_start, |
| const std::string& input_end, |
| size_t length) { |
| std::string output = input_start; |
| std::string input = input_start; |
| while (output.length() < length && input != input_end) { |
| input = NextUtf8Sequence(input); |
| output += input; |
| } |
| if (output.length() < length) { |
| output = ConstructRepeatedTestString(output, length); |
| } |
| return output; |
| } |
| |
| struct TestFunctionDescription { |
| TestTargetType function; |
| const char* function_name; |
| }; |
| |
| bool IsStringUTF8(const std::string& str) { |
| return base::IsStringUTF8(base::StringPiece(str)); |
| } |
| |
| // IsString7Bit is intentionally placed last so it can be excluded easily. |
| const TestFunctionDescription kTestFunctions[] = { |
| {&StreamingUtf8Validator::Validate, "StreamingUtf8Validator"}, |
| {&IsStringUTF8, "IsStringUTF8"}, {&IsString7Bit, "IsString7Bit"}}; |
| |
| // Construct a test string from |construct_test_string| for each of the lengths |
| // in |kTestLengths| in turn. For each string, run each test in |test_functions| |
| // for a number of iterations such that the total number of bytes validated |
| // is around 16MB. |
| void RunSomeTests( |
| const char format[], |
| base::Callback<std::string(size_t length)> construct_test_string, |
| const TestFunctionDescription* test_functions, |
| size_t test_count) { |
| for (size_t i = 0; i < arraysize(kTestLengths); ++i) { |
| const size_t length = kTestLengths[i]; |
| const std::string test_string = construct_test_string.Run(length); |
| const int real_length = static_cast<int>(test_string.length()); |
| const int times = (1 << 24) / real_length; |
| for (size_t test_index = 0; test_index < test_count; ++test_index) { |
| EXPECT_TRUE(RunTest(StringPrintf(format, |
| test_functions[test_index].function_name, |
| real_length, |
| times), |
| test_functions[test_index].function, |
| test_string, |
| times)); |
| } |
| } |
| } |
| |
| TEST(StreamingUtf8ValidatorPerfTest, OneByteRepeated) { |
| RunSomeTests("%s: bytes=1 repeated length=%d repeat=%d", |
| base::Bind(ConstructRepeatedTestString, kOneByteSeqRangeStart), |
| kTestFunctions, |
| 3); |
| } |
| |
| TEST(StreamingUtf8ValidatorPerfTest, OneByteRange) { |
| RunSomeTests("%s: bytes=1 ranged length=%d repeat=%d", |
| base::Bind(ConstructRangedTestString, |
| kOneByteSeqRangeStart, |
| kOneByteSeqRangeEnd), |
| kTestFunctions, |
| 3); |
| } |
| |
| TEST(StreamingUtf8ValidatorPerfTest, TwoByteRepeated) { |
| RunSomeTests("%s: bytes=2 repeated length=%d repeat=%d", |
| base::Bind(ConstructRepeatedTestString, kTwoByteSeqRangeStart), |
| kTestFunctions, |
| 2); |
| } |
| |
| TEST(StreamingUtf8ValidatorPerfTest, TwoByteRange) { |
| RunSomeTests("%s: bytes=2 ranged length=%d repeat=%d", |
| base::Bind(ConstructRangedTestString, |
| kTwoByteSeqRangeStart, |
| kTwoByteSeqRangeEnd), |
| kTestFunctions, |
| 2); |
| } |
| |
| TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRepeated) { |
| RunSomeTests( |
| "%s: bytes=3 repeated length=%d repeat=%d", |
| base::Bind(ConstructRepeatedTestString, kThreeByteSeqRangeStart), |
| kTestFunctions, |
| 2); |
| } |
| |
| TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRange) { |
| RunSomeTests("%s: bytes=3 ranged length=%d repeat=%d", |
| base::Bind(ConstructRangedTestString, |
| kThreeByteSeqRangeStart, |
| kThreeByteSeqRangeEnd), |
| kTestFunctions, |
| 2); |
| } |
| |
| TEST(StreamingUtf8ValidatorPerfTest, FourByteRepeated) { |
| RunSomeTests("%s: bytes=4 repeated length=%d repeat=%d", |
| base::Bind(ConstructRepeatedTestString, kFourByteSeqRangeStart), |
| kTestFunctions, |
| 2); |
| } |
| |
| TEST(StreamingUtf8ValidatorPerfTest, FourByteRange) { |
| RunSomeTests("%s: bytes=4 ranged length=%d repeat=%d", |
| base::Bind(ConstructRangedTestString, |
| kFourByteSeqRangeStart, |
| kFourByteSeqRangeEnd), |
| kTestFunctions, |
| 2); |
| } |
| |
| } // namespace |
| } // namespace base |