| // Copyright 2014 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Create a state machine for validating UTF-8. The algorithm in brief: |
| // 1. Convert the complete unicode range of code points, except for the |
| // surrogate code points, to an ordered array of sequences of bytes in |
| // UTF-8. |
| // 2. Convert individual bytes to ranges, starting from the right of each byte |
| // sequence. For each range, ensure the bytes on the left and the ranges |
| // on the right are the identical. |
| // 3. Convert the resulting list of ranges into a state machine, collapsing |
| // identical states. |
| // 4. Convert the state machine to an array of bytes. |
| // 5. Output as a C++ file. |
| // |
| // To use: |
| // $ ninja -C out/Release build_utf8_validator_tables |
| // $ out/Release/build_utf8_validator_tables |
| // --output=base/i18n/utf8_validator_tables.cc |
| // $ git add base/i18n/utf8_validator_tables.cc |
| // |
| // Because the table is not expected to ever change, it is checked into the |
| // repository rather than being regenerated at build time. |
| // |
| // This code uses type uint8_t throughout to represent bytes, to avoid |
| // signed/unsigned char confusion. |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include <algorithm> |
| #include <map> |
| #include <string> |
| #include <vector> |
| |
| #include "base/command_line.h" |
| #include "base/files/file_path.h" |
| #include "base/files/file_util.h" |
| #include "base/logging.h" |
| #include "base/memory/raw_ptr.h" |
| #include "base/numerics/safe_conversions.h" |
| #include "base/strings/stringprintf.h" |
| #include "third_party/icu/source/common/unicode/utf8.h" |
| |
| namespace { |
| |
| const char kHelpText[] = |
| "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n"; |
| |
| const char kProlog[] = |
| "// Copyright 2013 The Chromium Authors\n" |
| "// Use of this source code is governed by a BSD-style license that can " |
| "be\n" |
| "// found in the LICENSE file.\n" |
| "\n" |
| "// This file is auto-generated by build_utf8_validator_tables.\n" |
| "// DO NOT EDIT.\n" |
| "\n" |
| "#include \"base/i18n/utf8_validator_tables.h\"\n" |
| "\n" |
| "namespace base {\n" |
| "namespace internal {\n" |
| "\n" |
| "const uint8_t kUtf8ValidatorTables[] = {\n"; |
| |
| const char kEpilog[] = |
| "};\n" |
| "\n" |
| "const size_t kUtf8ValidatorTablesSize = " |
| "std::size(kUtf8ValidatorTables);\n" |
| "\n" |
| "} // namespace internal\n" |
| "} // namespace base\n"; |
| |
| // Ranges are inclusive at both ends--they represent [from, to] |
| class Range { |
| public: |
| // Ranges always start with just one byte. |
| explicit Range(uint8_t value) : from_(value), to_(value) {} |
| |
| // Range objects are copyable and assignable to be used in STL |
| // containers. Since they only contain non-pointer POD types, the default copy |
| // constructor, assignment operator and destructor will work. |
| |
| // Add a byte to the range. We intentionally only support adding a byte at the |
| // end, since that is the only operation the code needs. |
| void AddByte(uint8_t to) { |
| CHECK(to == to_ + 1); |
| to_ = to; |
| } |
| |
| uint8_t from() const { return from_; } |
| uint8_t to() const { return to_; } |
| |
| bool operator<(const Range& rhs) const { |
| return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to())); |
| } |
| |
| bool operator==(const Range& rhs) const { |
| return from() == rhs.from() && to() == rhs.to(); |
| } |
| |
| private: |
| uint8_t from_; |
| uint8_t to_; |
| }; |
| |
| // A vector of Ranges is like a simple regular expression--it corresponds to |
| // a set of strings of the same length that have bytes in each position in |
| // the appropriate range. |
| typedef std::vector<Range> StringSet; |
| |
| // A UTF-8 "character" is represented by a sequence of bytes. |
| typedef std::vector<uint8_t> Character; |
| |
| // In the second stage of the algorithm, we want to convert a large list of |
| // Characters into a small list of StringSets. |
| struct Pair { |
| Character character; |
| StringSet set; |
| }; |
| |
| typedef std::vector<Pair> PairVector; |
| |
| // A class to print a table of numbers in the same style as clang-format. |
| class TablePrinter { |
| public: |
| explicit TablePrinter(FILE* stream) |
| : stream_(stream), values_on_this_line_(0), current_offset_(0) {} |
| |
| TablePrinter(const TablePrinter&) = delete; |
| TablePrinter& operator=(const TablePrinter&) = delete; |
| |
| void PrintValue(uint8_t value) { |
| if (values_on_this_line_ == 0) { |
| fputs(" ", stream_); |
| } else if (values_on_this_line_ == kMaxValuesPerLine) { |
| fprintf(stream_.get(), " // 0x%02x\n ", current_offset_); |
| values_on_this_line_ = 0; |
| } |
| fprintf(stream_.get(), " 0x%02x,", static_cast<int>(value)); |
| ++values_on_this_line_; |
| ++current_offset_; |
| } |
| |
| void NewLine() { |
| while (values_on_this_line_ < kMaxValuesPerLine) { |
| fputs(" ", stream_); |
| ++values_on_this_line_; |
| } |
| fprintf(stream_.get(), " // 0x%02x\n", current_offset_); |
| values_on_this_line_ = 0; |
| } |
| |
| private: |
| // stdio stream. Not owned. |
| raw_ptr<FILE> stream_; |
| |
| // Number of values so far printed on this line. |
| int values_on_this_line_; |
| |
| // Total values printed so far. |
| int current_offset_; |
| |
| static const int kMaxValuesPerLine = 8; |
| }; |
| |
| // Start by filling a PairVector with characters. The resulting vector goes from |
| // "\x00" to "\xf4\x8f\xbf\xbf". |
| PairVector InitializeCharacters() { |
| PairVector vector; |
| for (int i = 0; i <= 0x10FFFF; ++i) { |
| if (i >= 0xD800 && i < 0xE000) { |
| // Surrogate codepoints are not permitted. Non-character code points are |
| // explicitly permitted. |
| continue; |
| } |
| uint8_t bytes[4]; |
| unsigned int offset = 0; |
| UBool is_error = false; |
| U8_APPEND(bytes, offset, std::size(bytes), i, is_error); |
| DCHECK(!is_error); |
| DCHECK_GT(offset, 0u); |
| DCHECK_LE(offset, std::size(bytes)); |
| Pair pair = {Character(bytes, bytes + offset), StringSet()}; |
| vector.push_back(pair); |
| } |
| return vector; |
| } |
| |
| // Construct a new Pair from |character| and the concatenation of |new_range| |
| // and |existing_set|, and append it to |pairs|. |
| void ConstructPairAndAppend(const Character& character, |
| const Range& new_range, |
| const StringSet& existing_set, |
| PairVector* pairs) { |
| Pair new_pair = {character, StringSet(1, new_range)}; |
| new_pair.set.insert( |
| new_pair.set.end(), existing_set.begin(), existing_set.end()); |
| pairs->push_back(new_pair); |
| } |
| |
| // Each pass over the PairVector strips one byte off the right-hand-side of the |
| // characters and adds a range to the set on the right. For example, the first |
| // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0", |
| // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0", |
| // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0", |
| // [\xa0-\xbf][\x80-\xbf]). |
| void MoveRightMostCharToSet(PairVector* pairs) { |
| PairVector new_pairs; |
| PairVector::const_iterator it = pairs->begin(); |
| while (it != pairs->end() && it->character.empty()) { |
| new_pairs.push_back(*it); |
| ++it; |
| } |
| CHECK(it != pairs->end()); |
| Character unconverted_bytes(it->character.begin(), it->character.end() - 1); |
| Range new_range(it->character.back()); |
| StringSet converted = it->set; |
| ++it; |
| while (it != pairs->end()) { |
| const Pair& current_pair = *it++; |
| if (current_pair.character.size() == unconverted_bytes.size() + 1 && |
| std::equal(unconverted_bytes.begin(), |
| unconverted_bytes.end(), |
| current_pair.character.begin()) && |
| converted == current_pair.set) { |
| // The particular set of UTF-8 codepoints we are validating guarantees |
| // that each byte range will be contiguous. This would not necessarily be |
| // true for an arbitrary set of UTF-8 codepoints. |
| DCHECK_EQ(new_range.to() + 1, current_pair.character.back()); |
| new_range.AddByte(current_pair.character.back()); |
| continue; |
| } |
| ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); |
| unconverted_bytes = Character(current_pair.character.begin(), |
| current_pair.character.end() - 1); |
| new_range = Range(current_pair.character.back()); |
| converted = current_pair.set; |
| } |
| ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); |
| new_pairs.swap(*pairs); |
| } |
| |
| void MoveAllCharsToSets(PairVector* pairs) { |
| // Since each pass of the function moves one character, and UTF-8 sequences |
| // are at most 4 characters long, this simply runs the algorithm four times. |
| for (int i = 0; i < 4; ++i) { |
| MoveRightMostCharToSet(pairs); |
| } |
| #if DCHECK_IS_ON() |
| for (PairVector::const_iterator it = pairs->begin(); it != pairs->end(); |
| ++it) { |
| DCHECK(it->character.empty()); |
| } |
| #endif |
| } |
| |
| // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f], |
| // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the |
| // algorithm is working. Use the command-line option |
| // --vmodule=build_utf8_validator_tables=1 to see this output. |
| void LogStringSets(const PairVector& pairs) { |
| for (const auto& pair_it : pairs) { |
| std::string set_as_string; |
| for (auto set_it = pair_it.set.begin(); set_it != pair_it.set.end(); |
| ++set_it) { |
| set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]", |
| static_cast<int>(set_it->from()), |
| static_cast<int>(set_it->to())); |
| } |
| VLOG(1) << set_as_string; |
| } |
| } |
| |
| // A single state in the state machine is represented by a sorted vector of |
| // start bytes and target states. All input bytes in the range between the start |
| // byte and the next entry in the vector (or 0xFF) result in a transition to the |
| // target state. |
| struct StateRange { |
| uint8_t from; |
| uint8_t target_state; |
| }; |
| |
| typedef std::vector<StateRange> State; |
| |
| // Generates a state where all bytes go to state 1 (invalid). This is also used |
| // as an initialiser for other states (since bytes from outside the desired |
| // range are invalid). |
| State GenerateInvalidState() { |
| const StateRange range = {0, 1}; |
| return State(1, range); |
| } |
| |
| // A map from a state (ie. a set of strings which will match from this state) to |
| // a number (which is an index into the array of states). |
| typedef std::map<StringSet, uint8_t> StateMap; |
| |
| // Create a new state corresponding to |set|, add it |states| and |state_map| |
| // and return the index it was given in |states|. |
| uint8_t MakeState(const StringSet& set, |
| std::vector<State>* states, |
| StateMap* state_map) { |
| DCHECK(!set.empty()); |
| const Range& range = set.front(); |
| const StringSet rest(set.begin() + 1, set.end()); |
| const StateMap::const_iterator where = state_map->find(rest); |
| const uint8_t target_state = where == state_map->end() |
| ? MakeState(rest, states, state_map) |
| : where->second; |
| DCHECK_LT(0, range.from()); |
| DCHECK_LT(range.to(), 0xFF); |
| const StateRange new_state_initializer[] = { |
| {0, 1}, |
| {range.from(), target_state}, |
| {static_cast<uint8_t>(range.to() + 1), 1}}; |
| states->push_back( |
| State(new_state_initializer, |
| new_state_initializer + std::size(new_state_initializer))); |
| const uint8_t new_state_number = |
| base::checked_cast<uint8_t>(states->size() - 1); |
| CHECK(state_map->insert(std::make_pair(set, new_state_number)).second); |
| return new_state_number; |
| } |
| |
| std::vector<State> GenerateStates(const PairVector& pairs) { |
| // States 0 and 1 are the initial/valid state and invalid state, respectively. |
| std::vector<State> states(2, GenerateInvalidState()); |
| StateMap state_map; |
| state_map.insert(std::make_pair(StringSet(), 0)); |
| for (auto it = pairs.begin(); it != pairs.end(); ++it) { |
| DCHECK(it->character.empty()); |
| DCHECK(!it->set.empty()); |
| const Range& range = it->set.front(); |
| const StringSet rest(it->set.begin() + 1, it->set.end()); |
| const StateMap::const_iterator where = state_map.find(rest); |
| const uint8_t target_state = where == state_map.end() |
| ? MakeState(rest, &states, &state_map) |
| : where->second; |
| if (states[0].back().from == range.from()) { |
| DCHECK_EQ(1, states[0].back().target_state); |
| states[0].back().target_state = target_state; |
| DCHECK_LT(range.to(), 0xFF); |
| const StateRange new_range = {static_cast<uint8_t>(range.to() + 1), 1}; |
| states[0].push_back(new_range); |
| } else { |
| DCHECK_LT(range.to(), 0xFF); |
| const StateRange new_range_initializer[] = { |
| {range.from(), target_state}, |
| {static_cast<uint8_t>(range.to() + 1), 1}}; |
| states[0].insert( |
| states[0].end(), new_range_initializer, |
| new_range_initializer + std::size(new_range_initializer)); |
| } |
| } |
| return states; |
| } |
| |
| // Output the generated states as a C++ table. Two tricks are used to compact |
| // the table: each state in the table starts with a shift value which indicates |
| // how many bits we can discard from the right-hand-side of the byte before |
| // doing the table lookup. Secondly, only the state-transitions for bytes |
| // with the top-bit set are included in the table; bytes without the top-bit set |
| // are just ASCII and are handled directly by the code. |
| void PrintStates(const std::vector<State>& states, FILE* stream) { |
| // First calculate the start-offset of each state. This allows the state |
| // machine to jump directly to the correct offset, avoiding an extra |
| // indirection. State 0 starts at offset 0. |
| std::vector<uint8_t> state_offset(1, 0); |
| std::vector<uint8_t> shifts; |
| uint8_t pos = 0; |
| |
| for (const auto& state_it : states) { |
| // We want to set |shift| to the (0-based) index of the least-significant |
| // set bit in any of the ranges for this state, since this tells us how many |
| // bits we can discard and still determine what range a byte lies in. Sadly |
| // it appears that ffs() is not portable, so we do it clumsily. |
| uint8_t shift = 7; |
| for (auto range_it = state_it.begin(); range_it != state_it.end(); |
| ++range_it) { |
| while (shift > 0 && range_it->from % (1 << shift) != 0) { |
| --shift; |
| } |
| } |
| shifts.push_back(shift); |
| pos += 1 + (1 << (7 - shift)); |
| state_offset.push_back(pos); |
| } |
| |
| DCHECK_EQ(129, state_offset[1]); |
| |
| fputs(kProlog, stream); |
| TablePrinter table_printer(stream); |
| |
| for (uint8_t state_index = 0; state_index < states.size(); ++state_index) { |
| const uint8_t shift = shifts[state_index]; |
| uint8_t next_range = 0; |
| uint8_t target_state = 1; |
| fprintf(stream, |
| " // State %d, offset 0x%02x\n", |
| static_cast<int>(state_index), |
| static_cast<int>(state_offset[state_index])); |
| table_printer.PrintValue(shift); |
| for (int i = 0; i < 0x100; i += (1 << shift)) { |
| if (next_range < states[state_index].size() && |
| states[state_index][next_range].from == i) { |
| target_state = states[state_index][next_range].target_state; |
| ++next_range; |
| } |
| if (i >= 0x80) { |
| table_printer.PrintValue(state_offset[target_state]); |
| } |
| } |
| table_printer.NewLine(); |
| } |
| |
| fputs(kEpilog, stream); |
| } |
| |
| } // namespace |
| |
| int main(int argc, char* argv[]) { |
| base::CommandLine::Init(argc, argv); |
| logging::LoggingSettings settings; |
| settings.logging_dest = |
| logging::LOG_TO_SYSTEM_DEBUG_LOG | logging::LOG_TO_STDERR; |
| logging::InitLogging(settings); |
| if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) { |
| fwrite(kHelpText, 1, std::size(kHelpText), stdout); |
| exit(EXIT_SUCCESS); |
| } |
| base::FilePath filename = |
| base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output"); |
| |
| FILE* output = stdout; |
| if (!filename.empty()) { |
| output = base::OpenFile(filename, "wb"); |
| if (!output) |
| PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe() |
| << "' for writing"; |
| } |
| |
| // Step 1: Enumerate the characters |
| PairVector pairs = InitializeCharacters(); |
| // Step 2: Convert to sets. |
| MoveAllCharsToSets(&pairs); |
| if (VLOG_IS_ON(1)) { |
| LogStringSets(pairs); |
| } |
| // Step 3: Generate states. |
| std::vector<State> states = GenerateStates(pairs); |
| // Step 4/5: Print output |
| PrintStates(states, output); |
| |
| if (!filename.empty()) { |
| if (!base::CloseFile(output)) |
| PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe() |
| << "'"; |
| } |
| |
| return EXIT_SUCCESS; |
| } |