|  | // Copyright 2014 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | // Create a state machine for validating UTF-8. The algorithm in brief: | 
|  | // 1. Convert the complete unicode range of code points, except for the | 
|  | //    surrogate code points, to an ordered array of sequences of bytes in | 
|  | //    UTF-8. | 
|  | // 2. Convert individual bytes to ranges, starting from the right of each byte | 
|  | //    sequence. For each range, ensure the bytes on the left and the ranges | 
|  | //    on the right are the identical. | 
|  | // 3. Convert the resulting list of ranges into a state machine, collapsing | 
|  | //    identical states. | 
|  | // 4. Convert the state machine to an array of bytes. | 
|  | // 5. Output as a C++ file. | 
|  | // | 
|  | // To use: | 
|  | //  $ ninja -C out/Release build_utf8_validator_tables | 
|  | //  $ out/Release/build_utf8_validator_tables | 
|  | //                                   --output=base/i18n/utf8_validator_tables.cc | 
|  | //  $ git add base/i18n/utf8_validator_tables.cc | 
|  | // | 
|  | // Because the table is not expected to ever change, it is checked into the | 
|  | // repository rather than being regenerated at build time. | 
|  | // | 
|  | // This code uses type uint8 throughout to represent bytes, to avoid | 
|  | // signed/unsigned char confusion. | 
|  |  | 
|  | #include <stdio.h> | 
|  | #include <stdlib.h> | 
|  | #include <string.h> | 
|  |  | 
|  | #include <algorithm> | 
|  | #include <map> | 
|  | #include <string> | 
|  | #include <vector> | 
|  |  | 
|  | #include "base/basictypes.h" | 
|  | #include "base/command_line.h" | 
|  | #include "base/files/file_path.h" | 
|  | #include "base/files/file_util.h" | 
|  | #include "base/logging.h" | 
|  | #include "base/numerics/safe_conversions.h" | 
|  | #include "base/strings/stringprintf.h" | 
|  | #include "third_party/icu/source/common/unicode/utf8.h" | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | const char kHelpText[] = | 
|  | "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n"; | 
|  |  | 
|  | const char kProlog[] = | 
|  | "// Copyright 2013 The Chromium Authors. All rights reserved.\n" | 
|  | "// Use of this source code is governed by a BSD-style license that can " | 
|  | "be\n" | 
|  | "// found in the LICENSE file.\n" | 
|  | "\n" | 
|  | "// This file is auto-generated by build_utf8_validator_tables.\n" | 
|  | "// DO NOT EDIT.\n" | 
|  | "\n" | 
|  | "#include \"base/i18n/utf8_validator_tables.h\"\n" | 
|  | "\n" | 
|  | "namespace base {\n" | 
|  | "namespace internal {\n" | 
|  | "\n" | 
|  | "const uint8 kUtf8ValidatorTables[] = {\n"; | 
|  |  | 
|  | const char kEpilog[] = | 
|  | "};\n" | 
|  | "\n" | 
|  | "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n" | 
|  | "\n" | 
|  | "}  // namespace internal\n" | 
|  | "}  // namespace base\n"; | 
|  |  | 
|  | // Ranges are inclusive at both ends--they represent [from, to] | 
|  | class Range { | 
|  | public: | 
|  | // Ranges always start with just one byte. | 
|  | explicit Range(uint8 value) : from_(value), to_(value) {} | 
|  |  | 
|  | // Range objects are copyable and assignable to be used in STL | 
|  | // containers. Since they only contain non-pointer POD types, the default copy | 
|  | // constructor, assignment operator and destructor will work. | 
|  |  | 
|  | // Add a byte to the range. We intentionally only support adding a byte at the | 
|  | // end, since that is the only operation the code needs. | 
|  | void AddByte(uint8 to) { | 
|  | CHECK(to == to_ + 1); | 
|  | to_ = to; | 
|  | } | 
|  |  | 
|  | uint8 from() const { return from_; } | 
|  | uint8 to() const { return to_; } | 
|  |  | 
|  | bool operator<(const Range& rhs) const { | 
|  | return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to())); | 
|  | } | 
|  |  | 
|  | bool operator==(const Range& rhs) const { | 
|  | return from() == rhs.from() && to() == rhs.to(); | 
|  | } | 
|  |  | 
|  | private: | 
|  | uint8 from_; | 
|  | uint8 to_; | 
|  | }; | 
|  |  | 
|  | // A vector of Ranges is like a simple regular expression--it corresponds to | 
|  | // a set of strings of the same length that have bytes in each position in | 
|  | // the appropriate range. | 
|  | typedef std::vector<Range> StringSet; | 
|  |  | 
|  | // A UTF-8 "character" is represented by a sequence of bytes. | 
|  | typedef std::vector<uint8> Character; | 
|  |  | 
|  | // In the second stage of the algorithm, we want to convert a large list of | 
|  | // Characters into a small list of StringSets. | 
|  | struct Pair { | 
|  | Character character; | 
|  | StringSet set; | 
|  | }; | 
|  |  | 
|  | typedef std::vector<Pair> PairVector; | 
|  |  | 
|  | // A class to print a table of numbers in the same style as clang-format. | 
|  | class TablePrinter { | 
|  | public: | 
|  | explicit TablePrinter(FILE* stream) | 
|  | : stream_(stream), values_on_this_line_(0), current_offset_(0) {} | 
|  |  | 
|  | void PrintValue(uint8 value) { | 
|  | if (values_on_this_line_ == 0) { | 
|  | fputs("   ", stream_); | 
|  | } else if (values_on_this_line_ == kMaxValuesPerLine) { | 
|  | fprintf(stream_, "  // 0x%02x\n   ", current_offset_); | 
|  | values_on_this_line_ = 0; | 
|  | } | 
|  | fprintf(stream_, " 0x%02x,", static_cast<int>(value)); | 
|  | ++values_on_this_line_; | 
|  | ++current_offset_; | 
|  | } | 
|  |  | 
|  | void NewLine() { | 
|  | while (values_on_this_line_ < kMaxValuesPerLine) { | 
|  | fputs("      ", stream_); | 
|  | ++values_on_this_line_; | 
|  | } | 
|  | fprintf(stream_, "  // 0x%02x\n", current_offset_); | 
|  | values_on_this_line_ = 0; | 
|  | } | 
|  |  | 
|  | private: | 
|  | // stdio stream. Not owned. | 
|  | FILE* stream_; | 
|  |  | 
|  | // Number of values so far printed on this line. | 
|  | int values_on_this_line_; | 
|  |  | 
|  | // Total values printed so far. | 
|  | int current_offset_; | 
|  |  | 
|  | static const int kMaxValuesPerLine = 8; | 
|  |  | 
|  | DISALLOW_COPY_AND_ASSIGN(TablePrinter); | 
|  | }; | 
|  |  | 
|  | // Start by filling a PairVector with characters. The resulting vector goes from | 
|  | // "\x00" to "\xf4\x8f\xbf\xbf". | 
|  | PairVector InitializeCharacters() { | 
|  | PairVector vector; | 
|  | for (int i = 0; i <= 0x10FFFF; ++i) { | 
|  | if (i >= 0xD800 && i < 0xE000) { | 
|  | // Surrogate codepoints are not permitted. Non-character code points are | 
|  | // explicitly permitted. | 
|  | continue; | 
|  | } | 
|  | uint8 bytes[4]; | 
|  | unsigned int offset = 0; | 
|  | UBool is_error = false; | 
|  | U8_APPEND(bytes, offset, arraysize(bytes), i, is_error); | 
|  | DCHECK(!is_error); | 
|  | DCHECK_GT(offset, 0u); | 
|  | DCHECK_LE(offset, arraysize(bytes)); | 
|  | Pair pair = {Character(bytes, bytes + offset), StringSet()}; | 
|  | vector.push_back(pair); | 
|  | } | 
|  | return vector; | 
|  | } | 
|  |  | 
|  | // Construct a new Pair from |character| and the concatenation of |new_range| | 
|  | // and |existing_set|, and append it to |pairs|. | 
|  | void ConstructPairAndAppend(const Character& character, | 
|  | const Range& new_range, | 
|  | const StringSet& existing_set, | 
|  | PairVector* pairs) { | 
|  | Pair new_pair = {character, StringSet(1, new_range)}; | 
|  | new_pair.set.insert( | 
|  | new_pair.set.end(), existing_set.begin(), existing_set.end()); | 
|  | pairs->push_back(new_pair); | 
|  | } | 
|  |  | 
|  | // Each pass over the PairVector strips one byte off the right-hand-side of the | 
|  | // characters and adds a range to the set on the right. For example, the first | 
|  | // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0", | 
|  | // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0", | 
|  | // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0", | 
|  | // [\xa0-\xbf][\x80-\xbf]). | 
|  | void MoveRightMostCharToSet(PairVector* pairs) { | 
|  | PairVector new_pairs; | 
|  | PairVector::const_iterator it = pairs->begin(); | 
|  | while (it != pairs->end() && it->character.empty()) { | 
|  | new_pairs.push_back(*it); | 
|  | ++it; | 
|  | } | 
|  | CHECK(it != pairs->end()); | 
|  | Character unconverted_bytes(it->character.begin(), it->character.end() - 1); | 
|  | Range new_range(it->character.back()); | 
|  | StringSet converted = it->set; | 
|  | ++it; | 
|  | while (it != pairs->end()) { | 
|  | const Pair& current_pair = *it++; | 
|  | if (current_pair.character.size() == unconverted_bytes.size() + 1 && | 
|  | std::equal(unconverted_bytes.begin(), | 
|  | unconverted_bytes.end(), | 
|  | current_pair.character.begin()) && | 
|  | converted == current_pair.set) { | 
|  | // The particular set of UTF-8 codepoints we are validating guarantees | 
|  | // that each byte range will be contiguous. This would not necessarily be | 
|  | // true for an arbitrary set of UTF-8 codepoints. | 
|  | DCHECK_EQ(new_range.to() + 1, current_pair.character.back()); | 
|  | new_range.AddByte(current_pair.character.back()); | 
|  | continue; | 
|  | } | 
|  | ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); | 
|  | unconverted_bytes = Character(current_pair.character.begin(), | 
|  | current_pair.character.end() - 1); | 
|  | new_range = Range(current_pair.character.back()); | 
|  | converted = current_pair.set; | 
|  | } | 
|  | ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); | 
|  | new_pairs.swap(*pairs); | 
|  | } | 
|  |  | 
|  | void MoveAllCharsToSets(PairVector* pairs) { | 
|  | // Since each pass of the function moves one character, and UTF-8 sequences | 
|  | // are at most 4 characters long, this simply runs the algorithm four times. | 
|  | for (int i = 0; i < 4; ++i) { | 
|  | MoveRightMostCharToSet(pairs); | 
|  | } | 
|  | #if DCHECK_IS_ON() | 
|  | for (PairVector::const_iterator it = pairs->begin(); it != pairs->end(); | 
|  | ++it) { | 
|  | DCHECK(it->character.empty()); | 
|  | } | 
|  | #endif | 
|  | } | 
|  |  | 
|  | // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f], | 
|  | // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the | 
|  | // algorithm is working. Use the command-line option | 
|  | // --vmodule=build_utf8_validator_tables=1 to see this output. | 
|  | void LogStringSets(const PairVector& pairs) { | 
|  | for (PairVector::const_iterator pair_it = pairs.begin(); | 
|  | pair_it != pairs.end(); | 
|  | ++pair_it) { | 
|  | std::string set_as_string; | 
|  | for (StringSet::const_iterator set_it = pair_it->set.begin(); | 
|  | set_it != pair_it->set.end(); | 
|  | ++set_it) { | 
|  | set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]", | 
|  | static_cast<int>(set_it->from()), | 
|  | static_cast<int>(set_it->to())); | 
|  | } | 
|  | VLOG(1) << set_as_string; | 
|  | } | 
|  | } | 
|  |  | 
|  | // A single state in the state machine is represented by a sorted vector of | 
|  | // start bytes and target states. All input bytes in the range between the start | 
|  | // byte and the next entry in the vector (or 0xFF) result in a transition to the | 
|  | // target state. | 
|  | struct StateRange { | 
|  | uint8 from; | 
|  | uint8 target_state; | 
|  | }; | 
|  |  | 
|  | typedef std::vector<StateRange> State; | 
|  |  | 
|  | // Generates a state where all bytes go to state 1 (invalid). This is also used | 
|  | // as an initialiser for other states (since bytes from outside the desired | 
|  | // range are invalid). | 
|  | State GenerateInvalidState() { | 
|  | const StateRange range = {0, 1}; | 
|  | return State(1, range); | 
|  | } | 
|  |  | 
|  | // A map from a state (ie. a set of strings which will match from this state) to | 
|  | // a number (which is an index into the array of states). | 
|  | typedef std::map<StringSet, uint8> StateMap; | 
|  |  | 
|  | // Create a new state corresponding to |set|, add it |states| and |state_map| | 
|  | // and return the index it was given in |states|. | 
|  | uint8 MakeState(const StringSet& set, | 
|  | std::vector<State>* states, | 
|  | StateMap* state_map) { | 
|  | DCHECK(!set.empty()); | 
|  | const Range& range = set.front(); | 
|  | const StringSet rest(set.begin() + 1, set.end()); | 
|  | const StateMap::const_iterator where = state_map->find(rest); | 
|  | const uint8 target_state = where == state_map->end() | 
|  | ? MakeState(rest, states, state_map) | 
|  | : where->second; | 
|  | DCHECK_LT(0, range.from()); | 
|  | DCHECK_LT(range.to(), 0xFF); | 
|  | const StateRange new_state_initializer[] = { | 
|  | {0, 1}, {range.from(), target_state}, | 
|  | {static_cast<uint8>(range.to() + 1), 1}}; | 
|  | states->push_back( | 
|  | State(new_state_initializer, | 
|  | new_state_initializer + arraysize(new_state_initializer))); | 
|  | const uint8 new_state_number = | 
|  | base::checked_cast<uint8>(states->size() - 1); | 
|  | CHECK(state_map->insert(std::make_pair(set, new_state_number)).second); | 
|  | return new_state_number; | 
|  | } | 
|  |  | 
|  | std::vector<State> GenerateStates(const PairVector& pairs) { | 
|  | // States 0 and 1 are the initial/valid state and invalid state, respectively. | 
|  | std::vector<State> states(2, GenerateInvalidState()); | 
|  | StateMap state_map; | 
|  | state_map.insert(std::make_pair(StringSet(), 0)); | 
|  | for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) { | 
|  | DCHECK(it->character.empty()); | 
|  | DCHECK(!it->set.empty()); | 
|  | const Range& range = it->set.front(); | 
|  | const StringSet rest(it->set.begin() + 1, it->set.end()); | 
|  | const StateMap::const_iterator where = state_map.find(rest); | 
|  | const uint8 target_state = where == state_map.end() | 
|  | ? MakeState(rest, &states, &state_map) | 
|  | : where->second; | 
|  | if (states[0].back().from == range.from()) { | 
|  | DCHECK_EQ(1, states[0].back().target_state); | 
|  | states[0].back().target_state = target_state; | 
|  | DCHECK_LT(range.to(), 0xFF); | 
|  | const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1}; | 
|  | states[0].push_back(new_range); | 
|  | } else { | 
|  | DCHECK_LT(range.to(), 0xFF); | 
|  | const StateRange new_range_initializer[] = {{range.from(), target_state}, | 
|  | {static_cast<uint8>(range.to() + 1), 1}}; | 
|  | states[0] | 
|  | .insert(states[0].end(), | 
|  | new_range_initializer, | 
|  | new_range_initializer + arraysize(new_range_initializer)); | 
|  | } | 
|  | } | 
|  | return states; | 
|  | } | 
|  |  | 
|  | // Output the generated states as a C++ table. Two tricks are used to compact | 
|  | // the table: each state in the table starts with a shift value which indicates | 
|  | // how many bits we can discard from the right-hand-side of the byte before | 
|  | // doing the table lookup. Secondly, only the state-transitions for bytes | 
|  | // with the top-bit set are included in the table; bytes without the top-bit set | 
|  | // are just ASCII and are handled directly by the code. | 
|  | void PrintStates(const std::vector<State>& states, FILE* stream) { | 
|  | // First calculate the start-offset of each state. This allows the state | 
|  | // machine to jump directly to the correct offset, avoiding an extra | 
|  | // indirection. State 0 starts at offset 0. | 
|  | std::vector<uint8> state_offset(1, 0); | 
|  | std::vector<uint8> shifts; | 
|  | uint8 pos = 0; | 
|  |  | 
|  | for (std::vector<State>::const_iterator state_it = states.begin(); | 
|  | state_it != states.end(); | 
|  | ++state_it) { | 
|  | // We want to set |shift| to the (0-based) index of the least-significant | 
|  | // set bit in any of the ranges for this state, since this tells us how many | 
|  | // bits we can discard and still determine what range a byte lies in. Sadly | 
|  | // it appears that ffs() is not portable, so we do it clumsily. | 
|  | uint8 shift = 7; | 
|  | for (State::const_iterator range_it = state_it->begin(); | 
|  | range_it != state_it->end(); | 
|  | ++range_it) { | 
|  | while (shift > 0 && range_it->from % (1 << shift) != 0) { | 
|  | --shift; | 
|  | } | 
|  | } | 
|  | shifts.push_back(shift); | 
|  | pos += 1 + (1 << (7 - shift)); | 
|  | state_offset.push_back(pos); | 
|  | } | 
|  |  | 
|  | DCHECK_EQ(129, state_offset[1]); | 
|  |  | 
|  | fputs(kProlog, stream); | 
|  | TablePrinter table_printer(stream); | 
|  |  | 
|  | for (uint8 state_index = 0; state_index < states.size(); ++state_index) { | 
|  | const uint8 shift = shifts[state_index]; | 
|  | uint8 next_range = 0; | 
|  | uint8 target_state = 1; | 
|  | fprintf(stream, | 
|  | "    // State %d, offset 0x%02x\n", | 
|  | static_cast<int>(state_index), | 
|  | static_cast<int>(state_offset[state_index])); | 
|  | table_printer.PrintValue(shift); | 
|  | for (int i = 0; i < 0x100; i += (1 << shift)) { | 
|  | if (next_range < states[state_index].size() && | 
|  | states[state_index][next_range].from == i) { | 
|  | target_state = states[state_index][next_range].target_state; | 
|  | ++next_range; | 
|  | } | 
|  | if (i >= 0x80) { | 
|  | table_printer.PrintValue(state_offset[target_state]); | 
|  | } | 
|  | } | 
|  | table_printer.NewLine(); | 
|  | } | 
|  |  | 
|  | fputs(kEpilog, stream); | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | int main(int argc, char* argv[]) { | 
|  | base::CommandLine::Init(argc, argv); | 
|  | logging::LoggingSettings settings; | 
|  | settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG; | 
|  | logging::InitLogging(settings); | 
|  | if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) { | 
|  | fwrite(kHelpText, 1, arraysize(kHelpText), stdout); | 
|  | exit(EXIT_SUCCESS); | 
|  | } | 
|  | base::FilePath filename = | 
|  | base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output"); | 
|  |  | 
|  | FILE* output = stdout; | 
|  | if (!filename.empty()) { | 
|  | output = base::OpenFile(filename, "wb"); | 
|  | if (!output) | 
|  | PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe() | 
|  | << "' for writing"; | 
|  | } | 
|  |  | 
|  | // Step 1: Enumerate the characters | 
|  | PairVector pairs = InitializeCharacters(); | 
|  | // Step 2: Convert to sets. | 
|  | MoveAllCharsToSets(&pairs); | 
|  | if (VLOG_IS_ON(1)) { | 
|  | LogStringSets(pairs); | 
|  | } | 
|  | // Step 3: Generate states. | 
|  | std::vector<State> states = GenerateStates(pairs); | 
|  | // Step 4/5: Print output | 
|  | PrintStates(states, output); | 
|  |  | 
|  | if (!filename.empty()) { | 
|  | if (!base::CloseFile(output)) | 
|  | PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe() | 
|  | << "'"; | 
|  | } | 
|  |  | 
|  | return EXIT_SUCCESS; | 
|  | } |