|  | // Protocol Buffers - Google's data interchange format | 
|  | // Copyright 2023 Google LLC.  All rights reserved. | 
|  | // | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file or at | 
|  | // https://developers.google.com/open-source/licenses/bsd | 
|  |  | 
|  | #include "upb/io/tokenizer.h" | 
|  |  | 
|  | #include <gtest/gtest.h> | 
|  | #include "absl/strings/escaping.h" | 
|  | #include "absl/strings/str_format.h" | 
|  | #include "upb/io/chunked_input_stream.h" | 
|  | #include "upb/io/string.h" | 
|  | #include "upb/lex/unicode.h" | 
|  | #include "upb/mem/arena.hpp" | 
|  |  | 
|  | // Must be last. | 
|  | #include "upb/port/def.inc" | 
|  |  | 
|  | namespace google { | 
|  | namespace protobuf { | 
|  | namespace io { | 
|  | namespace { | 
|  |  | 
|  | #ifndef arraysize | 
|  | #define arraysize(a) (sizeof(a) / sizeof(a[0])) | 
|  | #endif | 
|  |  | 
|  | static bool StringEquals(const char* a, const char* b) { | 
|  | return strcmp(a, b) == 0; | 
|  | } | 
|  |  | 
|  | // =================================================================== | 
|  | // Data-Driven Test Infrastructure | 
|  |  | 
|  | // TODO:  This is copied from coded_stream_unittest.  This is | 
|  | //   temporary until these features are integrated into gTest itself. | 
|  |  | 
|  | // TEST_1D and TEST_2D are macros I'd eventually like to see added to | 
|  | // gTest.  These macros can be used to declare tests which should be | 
|  | // run multiple times, once for each item in some input array.  TEST_1D | 
|  | // tests all cases in a single input array.  TEST_2D tests all | 
|  | // combinations of cases from two arrays.  The arrays must be statically | 
|  | // defined such that the arraysize() macro works on them.  Example: | 
|  | // | 
|  | // int kCases[] = {1, 2, 3, 4} | 
|  | // TEST_1D(MyFixture, MyTest, kCases) { | 
|  | //   EXPECT_GT(kCases_case, 0); | 
|  | // } | 
|  | // | 
|  | // This test iterates through the numbers 1, 2, 3, and 4 and tests that | 
|  | // they are all grater than zero.  In case of failure, the exact case | 
|  | // which failed will be printed.  The case type must be printable using | 
|  | // ostream::operator<<. | 
|  |  | 
|  | #define TEST_1D(FIXTURE, NAME, CASES)                             \ | 
|  | class FIXTURE##_##NAME##_DD : public FIXTURE {                  \ | 
|  | protected:                                                     \ | 
|  | template <typename CaseType>                                  \ | 
|  | void DoSingleCase(const CaseType& CASES##_case);              \ | 
|  | };                                                              \ | 
|  | \ | 
|  | TEST_F(FIXTURE##_##NAME##_DD, NAME) {                           \ | 
|  | for (size_t i = 0; i < arraysize(CASES); i++) {               \ | 
|  | SCOPED_TRACE(testing::Message()                             \ | 
|  | << #CASES " case #" << i << ": " << CASES[i]); \ | 
|  | DoSingleCase(CASES[i]);                                     \ | 
|  | }                                                             \ | 
|  | }                                                               \ | 
|  | \ | 
|  | template <typename CaseType>                                    \ | 
|  | void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case) | 
|  |  | 
|  | #define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                              \ | 
|  | class FIXTURE##_##NAME##_DD : public FIXTURE {                            \ | 
|  | protected:                                                               \ | 
|  | template <typename CaseType1, typename CaseType2>                       \ | 
|  | void DoSingleCase(const CaseType1& CASES1##_case,                       \ | 
|  | const CaseType2& CASES2##_case);                      \ | 
|  | };                                                                        \ | 
|  | \ | 
|  | TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                     \ | 
|  | for (size_t i = 0; i < arraysize(CASES1); i++) {                        \ | 
|  | for (size_t j = 0; j < arraysize(CASES2); j++) {                      \ | 
|  | SCOPED_TRACE(testing::Message()                                     \ | 
|  | << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \ | 
|  | << #CASES2 " case #" << j << ": " << CASES2[j]);       \ | 
|  | DoSingleCase(CASES1[i], CASES2[j]);                                 \ | 
|  | }                                                                     \ | 
|  | }                                                                       \ | 
|  | }                                                                         \ | 
|  | \ | 
|  | template <typename CaseType1, typename CaseType2>                         \ | 
|  | void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case,  \ | 
|  | const CaseType2& CASES2##_case) | 
|  |  | 
|  | // ------------------------------------------------------------------- | 
|  |  | 
|  | // In C, a size of zero from ZCIS_Next() means EOF so we can't play the same | 
|  | // trick here that happens in the C++ version. Use ChunkedInputStream instead. | 
|  | upb_ZeroCopyInputStream* TestInputStream(const void* data, size_t size, | 
|  | size_t block_size, upb_Arena* arena) { | 
|  | return upb_ChunkedInputStream_New(data, size, block_size, arena); | 
|  | } | 
|  |  | 
|  | // ------------------------------------------------------------------- | 
|  |  | 
|  | // We test each operation over a variety of block sizes to insure that | 
|  | // we test cases where reads cross buffer boundaries as well as cases | 
|  | // where they don't.  This is sort of a brute-force approach to this, | 
|  | // but it's easy to write and easy to understand. | 
|  | const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024}; | 
|  |  | 
|  | class TokenizerTest : public testing::Test { | 
|  | protected: | 
|  | // For easy testing. | 
|  | uint64_t ParseInteger(const std::string& text) { | 
|  | uint64_t result; | 
|  | EXPECT_TRUE(upb_Parse_Integer(text.data(), UINT64_MAX, &result)) | 
|  | << "'" << text << "'"; | 
|  | return result; | 
|  | } | 
|  | }; | 
|  |  | 
|  | // =================================================================== | 
|  |  | 
|  | // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error: | 
|  | //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr" | 
|  | #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) | 
|  |  | 
|  | // In each test case, the entire input text should parse as a single token | 
|  | // of the given type. | 
|  | struct SimpleTokenCase { | 
|  | std::string input; | 
|  | upb_TokenType type; | 
|  | }; | 
|  |  | 
|  | inline std::ostream& operator<<(std::ostream& out, | 
|  | const SimpleTokenCase& test_case) { | 
|  | return out << absl::CEscape(test_case.input); | 
|  | } | 
|  |  | 
|  | SimpleTokenCase kSimpleTokenCases[] = { | 
|  | // Test identifiers. | 
|  | {"hello", kUpb_TokenType_Identifier}, | 
|  |  | 
|  | // Test integers. | 
|  | {"123", kUpb_TokenType_Integer}, | 
|  | {"0xab6", kUpb_TokenType_Integer}, | 
|  | {"0XAB6", kUpb_TokenType_Integer}, | 
|  | {"0X1234567", kUpb_TokenType_Integer}, | 
|  | {"0x89abcdef", kUpb_TokenType_Integer}, | 
|  | {"0x89ABCDEF", kUpb_TokenType_Integer}, | 
|  | {"01234567", kUpb_TokenType_Integer}, | 
|  |  | 
|  | // Test floats. | 
|  | {"123.45", kUpb_TokenType_Float}, | 
|  | {"1.", kUpb_TokenType_Float}, | 
|  | {"1e3", kUpb_TokenType_Float}, | 
|  | {"1E3", kUpb_TokenType_Float}, | 
|  | {"1e-3", kUpb_TokenType_Float}, | 
|  | {"1e+3", kUpb_TokenType_Float}, | 
|  | {"1.e3", kUpb_TokenType_Float}, | 
|  | {"1.2e3", kUpb_TokenType_Float}, | 
|  | {".1", kUpb_TokenType_Float}, | 
|  | {".1e3", kUpb_TokenType_Float}, | 
|  | {".1e-3", kUpb_TokenType_Float}, | 
|  | {".1e+3", kUpb_TokenType_Float}, | 
|  |  | 
|  | // Test strings. | 
|  | {"'hello'", kUpb_TokenType_String}, | 
|  | {"\"foo\"", kUpb_TokenType_String}, | 
|  | {"'a\"b'", kUpb_TokenType_String}, | 
|  | {"\"a'b\"", kUpb_TokenType_String}, | 
|  | {"'a\\'b'", kUpb_TokenType_String}, | 
|  | {"\"a\\\"b\"", kUpb_TokenType_String}, | 
|  | {"'\\xf'", kUpb_TokenType_String}, | 
|  | {"'\\0'", kUpb_TokenType_String}, | 
|  |  | 
|  | // Test symbols. | 
|  | {"+", kUpb_TokenType_Symbol}, | 
|  | {".", kUpb_TokenType_Symbol}, | 
|  | }; | 
|  |  | 
|  | TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) { | 
|  | upb::Arena arena; | 
|  |  | 
|  | // Set up the tokenizer. | 
|  | auto input = TestInputStream(kSimpleTokenCases_case.input.data(), | 
|  | kSimpleTokenCases_case.input.size(), | 
|  | kBlockSizes_case, arena.ptr()); | 
|  | auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr()); | 
|  |  | 
|  | // Before Next() is called, the initial token should always be TYPE_START. | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start); | 
|  | EXPECT_EQ(upb_Tokenizer_Line(t), 0); | 
|  | EXPECT_EQ(upb_Tokenizer_Column(t), 0); | 
|  | EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0); | 
|  | EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); | 
|  |  | 
|  | // Parse the token. | 
|  | EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); | 
|  | // Check that it has the right type. | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), kSimpleTokenCases_case.type); | 
|  | // Check that it contains the complete input text. | 
|  | EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), | 
|  | kSimpleTokenCases_case.input.data())); | 
|  |  | 
|  | // Check that it is located at the beginning of the input | 
|  | EXPECT_EQ(upb_Tokenizer_Line(t), 0); | 
|  | EXPECT_EQ(upb_Tokenizer_Column(t), 0); | 
|  | EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size()); | 
|  |  | 
|  | upb_Status status; | 
|  | upb_Status_Clear(&status); | 
|  |  | 
|  | // There should be no more input and no errors.. | 
|  | EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); | 
|  | EXPECT_TRUE(upb_Status_IsOk(&status)); | 
|  |  | 
|  | // After Next() returns false, the token should have type TYPE_END. | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_End); | 
|  | EXPECT_EQ(upb_Tokenizer_Line(t), 0); | 
|  | EXPECT_EQ(upb_Tokenizer_Column(t), kSimpleTokenCases_case.input.size()); | 
|  | EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size()); | 
|  | EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); | 
|  | } | 
|  |  | 
|  | TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) { | 
|  | // Test the "allow_f_after_float" option. | 
|  |  | 
|  | // Set up the tokenizer. | 
|  | upb::Arena arena; | 
|  | const char* text = "1f 2.5f 6e3f 7F"; | 
|  | auto input = | 
|  | TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr()); | 
|  | const int options = kUpb_TokenizerOption_AllowFAfterFloat; | 
|  | auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr()); | 
|  |  | 
|  | // Advance through tokens and check that they are parsed as expected. | 
|  |  | 
|  | EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); | 
|  | EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "1f")); | 
|  |  | 
|  | EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); | 
|  | EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "2.5f")); | 
|  |  | 
|  | EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); | 
|  | EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "6e3f")); | 
|  |  | 
|  | EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float); | 
|  | EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "7F")); | 
|  |  | 
|  | upb_Status status; | 
|  | upb_Status_Clear(&status); | 
|  |  | 
|  | // There should be no more input and no errors.. | 
|  | EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); | 
|  | EXPECT_TRUE(upb_Status_IsOk(&status)); | 
|  | } | 
|  |  | 
|  | SimpleTokenCase kWhitespaceTokenCases[] = { | 
|  | {" ", kUpb_TokenType_Whitespace}, | 
|  | {"    ", kUpb_TokenType_Whitespace}, | 
|  | {"\t", kUpb_TokenType_Whitespace}, | 
|  | {"\v", kUpb_TokenType_Whitespace}, | 
|  | {"\t ", kUpb_TokenType_Whitespace}, | 
|  | {"\v\t", kUpb_TokenType_Whitespace}, | 
|  | {"   \t\r", kUpb_TokenType_Whitespace}, | 
|  | // Newlines: | 
|  | {"\n", kUpb_TokenType_Newline}, | 
|  | }; | 
|  |  | 
|  | TEST_2D(TokenizerTest, Whitespace, kWhitespaceTokenCases, kBlockSizes) { | 
|  | upb::Arena arena; | 
|  | { | 
|  | auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(), | 
|  | kWhitespaceTokenCases_case.input.size(), | 
|  | kBlockSizes_case, arena.ptr()); | 
|  | auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr()); | 
|  |  | 
|  | EXPECT_FALSE(upb_Tokenizer_Next(t, nullptr)); | 
|  | } | 
|  | { | 
|  | auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(), | 
|  | kWhitespaceTokenCases_case.input.size(), | 
|  | kBlockSizes_case, arena.ptr()); | 
|  | const int options = kUpb_TokenizerOption_ReportNewlines; | 
|  | auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr()); | 
|  |  | 
|  | EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); | 
|  |  | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), kWhitespaceTokenCases_case.type); | 
|  | EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), | 
|  | kWhitespaceTokenCases_case.input.data())); | 
|  | EXPECT_FALSE(upb_Tokenizer_Next(t, nullptr)); | 
|  | } | 
|  | } | 
|  |  | 
|  | #endif | 
|  |  | 
|  | // ------------------------------------------------------------------- | 
|  |  | 
|  | struct TokenFields { | 
|  | upb_TokenType type; | 
|  | std::string text; | 
|  | size_t line; | 
|  | size_t column; | 
|  | size_t end_column; | 
|  | }; | 
|  |  | 
|  | // In each case, the input is parsed to produce a list of tokens.  The | 
|  | // last token in "output" must have type TYPE_END. | 
|  | struct MultiTokenCase { | 
|  | std::string input; | 
|  | std::vector<TokenFields> output; | 
|  | }; | 
|  |  | 
|  | inline std::ostream& operator<<(std::ostream& out, | 
|  | const MultiTokenCase& test_case) { | 
|  | return out << absl::CEscape(test_case.input); | 
|  | } | 
|  |  | 
|  | MultiTokenCase kMultiTokenCases[] = { | 
|  | // Test empty input. | 
|  | {"", | 
|  | { | 
|  | {kUpb_TokenType_End, "", 0, 0, 0}, | 
|  | }}, | 
|  | // Test all token types at the same time. | 
|  | {"foo 1 1.2 + 'bar'", | 
|  | { | 
|  | {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, | 
|  | {kUpb_TokenType_Integer, "1", 0, 4, 5}, | 
|  | {kUpb_TokenType_Float, "1.2", 0, 6, 9}, | 
|  | {kUpb_TokenType_Symbol, "+", 0, 10, 11}, | 
|  | {kUpb_TokenType_String, "'bar'", 0, 12, 17}, | 
|  | {kUpb_TokenType_End, "", 0, 17, 17}, | 
|  | }}, | 
|  |  | 
|  | // Test that consecutive symbols are parsed as separate tokens. | 
|  | {"!@+%", | 
|  | { | 
|  | {kUpb_TokenType_Symbol, "!", 0, 0, 1}, | 
|  | {kUpb_TokenType_Symbol, "@", 0, 1, 2}, | 
|  | {kUpb_TokenType_Symbol, "+", 0, 2, 3}, | 
|  | {kUpb_TokenType_Symbol, "%", 0, 3, 4}, | 
|  | {kUpb_TokenType_End, "", 0, 4, 4}, | 
|  | }}, | 
|  |  | 
|  | // Test that newlines affect line numbers correctly. | 
|  | {"foo bar\nrab oof", | 
|  | { | 
|  | {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, | 
|  | {kUpb_TokenType_Identifier, "bar", 0, 4, 7}, | 
|  | {kUpb_TokenType_Identifier, "rab", 1, 0, 3}, | 
|  | {kUpb_TokenType_Identifier, "oof", 1, 4, 7}, | 
|  | {kUpb_TokenType_End, "", 1, 7, 7}, | 
|  | }}, | 
|  |  | 
|  | // Test that tabs affect column numbers correctly. | 
|  | {"foo\tbar  \tbaz", | 
|  | { | 
|  | {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, | 
|  | {kUpb_TokenType_Identifier, "bar", 0, 8, 11}, | 
|  | {kUpb_TokenType_Identifier, "baz", 0, 16, 19}, | 
|  | {kUpb_TokenType_End, "", 0, 19, 19}, | 
|  | }}, | 
|  |  | 
|  | // Test that tabs in string literals affect column numbers correctly. | 
|  | {"\"foo\tbar\" baz", | 
|  | { | 
|  | {kUpb_TokenType_String, "\"foo\tbar\"", 0, 0, 12}, | 
|  | {kUpb_TokenType_Identifier, "baz", 0, 13, 16}, | 
|  | {kUpb_TokenType_End, "", 0, 16, 16}, | 
|  | }}, | 
|  |  | 
|  | // Test that line comments are ignored. | 
|  | {"foo // This is a comment\n" | 
|  | "bar // This is another comment", | 
|  | { | 
|  | {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, | 
|  | {kUpb_TokenType_Identifier, "bar", 1, 0, 3}, | 
|  | {kUpb_TokenType_End, "", 1, 30, 30}, | 
|  | }}, | 
|  |  | 
|  | // Test that block comments are ignored. | 
|  | {"foo /* This is a block comment */ bar", | 
|  | { | 
|  | {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, | 
|  | {kUpb_TokenType_Identifier, "bar", 0, 34, 37}, | 
|  | {kUpb_TokenType_End, "", 0, 37, 37}, | 
|  | }}, | 
|  |  | 
|  | // Test that sh-style comments are not ignored by default. | 
|  | {"foo # bar\n" | 
|  | "baz", | 
|  | { | 
|  | {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, | 
|  | {kUpb_TokenType_Symbol, "#", 0, 4, 5}, | 
|  | {kUpb_TokenType_Identifier, "bar", 0, 6, 9}, | 
|  | {kUpb_TokenType_Identifier, "baz", 1, 0, 3}, | 
|  | {kUpb_TokenType_End, "", 1, 3, 3}, | 
|  | }}, | 
|  |  | 
|  | // Test all whitespace chars | 
|  | {"foo\n\t\r\v\fbar", | 
|  | { | 
|  | {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, | 
|  | {kUpb_TokenType_Identifier, "bar", 1, 11, 14}, | 
|  | {kUpb_TokenType_End, "", 1, 14, 14}, | 
|  | }}, | 
|  | }; | 
|  |  | 
|  | TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) { | 
|  | // Set up the tokenizer. | 
|  | upb::Arena arena; | 
|  | auto input = TestInputStream(kMultiTokenCases_case.input.data(), | 
|  | kMultiTokenCases_case.input.size(), | 
|  | kBlockSizes_case, arena.ptr()); | 
|  | auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr()); | 
|  |  | 
|  | // Before Next() is called, the initial token should always be TYPE_START. | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start); | 
|  | EXPECT_EQ(upb_Tokenizer_Line(t), 0); | 
|  | EXPECT_EQ(upb_Tokenizer_Column(t), 0); | 
|  | EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0); | 
|  | EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); | 
|  |  | 
|  | // Loop through all expected tokens. | 
|  | TokenFields token_fields; | 
|  | upb_Status status; | 
|  | upb_Status_Clear(&status); | 
|  | int i = 0; | 
|  | do { | 
|  | token_fields = kMultiTokenCases_case.output[i++]; | 
|  |  | 
|  | SCOPED_TRACE(testing::Message() | 
|  | << "Token #" << i << ": " << absl::CEscape(token_fields.text)); | 
|  |  | 
|  | // Next() should only return false when it hits the end token. | 
|  | if (token_fields.type == kUpb_TokenType_End) { | 
|  | EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); | 
|  | EXPECT_TRUE(upb_Status_IsOk(&status)); | 
|  | } else { | 
|  | EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); | 
|  | } | 
|  |  | 
|  | // Check that the token matches the expected one. | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type); | 
|  | EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line); | 
|  | EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column); | 
|  | EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column); | 
|  | EXPECT_EQ(upb_Tokenizer_TextSize(t), token_fields.text.size()); | 
|  | EXPECT_TRUE( | 
|  | StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data())); | 
|  | } while (token_fields.type != kUpb_TokenType_End); | 
|  | } | 
|  |  | 
|  | MultiTokenCase kMultiWhitespaceTokenCases[] = { | 
|  | // Test all token types at the same time. | 
|  | {"foo 1 \t1.2  \n   +\v'bar'", | 
|  | { | 
|  | {kUpb_TokenType_Identifier, "foo", 0, 0, 3}, | 
|  | {kUpb_TokenType_Whitespace, " ", 0, 3, 4}, | 
|  | {kUpb_TokenType_Integer, "1", 0, 4, 5}, | 
|  | {kUpb_TokenType_Whitespace, " \t", 0, 5, 8}, | 
|  | {kUpb_TokenType_Float, "1.2", 0, 8, 11}, | 
|  | {kUpb_TokenType_Whitespace, "  ", 0, 11, 13}, | 
|  | {kUpb_TokenType_Newline, "\n", 0, 13, 0}, | 
|  | {kUpb_TokenType_Whitespace, "   ", 1, 0, 3}, | 
|  | {kUpb_TokenType_Symbol, "+", 1, 3, 4}, | 
|  | {kUpb_TokenType_Whitespace, "\v", 1, 4, 5}, | 
|  | {kUpb_TokenType_String, "'bar'", 1, 5, 10}, | 
|  | {kUpb_TokenType_End, "", 1, 10, 10}, | 
|  | }}, | 
|  |  | 
|  | }; | 
|  |  | 
|  | TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases, | 
|  | kBlockSizes) { | 
|  | // Set up the tokenizer. | 
|  | upb::Arena arena; | 
|  | auto input = TestInputStream(kMultiWhitespaceTokenCases_case.input.data(), | 
|  | kMultiWhitespaceTokenCases_case.input.size(), | 
|  | kBlockSizes_case, arena.ptr()); | 
|  | const int options = kUpb_TokenizerOption_ReportNewlines; | 
|  | auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr()); | 
|  |  | 
|  | // Before Next() is called, the initial token should always be TYPE_START. | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start); | 
|  | EXPECT_EQ(upb_Tokenizer_Line(t), 0); | 
|  | EXPECT_EQ(upb_Tokenizer_Column(t), 0); | 
|  | EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0); | 
|  | EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "")); | 
|  |  | 
|  | // Loop through all expected tokens. | 
|  | TokenFields token_fields; | 
|  | upb_Status status; | 
|  | upb_Status_Clear(&status); | 
|  | int i = 0; | 
|  | do { | 
|  | token_fields = kMultiWhitespaceTokenCases_case.output[i++]; | 
|  |  | 
|  | SCOPED_TRACE(testing::Message() | 
|  | << "Token #" << i << ": " << token_fields.text); | 
|  |  | 
|  | // Next() should only return false when it hits the end token. | 
|  | if (token_fields.type == kUpb_TokenType_End) { | 
|  | EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); | 
|  | EXPECT_TRUE(upb_Status_IsOk(&status)); | 
|  | } else { | 
|  | EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); | 
|  | } | 
|  |  | 
|  | // Check that the token matches the expected one. | 
|  | EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type); | 
|  | EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line); | 
|  | EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column); | 
|  | EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column); | 
|  | EXPECT_TRUE( | 
|  | StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data())); | 
|  | } while (token_fields.type != kUpb_TokenType_End); | 
|  | } | 
|  |  | 
|  | // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error: | 
|  | //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr" | 
|  | #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) | 
|  |  | 
|  | TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) { | 
|  | // Test the "comment_style" option. | 
|  |  | 
|  | const char* text = | 
|  | "foo # bar\n" | 
|  | "baz // qux\n" | 
|  | "corge /* grault */\n" | 
|  | "garply"; | 
|  | const char* const kTokens[] = {"foo",  // "# bar" is ignored | 
|  | "baz", "/",      "/", "qux", "corge", "/", | 
|  | "*",   "grault", "*", "/",   "garply"}; | 
|  |  | 
|  | // Set up the tokenizer. | 
|  | upb::Arena arena; | 
|  | auto input = | 
|  | TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr()); | 
|  | const int options = kUpb_TokenizerOption_CommentStyleShell; | 
|  | auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr()); | 
|  |  | 
|  | // Advance through tokens and check that they are parsed as expected. | 
|  | for (size_t i = 0; i < arraysize(kTokens); i++) { | 
|  | EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr)); | 
|  | EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), kTokens[i])); | 
|  | } | 
|  |  | 
|  | // There should be no more input and no errors. | 
|  | upb_Status status; | 
|  | upb_Status_Clear(&status); | 
|  | EXPECT_FALSE(upb_Tokenizer_Next(t, &status)); | 
|  | EXPECT_TRUE(upb_Status_IsOk(&status)); | 
|  | } | 
|  |  | 
|  | #endif | 
|  |  | 
|  | // ------------------------------------------------------------------- | 
|  |  | 
|  | #if 0  // TODO: Extended comments are currently unimplemented. | 
|  |  | 
|  | // In each case, the input is expected to have two tokens named "prev" and | 
|  | // "next" with comments in between. | 
|  | struct DocCommentCase { | 
|  | std::string input; | 
|  |  | 
|  | const char* prev_trailing_comments; | 
|  | const char* detached_comments[10]; | 
|  | const char* next_leading_comments; | 
|  | }; | 
|  |  | 
|  | inline std::ostream& operator<<(std::ostream& out, | 
|  | const DocCommentCase& test_case) { | 
|  | return out << absl::CEscape(test_case.input); | 
|  | } | 
|  |  | 
|  | DocCommentCase kDocCommentCases[] = { | 
|  | {"prev next", | 
|  |  | 
|  | "", | 
|  | {}, | 
|  | ""}, | 
|  |  | 
|  | {"prev /* ignored */ next", | 
|  |  | 
|  | "", | 
|  | {}, | 
|  | ""}, | 
|  |  | 
|  | {"prev // trailing comment\n" | 
|  | "next", | 
|  |  | 
|  | " trailing comment\n", | 
|  | {}, | 
|  | ""}, | 
|  |  | 
|  | {"prev\n" | 
|  | "// leading comment\n" | 
|  | "// line 2\n" | 
|  | "next", | 
|  |  | 
|  | "", | 
|  | {}, | 
|  | " leading comment\n" | 
|  | " line 2\n"}, | 
|  |  | 
|  | {"prev\n" | 
|  | "// trailing comment\n" | 
|  | "// line 2\n" | 
|  | "\n" | 
|  | "next", | 
|  |  | 
|  | " trailing comment\n" | 
|  | " line 2\n", | 
|  | {}, | 
|  | ""}, | 
|  |  | 
|  | {"prev // trailing comment\n" | 
|  | "// leading comment\n" | 
|  | "// line 2\n" | 
|  | "next", | 
|  |  | 
|  | " trailing comment\n", | 
|  | {}, | 
|  | " leading comment\n" | 
|  | " line 2\n"}, | 
|  |  | 
|  | {"prev /* trailing block comment */\n" | 
|  | "/* leading block comment\n" | 
|  | " * line 2\n" | 
|  | " * line 3 */" | 
|  | "next", | 
|  |  | 
|  | " trailing block comment ", | 
|  | {}, | 
|  | " leading block comment\n" | 
|  | " line 2\n" | 
|  | " line 3 "}, | 
|  |  | 
|  | {"prev\n" | 
|  | "/* trailing block comment\n" | 
|  | " * line 2\n" | 
|  | " * line 3\n" | 
|  | " */\n" | 
|  | "/* leading block comment\n" | 
|  | " * line 2\n" | 
|  | " * line 3 */" | 
|  | "next", | 
|  |  | 
|  | " trailing block comment\n" | 
|  | " line 2\n" | 
|  | " line 3\n", | 
|  | {}, | 
|  | " leading block comment\n" | 
|  | " line 2\n" | 
|  | " line 3 "}, | 
|  |  | 
|  | {"prev\n" | 
|  | "// trailing comment\n" | 
|  | "\n" | 
|  | "// detached comment\n" | 
|  | "// line 2\n" | 
|  | "\n" | 
|  | "// second detached comment\n" | 
|  | "/* third detached comment\n" | 
|  | " * line 2 */\n" | 
|  | "// leading comment\n" | 
|  | "next", | 
|  |  | 
|  | " trailing comment\n", | 
|  | {" detached comment\n" | 
|  | " line 2\n", | 
|  | " second detached comment\n", | 
|  | " third detached comment\n" | 
|  | " line 2 "}, | 
|  | " leading comment\n"}, | 
|  |  | 
|  | {"prev /**/\n" | 
|  | "\n" | 
|  | "// detached comment\n" | 
|  | "\n" | 
|  | "// leading comment\n" | 
|  | "next", | 
|  |  | 
|  | "", | 
|  | {" detached comment\n"}, | 
|  | " leading comment\n"}, | 
|  |  | 
|  | {"prev /**/\n" | 
|  | "// leading comment\n" | 
|  | "next", | 
|  |  | 
|  | "", | 
|  | {}, | 
|  | " leading comment\n"}, | 
|  | }; | 
|  |  | 
|  | TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) { | 
|  | // Set up the tokenizer. | 
|  | TestInputStream input(kDocCommentCases_case.input.data(), | 
|  | kDocCommentCases_case.input.size(), kBlockSizes_case); | 
|  | TestErrorCollector error_collector; | 
|  | Tokenizer tokenizer(&input, &error_collector); | 
|  |  | 
|  | // Set up a second tokenizer where we'll pass all NULLs to NextWithComments(). | 
|  | TestInputStream input2(kDocCommentCases_case.input.data(), | 
|  | kDocCommentCases_case.input.size(), kBlockSizes_case); | 
|  | Tokenizer tokenizer2(&input2, &error_collector); | 
|  |  | 
|  | tokenizer.Next(); | 
|  | tokenizer2.Next(); | 
|  |  | 
|  | EXPECT_EQ("prev", tokenizer.current().text); | 
|  | EXPECT_EQ("prev", tokenizer2.current().text); | 
|  |  | 
|  | std::string prev_trailing_comments; | 
|  | std::vector<std::string> detached_comments; | 
|  | std::string next_leading_comments; | 
|  | tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments, | 
|  | &next_leading_comments); | 
|  | tokenizer2.NextWithComments(nullptr, nullptr, nullptr); | 
|  | EXPECT_EQ("next", tokenizer.current().text); | 
|  | EXPECT_EQ("next", tokenizer2.current().text); | 
|  |  | 
|  | EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments, | 
|  | prev_trailing_comments); | 
|  |  | 
|  | for (int i = 0; i < detached_comments.size(); i++) { | 
|  | EXPECT_LT(i, arraysize(kDocCommentCases)); | 
|  | EXPECT_TRUE(kDocCommentCases_case.detached_comments[i] != nullptr); | 
|  | EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]); | 
|  | } | 
|  |  | 
|  | // Verify that we matched all the detached comments. | 
|  | EXPECT_EQ(nullptr, | 
|  | kDocCommentCases_case.detached_comments[detached_comments.size()]); | 
|  |  | 
|  | EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments); | 
|  | } | 
|  |  | 
|  | #endif  // 0 | 
|  |  | 
|  | // ------------------------------------------------------------------- | 
|  |  | 
|  | // Test parse helpers. | 
|  | // TODO: Add a fuzz test for this. | 
|  | TEST_F(TokenizerTest, ParseInteger) { | 
|  | EXPECT_EQ(0, ParseInteger("0")); | 
|  | EXPECT_EQ(123, ParseInteger("123")); | 
|  | EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12")); | 
|  | EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12")); | 
|  | EXPECT_EQ(UINT64_MAX, ParseInteger("0xFFFFFFFFFFFFFFFF")); | 
|  | EXPECT_EQ(01234567, ParseInteger("01234567")); | 
|  | EXPECT_EQ(0X123, ParseInteger("0X123")); | 
|  |  | 
|  | // Test invalid integers that may still be tokenized as integers. | 
|  | EXPECT_EQ(0, ParseInteger("0x")); | 
|  |  | 
|  | uint64_t i; | 
|  |  | 
|  | // Test invalid integers that will never be tokenized as integers. | 
|  | EXPECT_FALSE(upb_Parse_Integer("zxy", UINT64_MAX, &i)); | 
|  | EXPECT_FALSE(upb_Parse_Integer("1.2", UINT64_MAX, &i)); | 
|  | EXPECT_FALSE(upb_Parse_Integer("08", UINT64_MAX, &i)); | 
|  | EXPECT_FALSE(upb_Parse_Integer("0xg", UINT64_MAX, &i)); | 
|  | EXPECT_FALSE(upb_Parse_Integer("-1", UINT64_MAX, &i)); | 
|  |  | 
|  | // Test overflows. | 
|  | EXPECT_TRUE(upb_Parse_Integer("0", 0, &i)); | 
|  | EXPECT_FALSE(upb_Parse_Integer("1", 0, &i)); | 
|  | EXPECT_TRUE(upb_Parse_Integer("1", 1, &i)); | 
|  | EXPECT_TRUE(upb_Parse_Integer("12345", 12345, &i)); | 
|  | EXPECT_FALSE(upb_Parse_Integer("12346", 12345, &i)); | 
|  | EXPECT_TRUE(upb_Parse_Integer("0xFFFFFFFFFFFFFFFF", UINT64_MAX, &i)); | 
|  | EXPECT_FALSE(upb_Parse_Integer("0x10000000000000000", UINT64_MAX, &i)); | 
|  |  | 
|  | // Test near the limits of signed parsing (values in INT64_MAX +/- 1600) | 
|  | for (int64_t offset = -1600; offset <= 1600; ++offset) { | 
|  | // We make sure to perform an unsigned addition so that we avoid signed | 
|  | // overflow, which would be undefined behavior. | 
|  | uint64_t i = 0x7FFFFFFFFFFFFFFFu + static_cast<uint64_t>(offset); | 
|  | char decimal[32]; | 
|  | snprintf(decimal, 32, "%llu", static_cast<unsigned long long>(i)); | 
|  | if (offset > 0) { | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_FALSE(upb_Parse_Integer(decimal, INT64_MAX, &parsed)) | 
|  | << decimal << "=>" << parsed; | 
|  | } else { | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_TRUE(upb_Parse_Integer(decimal, INT64_MAX, &parsed)) | 
|  | << decimal << "=>" << parsed; | 
|  | EXPECT_EQ(parsed, i); | 
|  | } | 
|  | char octal[32]; | 
|  | snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i)); | 
|  | if (offset > 0) { | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_FALSE(upb_Parse_Integer(octal, INT64_MAX, &parsed)) | 
|  | << octal << "=>" << parsed; | 
|  | } else { | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_TRUE(upb_Parse_Integer(octal, INT64_MAX, &parsed)) | 
|  | << octal << "=>" << parsed; | 
|  | EXPECT_EQ(parsed, i); | 
|  | } | 
|  | char hex[32]; | 
|  | snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(i)); | 
|  | if (offset > 0) { | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_FALSE(upb_Parse_Integer(hex, INT64_MAX, &parsed)) | 
|  | << hex << "=>" << parsed; | 
|  | } else { | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_TRUE(upb_Parse_Integer(hex, INT64_MAX, &parsed)) << hex; | 
|  | EXPECT_EQ(parsed, i); | 
|  | } | 
|  | // EXPECT_NE(offset, -237); | 
|  | } | 
|  |  | 
|  | // Test near the limits of unsigned parsing (values in UINT64_MAX +/- 1600) | 
|  | // By definition, values greater than UINT64_MAX cannot be held in a uint64_t | 
|  | // variable, so printing them is a little tricky; fortunately all but the | 
|  | // last four digits are known, so we can hard-code them in the printf string, | 
|  | // and we only need to format the last 4. | 
|  | for (int64_t offset = -1600; offset <= 1600; ++offset) { | 
|  | { | 
|  | uint64_t i = 18446744073709551615u + offset; | 
|  | char decimal[32]; | 
|  | snprintf(decimal, 32, "1844674407370955%04llu", | 
|  | static_cast<unsigned long long>(1615 + offset)); | 
|  | if (offset > 0) { | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_FALSE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed)) | 
|  | << decimal << "=>" << parsed; | 
|  | } else { | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_TRUE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed)) << decimal; | 
|  | EXPECT_EQ(parsed, i); | 
|  | } | 
|  | } | 
|  | { | 
|  | uint64_t i = 01777777777777777777777u + offset; | 
|  | if (offset > 0) { | 
|  | char octal[32]; | 
|  | snprintf(octal, 32, "0200000000000000000%04llo", | 
|  | static_cast<unsigned long long>(offset - 1)); | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_FALSE(upb_Parse_Integer(octal, UINT64_MAX, &parsed)) | 
|  | << octal << "=>" << parsed; | 
|  | } else { | 
|  | char octal[32]; | 
|  | snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i)); | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_TRUE(upb_Parse_Integer(octal, UINT64_MAX, &parsed)) << octal; | 
|  | EXPECT_EQ(parsed, i); | 
|  | } | 
|  | } | 
|  | { | 
|  | uint64_t ui = 0xffffffffffffffffu + offset; | 
|  | char hex[32]; | 
|  | if (offset > 0) { | 
|  | snprintf(hex, 32, "0x1000000000000%04llx", | 
|  | static_cast<unsigned long long>(offset - 1)); | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_FALSE(upb_Parse_Integer(hex, UINT64_MAX, &parsed)) | 
|  | << hex << "=>" << parsed; | 
|  | } else { | 
|  | snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(ui)); | 
|  | uint64_t parsed = -1; | 
|  | EXPECT_TRUE(upb_Parse_Integer(hex, UINT64_MAX, &parsed)) << hex; | 
|  | EXPECT_EQ(parsed, ui); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | TEST_F(TokenizerTest, ParseFloat) { | 
|  | EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.")); | 
|  | EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1e3")); | 
|  | EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1E3")); | 
|  | EXPECT_DOUBLE_EQ(1.5e3, upb_Parse_Float("1.5e3")); | 
|  | EXPECT_DOUBLE_EQ(.1, upb_Parse_Float(".1")); | 
|  | EXPECT_DOUBLE_EQ(.25, upb_Parse_Float(".25")); | 
|  | EXPECT_DOUBLE_EQ(.1e3, upb_Parse_Float(".1e3")); | 
|  | EXPECT_DOUBLE_EQ(.25e3, upb_Parse_Float(".25e3")); | 
|  | EXPECT_DOUBLE_EQ(.1e+3, upb_Parse_Float(".1e+3")); | 
|  | EXPECT_DOUBLE_EQ(.1e-3, upb_Parse_Float(".1e-3")); | 
|  | EXPECT_DOUBLE_EQ(5, upb_Parse_Float("5")); | 
|  | EXPECT_DOUBLE_EQ(6e-12, upb_Parse_Float("6e-12")); | 
|  | EXPECT_DOUBLE_EQ(1.2, upb_Parse_Float("1.2")); | 
|  | EXPECT_DOUBLE_EQ(1.e2, upb_Parse_Float("1.e2")); | 
|  |  | 
|  | // Test invalid integers that may still be tokenized as integers. | 
|  | EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e")); | 
|  | EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e-")); | 
|  | EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.e")); | 
|  |  | 
|  | // Test 'f' suffix. | 
|  | EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1f")); | 
|  | EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.0f")); | 
|  | EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1F")); | 
|  |  | 
|  | // These should parse successfully even though they are out of range. | 
|  | // Overflows become infinity and underflows become zero. | 
|  | EXPECT_EQ(0.0, upb_Parse_Float("1e-9999999999999999999999999999")); | 
|  | EXPECT_EQ(HUGE_VAL, upb_Parse_Float("1e+9999999999999999999999999999")); | 
|  |  | 
|  | #if GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet | 
|  | // Test invalid integers that will never be tokenized as integers. | 
|  | EXPECT_DEBUG_DEATH( | 
|  | upb_Parse_Float("zxy"), | 
|  | "passed text that could not have been tokenized as a float"); | 
|  | EXPECT_DEBUG_DEATH( | 
|  | upb_Parse_Float("1-e0"), | 
|  | "passed text that could not have been tokenized as a float"); | 
|  | EXPECT_DEBUG_DEATH( | 
|  | upb_Parse_Float("-1.0"), | 
|  | "passed text that could not have been tokenized as a float"); | 
|  | #endif  // GTEST_HAS_DEATH_TEST | 
|  | } | 
|  |  | 
|  | TEST_F(TokenizerTest, ParseString) { | 
|  | const std::string inputs[] = { | 
|  | "'hello'", | 
|  | "\"blah\\nblah2\"", | 
|  | "'\\1x\\1\\123\\739\\52\\334n\\3'", | 
|  | "'\\x20\\x4'", | 
|  |  | 
|  | // Test invalid strings that may still be tokenized as strings. | 
|  | "\"\\a\\l\\v\\t",  // \l is invalid | 
|  | "'", | 
|  | "'\\", | 
|  |  | 
|  | // Experiment with Unicode escapes. | 
|  | // Here are one-, two- and three-byte Unicode characters. | 
|  | "'\\u0024\\u00a2\\u20ac\\U00024b62XX'", | 
|  | "'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'",  // Same, encoded using UTF16. | 
|  |  | 
|  | // Here's some broken UTF16: a head surrogate with no tail surrogate. | 
|  | // We just output this as if it were UTF8; it's not a defined code point, | 
|  | // but it has a defined encoding. | 
|  | "'\\ud852XX'", | 
|  |  | 
|  | // Malformed escape: Demons may fly out of the nose. | 
|  | "'\\u0'", | 
|  |  | 
|  | // Beyond the range of valid UTF-32 code units. | 
|  | "'\\U00110000\\U00200000\\UFFFFFFFF'", | 
|  | }; | 
|  |  | 
|  | const std::string outputs[] = { | 
|  | "hello", | 
|  | "blah\nblah2", | 
|  | "\1x\1\123\739\52\334n\3", | 
|  | "\x20\x4", | 
|  |  | 
|  | "\a?\v\t", | 
|  | "", | 
|  | "\\", | 
|  |  | 
|  | "$¢€ð¤¢XX", | 
|  | "$¢€ð¤¢XX", | 
|  |  | 
|  | "\xed\xa1\x92XX", | 
|  |  | 
|  | "u0", | 
|  |  | 
|  | "\\U00110000\\U00200000\\Uffffffff", | 
|  | }; | 
|  |  | 
|  | upb::Arena arena; | 
|  |  | 
|  | for (size_t i = 0; i < sizeof(inputs) / sizeof(inputs[0]); i++) { | 
|  | auto sv = upb_Parse_String(inputs[i].data(), arena.ptr()); | 
|  | EXPECT_TRUE(StringEquals(sv.data, outputs[i].data())); | 
|  | } | 
|  |  | 
|  | // Test invalid strings that will never be tokenized as strings. | 
|  | #if GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet | 
|  | EXPECT_DEBUG_DEATH( | 
|  | upb_Parse_String("", arena.ptr()), | 
|  | "passed text that could not have been tokenized as a string"); | 
|  | #endif  // GTEST_HAS_DEATH_TEST | 
|  | } | 
|  |  | 
|  | TEST_F(TokenizerTest, ParseStringAppend) { | 
|  | upb::Arena arena; | 
|  | upb_String output; | 
|  | upb_String_Init(&output, arena.ptr()); | 
|  |  | 
|  | upb_String_Assign(&output, "stuff+", 6); | 
|  | auto sv = upb_Parse_String("'hello'", arena.ptr()); | 
|  | EXPECT_TRUE(StringEquals(sv.data, "hello")); | 
|  | upb_String_Append(&output, sv.data, sv.size); | 
|  | EXPECT_TRUE(StringEquals(upb_String_Data(&output), "stuff+hello")); | 
|  | } | 
|  |  | 
|  | // ------------------------------------------------------------------- | 
|  |  | 
|  | // Each case parses some input text, ignoring the tokens produced, and | 
|  | // checks that the error output matches what is expected. | 
|  | struct ErrorCase { | 
|  | std::string input; | 
|  | const char* errors; | 
|  | }; | 
|  |  | 
|  | inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) { | 
|  | return out << absl::CEscape(test_case.input); | 
|  | } | 
|  |  | 
|  | ErrorCase kErrorCases[] = { | 
|  | // String errors. | 
|  | {"'\\l'", "0:2: Invalid escape sequence in string literal."}, | 
|  | {"'\\X'", "0:2: Invalid escape sequence in string literal."}, | 
|  | {"'\\x'", "0:3: Expected hex digits for escape sequence."}, | 
|  | {"'foo", "0:4: Unexpected end of string."}, | 
|  | {"'bar\nfoo", "0:4: String literals cannot cross line boundaries."}, | 
|  | {"'\\u01'", "0:5: Expected four hex digits for \\u escape sequence."}, | 
|  | {"'\\uXYZ'", "0:3: Expected four hex digits for \\u escape sequence."}, | 
|  |  | 
|  | // Integer errors. | 
|  | {"123foo", "0:3: Need space between number and identifier."}, | 
|  |  | 
|  | // Hex/octal errors. | 
|  | {"0x foo", "0:2: \"0x\" must be followed by hex digits."}, | 
|  | {"0541823", "0:4: Numbers starting with leading zero must be in octal."}, | 
|  | {"0x123z", "0:5: Need space between number and identifier."}, | 
|  | {"0x123.4", "0:5: Hex and octal numbers must be integers."}, | 
|  | {"0123.4", "0:4: Hex and octal numbers must be integers."}, | 
|  |  | 
|  | // Float errors. | 
|  | {"1e foo", "0:2: \"e\" must be followed by exponent."}, | 
|  | {"1e- foo", "0:3: \"e\" must be followed by exponent."}, | 
|  | {"1.2.3", | 
|  | "0:3: Already saw decimal point or exponent; can't have another one."}, | 
|  | {"1e2.3", | 
|  | "0:3: Already saw decimal point or exponent; can't have another one."}, | 
|  | {"a.1", "0:1: Need space between identifier and decimal point."}, | 
|  | // allow_f_after_float not enabled, so this should be an error. | 
|  | {"1.0f", "0:3: Need space between number and identifier."}, | 
|  |  | 
|  | // Block comment errors. | 
|  | {"/*", | 
|  | "0:2: End-of-file inside block comment.\n0:0: Comment started here."}, | 
|  | {"/*/*/ foo", | 
|  | "0:3: \"/*\" inside block comment.  Block comments cannot be nested."}, | 
|  |  | 
|  | // Control characters.  Multiple consecutive control characters should only | 
|  | // produce one error. | 
|  | {"\b foo", "0:0: Invalid control characters encountered in text."}, | 
|  | {"\b\b foo", "0:0: Invalid control characters encountered in text."}, | 
|  |  | 
|  | // Check that control characters at end of input don't result in an | 
|  | // infinite loop. | 
|  | {"\b", "0:0: Invalid control characters encountered in text."}, | 
|  |  | 
|  | // Check recovery from '\0'.  We have to explicitly specify the length of | 
|  | // these strings because otherwise the string constructor will just call | 
|  | // strlen() which will see the first '\0' and think that is the end of the | 
|  | // string. | 
|  | {std::string("\0foo", 4), | 
|  | "0:0: Invalid control characters encountered in text."}, | 
|  | {std::string("\0\0foo", 5), | 
|  | "0:0: Invalid control characters encountered in text."}, | 
|  |  | 
|  | // Check error from high order bits set | 
|  | {"\300", "0:0: Interpreting non ascii codepoint 192."}, | 
|  | }; | 
|  |  | 
|  | TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) { | 
|  | // Set up the tokenizer. | 
|  | upb::Arena arena; | 
|  | auto input = TestInputStream(kErrorCases_case.input.data(), | 
|  | kErrorCases_case.input.size(), kBlockSizes_case, | 
|  | arena.ptr()); | 
|  | auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr()); | 
|  |  | 
|  | upb_Status status; | 
|  | upb_Status_Clear(&status); | 
|  |  | 
|  | while (upb_Tokenizer_Next(t, &status)) | 
|  | ;  // just keep looping | 
|  | EXPECT_TRUE( | 
|  | StringEquals(upb_Status_ErrorMessage(&status), kErrorCases_case.errors)); | 
|  | } | 
|  |  | 
|  | // ------------------------------------------------------------------- | 
|  |  | 
|  | TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) { | 
|  | const std::string text = "foo bar"; | 
|  | upb::Arena arena; | 
|  | auto input = | 
|  | TestInputStream(text.data(), text.size(), kBlockSizes_case, arena.ptr()); | 
|  |  | 
|  | // Create a tokenizer, read one token, then destroy it. | 
|  | auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr()); | 
|  | upb_Tokenizer_Next(t, nullptr); | 
|  | upb_Tokenizer_Fini(t); | 
|  |  | 
|  | // Only "foo" should have been read. | 
|  | EXPECT_EQ(strlen("foo"), upb_ZeroCopyInputStream_ByteCount(input)); | 
|  | } | 
|  |  | 
|  | static const char* kParseBenchmark[] = { | 
|  | "\"partner-google-mobile-modes-print\"", | 
|  | "\"partner-google-mobile-modes-products\"", | 
|  | "\"partner-google-mobile-modes-realtime\"", | 
|  | "\"partner-google-mobile-modes-video\"", | 
|  | "\"partner-google-modes-news\"", | 
|  | "\"partner-google-modes-places\"", | 
|  | "\"partner-google-news\"", | 
|  | "\"partner-google-print\"", | 
|  | "\"partner-google-products\"", | 
|  | "\"partner-google-realtime\"", | 
|  | "\"partner-google-video\"", | 
|  | "\"true\"", | 
|  | "\"BigImagesHover__js_list\"", | 
|  | "\"XFEExternJsVersionParameters\"", | 
|  | "\"Available versions of the big images hover javascript\"", | 
|  | "\"Version: {\n\"", | 
|  | "\"  script_name: \"extern_js/dummy_file_compiled_post20070813.js\"\n\"", | 
|  | "\"  version_number: 0\n\"", | 
|  | "\"}\"", | 
|  | "\"BigImagesHover__js_selection\"", | 
|  | "\"XFEExternJsVersionParameters\"", | 
|  | "\"Versioning info for the big images hover javascript.\"", | 
|  | "\"current_version: 0\"", | 
|  | "\"BigImagesHover__js_suppressed\"", | 
|  | "\"Indicates if the client-side javascript associated with big images.\"", | 
|  | "\"true\"", | 
|  | "\"BrowserAnyOf\"", | 
|  | "\"IsChrome5OrAbove\"", | 
|  | "\"IsFirefox3OrAbove\"", | 
|  | "IsIE8OrAboveBinary", | 
|  | "\"Abe \"Sausage King\" Froman\"", | 
|  | "\"Frank \"Meatball\" Febbraro\"", | 
|  | }; | 
|  |  | 
|  | TEST(Benchmark, ParseStringAppendAccumulate) { | 
|  | upb::Arena arena; | 
|  | size_t outsize = 0; | 
|  | int benchmark_len = arraysize(kParseBenchmark); | 
|  | for (int i = 0; i < benchmark_len; i++) { | 
|  | auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr()); | 
|  | outsize += sv.size; | 
|  | } | 
|  | EXPECT_NE(0, outsize); | 
|  | } | 
|  |  | 
|  | TEST(Benchmark, ParseStringAppend) { | 
|  | upb::Arena arena; | 
|  | upb_String output; | 
|  | upb_String_Init(&output, arena.ptr()); | 
|  | int benchmark_len = arraysize(kParseBenchmark); | 
|  | for (int i = 0; i < benchmark_len; i++) { | 
|  | auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr()); | 
|  | upb_String_Append(&output, sv.data, sv.size); | 
|  | } | 
|  | EXPECT_NE(0, upb_String_Size(&output)); | 
|  | } | 
|  |  | 
|  | // These tests validate the Tokenizer's handling of Unicode escapes. | 
|  |  | 
|  | // Encode a single code point as UTF8. | 
|  | static std::string StandardUTF8(uint32_t code_point) { | 
|  | char buffer[4]; | 
|  | int count = upb_Unicode_ToUTF8(code_point, &buffer[0]); | 
|  |  | 
|  | EXPECT_NE(count, 0) << "Failed to encode point " << std::hex << code_point; | 
|  | return std::string(reinterpret_cast<const char*>(buffer), count); | 
|  | } | 
|  |  | 
|  | static std::string DisplayHex(const std::string& data) { | 
|  | std::string output; | 
|  | for (size_t i = 0; i < data.size(); ++i) { | 
|  | absl::StrAppendFormat(&output, "%02x ", data[i]); | 
|  | } | 
|  | return output; | 
|  | } | 
|  |  | 
|  | static void ExpectFormat(const std::string& expectation, | 
|  | const std::string& formatted) { | 
|  | upb::Arena arena; | 
|  | auto sv = upb_Parse_String(formatted.data(), arena.ptr()); | 
|  | EXPECT_EQ(strcmp(sv.data, expectation.data()), 0) | 
|  | << ": Incorrectly parsed " << formatted << ":\nGot      " | 
|  | << DisplayHex(sv.data) << "\nExpected " << DisplayHex(expectation); | 
|  | } | 
|  |  | 
|  | TEST(TokenizerHandlesUnicode, BMPCodes) { | 
|  | for (uint32_t code_point = 0; code_point < 0x10000; ++code_point) { | 
|  | // The UTF8 encoding of surrogates as single entities is not defined. | 
|  | if (upb_Unicode_IsHigh(code_point)) continue; | 
|  | if (upb_Unicode_IsLow(code_point)) continue; | 
|  |  | 
|  | const std::string expectation = StandardUTF8(code_point); | 
|  |  | 
|  | // Points in the BMP pages can be encoded using either \u with four hex | 
|  | // digits, or \U with eight hex digits. | 
|  | ExpectFormat(expectation, absl::StrFormat("'\\u%04x'", code_point)); | 
|  | ExpectFormat(expectation, absl::StrFormat("'\\u%04X'", code_point)); | 
|  | ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point)); | 
|  | ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point)); | 
|  | } | 
|  | } | 
|  |  | 
|  | TEST(TokenizerHandlesUnicode, NonBMPCodes) { | 
|  | for (uint32_t code_point = 0x10000; code_point < 0x110000; ++code_point) { | 
|  | const std::string expectation = StandardUTF8(code_point); | 
|  |  | 
|  | // Points in the non-BMP pages can be encoded using either \U with eight hex | 
|  | // digits, or using UTF-16 surrogate pairs. | 
|  | ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point)); | 
|  | ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point)); | 
|  | ExpectFormat(expectation, absl::StrFormat("'\\u%04x\\u%04x'", | 
|  | upb_Unicode_ToHigh(code_point), | 
|  | upb_Unicode_ToLow(code_point))); | 
|  | } | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  | }  // namespace io | 
|  | }  // namespace protobuf | 
|  | }  // namespace google |