upb/io/tokenizer_test.cc - third_party/protobuf - Git at Google

 // Protocol Buffers - Google's data interchange format
 // Copyright 2023 Google LLC.  All rights reserved.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file or at
 // https://developers.google.com/open-source/licenses/bsd

 #include "upb/io/tokenizer.h"

 #include <gtest/gtest.h>
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_format.h"
 #include "upb/io/chunked_input_stream.h"
 #include "upb/io/string.h"
 #include "upb/lex/unicode.h"
 #include "upb/mem/arena.hpp"

 // Must be last.
 #include "upb/port/def.inc"

 namespace google {
 namespace protobuf {
 namespace io {
 namespace {

 #ifndef arraysize
 #define arraysize(a) (sizeof(a) / sizeof(a[0]))
 #endif

 static bool StringEquals(const char* a, const char* b) {
   return strcmp(a, b) == 0;
 }

 // ===================================================================
 // Data-Driven Test Infrastructure

 // TODO:  This is copied from coded_stream_unittest.  This is
 //   temporary until these features are integrated into gTest itself.

 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
 // gTest.  These macros can be used to declare tests which should be
 // run multiple times, once for each item in some input array.  TEST_1D
 // tests all cases in a single input array.  TEST_2D tests all
 // combinations of cases from two arrays.  The arrays must be statically
 // defined such that the arraysize() macro works on them.  Example:
 //
 // int kCases[] = {1, 2, 3, 4}
 // TEST_1D(MyFixture, MyTest, kCases) {
 //   EXPECT_GT(kCases_case, 0);
 // }
 //
 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
 // they are all grater than zero.  In case of failure, the exact case
 // which failed will be printed.  The case type must be printable using
 // ostream::operator<<.

 #define TEST_1D(FIXTURE, NAME, CASES)                             \
   class FIXTURE##_##NAME##_DD : public FIXTURE {                  \
    protected:                                                     \
     template <typename CaseType>                                  \
     void DoSingleCase(const CaseType& CASES##_case);              \
   };                                                              \
                                                                   \
   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                           \
     for (size_t i = 0; i < arraysize(CASES); i++) {               \
       SCOPED_TRACE(testing::Message()                             \
                    << #CASES " case #" << i << ": " << CASES[i]); \
       DoSingleCase(CASES[i]);                                     \
     }                                                             \
   }                                                               \
                                                                   \
   template <typename CaseType>                                    \
   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)

 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                              \
   class FIXTURE##_##NAME##_DD : public FIXTURE {                            \
    protected:                                                               \
     template <typename CaseType1, typename CaseType2>                       \
     void DoSingleCase(const CaseType1& CASES1##_case,                       \
                       const CaseType2& CASES2##_case);                      \
   };                                                                        \
                                                                             \
   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                     \
     for (size_t i = 0; i < arraysize(CASES1); i++) {                        \
       for (size_t j = 0; j < arraysize(CASES2); j++) {                      \
         SCOPED_TRACE(testing::Message()                                     \
                      << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
                      << #CASES2 " case #" << j << ": " << CASES2[j]);       \
         DoSingleCase(CASES1[i], CASES2[j]);                                 \
       }                                                                     \
     }                                                                       \
   }                                                                         \
                                                                             \
   template <typename CaseType1, typename CaseType2>                         \
   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case,  \
                                            const CaseType2& CASES2##_case)

 // -------------------------------------------------------------------

 // In C, a size of zero from ZCIS_Next() means EOF so we can't play the same
 // trick here that happens in the C++ version. Use ChunkedInputStream instead.
 upb_ZeroCopyInputStream* TestInputStream(const void* data, size_t size,
                                          size_t block_size, upb_Arena* arena) {
   return upb_ChunkedInputStream_New(data, size, block_size, arena);
 }

 // -------------------------------------------------------------------

 // We test each operation over a variety of block sizes to insure that
 // we test cases where reads cross buffer boundaries as well as cases
 // where they don't.  This is sort of a brute-force approach to this,
 // but it's easy to write and easy to understand.
 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};

 class TokenizerTest : public testing::Test {
  protected:
   // For easy testing.
   uint64_t ParseInteger(const std::string& text) {
     uint64_t result;
     EXPECT_TRUE(upb_Parse_Integer(text.data(), UINT64_MAX, &result))
         << "'" << text << "'";
     return result;
   }
 };

 // ===================================================================

 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)

 // In each test case, the entire input text should parse as a single token
 // of the given type.
 struct SimpleTokenCase {
   std::string input;
   upb_TokenType type;
 };

 inline std::ostream& operator<<(std::ostream& out,
                                 const SimpleTokenCase& test_case) {
   return out << absl::CEscape(test_case.input);
 }

 SimpleTokenCase kSimpleTokenCases[] = {
     // Test identifiers.
     {"hello", kUpb_TokenType_Identifier},

     // Test integers.
     {"123", kUpb_TokenType_Integer},
     {"0xab6", kUpb_TokenType_Integer},
     {"0XAB6", kUpb_TokenType_Integer},
     {"0X1234567", kUpb_TokenType_Integer},
     {"0x89abcdef", kUpb_TokenType_Integer},
     {"0x89ABCDEF", kUpb_TokenType_Integer},
     {"01234567", kUpb_TokenType_Integer},

     // Test floats.
     {"123.45", kUpb_TokenType_Float},
     {"1.", kUpb_TokenType_Float},
     {"1e3", kUpb_TokenType_Float},
     {"1E3", kUpb_TokenType_Float},
     {"1e-3", kUpb_TokenType_Float},
     {"1e+3", kUpb_TokenType_Float},
     {"1.e3", kUpb_TokenType_Float},
     {"1.2e3", kUpb_TokenType_Float},
     {".1", kUpb_TokenType_Float},
     {".1e3", kUpb_TokenType_Float},
     {".1e-3", kUpb_TokenType_Float},
     {".1e+3", kUpb_TokenType_Float},

     // Test strings.
     {"'hello'", kUpb_TokenType_String},
     {"\"foo\"", kUpb_TokenType_String},
     {"'a\"b'", kUpb_TokenType_String},
     {"\"a'b\"", kUpb_TokenType_String},
     {"'a\\'b'", kUpb_TokenType_String},
     {"\"a\\\"b\"", kUpb_TokenType_String},
     {"'\\xf'", kUpb_TokenType_String},
     {"'\\0'", kUpb_TokenType_String},

     // Test symbols.
     {"+", kUpb_TokenType_Symbol},
     {".", kUpb_TokenType_Symbol},
 };

 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
   upb::Arena arena;

   // Set up the tokenizer.
   auto input = TestInputStream(kSimpleTokenCases_case.input.data(),
                                kSimpleTokenCases_case.input.size(),
                                kBlockSizes_case, arena.ptr());
   auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());

   // Before Next() is called, the initial token should always be TYPE_START.
   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
   EXPECT_EQ(upb_Tokenizer_Line(t), 0);
   EXPECT_EQ(upb_Tokenizer_Column(t), 0);
   EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));

   // Parse the token.
   EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
   // Check that it has the right type.
   EXPECT_EQ(upb_Tokenizer_Type(t), kSimpleTokenCases_case.type);
   // Check that it contains the complete input text.
   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t),
                            kSimpleTokenCases_case.input.data()));

   // Check that it is located at the beginning of the input
   EXPECT_EQ(upb_Tokenizer_Line(t), 0);
   EXPECT_EQ(upb_Tokenizer_Column(t), 0);
   EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size());

   upb_Status status;
   upb_Status_Clear(&status);

   // There should be no more input and no errors..
   EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
   EXPECT_TRUE(upb_Status_IsOk(&status));

   // After Next() returns false, the token should have type TYPE_END.
   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_End);
   EXPECT_EQ(upb_Tokenizer_Line(t), 0);
   EXPECT_EQ(upb_Tokenizer_Column(t), kSimpleTokenCases_case.input.size());
   EXPECT_EQ(upb_Tokenizer_EndColumn(t), kSimpleTokenCases_case.input.size());
   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));
 }

 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
   // Test the "allow_f_after_float" option.

   // Set up the tokenizer.
   upb::Arena arena;
   const char* text = "1f 2.5f 6e3f 7F";
   auto input =
       TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr());
   const int options = kUpb_TokenizerOption_AllowFAfterFloat;
   auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());

   // Advance through tokens and check that they are parsed as expected.

   EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "1f"));

   EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "2.5f"));

   EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "6e3f"));

   EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Float);
   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), "7F"));

   upb_Status status;
   upb_Status_Clear(&status);

   // There should be no more input and no errors..
   EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
   EXPECT_TRUE(upb_Status_IsOk(&status));
 }

 SimpleTokenCase kWhitespaceTokenCases[] = {
     {" ", kUpb_TokenType_Whitespace},
     {"    ", kUpb_TokenType_Whitespace},
     {"\t", kUpb_TokenType_Whitespace},
     {"\v", kUpb_TokenType_Whitespace},
     {"\t ", kUpb_TokenType_Whitespace},
     {"\v\t", kUpb_TokenType_Whitespace},
     {"   \t\r", kUpb_TokenType_Whitespace},
     // Newlines:
     {"\n", kUpb_TokenType_Newline},
 };

 TEST_2D(TokenizerTest, Whitespace, kWhitespaceTokenCases, kBlockSizes) {
   upb::Arena arena;
   {
     auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(),
                                  kWhitespaceTokenCases_case.input.size(),
                                  kBlockSizes_case, arena.ptr());
     auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());

     EXPECT_FALSE(upb_Tokenizer_Next(t, nullptr));
   }
   {
     auto input = TestInputStream(kWhitespaceTokenCases_case.input.data(),
                                  kWhitespaceTokenCases_case.input.size(),
                                  kBlockSizes_case, arena.ptr());
     const int options = kUpb_TokenizerOption_ReportNewlines;
     auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());

     EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));

     EXPECT_EQ(upb_Tokenizer_Type(t), kWhitespaceTokenCases_case.type);
     EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t),
                              kWhitespaceTokenCases_case.input.data()));
     EXPECT_FALSE(upb_Tokenizer_Next(t, nullptr));
   }
 }

 #endif

 // -------------------------------------------------------------------

 struct TokenFields {
   upb_TokenType type;
   std::string text;
   size_t line;
   size_t column;
   size_t end_column;
 };

 // In each case, the input is parsed to produce a list of tokens.  The
 // last token in "output" must have type TYPE_END.
 struct MultiTokenCase {
   std::string input;
   std::vector<TokenFields> output;
 };

 inline std::ostream& operator<<(std::ostream& out,
                                 const MultiTokenCase& test_case) {
   return out << absl::CEscape(test_case.input);
 }

 MultiTokenCase kMultiTokenCases[] = {
     // Test empty input.
     {"",
      {
          {kUpb_TokenType_End, "", 0, 0, 0},
      }},
     // Test all token types at the same time.
     {"foo 1 1.2 + 'bar'",
      {
          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
          {kUpb_TokenType_Integer, "1", 0, 4, 5},
          {kUpb_TokenType_Float, "1.2", 0, 6, 9},
          {kUpb_TokenType_Symbol, "+", 0, 10, 11},
          {kUpb_TokenType_String, "'bar'", 0, 12, 17},
          {kUpb_TokenType_End, "", 0, 17, 17},
      }},

     // Test that consecutive symbols are parsed as separate tokens.
     {"!@+%",
      {
          {kUpb_TokenType_Symbol, "!", 0, 0, 1},
          {kUpb_TokenType_Symbol, "@", 0, 1, 2},
          {kUpb_TokenType_Symbol, "+", 0, 2, 3},
          {kUpb_TokenType_Symbol, "%", 0, 3, 4},
          {kUpb_TokenType_End, "", 0, 4, 4},
      }},

     // Test that newlines affect line numbers correctly.
     {"foo bar\nrab oof",
      {
          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
          {kUpb_TokenType_Identifier, "bar", 0, 4, 7},
          {kUpb_TokenType_Identifier, "rab", 1, 0, 3},
          {kUpb_TokenType_Identifier, "oof", 1, 4, 7},
          {kUpb_TokenType_End, "", 1, 7, 7},
      }},

     // Test that tabs affect column numbers correctly.
     {"foo\tbar  \tbaz",
      {
          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
          {kUpb_TokenType_Identifier, "bar", 0, 8, 11},
          {kUpb_TokenType_Identifier, "baz", 0, 16, 19},
          {kUpb_TokenType_End, "", 0, 19, 19},
      }},

     // Test that tabs in string literals affect column numbers correctly.
     {"\"foo\tbar\" baz",
      {
          {kUpb_TokenType_String, "\"foo\tbar\"", 0, 0, 12},
          {kUpb_TokenType_Identifier, "baz", 0, 13, 16},
          {kUpb_TokenType_End, "", 0, 16, 16},
      }},

     // Test that line comments are ignored.
     {"foo // This is a comment\n"
      "bar // This is another comment",
      {
          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
          {kUpb_TokenType_Identifier, "bar", 1, 0, 3},
          {kUpb_TokenType_End, "", 1, 30, 30},
      }},

     // Test that block comments are ignored.
     {"foo /* This is a block comment */ bar",
      {
          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
          {kUpb_TokenType_Identifier, "bar", 0, 34, 37},
          {kUpb_TokenType_End, "", 0, 37, 37},
      }},

     // Test that sh-style comments are not ignored by default.
     {"foo # bar\n"
      "baz",
      {
          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
          {kUpb_TokenType_Symbol, "#", 0, 4, 5},
          {kUpb_TokenType_Identifier, "bar", 0, 6, 9},
          {kUpb_TokenType_Identifier, "baz", 1, 0, 3},
          {kUpb_TokenType_End, "", 1, 3, 3},
      }},

     // Test all whitespace chars
     {"foo\n\t\r\v\fbar",
      {
          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
          {kUpb_TokenType_Identifier, "bar", 1, 11, 14},
          {kUpb_TokenType_End, "", 1, 14, 14},
      }},
 };

 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
   // Set up the tokenizer.
   upb::Arena arena;
   auto input = TestInputStream(kMultiTokenCases_case.input.data(),
                                kMultiTokenCases_case.input.size(),
                                kBlockSizes_case, arena.ptr());
   auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());

   // Before Next() is called, the initial token should always be TYPE_START.
   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
   EXPECT_EQ(upb_Tokenizer_Line(t), 0);
   EXPECT_EQ(upb_Tokenizer_Column(t), 0);
   EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));

   // Loop through all expected tokens.
   TokenFields token_fields;
   upb_Status status;
   upb_Status_Clear(&status);
   int i = 0;
   do {
     token_fields = kMultiTokenCases_case.output[i++];

     SCOPED_TRACE(testing::Message()
                  << "Token #" << i << ": " << absl::CEscape(token_fields.text));

     // Next() should only return false when it hits the end token.
     if (token_fields.type == kUpb_TokenType_End) {
       EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
       EXPECT_TRUE(upb_Status_IsOk(&status));
     } else {
       EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
     }

     // Check that the token matches the expected one.
     EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type);
     EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line);
     EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column);
     EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column);
     EXPECT_EQ(upb_Tokenizer_TextSize(t), token_fields.text.size());
     EXPECT_TRUE(
         StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data()));
   } while (token_fields.type != kUpb_TokenType_End);
 }

 MultiTokenCase kMultiWhitespaceTokenCases[] = {
     // Test all token types at the same time.
     {"foo 1 \t1.2  \n   +\v'bar'",
      {
          {kUpb_TokenType_Identifier, "foo", 0, 0, 3},
          {kUpb_TokenType_Whitespace, " ", 0, 3, 4},
          {kUpb_TokenType_Integer, "1", 0, 4, 5},
          {kUpb_TokenType_Whitespace, " \t", 0, 5, 8},
          {kUpb_TokenType_Float, "1.2", 0, 8, 11},
          {kUpb_TokenType_Whitespace, "  ", 0, 11, 13},
          {kUpb_TokenType_Newline, "\n", 0, 13, 0},
          {kUpb_TokenType_Whitespace, "   ", 1, 0, 3},
          {kUpb_TokenType_Symbol, "+", 1, 3, 4},
          {kUpb_TokenType_Whitespace, "\v", 1, 4, 5},
          {kUpb_TokenType_String, "'bar'", 1, 5, 10},
          {kUpb_TokenType_End, "", 1, 10, 10},
      }},

 };

 TEST_2D(TokenizerTest, MultipleWhitespaceTokens, kMultiWhitespaceTokenCases,
         kBlockSizes) {
   // Set up the tokenizer.
   upb::Arena arena;
   auto input = TestInputStream(kMultiWhitespaceTokenCases_case.input.data(),
                                kMultiWhitespaceTokenCases_case.input.size(),
                                kBlockSizes_case, arena.ptr());
   const int options = kUpb_TokenizerOption_ReportNewlines;
   auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());

   // Before Next() is called, the initial token should always be TYPE_START.
   EXPECT_EQ(upb_Tokenizer_Type(t), kUpb_TokenType_Start);
   EXPECT_EQ(upb_Tokenizer_Line(t), 0);
   EXPECT_EQ(upb_Tokenizer_Column(t), 0);
   EXPECT_EQ(upb_Tokenizer_EndColumn(t), 0);
   EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), ""));

   // Loop through all expected tokens.
   TokenFields token_fields;
   upb_Status status;
   upb_Status_Clear(&status);
   int i = 0;
   do {
     token_fields = kMultiWhitespaceTokenCases_case.output[i++];

     SCOPED_TRACE(testing::Message()
                  << "Token #" << i << ": " << token_fields.text);

     // Next() should only return false when it hits the end token.
     if (token_fields.type == kUpb_TokenType_End) {
       EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
       EXPECT_TRUE(upb_Status_IsOk(&status));
     } else {
       EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
     }

     // Check that the token matches the expected one.
     EXPECT_EQ(upb_Tokenizer_Type(t), token_fields.type);
     EXPECT_EQ(upb_Tokenizer_Line(t), token_fields.line);
     EXPECT_EQ(upb_Tokenizer_Column(t), token_fields.column);
     EXPECT_EQ(upb_Tokenizer_EndColumn(t), token_fields.end_column);
     EXPECT_TRUE(
         StringEquals(upb_Tokenizer_TextData(t), token_fields.text.data()));
   } while (token_fields.type != kUpb_TokenType_End);
 }

 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)

 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
   // Test the "comment_style" option.

   const char* text =
       "foo # bar\n"
       "baz // qux\n"
       "corge /* grault */\n"
       "garply";
   const char* const kTokens[] = {"foo",  // "# bar" is ignored
                                  "baz", "/",      "/", "qux", "corge", "/",
                                  "*",   "grault", "*", "/",   "garply"};

   // Set up the tokenizer.
   upb::Arena arena;
   auto input =
       TestInputStream(text, strlen(text), kBlockSizes_case, arena.ptr());
   const int options = kUpb_TokenizerOption_CommentStyleShell;
   auto t = upb_Tokenizer_New(nullptr, 0, input, options, arena.ptr());

   // Advance through tokens and check that they are parsed as expected.
   for (size_t i = 0; i < arraysize(kTokens); i++) {
     EXPECT_TRUE(upb_Tokenizer_Next(t, nullptr));
     EXPECT_TRUE(StringEquals(upb_Tokenizer_TextData(t), kTokens[i]));
   }

   // There should be no more input and no errors.
   upb_Status status;
   upb_Status_Clear(&status);
   EXPECT_FALSE(upb_Tokenizer_Next(t, &status));
   EXPECT_TRUE(upb_Status_IsOk(&status));
 }

 #endif

 // -------------------------------------------------------------------

 #if 0  // TODO: Extended comments are currently unimplemented.

 // In each case, the input is expected to have two tokens named "prev" and
 // "next" with comments in between.
 struct DocCommentCase {
   std::string input;

   const char* prev_trailing_comments;
   const char* detached_comments[10];
   const char* next_leading_comments;
 };

 inline std::ostream& operator<<(std::ostream& out,
                                 const DocCommentCase& test_case) {
   return out << absl::CEscape(test_case.input);
 }

 DocCommentCase kDocCommentCases[] = {
     {"prev next",

      "",
      {},
      ""},

     {"prev /* ignored */ next",

      "",
      {},
      ""},

     {"prev // trailing comment\n"
      "next",

      " trailing comment\n",
      {},
      ""},

     {"prev\n"
      "// leading comment\n"
      "// line 2\n"
      "next",

      "",
      {},
      " leading comment\n"
      " line 2\n"},

     {"prev\n"
      "// trailing comment\n"
      "// line 2\n"
      "\n"
      "next",

      " trailing comment\n"
      " line 2\n",
      {},
      ""},

     {"prev // trailing comment\n"
      "// leading comment\n"
      "// line 2\n"
      "next",

      " trailing comment\n",
      {},
      " leading comment\n"
      " line 2\n"},

     {"prev /* trailing block comment */\n"
      "/* leading block comment\n"
      " * line 2\n"
      " * line 3 */"
      "next",

      " trailing block comment ",
      {},
      " leading block comment\n"
      " line 2\n"
      " line 3 "},

     {"prev\n"
      "/* trailing block comment\n"
      " * line 2\n"
      " * line 3\n"
      " */\n"
      "/* leading block comment\n"
      " * line 2\n"
      " * line 3 */"
      "next",

      " trailing block comment\n"
      " line 2\n"
      " line 3\n",
      {},
      " leading block comment\n"
      " line 2\n"
      " line 3 "},

     {"prev\n"
      "// trailing comment\n"
      "\n"
      "// detached comment\n"
      "// line 2\n"
      "\n"
      "// second detached comment\n"
      "/* third detached comment\n"
      " * line 2 */\n"
      "// leading comment\n"
      "next",

      " trailing comment\n",
      {" detached comment\n"
       " line 2\n",
       " second detached comment\n",
       " third detached comment\n"
       " line 2 "},
      " leading comment\n"},

     {"prev /**/\n"
      "\n"
      "// detached comment\n"
      "\n"
      "// leading comment\n"
      "next",

      "",
      {" detached comment\n"},
      " leading comment\n"},

     {"prev /**/\n"
      "// leading comment\n"
      "next",

      "",
      {},
      " leading comment\n"},
 };

 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
   // Set up the tokenizer.
   TestInputStream input(kDocCommentCases_case.input.data(),
                         kDocCommentCases_case.input.size(), kBlockSizes_case);
   TestErrorCollector error_collector;
   Tokenizer tokenizer(&input, &error_collector);

   // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
   TestInputStream input2(kDocCommentCases_case.input.data(),
                          kDocCommentCases_case.input.size(), kBlockSizes_case);
   Tokenizer tokenizer2(&input2, &error_collector);

   tokenizer.Next();
   tokenizer2.Next();

   EXPECT_EQ("prev", tokenizer.current().text);
   EXPECT_EQ("prev", tokenizer2.current().text);

   std::string prev_trailing_comments;
   std::vector<std::string> detached_comments;
   std::string next_leading_comments;
   tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
                              &next_leading_comments);
   tokenizer2.NextWithComments(nullptr, nullptr, nullptr);
   EXPECT_EQ("next", tokenizer.current().text);
   EXPECT_EQ("next", tokenizer2.current().text);

   EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
             prev_trailing_comments);

   for (int i = 0; i < detached_comments.size(); i++) {
     EXPECT_LT(i, arraysize(kDocCommentCases));
     EXPECT_TRUE(kDocCommentCases_case.detached_comments[i] != nullptr);
     EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]);
   }

   // Verify that we matched all the detached comments.
   EXPECT_EQ(nullptr,
             kDocCommentCases_case.detached_comments[detached_comments.size()]);

   EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments);
 }

 #endif  // 0

 // -------------------------------------------------------------------

 // Test parse helpers.
 // TODO: Add a fuzz test for this.
 TEST_F(TokenizerTest, ParseInteger) {
   EXPECT_EQ(0, ParseInteger("0"));
   EXPECT_EQ(123, ParseInteger("123"));
   EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
   EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
   EXPECT_EQ(UINT64_MAX, ParseInteger("0xFFFFFFFFFFFFFFFF"));
   EXPECT_EQ(01234567, ParseInteger("01234567"));
   EXPECT_EQ(0X123, ParseInteger("0X123"));

   // Test invalid integers that may still be tokenized as integers.
   EXPECT_EQ(0, ParseInteger("0x"));

   uint64_t i;

   // Test invalid integers that will never be tokenized as integers.
   EXPECT_FALSE(upb_Parse_Integer("zxy", UINT64_MAX, &i));
   EXPECT_FALSE(upb_Parse_Integer("1.2", UINT64_MAX, &i));
   EXPECT_FALSE(upb_Parse_Integer("08", UINT64_MAX, &i));
   EXPECT_FALSE(upb_Parse_Integer("0xg", UINT64_MAX, &i));
   EXPECT_FALSE(upb_Parse_Integer("-1", UINT64_MAX, &i));

   // Test overflows.
   EXPECT_TRUE(upb_Parse_Integer("0", 0, &i));
   EXPECT_FALSE(upb_Parse_Integer("1", 0, &i));
   EXPECT_TRUE(upb_Parse_Integer("1", 1, &i));
   EXPECT_TRUE(upb_Parse_Integer("12345", 12345, &i));
   EXPECT_FALSE(upb_Parse_Integer("12346", 12345, &i));
   EXPECT_TRUE(upb_Parse_Integer("0xFFFFFFFFFFFFFFFF", UINT64_MAX, &i));
   EXPECT_FALSE(upb_Parse_Integer("0x10000000000000000", UINT64_MAX, &i));

   // Test near the limits of signed parsing (values in INT64_MAX +/- 1600)
   for (int64_t offset = -1600; offset <= 1600; ++offset) {
     // We make sure to perform an unsigned addition so that we avoid signed
     // overflow, which would be undefined behavior.
     uint64_t i = 0x7FFFFFFFFFFFFFFFu + static_cast<uint64_t>(offset);
     char decimal[32];
     snprintf(decimal, 32, "%llu", static_cast<unsigned long long>(i));
     if (offset > 0) {
       uint64_t parsed = -1;
       EXPECT_FALSE(upb_Parse_Integer(decimal, INT64_MAX, &parsed))
           << decimal << "=>" << parsed;
     } else {
       uint64_t parsed = -1;
       EXPECT_TRUE(upb_Parse_Integer(decimal, INT64_MAX, &parsed))
           << decimal << "=>" << parsed;
       EXPECT_EQ(parsed, i);
     }
     char octal[32];
     snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
     if (offset > 0) {
       uint64_t parsed = -1;
       EXPECT_FALSE(upb_Parse_Integer(octal, INT64_MAX, &parsed))
           << octal << "=>" << parsed;
     } else {
       uint64_t parsed = -1;
       EXPECT_TRUE(upb_Parse_Integer(octal, INT64_MAX, &parsed))
           << octal << "=>" << parsed;
       EXPECT_EQ(parsed, i);
     }
     char hex[32];
     snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(i));
     if (offset > 0) {
       uint64_t parsed = -1;
       EXPECT_FALSE(upb_Parse_Integer(hex, INT64_MAX, &parsed))
           << hex << "=>" << parsed;
     } else {
       uint64_t parsed = -1;
       EXPECT_TRUE(upb_Parse_Integer(hex, INT64_MAX, &parsed)) << hex;
       EXPECT_EQ(parsed, i);
     }
     // EXPECT_NE(offset, -237);
   }

   // Test near the limits of unsigned parsing (values in UINT64_MAX +/- 1600)
   // By definition, values greater than UINT64_MAX cannot be held in a uint64_t
   // variable, so printing them is a little tricky; fortunately all but the
   // last four digits are known, so we can hard-code them in the printf string,
   // and we only need to format the last 4.
   for (int64_t offset = -1600; offset <= 1600; ++offset) {
     {
       uint64_t i = 18446744073709551615u + offset;
       char decimal[32];
       snprintf(decimal, 32, "1844674407370955%04llu",
                static_cast<unsigned long long>(1615 + offset));
       if (offset > 0) {
         uint64_t parsed = -1;
         EXPECT_FALSE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed))
             << decimal << "=>" << parsed;
       } else {
         uint64_t parsed = -1;
         EXPECT_TRUE(upb_Parse_Integer(decimal, UINT64_MAX, &parsed)) << decimal;
         EXPECT_EQ(parsed, i);
       }
     }
     {
       uint64_t i = 01777777777777777777777u + offset;
       if (offset > 0) {
         char octal[32];
         snprintf(octal, 32, "0200000000000000000%04llo",
                  static_cast<unsigned long long>(offset - 1));
         uint64_t parsed = -1;
         EXPECT_FALSE(upb_Parse_Integer(octal, UINT64_MAX, &parsed))
             << octal << "=>" << parsed;
       } else {
         char octal[32];
         snprintf(octal, 32, "0%llo", static_cast<unsigned long long>(i));
         uint64_t parsed = -1;
         EXPECT_TRUE(upb_Parse_Integer(octal, UINT64_MAX, &parsed)) << octal;
         EXPECT_EQ(parsed, i);
       }
     }
     {
       uint64_t ui = 0xffffffffffffffffu + offset;
       char hex[32];
       if (offset > 0) {
         snprintf(hex, 32, "0x1000000000000%04llx",
                  static_cast<unsigned long long>(offset - 1));
         uint64_t parsed = -1;
         EXPECT_FALSE(upb_Parse_Integer(hex, UINT64_MAX, &parsed))
             << hex << "=>" << parsed;
       } else {
         snprintf(hex, 32, "0x%llx", static_cast<unsigned long long>(ui));
         uint64_t parsed = -1;
         EXPECT_TRUE(upb_Parse_Integer(hex, UINT64_MAX, &parsed)) << hex;
         EXPECT_EQ(parsed, ui);
       }
     }
   }
 }

 TEST_F(TokenizerTest, ParseFloat) {
   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1."));
   EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1e3"));
   EXPECT_DOUBLE_EQ(1e3, upb_Parse_Float("1E3"));
   EXPECT_DOUBLE_EQ(1.5e3, upb_Parse_Float("1.5e3"));
   EXPECT_DOUBLE_EQ(.1, upb_Parse_Float(".1"));
   EXPECT_DOUBLE_EQ(.25, upb_Parse_Float(".25"));
   EXPECT_DOUBLE_EQ(.1e3, upb_Parse_Float(".1e3"));
   EXPECT_DOUBLE_EQ(.25e3, upb_Parse_Float(".25e3"));
   EXPECT_DOUBLE_EQ(.1e+3, upb_Parse_Float(".1e+3"));
   EXPECT_DOUBLE_EQ(.1e-3, upb_Parse_Float(".1e-3"));
   EXPECT_DOUBLE_EQ(5, upb_Parse_Float("5"));
   EXPECT_DOUBLE_EQ(6e-12, upb_Parse_Float("6e-12"));
   EXPECT_DOUBLE_EQ(1.2, upb_Parse_Float("1.2"));
   EXPECT_DOUBLE_EQ(1.e2, upb_Parse_Float("1.e2"));

   // Test invalid integers that may still be tokenized as integers.
   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e"));
   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1e-"));
   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.e"));

   // Test 'f' suffix.
   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1f"));
   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1.0f"));
   EXPECT_DOUBLE_EQ(1, upb_Parse_Float("1F"));

   // These should parse successfully even though they are out of range.
   // Overflows become infinity and underflows become zero.
   EXPECT_EQ(0.0, upb_Parse_Float("1e-9999999999999999999999999999"));
   EXPECT_EQ(HUGE_VAL, upb_Parse_Float("1e+9999999999999999999999999999"));

 #if GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
   // Test invalid integers that will never be tokenized as integers.
   EXPECT_DEBUG_DEATH(
       upb_Parse_Float("zxy"),
       "passed text that could not have been tokenized as a float");
   EXPECT_DEBUG_DEATH(
       upb_Parse_Float("1-e0"),
       "passed text that could not have been tokenized as a float");
   EXPECT_DEBUG_DEATH(
       upb_Parse_Float("-1.0"),
       "passed text that could not have been tokenized as a float");
 #endif  // GTEST_HAS_DEATH_TEST
 }

 TEST_F(TokenizerTest, ParseString) {
   const std::string inputs[] = {
       "'hello'",
       "\"blah\\nblah2\"",
       "'\\1x\\1\\123\\739\\52\\334n\\3'",
       "'\\x20\\x4'",

       // Test invalid strings that may still be tokenized as strings.
       "\"\\a\\l\\v\\t",  // \l is invalid
       "'",
       "'\\",

       // Experiment with Unicode escapes.
       // Here are one-, two- and three-byte Unicode characters.
       "'\\u0024\\u00a2\\u20ac\\U00024b62XX'",
       "'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'",  // Same, encoded using UTF16.

       // Here's some broken UTF16: a head surrogate with no tail surrogate.
       // We just output this as if it were UTF8; it's not a defined code point,
       // but it has a defined encoding.
       "'\\ud852XX'",

       // Malformed escape: Demons may fly out of the nose.
       "'\\u0'",

       // Beyond the range of valid UTF-32 code units.
       "'\\U00110000\\U00200000\\UFFFFFFFF'",
   };

   const std::string outputs[] = {
       "hello",
       "blah\nblah2",
       "\1x\1\123\739\52\334n\3",
       "\x20\x4",

       "\a?\v\t",
       "",
       "\\",

       "$¢€𤭢XX",
       "$¢€𤭢XX",

       "\xed\xa1\x92XX",

       "u0",

       "\\U00110000\\U00200000\\Uffffffff",
   };

   upb::Arena arena;

   for (size_t i = 0; i < sizeof(inputs) / sizeof(inputs[0]); i++) {
     auto sv = upb_Parse_String(inputs[i].data(), arena.ptr());
     EXPECT_TRUE(StringEquals(sv.data, outputs[i].data()));
   }

   // Test invalid strings that will never be tokenized as strings.
 #if GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
   EXPECT_DEBUG_DEATH(
       upb_Parse_String("", arena.ptr()),
       "passed text that could not have been tokenized as a string");
 #endif  // GTEST_HAS_DEATH_TEST
 }

 TEST_F(TokenizerTest, ParseStringAppend) {
   upb::Arena arena;
   upb_String output;
   upb_String_Init(&output, arena.ptr());

   upb_String_Assign(&output, "stuff+", 6);
   auto sv = upb_Parse_String("'hello'", arena.ptr());
   EXPECT_TRUE(StringEquals(sv.data, "hello"));
   upb_String_Append(&output, sv.data, sv.size);
   EXPECT_TRUE(StringEquals(upb_String_Data(&output), "stuff+hello"));
 }

 // -------------------------------------------------------------------

 // Each case parses some input text, ignoring the tokens produced, and
 // checks that the error output matches what is expected.
 struct ErrorCase {
   std::string input;
   const char* errors;
 };

 inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
   return out << absl::CEscape(test_case.input);
 }

 ErrorCase kErrorCases[] = {
     // String errors.
     {"'\\l'", "0:2: Invalid escape sequence in string literal."},
     {"'\\X'", "0:2: Invalid escape sequence in string literal."},
     {"'\\x'", "0:3: Expected hex digits for escape sequence."},
     {"'foo", "0:4: Unexpected end of string."},
     {"'bar\nfoo", "0:4: String literals cannot cross line boundaries."},
     {"'\\u01'", "0:5: Expected four hex digits for \\u escape sequence."},
     {"'\\uXYZ'", "0:3: Expected four hex digits for \\u escape sequence."},

     // Integer errors.
     {"123foo", "0:3: Need space between number and identifier."},

     // Hex/octal errors.
     {"0x foo", "0:2: \"0x\" must be followed by hex digits."},
     {"0541823", "0:4: Numbers starting with leading zero must be in octal."},
     {"0x123z", "0:5: Need space between number and identifier."},
     {"0x123.4", "0:5: Hex and octal numbers must be integers."},
     {"0123.4", "0:4: Hex and octal numbers must be integers."},

     // Float errors.
     {"1e foo", "0:2: \"e\" must be followed by exponent."},
     {"1e- foo", "0:3: \"e\" must be followed by exponent."},
     {"1.2.3",
      "0:3: Already saw decimal point or exponent; can't have another one."},
     {"1e2.3",
      "0:3: Already saw decimal point or exponent; can't have another one."},
     {"a.1", "0:1: Need space between identifier and decimal point."},
     // allow_f_after_float not enabled, so this should be an error.
     {"1.0f", "0:3: Need space between number and identifier."},

     // Block comment errors.
     {"/*",
      "0:2: End-of-file inside block comment.\n0:0: Comment started here."},
     {"/*/*/ foo",
      "0:3: \"/*\" inside block comment.  Block comments cannot be nested."},

     // Control characters.  Multiple consecutive control characters should only
     // produce one error.
     {"\b foo", "0:0: Invalid control characters encountered in text."},
     {"\b\b foo", "0:0: Invalid control characters encountered in text."},

     // Check that control characters at end of input don't result in an
     // infinite loop.
     {"\b", "0:0: Invalid control characters encountered in text."},

     // Check recovery from '\0'.  We have to explicitly specify the length of
     // these strings because otherwise the string constructor will just call
     // strlen() which will see the first '\0' and think that is the end of the
     // string.
     {std::string("\0foo", 4),
      "0:0: Invalid control characters encountered in text."},
     {std::string("\0\0foo", 5),
      "0:0: Invalid control characters encountered in text."},

     // Check error from high order bits set
     {"\300", "0:0: Interpreting non ascii codepoint 192."},
 };

 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
   // Set up the tokenizer.
   upb::Arena arena;
   auto input = TestInputStream(kErrorCases_case.input.data(),
                                kErrorCases_case.input.size(), kBlockSizes_case,
                                arena.ptr());
   auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());

   upb_Status status;
   upb_Status_Clear(&status);

   while (upb_Tokenizer_Next(t, &status))
     ;  // just keep looping
   EXPECT_TRUE(
       StringEquals(upb_Status_ErrorMessage(&status), kErrorCases_case.errors));
 }

 // -------------------------------------------------------------------

 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
   const std::string text = "foo bar";
   upb::Arena arena;
   auto input =
       TestInputStream(text.data(), text.size(), kBlockSizes_case, arena.ptr());

   // Create a tokenizer, read one token, then destroy it.
   auto t = upb_Tokenizer_New(nullptr, 0, input, 0, arena.ptr());
   upb_Tokenizer_Next(t, nullptr);
   upb_Tokenizer_Fini(t);

   // Only "foo" should have been read.
   EXPECT_EQ(strlen("foo"), upb_ZeroCopyInputStream_ByteCount(input));
 }

 static const char* kParseBenchmark[] = {
     "\"partner-google-mobile-modes-print\"",
     "\"partner-google-mobile-modes-products\"",
     "\"partner-google-mobile-modes-realtime\"",
     "\"partner-google-mobile-modes-video\"",
     "\"partner-google-modes-news\"",
     "\"partner-google-modes-places\"",
     "\"partner-google-news\"",
     "\"partner-google-print\"",
     "\"partner-google-products\"",
     "\"partner-google-realtime\"",
     "\"partner-google-video\"",
     "\"true\"",
     "\"BigImagesHover__js_list\"",
     "\"XFEExternJsVersionParameters\"",
     "\"Available versions of the big images hover javascript\"",
     "\"Version: {\n\"",
     "\"  script_name: \"extern_js/dummy_file_compiled_post20070813.js\"\n\"",
     "\"  version_number: 0\n\"",
     "\"}\"",
     "\"BigImagesHover__js_selection\"",
     "\"XFEExternJsVersionParameters\"",
     "\"Versioning info for the big images hover javascript.\"",
     "\"current_version: 0\"",
     "\"BigImagesHover__js_suppressed\"",
     "\"Indicates if the client-side javascript associated with big images.\"",
     "\"true\"",
     "\"BrowserAnyOf\"",
     "\"IsChrome5OrAbove\"",
     "\"IsFirefox3OrAbove\"",
     "IsIE8OrAboveBinary",
     "\"Abe \"Sausage King\" Froman\"",
     "\"Frank \"Meatball\" Febbraro\"",
 };

 TEST(Benchmark, ParseStringAppendAccumulate) {
   upb::Arena arena;
   size_t outsize = 0;
   int benchmark_len = arraysize(kParseBenchmark);
   for (int i = 0; i < benchmark_len; i++) {
     auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr());
     outsize += sv.size;
   }
   EXPECT_NE(0, outsize);
 }

 TEST(Benchmark, ParseStringAppend) {
   upb::Arena arena;
   upb_String output;
   upb_String_Init(&output, arena.ptr());
   int benchmark_len = arraysize(kParseBenchmark);
   for (int i = 0; i < benchmark_len; i++) {
     auto sv = upb_Parse_String(kParseBenchmark[i], arena.ptr());
     upb_String_Append(&output, sv.data, sv.size);
   }
   EXPECT_NE(0, upb_String_Size(&output));
 }

 // These tests validate the Tokenizer's handling of Unicode escapes.

 // Encode a single code point as UTF8.
 static std::string StandardUTF8(uint32_t code_point) {
   char buffer[4];
   int count = upb_Unicode_ToUTF8(code_point, &buffer[0]);

   EXPECT_NE(count, 0) << "Failed to encode point " << std::hex << code_point;
   return std::string(reinterpret_cast<const char*>(buffer), count);
 }

 static std::string DisplayHex(const std::string& data) {
   std::string output;
   for (size_t i = 0; i < data.size(); ++i) {
     absl::StrAppendFormat(&output, "%02x ", data[i]);
   }
   return output;
 }

 static void ExpectFormat(const std::string& expectation,
                          const std::string& formatted) {
   upb::Arena arena;
   auto sv = upb_Parse_String(formatted.data(), arena.ptr());
   EXPECT_EQ(strcmp(sv.data, expectation.data()), 0)
       << ": Incorrectly parsed " << formatted << ":\nGot      "
       << DisplayHex(sv.data) << "\nExpected " << DisplayHex(expectation);
 }

 TEST(TokenizerHandlesUnicode, BMPCodes) {
   for (uint32_t code_point = 0; code_point < 0x10000; ++code_point) {
     // The UTF8 encoding of surrogates as single entities is not defined.
     if (upb_Unicode_IsHigh(code_point)) continue;
     if (upb_Unicode_IsLow(code_point)) continue;

     const std::string expectation = StandardUTF8(code_point);

     // Points in the BMP pages can be encoded using either \u with four hex
     // digits, or \U with eight hex digits.
     ExpectFormat(expectation, absl::StrFormat("'\\u%04x'", code_point));
     ExpectFormat(expectation, absl::StrFormat("'\\u%04X'", code_point));
     ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point));
     ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point));
   }
 }

 TEST(TokenizerHandlesUnicode, NonBMPCodes) {
   for (uint32_t code_point = 0x10000; code_point < 0x110000; ++code_point) {
     const std::string expectation = StandardUTF8(code_point);

     // Points in the non-BMP pages can be encoded using either \U with eight hex
     // digits, or using UTF-16 surrogate pairs.
     ExpectFormat(expectation, absl::StrFormat("'\\U%08x'", code_point));
     ExpectFormat(expectation, absl::StrFormat("'\\U%08X'", code_point));
     ExpectFormat(expectation, absl::StrFormat("'\\u%04x\\u%04x'",
                                               upb_Unicode_ToHigh(code_point),
                                               upb_Unicode_ToLow(code_point)));
   }
 }

 }  // namespace
 }  // namespace io
 }  // namespace protobuf
 }  // namespace google