Adam Cozzette | 501ecec | 2023-09-26 14:36:20 -0700 | [diff] [blame] | 1 | // Protocol Buffers - Google's data interchange format |
| 2 | // Copyright 2023 Google LLC. All rights reserved. |
| 3 | // |
| 4 | // Use of this source code is governed by a BSD-style |
| 5 | // license that can be found in the LICENSE file or at |
| 6 | // https://developers.google.com/open-source/licenses/bsd |
| 7 | |
| 8 | // Class for parsing tokenized text from a ZeroCopyInputStream. |
| 9 | |
| 10 | #ifndef UPB_IO_TOKENIZER_H_ |
| 11 | #define UPB_IO_TOKENIZER_H_ |
| 12 | |
| 13 | #include "upb/base/status.h" |
| 14 | #include "upb/base/string_view.h" |
| 15 | #include "upb/io/zero_copy_input_stream.h" |
| 16 | #include "upb/mem/arena.h" |
| 17 | |
| 18 | // Must be included last. |
| 19 | #include "upb/port/def.inc" |
| 20 | |
| 21 | #ifdef __cplusplus |
| 22 | extern "C" { |
| 23 | #endif |
| 24 | |
| 25 | typedef enum { |
| 26 | kUpb_TokenType_Start, // Next() has not yet been called. |
| 27 | kUpb_TokenType_End, // End of input reached. "text" is empty. |
| 28 | |
| 29 | // A sequence of letters, digits, and underscores, not starting with a digit. |
| 30 | // It is an error for a number to be followed by an identifier with no space |
| 31 | // in between. |
| 32 | kUpb_TokenType_Identifier, |
| 33 | |
| 34 | // A sequence of digits representing an integer. Normally the digits are |
| 35 | // decimal, but a prefix of "0x" indicates a hex number and a leading zero |
| 36 | // indicates octal, just like with C numeric literals. A leading negative |
| 37 | // sign is NOT included in the token; it's up to the parser to interpret the |
| 38 | // unary minus operator on its own. |
| 39 | kUpb_TokenType_Integer, |
| 40 | |
| 41 | // A floating point literal, with a fractional part and/or an exponent. |
| 42 | // Always in decimal. Again, never negative. |
| 43 | kUpb_TokenType_Float, |
| 44 | |
| 45 | // A quoted sequence of escaped characters. |
| 46 | // Either single or double quotes can be used, but they must match. |
| 47 | // A string literal cannot cross a line break. |
| 48 | kUpb_TokenType_String, |
| 49 | |
| 50 | // Any other printable character, like '!' or '+'. |
| 51 | // Symbols are always a single character, so "!+$%" is four tokens. |
| 52 | kUpb_TokenType_Symbol, |
| 53 | |
| 54 | // A sequence of whitespace. |
| 55 | // This token type is only produced if report_whitespace() is true. |
| 56 | // It is not reported for whitespace within comments or strings. |
| 57 | kUpb_TokenType_Whitespace, |
| 58 | |
| 59 | // A newline ('\n'). This token type is only produced if report_whitespace() |
| 60 | // is true and report_newlines() is also true. |
| 61 | // It is not reported for newlines in comments or strings. |
| 62 | kUpb_TokenType_Newline, |
| 63 | } upb_TokenType; |
| 64 | |
| 65 | typedef enum { |
| 66 | // Set to allow floats to be suffixed with the letter 'f'. Tokens which would |
| 67 | // otherwise be integers but which have the 'f' suffix will be forced to be |
| 68 | // interpreted as floats. For all other purposes, the 'f' is ignored. |
| 69 | kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0, |
| 70 | |
| 71 | // If set, whitespace tokens are reported by Next(). |
| 72 | kUpb_TokenizerOption_ReportWhitespace = 1 << 1, |
| 73 | |
| 74 | // If set, newline tokens are reported by Next(). |
| 75 | // This is a superset of ReportWhitespace. |
| 76 | kUpb_TokenizerOption_ReportNewlines = 1 << 2, |
| 77 | |
| 78 | // By default the tokenizer expects C-style (/* */) comments. |
| 79 | // If set, it expects shell-style (#) comments instead. |
| 80 | kUpb_TokenizerOption_CommentStyleShell = 1 << 3, |
| 81 | } upb_Tokenizer_Option; |
| 82 | |
| 83 | typedef struct upb_Tokenizer upb_Tokenizer; |
| 84 | |
| 85 | // Can be passed a flat array and/or a ZCIS as input. |
| 86 | // The array will be read first (if non-NULL), then the stream (if non-NULL). |
| 87 | upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size, |
| 88 | upb_ZeroCopyInputStream* input, int options, |
| 89 | upb_Arena* arena); |
| 90 | |
| 91 | void upb_Tokenizer_Fini(upb_Tokenizer* t); |
| 92 | |
| 93 | // Advance the tokenizer to the next input token. Returns True on success. |
| 94 | // Returns False and (clears *status on EOF, sets *status on error). |
| 95 | bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status); |
| 96 | |
| 97 | // Accessors for inspecting current/previous parse tokens, |
| 98 | // which are opaque to the tokenizer (to reduce copying). |
| 99 | |
| 100 | upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t); |
| 101 | int upb_Tokenizer_Column(const upb_Tokenizer* t); |
| 102 | int upb_Tokenizer_EndColumn(const upb_Tokenizer* t); |
| 103 | int upb_Tokenizer_Line(const upb_Tokenizer* t); |
| 104 | int upb_Tokenizer_TextSize(const upb_Tokenizer* t); |
| 105 | const char* upb_Tokenizer_TextData(const upb_Tokenizer* t); |
| 106 | |
| 107 | // External helper: validate an identifier. |
| 108 | bool upb_Tokenizer_IsIdentifier(const char* data, int size); |
| 109 | |
| 110 | // Parses a TYPE_INTEGER token. Returns false if the result would be |
| 111 | // greater than max_value. Otherwise, returns true and sets *output to the |
| 112 | // result. If the text is not from a Token of type TYPE_INTEGER originally |
| 113 | // parsed by a Tokenizer, the result is undefined (possibly an assert |
| 114 | // failure). |
| 115 | bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output); |
| 116 | |
| 117 | // Parses a TYPE_FLOAT token. This never fails, so long as the text actually |
| 118 | // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the |
| 119 | // result is undefined (possibly an assert failure). |
| 120 | double upb_Parse_Float(const char* text); |
| 121 | |
| 122 | // Parses a TYPE_STRING token. This never fails, so long as the text actually |
| 123 | // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the |
| 124 | // result is undefined (possibly an assert failure). |
| 125 | upb_StringView upb_Parse_String(const char* text, upb_Arena* arena); |
| 126 | |
| 127 | #ifdef __cplusplus |
| 128 | } /* extern "C" */ |
| 129 | #endif |
| 130 | |
| 131 | #include "upb/port/undef.inc" |
| 132 | |
| 133 | #endif // UPB_IO_TOKENIZER_H_ |