blob: 3ef7f30a6e8a04ef53d3c29d1f0b85d6d078629b [file] [log] [blame]
Adam Cozzette501ecec2023-09-26 14:36:20 -07001// Protocol Buffers - Google's data interchange format
2// Copyright 2023 Google LLC. All rights reserved.
3//
4// Use of this source code is governed by a BSD-style
5// license that can be found in the LICENSE file or at
6// https://developers.google.com/open-source/licenses/bsd
7
8// Class for parsing tokenized text from a ZeroCopyInputStream.
9
10#ifndef UPB_IO_TOKENIZER_H_
11#define UPB_IO_TOKENIZER_H_
12
13#include "upb/base/status.h"
14#include "upb/base/string_view.h"
15#include "upb/io/zero_copy_input_stream.h"
16#include "upb/mem/arena.h"
17
18// Must be included last.
19#include "upb/port/def.inc"
20
21#ifdef __cplusplus
22extern "C" {
23#endif
24
25typedef enum {
26 kUpb_TokenType_Start, // Next() has not yet been called.
27 kUpb_TokenType_End, // End of input reached. "text" is empty.
28
29 // A sequence of letters, digits, and underscores, not starting with a digit.
30 // It is an error for a number to be followed by an identifier with no space
31 // in between.
32 kUpb_TokenType_Identifier,
33
34 // A sequence of digits representing an integer. Normally the digits are
35 // decimal, but a prefix of "0x" indicates a hex number and a leading zero
36 // indicates octal, just like with C numeric literals. A leading negative
37 // sign is NOT included in the token; it's up to the parser to interpret the
38 // unary minus operator on its own.
39 kUpb_TokenType_Integer,
40
41 // A floating point literal, with a fractional part and/or an exponent.
42 // Always in decimal. Again, never negative.
43 kUpb_TokenType_Float,
44
45 // A quoted sequence of escaped characters.
46 // Either single or double quotes can be used, but they must match.
47 // A string literal cannot cross a line break.
48 kUpb_TokenType_String,
49
50 // Any other printable character, like '!' or '+'.
51 // Symbols are always a single character, so "!+$%" is four tokens.
52 kUpb_TokenType_Symbol,
53
54 // A sequence of whitespace.
55 // This token type is only produced if report_whitespace() is true.
56 // It is not reported for whitespace within comments or strings.
57 kUpb_TokenType_Whitespace,
58
59 // A newline ('\n'). This token type is only produced if report_whitespace()
60 // is true and report_newlines() is also true.
61 // It is not reported for newlines in comments or strings.
62 kUpb_TokenType_Newline,
63} upb_TokenType;
64
65typedef enum {
66 // Set to allow floats to be suffixed with the letter 'f'. Tokens which would
67 // otherwise be integers but which have the 'f' suffix will be forced to be
68 // interpreted as floats. For all other purposes, the 'f' is ignored.
69 kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0,
70
71 // If set, whitespace tokens are reported by Next().
72 kUpb_TokenizerOption_ReportWhitespace = 1 << 1,
73
74 // If set, newline tokens are reported by Next().
75 // This is a superset of ReportWhitespace.
76 kUpb_TokenizerOption_ReportNewlines = 1 << 2,
77
78 // By default the tokenizer expects C-style (/* */) comments.
79 // If set, it expects shell-style (#) comments instead.
80 kUpb_TokenizerOption_CommentStyleShell = 1 << 3,
81} upb_Tokenizer_Option;
82
83typedef struct upb_Tokenizer upb_Tokenizer;
84
85// Can be passed a flat array and/or a ZCIS as input.
86// The array will be read first (if non-NULL), then the stream (if non-NULL).
87upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
88 upb_ZeroCopyInputStream* input, int options,
89 upb_Arena* arena);
90
91void upb_Tokenizer_Fini(upb_Tokenizer* t);
92
93// Advance the tokenizer to the next input token. Returns True on success.
94// Returns False and (clears *status on EOF, sets *status on error).
95bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status);
96
97// Accessors for inspecting current/previous parse tokens,
98// which are opaque to the tokenizer (to reduce copying).
99
100upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t);
101int upb_Tokenizer_Column(const upb_Tokenizer* t);
102int upb_Tokenizer_EndColumn(const upb_Tokenizer* t);
103int upb_Tokenizer_Line(const upb_Tokenizer* t);
104int upb_Tokenizer_TextSize(const upb_Tokenizer* t);
105const char* upb_Tokenizer_TextData(const upb_Tokenizer* t);
106
107// External helper: validate an identifier.
108bool upb_Tokenizer_IsIdentifier(const char* data, int size);
109
110// Parses a TYPE_INTEGER token. Returns false if the result would be
111// greater than max_value. Otherwise, returns true and sets *output to the
112// result. If the text is not from a Token of type TYPE_INTEGER originally
113// parsed by a Tokenizer, the result is undefined (possibly an assert
114// failure).
115bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output);
116
117// Parses a TYPE_FLOAT token. This never fails, so long as the text actually
118// comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
119// result is undefined (possibly an assert failure).
120double upb_Parse_Float(const char* text);
121
122// Parses a TYPE_STRING token. This never fails, so long as the text actually
123// comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
124// result is undefined (possibly an assert failure).
125upb_StringView upb_Parse_String(const char* text, upb_Arena* arena);
126
127#ifdef __cplusplus
128} /* extern "C" */
129#endif
130
131#include "upb/port/undef.inc"
132
133#endif // UPB_IO_TOKENIZER_H_