Blame - upb/io/tokenizer.h - third_party/protobuf

blob: 3ef7f30a6e8a04ef53d3c29d1f0b85d6d078629b [file] [log] [blame]

Adam Cozzette	501ecec	2023-09-26 14:36:20 -0700	[diff] [blame]	1	// Protocol Buffers - Google's data interchange format
				2	// Copyright 2023 Google LLC. All rights reserved.
				3	//
				4	// Use of this source code is governed by a BSD-style
				5	// license that can be found in the LICENSE file or at
				6	// https://developers.google.com/open-source/licenses/bsd
				7
				8	// Class for parsing tokenized text from a ZeroCopyInputStream.
				9
				10	#ifndef UPB_IO_TOKENIZER_H_
				11	#define UPB_IO_TOKENIZER_H_
				12
				13	#include "upb/base/status.h"
				14	#include "upb/base/string_view.h"
				15	#include "upb/io/zero_copy_input_stream.h"
				16	#include "upb/mem/arena.h"
				17
				18	// Must be included last.
				19	#include "upb/port/def.inc"
				20
				21	#ifdef __cplusplus
				22	extern "C" {
				23	#endif
				24
				25	typedef enum {
				26	kUpb_TokenType_Start, // Next() has not yet been called.
				27	kUpb_TokenType_End, // End of input reached. "text" is empty.
				28
				29	// A sequence of letters, digits, and underscores, not starting with a digit.
				30	// It is an error for a number to be followed by an identifier with no space
				31	// in between.
				32	kUpb_TokenType_Identifier,
				33
				34	// A sequence of digits representing an integer. Normally the digits are
				35	// decimal, but a prefix of "0x" indicates a hex number and a leading zero
				36	// indicates octal, just like with C numeric literals. A leading negative
				37	// sign is NOT included in the token; it's up to the parser to interpret the
				38	// unary minus operator on its own.
				39	kUpb_TokenType_Integer,
				40
				41	// A floating point literal, with a fractional part and/or an exponent.
				42	// Always in decimal. Again, never negative.
				43	kUpb_TokenType_Float,
				44
				45	// A quoted sequence of escaped characters.
				46	// Either single or double quotes can be used, but they must match.
				47	// A string literal cannot cross a line break.
				48	kUpb_TokenType_String,
				49
				50	// Any other printable character, like '!' or '+'.
				51	// Symbols are always a single character, so "!+$%" is four tokens.
				52	kUpb_TokenType_Symbol,
				53
				54	// A sequence of whitespace.
				55	// This token type is only produced if report_whitespace() is true.
				56	// It is not reported for whitespace within comments or strings.
				57	kUpb_TokenType_Whitespace,
				58
				59	// A newline ('\n'). This token type is only produced if report_whitespace()
				60	// is true and report_newlines() is also true.
				61	// It is not reported for newlines in comments or strings.
				62	kUpb_TokenType_Newline,
				63	} upb_TokenType;
				64
				65	typedef enum {
				66	// Set to allow floats to be suffixed with the letter 'f'. Tokens which would
				67	// otherwise be integers but which have the 'f' suffix will be forced to be
				68	// interpreted as floats. For all other purposes, the 'f' is ignored.
				69	kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0,
				70
				71	// If set, whitespace tokens are reported by Next().
				72	kUpb_TokenizerOption_ReportWhitespace = 1 << 1,
				73
				74	// If set, newline tokens are reported by Next().
				75	// This is a superset of ReportWhitespace.
				76	kUpb_TokenizerOption_ReportNewlines = 1 << 2,
				77
				78	// By default the tokenizer expects C-style (/* */) comments.
				79	// If set, it expects shell-style (#) comments instead.
				80	kUpb_TokenizerOption_CommentStyleShell = 1 << 3,
				81	} upb_Tokenizer_Option;
				82
				83	typedef struct upb_Tokenizer upb_Tokenizer;
				84
				85	// Can be passed a flat array and/or a ZCIS as input.
				86	// The array will be read first (if non-NULL), then the stream (if non-NULL).
				87	upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
				88	upb_ZeroCopyInputStream* input, int options,
				89	upb_Arena* arena);
				90
				91	void upb_Tokenizer_Fini(upb_Tokenizer* t);
				92
				93	// Advance the tokenizer to the next input token. Returns True on success.
				94	// Returns False and (clears status on EOF, sets status on error).
				95	bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status);
				96
				97	// Accessors for inspecting current/previous parse tokens,
				98	// which are opaque to the tokenizer (to reduce copying).
				99
				100	upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t);
				101	int upb_Tokenizer_Column(const upb_Tokenizer* t);
				102	int upb_Tokenizer_EndColumn(const upb_Tokenizer* t);
				103	int upb_Tokenizer_Line(const upb_Tokenizer* t);
				104	int upb_Tokenizer_TextSize(const upb_Tokenizer* t);
				105	const char* upb_Tokenizer_TextData(const upb_Tokenizer* t);
				106
				107	// External helper: validate an identifier.
				108	bool upb_Tokenizer_IsIdentifier(const char* data, int size);
				109
				110	// Parses a TYPE_INTEGER token. Returns false if the result would be
				111	// greater than max_value. Otherwise, returns true and sets *output to the
				112	// result. If the text is not from a Token of type TYPE_INTEGER originally
				113	// parsed by a Tokenizer, the result is undefined (possibly an assert
				114	// failure).
				115	bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output);
				116
				117	// Parses a TYPE_FLOAT token. This never fails, so long as the text actually
				118	// comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
				119	// result is undefined (possibly an assert failure).
				120	double upb_Parse_Float(const char* text);
				121
				122	// Parses a TYPE_STRING token. This never fails, so long as the text actually
				123	// comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
				124	// result is undefined (possibly an assert failure).
				125	upb_StringView upb_Parse_String(const char* text, upb_Arena* arena);
				126
				127	#ifdef __cplusplus
				128	} /* extern "C" */
				129	#endif
				130
				131	#include "upb/port/undef.inc"
				132
				133	#endif // UPB_IO_TOKENIZER_H_