upb/io/tokenizer.h - third_party/protobuf - Git at Google

 // Protocol Buffers - Google's data interchange format
 // Copyright 2023 Google LLC.  All rights reserved.
 //
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file or at
 // https://developers.google.com/open-source/licenses/bsd

 // Class for parsing tokenized text from a ZeroCopyInputStream.

 #ifndef UPB_IO_TOKENIZER_H_
 #define UPB_IO_TOKENIZER_H_

 #include "upb/base/status.h"
 #include "upb/base/string_view.h"
 #include "upb/io/zero_copy_input_stream.h"
 #include "upb/mem/arena.h"

 // Must be included last.
 #include "upb/port/def.inc"

 #ifdef __cplusplus
 extern "C" {
 #endif

 typedef enum {
   kUpb_TokenType_Start,  // Next() has not yet been called.
   kUpb_TokenType_End,    // End of input reached. "text" is empty.

   // A sequence of letters, digits, and underscores, not starting with a digit.
   // It is an error for a number to be followed by an identifier with no space
   // in between.
   kUpb_TokenType_Identifier,

   // A sequence of digits representing an integer. Normally the digits are
   // decimal, but a prefix of "0x" indicates a hex number and a leading zero
   // indicates octal, just like with C numeric literals. A leading negative
   // sign is NOT included in the token; it's up to the parser to interpret the
   // unary minus operator on its own.
   kUpb_TokenType_Integer,

   // A floating point literal, with a fractional part and/or an exponent.
   // Always in decimal. Again, never negative.
   kUpb_TokenType_Float,

   // A quoted sequence of escaped characters.
   // Either single or double quotes can be used, but they must match.
   // A string literal cannot cross a line break.
   kUpb_TokenType_String,

   // Any other printable character, like '!' or '+'.
   // Symbols are always a single character, so "!+$%" is four tokens.
   kUpb_TokenType_Symbol,

   // A sequence of whitespace.
   // This token type is only produced if report_whitespace() is true.
   // It is not reported for whitespace within comments or strings.
   kUpb_TokenType_Whitespace,

   // A newline ('\n'). This token type is only produced if report_whitespace()
   // is true and report_newlines() is also true.
   // It is not reported for newlines in comments or strings.
   kUpb_TokenType_Newline,
 } upb_TokenType;

 typedef enum {
   // Set to allow floats to be suffixed with the letter 'f'. Tokens which would
   // otherwise be integers but which have the 'f' suffix will be forced to be
   // interpreted as floats. For all other purposes, the 'f' is ignored.
   kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0,

   // If set, whitespace tokens are reported by Next().
   kUpb_TokenizerOption_ReportWhitespace = 1 << 1,

   // If set, newline tokens are reported by Next().
   // This is a superset of ReportWhitespace.
   kUpb_TokenizerOption_ReportNewlines = 1 << 2,

   // By default the tokenizer expects C-style (/* */) comments.
   // If set, it expects shell-style (#) comments instead.
   kUpb_TokenizerOption_CommentStyleShell = 1 << 3,
 } upb_Tokenizer_Option;

 typedef struct upb_Tokenizer upb_Tokenizer;

 // Can be passed a flat array and/or a ZCIS as input.
 // The array will be read first (if non-NULL), then the stream (if non-NULL).
 upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
                                  upb_ZeroCopyInputStream* input, int options,
                                  upb_Arena* arena);

 void upb_Tokenizer_Fini(upb_Tokenizer* t);

 // Advance the tokenizer to the next input token. Returns True on success.
 // Returns False and (clears *status on EOF, sets *status on error).
 bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status);

 // Accessors for inspecting current/previous parse tokens,
 // which are opaque to the tokenizer (to reduce copying).

 upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t);
 int upb_Tokenizer_Column(const upb_Tokenizer* t);
 int upb_Tokenizer_EndColumn(const upb_Tokenizer* t);
 int upb_Tokenizer_Line(const upb_Tokenizer* t);
 int upb_Tokenizer_TextSize(const upb_Tokenizer* t);
 const char* upb_Tokenizer_TextData(const upb_Tokenizer* t);

 // External helper: validate an identifier.
 bool upb_Tokenizer_IsIdentifier(const char* data, int size);

 // Parses a TYPE_INTEGER token. Returns false if the result would be
 // greater than max_value. Otherwise, returns true and sets *output to the
 // result. If the text is not from a Token of type TYPE_INTEGER originally
 // parsed by a Tokenizer, the result is undefined (possibly an assert
 // failure).
 bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output);

 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually
 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
 // result is undefined (possibly an assert failure).
 double upb_Parse_Float(const char* text);

 // Parses a TYPE_STRING token. This never fails, so long as the text actually
 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
 // result is undefined (possibly an assert failure).
 upb_StringView upb_Parse_String(const char* text, upb_Arena* arena);

 #ifdef __cplusplus
 } /* extern "C" */
 #endif

 #include "upb/port/undef.inc"

 #endif  // UPB_IO_TOKENIZER_H_
	// Protocol Buffers - Google's data interchange format
	// Copyright 2023 Google LLC. All rights reserved.
	//
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file or at
	// https://developers.google.com/open-source/licenses/bsd

	// Class for parsing tokenized text from a ZeroCopyInputStream.

	#ifndef UPB_IO_TOKENIZER_H_
	#define UPB_IO_TOKENIZER_H_

	#include "upb/base/status.h"
	#include "upb/base/string_view.h"
	#include "upb/io/zero_copy_input_stream.h"
	#include "upb/mem/arena.h"

	// Must be included last.
	#include "upb/port/def.inc"

	#ifdef __cplusplus
	extern "C" {
	#endif

	typedef enum {
	kUpb_TokenType_Start, // Next() has not yet been called.
	kUpb_TokenType_End, // End of input reached. "text" is empty.

	// A sequence of letters, digits, and underscores, not starting with a digit.
	// It is an error for a number to be followed by an identifier with no space
	// in between.
	kUpb_TokenType_Identifier,

	// A sequence of digits representing an integer. Normally the digits are
	// decimal, but a prefix of "0x" indicates a hex number and a leading zero
	// indicates octal, just like with C numeric literals. A leading negative
	// sign is NOT included in the token; it's up to the parser to interpret the
	// unary minus operator on its own.
	kUpb_TokenType_Integer,

	// A floating point literal, with a fractional part and/or an exponent.
	// Always in decimal. Again, never negative.
	kUpb_TokenType_Float,

	// A quoted sequence of escaped characters.
	// Either single or double quotes can be used, but they must match.
	// A string literal cannot cross a line break.
	kUpb_TokenType_String,

	// Any other printable character, like '!' or '+'.
	// Symbols are always a single character, so "!+$%" is four tokens.
	kUpb_TokenType_Symbol,

	// A sequence of whitespace.
	// This token type is only produced if report_whitespace() is true.
	// It is not reported for whitespace within comments or strings.
	kUpb_TokenType_Whitespace,

	// A newline ('\n'). This token type is only produced if report_whitespace()
	// is true and report_newlines() is also true.
	// It is not reported for newlines in comments or strings.
	kUpb_TokenType_Newline,
	} upb_TokenType;

	typedef enum {
	// Set to allow floats to be suffixed with the letter 'f'. Tokens which would
	// otherwise be integers but which have the 'f' suffix will be forced to be
	// interpreted as floats. For all other purposes, the 'f' is ignored.
	kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0,

	// If set, whitespace tokens are reported by Next().
	kUpb_TokenizerOption_ReportWhitespace = 1 << 1,

	// If set, newline tokens are reported by Next().
	// This is a superset of ReportWhitespace.
	kUpb_TokenizerOption_ReportNewlines = 1 << 2,

	// By default the tokenizer expects C-style (/* */) comments.
	// If set, it expects shell-style (#) comments instead.
	kUpb_TokenizerOption_CommentStyleShell = 1 << 3,
	} upb_Tokenizer_Option;

	typedef struct upb_Tokenizer upb_Tokenizer;

	// Can be passed a flat array and/or a ZCIS as input.
	// The array will be read first (if non-NULL), then the stream (if non-NULL).
	upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
	upb_ZeroCopyInputStream* input, int options,
	upb_Arena* arena);

	void upb_Tokenizer_Fini(upb_Tokenizer* t);

	// Advance the tokenizer to the next input token. Returns True on success.
	// Returns False and (clears status on EOF, sets status on error).
	bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status);

	// Accessors for inspecting current/previous parse tokens,
	// which are opaque to the tokenizer (to reduce copying).

	upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t);
	int upb_Tokenizer_Column(const upb_Tokenizer* t);
	int upb_Tokenizer_EndColumn(const upb_Tokenizer* t);
	int upb_Tokenizer_Line(const upb_Tokenizer* t);
	int upb_Tokenizer_TextSize(const upb_Tokenizer* t);
	const char* upb_Tokenizer_TextData(const upb_Tokenizer* t);

	// External helper: validate an identifier.
	bool upb_Tokenizer_IsIdentifier(const char* data, int size);

	// Parses a TYPE_INTEGER token. Returns false if the result would be
	// greater than max_value. Otherwise, returns true and sets *output to the
	// result. If the text is not from a Token of type TYPE_INTEGER originally
	// parsed by a Tokenizer, the result is undefined (possibly an assert
	// failure).
	bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output);

	// Parses a TYPE_FLOAT token. This never fails, so long as the text actually
	// comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
	// result is undefined (possibly an assert failure).
	double upb_Parse_Float(const char* text);

	// Parses a TYPE_STRING token. This never fails, so long as the text actually
	// comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
	// result is undefined (possibly an assert failure).
	upb_StringView upb_Parse_String(const char* text, upb_Arena* arena);

	#ifdef __cplusplus
	} /* extern "C" */
	#endif

	#include "upb/port/undef.inc"

	#endif // UPB_IO_TOKENIZER_H_