Blame - upb/io/tokenizer.c - third_party/protobuf

blob: cfc1d400f778b9bd2345476fa355a8027a46720d [file] [log] [blame]

Adam Cozzette	501ecec	2023-09-26 14:36:20 -0700	[diff] [blame]	1	// Protocol Buffers - Google's data interchange format
				2	// Copyright 2023 Google LLC. All rights reserved.
				3	//
				4	// Use of this source code is governed by a BSD-style
				5	// license that can be found in the LICENSE file or at
				6	// https://developers.google.com/open-source/licenses/bsd
				7
				8	#include "upb/io/tokenizer.h"
				9
				10	#include "upb/io/string.h"
				11	#include "upb/lex/strtod.h"
				12	#include "upb/lex/unicode.h"
				13
				14	// Must be included last.
				15	#include "upb/port/def.inc"
				16
				17	typedef enum {
				18	// Started a line comment.
				19	kUpb_CommentType_Line,
				20
				21	// Started a block comment.
				22	kUpb_CommentType_Block,
				23
				24	// Consumed a slash, then realized it wasn't a comment. current_ has
				25	// been filled in with a slash token. The caller should return it.
				26	kUpb_CommentType_SlashNot,
				27
				28	// We do not appear to be starting a comment here.
				29	kUpb_CommentType_None,
				30	} upb_CommentType;
				31
				32	static bool upb_Tokenizer_IsUnprintable(char c) { return '\0' < c && c < ' '; }
				33
				34	// Since we count columns we need to interpret tabs somehow. We'll take
				35	// the standard 8-character definition for lack of any way to do better.
				36	static const int kUpb_Tokenizer_TabWidth = 8;
				37
				38	// Given a char, interpret it as a numeric digit and return its value.
				39	// This supports any number base up to 36.
				40	// Represents integer values of digits.
				41	// Uses 36 to indicate an invalid character since we support
				42	// bases up to 36.
				43	static const int8_t kUpb_Tokenizer_AsciiToInt[256] = {
				44	36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 00-0F
				45	36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 10-1F
				46	36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // ' '-'/'
				47	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // '0'-'9'
				48	36, 36, 36, 36, 36, 36, 36, // ':'-'@'
				49	10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'P'
				50	26, 27, 28, 29, 30, 31, 32, 33, 34, 35, // 'Q'-'Z'
				51	36, 36, 36, 36, 36, 36, // '['-'`'
				52	10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'a'-'p'
				53	26, 27, 28, 29, 30, 31, 32, 33, 34, 35, // 'q'-'z'
				54	36, 36, 36, 36, 36, // '{'-DEL
				55	36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 80-8F
				56	36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 90-9F
				57	36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // A0-AF
				58	36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // B0-BF
				59	36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // C0-CF
				60	36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // D0-DF
				61	36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // E0-EF
				62	36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // F0-FF
				63	};
				64
				65	static int DigitValue(char digit) {
				66	return kUpb_Tokenizer_AsciiToInt[digit & 0xFF];
				67	}
				68
				69	static bool upb_Tokenizer_IsLetter(char c) {
				70	return ('a' <= c && c <= 'z') \|\| ('A' <= c && c <= 'Z') \|\| (c == '_');
				71	}
				72
				73	static bool upb_Tokenizer_IsDigit(char c) { return '0' <= c && c <= '9'; }
				74
				75	static bool upb_Tokenizer_IsOctalDigit(char c) { return '0' <= c && c <= '7'; }
				76
				77	static bool upb_Tokenizer_IsHexDigit(char c) {
				78	return ('0' <= c && c <= '9') \|\| ('a' <= c && c <= 'f') \|\|
				79	('A' <= c && c <= 'F');
				80	}
				81
				82	static bool upb_Tokenizer_IsAlphanumeric(char c) {
				83	return upb_Tokenizer_IsLetter(c) \|\| upb_Tokenizer_IsDigit(c);
				84	}
				85
				86	static bool upb_Tokenizer_IsWhitespaceNoNewline(char c) {
				87	return c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\v' \|\| c == '\f';
				88	}
				89
				90	static bool upb_Tokenizer_IsWhitespace(char c) {
				91	return c == '\n' \|\| upb_Tokenizer_IsWhitespaceNoNewline(c);
				92	}
				93
				94	static bool upb_Tokenizer_IsEscape(char c) {
				95	return c == 'a' \|\| c == 'b' \|\| c == 'f' \|\| c == 'n' \|\| c == 'r' \|\| c == 't' \|\|
				96	c == 'v' \|\| c == '\\' \|\| c == '?' \|\| c == '\'' \|\| c == '\"';
				97	}
				98
				99	static char TranslateEscape(char c) {
				100	switch (c) {
				101	case 'a':
				102	return '\a';
				103	case 'b':
				104	return '\b';
				105	case 'f':
				106	return '\f';
				107	case 'n':
				108	return '\n';
				109	case 'r':
				110	return '\r';
				111	case 't':
				112	return '\t';
				113	case 'v':
				114	return '\v';
				115	case '\\':
				116	return '\\';
				117	case '?':
				118	return '\?'; // Trigraphs = :(
				119	case '\'':
				120	return '\'';
				121	case '"':
				122	return '\"';
				123
				124	// We expect escape sequences to have been validated separately.
				125	default:
				126	return '?';
				127	}
				128	}
				129
				130	// ===================================================================
				131
				132	struct upb_Tokenizer {
				133	upb_TokenType token_type; // The type of the current token.
				134
				135	// The exact text of the current token as it appeared in the input.
				136	// e.g. tokens of TYPE_STRING will still be escaped and in quotes.
				137	upb_String token_text;
				138
				139	// "line" and "column" specify the position of the first character of
				140	// the token within the input stream. They are zero-based.
				141	int token_line;
				142	int token_column;
				143	int token_end_column;
				144
				145	upb_ZeroCopyInputStream* input;
				146	upb_Arena* arena;
				147	upb_Status* status;
				148
				149	char current_char; // == buffer_[buffer_pos_], updated by NextChar().
				150	const char* buffer; // Current buffer returned from input_.
				151	size_t buffer_size; // Size of buffer_.
				152	size_t buffer_pos; // Current position within the buffer.
				153	bool read_error; // Did we previously encounter a read error?
				154
				155	// Line and column number of current_char_ within the whole input stream.
				156	int line;
				157
				158	// By "column number", the proto compiler refers to a count of the number
				159	// of bytes before a given byte, except that a tab character advances to
				160	// the next multiple of 8 bytes. Note in particular that column numbers
				161	// are zero-based, while many user interfaces use one-based column numbers.
				162	int column;
				163
				164	// Cached values from before the most recent call to Next()
				165	upb_TokenType previous_type;
				166	int previous_line;
				167	int previous_column;
				168	int previous_end_column;
				169
				170	// String to which text should be appended as we advance through it.
				171	// Call RecordTo(&str) to start recording and StopRecording() to stop.
				172	// E.g. StartToken() calls RecordTo(&current_.text). record_start_ is the
				173	// position within the current buffer where recording started.
				174	upb_String* record_target;
				175	int record_start;
				176	int options;
				177	jmp_buf err;
				178	};
				179
				180	// Convenience methods to return an error at the current line and column.
				181
				182	UPB_NORETURN static void ReportError(upb_Tokenizer* t, const char* msg) {
				183	upb_Status_SetErrorFormat(t->status, "%d:%d: %s", t->line, t->column, msg);
				184	UPB_LONGJMP(t->err, 1);
				185	}
				186
				187	UPB_NORETURN UPB_PRINTF(2, 3) static void ReportErrorFormat(upb_Tokenizer* t,
				188	const char* fmt,
				189	...) {
				190	va_list args;
				191	va_start(args, fmt);
				192	char msg[128];
				193	vsnprintf(msg, sizeof(msg), fmt, args);
				194	ReportError(t, msg);
				195	}
				196
				197	// Read a new buffer from the input.
				198	static void Refresh(upb_Tokenizer* t) {
				199	if (t->read_error) {
				200	t->current_char = '\0';
				201	return;
				202	}
				203
				204	// If we're in a token, append the rest of the buffer to it.
				205	if (t->record_target != NULL && t->record_start < t->buffer_size) {
				206	upb_String_Append(t->record_target, t->buffer + t->record_start,
				207	t->buffer_size - t->record_start);
				208	t->record_start = 0;
				209	}
				210
				211	t->buffer = NULL;
				212	t->buffer_pos = 0;
				213
				214	upb_Status status;
				215	const void* data =
				216	upb_ZeroCopyInputStream_Next(t->input, &t->buffer_size, &status);
				217
				218	if (t->buffer_size > 0) {
				219	t->buffer = data;
				220	t->current_char = t->buffer[0];
				221	} else {
				222	// end of stream (or read error)
				223	t->buffer_size = 0;
				224	t->read_error = true;
				225	t->current_char = '\0';
				226	}
				227	}
				228
				229	// Consume this character and advance to the next one.
				230	static void NextChar(upb_Tokenizer* t) {
				231	// Update our line and column counters based on the character being
				232	// consumed.
				233	if (t->current_char == '\n') {
				234	t->line++;
				235	t->column = 0;
				236	} else if (t->current_char == '\t') {
				237	t->column += kUpb_Tokenizer_TabWidth - t->column % kUpb_Tokenizer_TabWidth;
				238	} else {
				239	t->column++;
				240	}
				241
				242	// Advance to the next character.
				243	t->buffer_pos++;
				244	if (t->buffer_pos < t->buffer_size) {
				245	t->current_char = t->buffer[t->buffer_pos];
				246	} else {
				247	Refresh(t);
				248	}
				249	}
				250
				251	static void RecordTo(upb_Tokenizer* t, upb_String* target) {
				252	t->record_target = target;
				253	t->record_start = t->buffer_pos;
				254	}
				255
				256	static void StopRecording(upb_Tokenizer* t) {
				257	if (t->buffer_pos > t->record_start) {
				258	upb_String_Append(t->record_target, t->buffer + t->record_start,
				259	t->buffer_pos - t->record_start);
				260	}
				261	t->record_target = NULL;
				262	t->record_start = -1;
				263	}
				264
				265	// Called when the current character is the first character of a new
				266	// token (not including whitespace or comments).
				267	static void StartToken(upb_Tokenizer* t) {
				268	t->token_type = kUpb_TokenType_Start;
				269	upb_String_Clear(&t->token_text);
				270	t->token_line = t->line;
				271	t->token_column = t->column;
				272	RecordTo(t, &t->token_text);
				273	}
				274
				275	// Called when the current character is the first character after the
				276	// end of the last token. After this returns, current_.text will
				277	// contain all text consumed since StartToken() was called.
				278	static void EndToken(upb_Tokenizer* t) {
				279	StopRecording(t);
				280	t->token_end_column = t->column;
				281	}
				282
				283	// -----------------------------------------------------------------
				284	// These helper methods make the parsing code more readable.
				285	// The "character classes" referred to are defined at the top of the file.
				286	// The method returns true if c is a member of this "class", like "Letter"
				287	// or "Digit".
				288
				289	// Returns true if the current character is of the given character
				290	// class, but does not consume anything.
				291	static bool LookingAt(const upb_Tokenizer* t, bool (*f)(char)) {
				292	return f(t->current_char);
				293	}
				294
				295	// If the current character is in the given class, consume it and return true.
				296	// Otherwise return false.
				297	static bool TryConsumeOne(upb_Tokenizer* t, bool (*f)(char)) {
				298	if (f(t->current_char)) {
				299	NextChar(t);
				300	return true;
				301	} else {
				302	return false;
				303	}
				304	}
				305
				306	// Like above, but try to consume the specific character indicated.
				307	static bool TryConsume(upb_Tokenizer* t, char c) {
				308	if (t->current_char == c) {
				309	NextChar(t);
				310	return true;
				311	} else {
				312	return false;
				313	}
				314	}
				315
				316	// Consume zero or more of the given character class.
				317	static void ConsumeZeroOrMore(upb_Tokenizer* t, bool (*f)(char)) {
				318	while (f(t->current_char)) {
				319	NextChar(t);
				320	}
				321	}
				322
				323	// Consume one or more of the given character class or log the given
				324	// error message.
				325	static void ConsumeOneOrMore(upb_Tokenizer* t, bool (*f)(char),
				326	const char* err_msg) {
				327	if (!f(t->current_char)) {
				328	ReportError(t, err_msg);
				329	}
				330
				331	do {
				332	NextChar(t);
				333	} while (f(t->current_char));
				334	}
				335
				336	// -----------------------------------------------------------------
				337	// The following four methods are used to consume tokens of specific
				338	// types. They are actually used to consume all characters after
				339	// the first, since the calling function consumes the first character
				340	// in order to decide what kind of token is being read.
				341
				342	// Read and consume a string, ending when the given delimiter is consumed.
				343	static void ConsumeString(upb_Tokenizer* t, char delimiter) {
				344	while (true) {
				345	switch (t->current_char) {
				346	case '\0':
				347	ReportError(t, "Unexpected end of string.");
				348
				349	case '\n':
				350	ReportError(t, "String literals cannot cross line boundaries.");
				351
				352	case '\\': {
				353	// An escape sequence.
				354	NextChar(t);
				355	if (TryConsumeOne(t, upb_Tokenizer_IsEscape)) {
				356	// Valid escape sequence.
				357	} else if (TryConsumeOne(t, upb_Tokenizer_IsOctalDigit)) {
				358	// Possibly followed by two more octal digits, but these will
				359	// just be consumed by the main loop anyway so we don't need
				360	// to do so explicitly here.
				361	} else if (TryConsume(t, 'x')) {
				362	if (!TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
				363	ReportError(t, "Expected hex digits for escape sequence.");
				364	}
				365	// Possibly followed by another hex digit, but again we don't care.
				366	} else if (TryConsume(t, 'u')) {
				367	if (!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) \|\|
				368	!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) \|\|
				369	!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) \|\|
				370	!TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
				371	ReportError(t, "Expected four hex digits for \\u escape sequence.");
				372	}
				373	} else if (TryConsume(t, 'U')) {
				374	// We expect 8 hex digits; but only the range up to 0x10ffff is
				375	// legal.
				376	if (!TryConsume(t, '0') \|\| !TryConsume(t, '0') \|\|
				377	!(TryConsume(t, '0') \|\| TryConsume(t, '1')) \|\|
				378	!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) \|\|
				379	!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) \|\|
				380	!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) \|\|
				381	!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) \|\|
				382	!TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
				383	ReportError(t,
				384	"Expected eight hex digits up to 10ffff for \\U escape "
				385	"sequence");
				386	}
				387	} else {
				388	ReportError(t, "Invalid escape sequence in string literal.");
				389	}
				390	break;
				391	}
				392
				393	default: {
				394	if (t->current_char == delimiter) {
				395	NextChar(t);
				396	return;
				397	}
				398	NextChar(t);
				399	break;
				400	}
				401	}
				402	}
				403	}
				404
				405	// Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER depending
				406	// on what was read. This needs to know if the first characer was a zero in
				407	// order to correctly recognize hex and octal numbers. It also needs to know
				408	// whether the first character was a '.' to parse floating point correctly.
				409	static upb_TokenType ConsumeNumber(upb_Tokenizer* t, bool started_with_zero,
				410	bool started_with_dot) {
				411	bool is_float = false;
				412
				413	if (started_with_zero && (TryConsume(t, 'x') \|\| TryConsume(t, 'X'))) {
				414	// A hex number (started with "0x").
				415	ConsumeOneOrMore(t, upb_Tokenizer_IsHexDigit,
				416	"\"0x\" must be followed by hex digits.");
				417
				418	} else if (started_with_zero && LookingAt(t, upb_Tokenizer_IsDigit)) {
				419	// An octal number (had a leading zero).
				420	ConsumeZeroOrMore(t, upb_Tokenizer_IsOctalDigit);
				421	if (LookingAt(t, upb_Tokenizer_IsDigit)) {
				422	ReportError(t, "Numbers starting with leading zero must be in octal.");
				423	}
				424
				425	} else {
				426	// A decimal number.
				427	if (started_with_dot) {
				428	is_float = true;
				429	ConsumeZeroOrMore(t, upb_Tokenizer_IsDigit);
				430	} else {
				431	ConsumeZeroOrMore(t, upb_Tokenizer_IsDigit);
				432
				433	if (TryConsume(t, '.')) {
				434	is_float = true;
				435	ConsumeZeroOrMore(t, upb_Tokenizer_IsDigit);
				436	}
				437	}
				438
				439	if (TryConsume(t, 'e') \|\| TryConsume(t, 'E')) {
				440	is_float = true;
				441	if (!TryConsume(t, '-')) TryConsume(t, '+');
				442	ConsumeOneOrMore(t, upb_Tokenizer_IsDigit,
				443	"\"e\" must be followed by exponent.");
				444	}
				445
				446	if (t->options & kUpb_TokenizerOption_AllowFAfterFloat) {
				447	if (TryConsume(t, 'f') \|\| TryConsume(t, 'F')) is_float = true;
				448	}
				449	}
				450
				451	if (LookingAt(t, upb_Tokenizer_IsLetter)) {
				452	ReportError(t, "Need space between number and identifier.");
				453	}
				454
				455	if (t->current_char == '.') {
				456	if (is_float) {
				457	ReportError(
				458	t, "Already saw decimal point or exponent; can't have another one.");
				459	} else {
				460	ReportError(t, "Hex and octal numbers must be integers.");
				461	}
				462	}
				463
				464	return is_float ? kUpb_TokenType_Float : kUpb_TokenType_Integer;
				465	}
				466
				467	// Consume the rest of a line.
				468	static void ConsumeLineComment(upb_Tokenizer* t, upb_String* content) {
				469	if (content != NULL) RecordTo(t, content);
				470
				471	while (t->current_char != '\0' && t->current_char != '\n') {
				472	NextChar(t);
				473	}
				474	TryConsume(t, '\n');
				475
				476	if (content != NULL) StopRecording(t);
				477	}
				478
				479	static void ConsumeBlockComment(upb_Tokenizer* t, upb_String* content) {
				480	const int start_line = t->line;
				481	const int start_column = t->column - 2;
				482
				483	if (content != NULL) RecordTo(t, content);
				484
				485	while (true) {
				486	while (t->current_char != '\0' && t->current_char != '*' &&
				487	t->current_char != '/' && t->current_char != '\n') {
				488	NextChar(t);
				489	}
				490
				491	if (TryConsume(t, '\n')) {
				492	if (content != NULL) StopRecording(t);
				493
				494	// Consume leading whitespace and asterisk;
				495	ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespaceNoNewline);
				496	if (TryConsume(t, '*')) {
				497	if (TryConsume(t, '/')) {
				498	// End of comment.
				499	break;
				500	}
				501	}
				502
				503	if (content != NULL) RecordTo(t, content);
				504	} else if (TryConsume(t, '*') && TryConsume(t, '/')) {
				505	// End of comment.
				506	if (content != NULL) {
				507	StopRecording(t);
				508	// Strip trailing "*/".
				509	upb_String_Erase(content, upb_String_Size(content) - 2, 2);
				510	}
				511	break;
				512	} else if (TryConsume(t, '/') && t->current_char == '*') {
				513	// Note: We didn't consume the '*' because if there is a '/' after it
				514	// we want to interpret that as the end of the comment.
				515	ReportError(
				516	t, "\"/*\" inside block comment. Block comments cannot be nested.");
				517	} else if (t->current_char == '\0') {
				518	ReportErrorFormat(
				519	t, "End-of-file inside block comment.\n%d:%d: Comment started here.",
				520	start_line, start_column);
				521	}
				522	}
				523	}
				524
				525	// If we're at the start of a new comment, consume it and return what kind
				526	// of comment it is.
				527	static upb_CommentType TryConsumeCommentStart(upb_Tokenizer* t) {
				528	const bool style_sh = t->options & kUpb_TokenizerOption_CommentStyleShell;
				529	const bool style_cpp = !style_sh;
				530
				531	if (style_cpp && TryConsume(t, '/')) {
				532	if (TryConsume(t, '/')) {
				533	return kUpb_CommentType_Line;
				534	} else if (TryConsume(t, '*')) {
				535	return kUpb_CommentType_Block;
				536	} else {
				537	// Oops, it was just a slash. Return it.
				538	t->token_type = kUpb_TokenType_Symbol;
				539	upb_String_Assign(&t->token_text, "/", 1);
				540	t->token_line = t->line;
				541	t->token_column = t->column - 1;
				542	t->token_end_column = t->column;
				543	return kUpb_CommentType_SlashNot;
				544	}
				545	} else if (style_sh && TryConsume(t, '#')) {
				546	return kUpb_CommentType_Line;
				547	} else {
				548	return kUpb_CommentType_None;
				549	}
				550	}
				551
				552	// If we're looking at a TYPE_WHITESPACE token and `report_whitespace` is true,
				553	// consume it and return true.
				554	static bool TryConsumeWhitespace(upb_Tokenizer* t) {
				555	if (t->options & kUpb_TokenizerOption_ReportNewlines) {
				556	if (TryConsumeOne(t, upb_Tokenizer_IsWhitespaceNoNewline)) {
				557	ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespaceNoNewline);
				558	t->token_type = kUpb_TokenType_Whitespace;
				559	return true;
				560	}
				561	return false;
				562	}
				563	if (TryConsumeOne(t, upb_Tokenizer_IsWhitespace)) {
				564	ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespace);
				565	t->token_type = kUpb_TokenType_Whitespace;
				566	return (t->options & kUpb_TokenizerOption_ReportWhitespace) != 0;
				567	}
				568	return false;
				569	}
				570
				571	// If we're looking at a TYPE_NEWLINE token and `report_newlines` is true,
				572	// consume it and return true.
				573	static bool TryConsumeNewline(upb_Tokenizer* t) {
				574	if (t->options & kUpb_TokenizerOption_ReportNewlines) {
				575	if (TryConsume(t, '\n')) {
				576	t->token_type = kUpb_TokenType_Newline;
				577	return true;
				578	}
				579	}
				580	return false;
				581	}
				582
				583	// -------------------------------------------------------------------
				584
				585	int upb_Tokenizer_Column(const upb_Tokenizer* t) { return t->token_column; }
				586
				587	int upb_Tokenizer_EndColumn(const upb_Tokenizer* t) {
				588	return t->token_end_column;
				589	}
				590
				591	int upb_Tokenizer_Line(const upb_Tokenizer* t) { return t->token_line; }
				592
				593	int upb_Tokenizer_TextSize(const upb_Tokenizer* t) {
				594	return t->token_text.size_;
				595	}
				596
				597	const char* upb_Tokenizer_TextData(const upb_Tokenizer* t) {
				598	return t->token_text.data_;
				599	}
				600
				601	upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t) {
				602	return t->token_type;
				603	}
				604
				605	bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status) {
				606	t->status = status;
				607	t->previous_type = t->token_type;
				608	t->previous_line = t->token_line;
				609	t->previous_column = t->token_column;
				610	t->previous_end_column = t->token_end_column;
				611
				612	if (UPB_SETJMP(t->err)) return false;
				613
				614	while (!t->read_error) {
				615	StartToken(t);
				616	bool report_token = TryConsumeWhitespace(t) \|\| TryConsumeNewline(t);
				617	EndToken(t);
				618	if (report_token) return true;
				619
				620	switch (TryConsumeCommentStart(t)) {
				621	case kUpb_CommentType_Line:
				622	ConsumeLineComment(t, NULL);
				623	continue;
				624	case kUpb_CommentType_Block:
				625	ConsumeBlockComment(t, NULL);
				626	continue;
				627	case kUpb_CommentType_SlashNot:
				628	return true;
				629	case kUpb_CommentType_None:
				630	break;
				631	}
				632
				633	// Check for EOF before continuing.
				634	if (t->read_error) break;
				635
				636	if (LookingAt(t, upb_Tokenizer_IsUnprintable) \|\| t->current_char == '\0') {
				637	ReportError(t, "Invalid control characters encountered in text.");
				638	}
				639
				640	// Reading some sort of token.
				641	StartToken(t);
				642
				643	if (TryConsumeOne(t, upb_Tokenizer_IsLetter)) {
				644	ConsumeZeroOrMore(t, upb_Tokenizer_IsAlphanumeric);
				645	t->token_type = kUpb_TokenType_Identifier;
				646	} else if (TryConsume(t, '0')) {
				647	t->token_type = ConsumeNumber(t, true, false);
				648	} else if (TryConsume(t, '.')) {
				649	// This could be the beginning of a floating-point number, or it could
				650	// just be a '.' symbol.
				651
				652	if (TryConsumeOne(t, upb_Tokenizer_IsDigit)) {
				653	// It's a floating-point number.
				654	if (t->previous_type == kUpb_TokenType_Identifier &&
				655	t->token_line == t->previous_line &&
				656	t->token_column == t->previous_end_column) {
				657	// We don't accept syntax like "blah.123".
				658	t->column -= 2;
				659	ReportError(t, "Need space between identifier and decimal point.");
				660	}
				661	t->token_type = ConsumeNumber(t, false, true);
				662	} else {
				663	t->token_type = kUpb_TokenType_Symbol;
				664	}
				665	} else if (TryConsumeOne(t, upb_Tokenizer_IsDigit)) {
				666	t->token_type = ConsumeNumber(t, false, false);
				667	} else if (TryConsume(t, '\"')) {
				668	ConsumeString(t, '\"');
				669	t->token_type = kUpb_TokenType_String;
				670	} else if (TryConsume(t, '\'')) {
				671	ConsumeString(t, '\'');
				672	t->token_type = kUpb_TokenType_String;
				673	} else {
				674	// Check if the high order bit is set.
				675	if (t->current_char & 0x80) {
				676	ReportErrorFormat(t, "Interpreting non ascii codepoint %d.",
				677	(uint8_t)t->current_char);
				678	}
				679	NextChar(t);
				680	t->token_type = kUpb_TokenType_Symbol;
				681	}
				682
				683	EndToken(t);
				684	return true;
				685	}
				686
				687	// EOF
				688	t->token_type = kUpb_TokenType_End;
				689	upb_String_Clear(&t->token_text);
				690	t->token_line = t->line;
				691	t->token_column = t->column;
				692	t->token_end_column = t->column;
				693	upb_Status_Clear(status);
				694	return false;
				695	}
				696
				697	// -------------------------------------------------------------------
				698	// Token-parsing helpers. Remember that these don't need to report
				699	// errors since any errors should already have been reported while
				700	// tokenizing. Also, these can assume that whatever text they
				701	// are given is text that the tokenizer actually parsed as a token
				702	// of the given type.
				703
				704	bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output) {
				705	// We can't just use strtoull() because (a) it accepts negative numbers,
				706	// (b) We want additional range checks, (c) it reports overflows via errno.
				707
				708	const char* ptr = text;
				709	int base = 10;
				710	uint64_t overflow_if_mul_base = (UINT64_MAX / 10) + 1;
				711	if (ptr[0] == '0') {
				712	if (ptr[1] == 'x' \|\| ptr[1] == 'X') {
				713	// This is hex.
				714	base = 16;
				715	overflow_if_mul_base = (UINT64_MAX / 16) + 1;
				716	ptr += 2;
				717	} else {
				718	// This is octal.
				719	base = 8;
				720	overflow_if_mul_base = (UINT64_MAX / 8) + 1;
				721	}
				722	}
				723
				724	uint64_t result = 0;
				725	// For all the leading '0's, and also the first non-zero character, we
				726	// don't need to multiply.
				727	while (*ptr != '\0') {
				728	int digit = DigitValue(*ptr++);
				729	if (digit >= base) {
				730	// The token provided by Tokenizer is invalid. i.e., 099 is an invalid
				731	// token, but Tokenizer still think it's integer.
				732	return false;
				733	}
				734	if (digit != 0) {
				735	result = digit;
				736	break;
				737	}
				738	}
				739	for (; *ptr != '\0'; ptr++) {
				740	int digit = DigitValue(*ptr);
				741	if (digit < 0 \|\| digit >= base) {
				742	// The token provided by Tokenizer is invalid. i.e., 099 is an invalid
				743	// token, but Tokenizer still think it's integer.
				744	return false;
				745	}
				746	if (result >= overflow_if_mul_base) {
				747	// We know the multiply we're about to do will overflow, so exit now.
				748	return false;
				749	}
				750	// We know that result * base won't overflow, but adding digit might...
				751	result = result * base + digit;
				752	// C++ guarantees defined "wrap" semantics when unsigned integer
				753	// operations overflow, making this a fast way to check if adding
				754	// digit made result overflow, and thus, wrap around.
				755	if (result < (uint64_t)base) return false;
				756	}
				757	if (result > max_value) return false;
				758
				759	*output = result;
				760	return true;
				761	}
				762
				763	double upb_Parse_Float(const char* text) {
				764	char* end;
				765	double result = _upb_NoLocaleStrtod(text, &end);
				766
				767	// "1e" is not a valid float, but if the tokenizer reads it, it will
				768	// report an error but still return it as a valid token. We need to
				769	// accept anything the tokenizer could possibly return, error or not.
				770	if (end == 'e' \|\| end == 'E') {
				771	++end;
				772	if (end == '-' \|\| end == '+') ++end;
				773	}
				774
				775	// If the Tokenizer had allow_f_after_float_ enabled, the float may be
				776	// suffixed with the letter 'f'.
				777	if (end == 'f' \|\| end == 'F') {
				778	++end;
				779	}
				780
				781	if ((end - text) != strlen(text) \|\| *text == '-') {
				782	fprintf(stderr,
				783	"upb_Parse_Float() passed text that could not have"
				784	" been tokenized as a float: %s\n",
				785	text);
				786	UPB_ASSERT(0);
				787	}
				788	return result;
				789	}
				790
				791	// Append a Unicode code point to a string as UTF8.
				792	static void AppendUTF8(uint32_t code_point, upb_String* output) {
				793	char temp[24];
				794	int len = upb_Unicode_ToUTF8(code_point, temp);
				795	if (len == 0) {
				796	// ConsumeString permits hex values up to 0x1FFFFF,
				797	// and FetchUnicodePoint doesn't perform a range check.
				798	// Unicode code points end at 0x10FFFF, so this is out-of-range.
				799	len = snprintf(temp, sizeof temp, "\\U%08x", code_point);
				800	}
				801	upb_String_Append(output, temp, len);
				802	}
				803
				804	// Try to read <len> hex digits from ptr, and stuff the numeric result into
				805	// *result. Returns true if that many digits were successfully consumed.
				806	static bool ReadHexDigits(const char* ptr, int len, uint32_t* result) {
				807	*result = 0;
				808	if (len == 0) return false;
				809	for (const char* end = ptr + len; ptr < end; ++ptr) {
				810	if (*ptr == '\0') return false;
				811	result = (result << 4) + DigitValue(*ptr);
				812	}
				813	return true;
				814	}
				815
				816	// Convert the escape sequence parameter to a number of expected hex digits.
				817	static int UnicodeLength(char key) {
				818	if (key == 'u') return 4;
				819	if (key == 'U') return 8;
				820	return 0;
				821	}
				822
				823	// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
				824	// to parse that sequence. On success, returns a pointer to the first char
				825	// beyond that sequence, and fills in *code_point. On failure, returns ptr
				826	// itself.
				827	static const char* FetchUnicodePoint(const char* ptr, uint32_t* code_point) {
				828	const char* p = ptr;
				829	// Fetch the code point.
				830	const int len = UnicodeLength(*p++);
				831	if (!ReadHexDigits(p, len, code_point)) return ptr;
				832	p += len;
				833
				834	// Check if the code point we read is a "head surrogate." If so, then we
				835	// expect it to be immediately followed by another code point which is a valid
				836	// "trail surrogate," and together they form a UTF-16 pair which decodes into
				837	// a single Unicode point. Trail surrogates may only use \u, not \U.
				838	if (upb_Unicode_IsHigh(code_point) && p == '\\' && *(p + 1) == 'u') {
				839	uint32_t trail_surrogate;
				840	if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
				841	upb_Unicode_IsLow(trail_surrogate)) {
				842	code_point = upb_Unicode_FromPair(code_point, trail_surrogate);
				843	p += 6;
				844	}
				845	// If this failed, then we just emit the head surrogate as a code point.
				846	// It's bogus, but so is the string.
				847	}
				848
				849	return p;
				850	}
				851
				852	// The text string must begin and end with single or double quote characters.
				853	upb_StringView upb_Parse_String(const char* text, upb_Arena* arena) {
				854	const size_t size = strlen(text);
				855
				856	upb_String output;
				857	upb_String_Init(&output, arena);
				858
				859	// Reminder: text[0] is always a quote character.
				860	// (If text is empty, it's invalid, so we'll just return).
				861	if (size == 0) {
				862	fprintf(stderr,
				863	"Tokenizer::ParseStringAppend() passed text that could not"
				864	" have been tokenized as a string: %s",
				865	text);
				866	UPB_ASSERT(0);
				867	return upb_StringView_FromDataAndSize(NULL, 0);
				868	}
				869
				870	// Reserve room for new string.
				871	const size_t new_len = size + upb_String_Size(&output);
				872	upb_String_Reserve(&output, new_len);
				873
				874	// Loop through the string copying characters to "output" and
				875	// interpreting escape sequences. Note that any invalid escape
				876	// sequences or other errors were already reported while tokenizing.
				877	// In this case we do not need to produce valid results.
				878	for (const char* ptr = text + 1; *ptr != '\0'; ptr++) {
				879	if (*ptr == '\\' && ptr[1] != '\0') {
				880	// An escape sequence.
				881	++ptr;
				882
				883	if (upb_Tokenizer_IsOctalDigit(*ptr)) {
				884	// An octal escape. May one, two, or three digits.
				885	int code = DigitValue(*ptr);
				886	if (upb_Tokenizer_IsOctalDigit(ptr[1])) {
				887	++ptr;
				888	code = code * 8 + DigitValue(*ptr);
				889	}
				890	if (upb_Tokenizer_IsOctalDigit(ptr[1])) {
				891	++ptr;
				892	code = code * 8 + DigitValue(*ptr);
				893	}
				894	upb_String_PushBack(&output, (char)code);
				895
				896	} else if (*ptr == 'x') {
				897	// A hex escape. May zero, one, or two digits. (The zero case
				898	// will have been caught as an error earlier.)
				899	int code = 0;
				900	if (upb_Tokenizer_IsHexDigit(ptr[1])) {
				901	++ptr;
				902	code = DigitValue(*ptr);
				903	}
				904	if (upb_Tokenizer_IsHexDigit(ptr[1])) {
				905	++ptr;
				906	code = code * 16 + DigitValue(*ptr);
				907	}
				908	upb_String_PushBack(&output, (char)code);
				909
				910	} else if (ptr == 'u' \|\| ptr == 'U') {
				911	uint32_t unicode;
				912	const char* end = FetchUnicodePoint(ptr, &unicode);
				913	if (end == ptr) {
				914	// Failure: Just dump out what we saw, don't try to parse it.
				915	upb_String_PushBack(&output, *ptr);
				916	} else {
				917	AppendUTF8(unicode, &output);
				918	ptr = end - 1; // Because we're about to ++ptr.
				919	}
				920	} else {
				921	// Some other escape code.
				922	upb_String_PushBack(&output, TranslateEscape(*ptr));
				923	}
				924
				925	} else if (*ptr == text[0] && ptr[1] == '\0') {
				926	// Ignore final quote matching the starting quote.
				927	} else {
				928	upb_String_PushBack(&output, *ptr);
				929	}
				930	}
				931
				932	return upb_StringView_FromDataAndSize(upb_String_Data(&output),
				933	upb_String_Size(&output));
				934	}
				935
				936	static bool AllInClass(bool (f)(char), const char text, int size) {
				937	for (int i = 0; i < size; i++) {
				938	if (!f(text[i])) return false;
				939	}
				940	return true;
				941	}
				942
				943	bool upb_Tokenizer_IsIdentifier(const char* data, int size) {
				944	// Mirrors IDENTIFIER definition in Tokenizer::Next() above.
				945	if (size == 0) return false;
				946	if (!upb_Tokenizer_IsLetter(data[0])) return false;
				947	if (!AllInClass(upb_Tokenizer_IsAlphanumeric, data + 1, size - 1))
				948	return false;
				949	return true;
				950	}
				951
				952	upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
				953	upb_ZeroCopyInputStream* input, int options,
				954	upb_Arena* arena) {
				955	upb_Tokenizer* t = upb_Arena_Malloc(arena, sizeof(upb_Tokenizer));
				956	if (!t) return NULL;
				957
				958	t->input = input;
				959	t->arena = arena;
				960	t->buffer = data;
				961	t->buffer_size = size;
				962	t->buffer_pos = 0;
				963	t->read_error = false;
				964	t->line = 0;
				965	t->column = 0;
				966	t->record_target = NULL;
				967	t->record_start = -1;
				968
				969	// ReportNewlines implies ReportWhitespace.
				970	if (options & kUpb_TokenizerOption_ReportNewlines) {
				971	options \|= kUpb_TokenizerOption_ReportWhitespace;
				972	}
				973	t->options = options;
				974
				975	upb_String_Init(&t->token_text, arena);
				976	t->token_type = kUpb_TokenType_Start;
				977	t->token_line = 0;
				978	t->token_column = 0;
				979	t->token_end_column = 0;
				980
				981	t->previous_type = kUpb_TokenType_Start;
				982	t->previous_line = 0;
				983	t->previous_column = 0;
				984	t->previous_end_column = 0;
				985
				986	if (size) {
				987	t->current_char = t->buffer[0];
				988	} else {
				989	Refresh(t);
				990	}
				991	return t;
				992	}
				993
				994	void upb_Tokenizer_Fini(upb_Tokenizer* t) {
				995	// If we had any buffer left unread, return it to the underlying stream
				996	// so that someone else can read it.
				997	if (t->buffer_size > t->buffer_pos) {
				998	upb_ZeroCopyInputStream_BackUp(t->input, t->buffer_size - t->buffer_pos);
				999	}
				1000	}