blob: cfc1d400f778b9bd2345476fa355a8027a46720d [file] [log] [blame]
Adam Cozzette501ecec2023-09-26 14:36:20 -07001// Protocol Buffers - Google's data interchange format
2// Copyright 2023 Google LLC. All rights reserved.
3//
4// Use of this source code is governed by a BSD-style
5// license that can be found in the LICENSE file or at
6// https://developers.google.com/open-source/licenses/bsd
7
8#include "upb/io/tokenizer.h"
9
10#include "upb/io/string.h"
11#include "upb/lex/strtod.h"
12#include "upb/lex/unicode.h"
13
14// Must be included last.
15#include "upb/port/def.inc"
16
17typedef enum {
18 // Started a line comment.
19 kUpb_CommentType_Line,
20
21 // Started a block comment.
22 kUpb_CommentType_Block,
23
24 // Consumed a slash, then realized it wasn't a comment. current_ has
25 // been filled in with a slash token. The caller should return it.
26 kUpb_CommentType_SlashNot,
27
28 // We do not appear to be starting a comment here.
29 kUpb_CommentType_None,
30} upb_CommentType;
31
32static bool upb_Tokenizer_IsUnprintable(char c) { return '\0' < c && c < ' '; }
33
34// Since we count columns we need to interpret tabs somehow. We'll take
35// the standard 8-character definition for lack of any way to do better.
36static const int kUpb_Tokenizer_TabWidth = 8;
37
38// Given a char, interpret it as a numeric digit and return its value.
39// This supports any number base up to 36.
40// Represents integer values of digits.
41// Uses 36 to indicate an invalid character since we support
42// bases up to 36.
43static const int8_t kUpb_Tokenizer_AsciiToInt[256] = {
44 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 00-0F
45 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 10-1F
46 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // ' '-'/'
47 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // '0'-'9'
48 36, 36, 36, 36, 36, 36, 36, // ':'-'@'
49 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'P'
50 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, // 'Q'-'Z'
51 36, 36, 36, 36, 36, 36, // '['-'`'
52 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'a'-'p'
53 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, // 'q'-'z'
54 36, 36, 36, 36, 36, // '{'-DEL
55 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 80-8F
56 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 90-9F
57 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // A0-AF
58 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // B0-BF
59 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // C0-CF
60 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // D0-DF
61 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // E0-EF
62 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // F0-FF
63};
64
65static int DigitValue(char digit) {
66 return kUpb_Tokenizer_AsciiToInt[digit & 0xFF];
67}
68
69static bool upb_Tokenizer_IsLetter(char c) {
70 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_');
71}
72
73static bool upb_Tokenizer_IsDigit(char c) { return '0' <= c && c <= '9'; }
74
75static bool upb_Tokenizer_IsOctalDigit(char c) { return '0' <= c && c <= '7'; }
76
77static bool upb_Tokenizer_IsHexDigit(char c) {
78 return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
79 ('A' <= c && c <= 'F');
80}
81
82static bool upb_Tokenizer_IsAlphanumeric(char c) {
83 return upb_Tokenizer_IsLetter(c) || upb_Tokenizer_IsDigit(c);
84}
85
86static bool upb_Tokenizer_IsWhitespaceNoNewline(char c) {
87 return c == ' ' || c == '\t' || c == '\r' || c == '\v' || c == '\f';
88}
89
90static bool upb_Tokenizer_IsWhitespace(char c) {
91 return c == '\n' || upb_Tokenizer_IsWhitespaceNoNewline(c);
92}
93
94static bool upb_Tokenizer_IsEscape(char c) {
95 return c == 'a' || c == 'b' || c == 'f' || c == 'n' || c == 'r' || c == 't' ||
96 c == 'v' || c == '\\' || c == '?' || c == '\'' || c == '\"';
97}
98
99static char TranslateEscape(char c) {
100 switch (c) {
101 case 'a':
102 return '\a';
103 case 'b':
104 return '\b';
105 case 'f':
106 return '\f';
107 case 'n':
108 return '\n';
109 case 'r':
110 return '\r';
111 case 't':
112 return '\t';
113 case 'v':
114 return '\v';
115 case '\\':
116 return '\\';
117 case '?':
118 return '\?'; // Trigraphs = :(
119 case '\'':
120 return '\'';
121 case '"':
122 return '\"';
123
124 // We expect escape sequences to have been validated separately.
125 default:
126 return '?';
127 }
128}
129
130// ===================================================================
131
132struct upb_Tokenizer {
133 upb_TokenType token_type; // The type of the current token.
134
135 // The exact text of the current token as it appeared in the input.
136 // e.g. tokens of TYPE_STRING will still be escaped and in quotes.
137 upb_String token_text;
138
139 // "line" and "column" specify the position of the first character of
140 // the token within the input stream. They are zero-based.
141 int token_line;
142 int token_column;
143 int token_end_column;
144
145 upb_ZeroCopyInputStream* input;
146 upb_Arena* arena;
147 upb_Status* status;
148
149 char current_char; // == buffer_[buffer_pos_], updated by NextChar().
150 const char* buffer; // Current buffer returned from input_.
151 size_t buffer_size; // Size of buffer_.
152 size_t buffer_pos; // Current position within the buffer.
153 bool read_error; // Did we previously encounter a read error?
154
155 // Line and column number of current_char_ within the whole input stream.
156 int line;
157
158 // By "column number", the proto compiler refers to a count of the number
159 // of bytes before a given byte, except that a tab character advances to
160 // the next multiple of 8 bytes. Note in particular that column numbers
161 // are zero-based, while many user interfaces use one-based column numbers.
162 int column;
163
164 // Cached values from before the most recent call to Next()
165 upb_TokenType previous_type;
166 int previous_line;
167 int previous_column;
168 int previous_end_column;
169
170 // String to which text should be appended as we advance through it.
171 // Call RecordTo(&str) to start recording and StopRecording() to stop.
172 // E.g. StartToken() calls RecordTo(&current_.text). record_start_ is the
173 // position within the current buffer where recording started.
174 upb_String* record_target;
175 int record_start;
176 int options;
177 jmp_buf err;
178};
179
180// Convenience methods to return an error at the current line and column.
181
182UPB_NORETURN static void ReportError(upb_Tokenizer* t, const char* msg) {
183 upb_Status_SetErrorFormat(t->status, "%d:%d: %s", t->line, t->column, msg);
184 UPB_LONGJMP(t->err, 1);
185}
186
187UPB_NORETURN UPB_PRINTF(2, 3) static void ReportErrorFormat(upb_Tokenizer* t,
188 const char* fmt,
189 ...) {
190 va_list args;
191 va_start(args, fmt);
192 char msg[128];
193 vsnprintf(msg, sizeof(msg), fmt, args);
194 ReportError(t, msg);
195}
196
197// Read a new buffer from the input.
198static void Refresh(upb_Tokenizer* t) {
199 if (t->read_error) {
200 t->current_char = '\0';
201 return;
202 }
203
204 // If we're in a token, append the rest of the buffer to it.
205 if (t->record_target != NULL && t->record_start < t->buffer_size) {
206 upb_String_Append(t->record_target, t->buffer + t->record_start,
207 t->buffer_size - t->record_start);
208 t->record_start = 0;
209 }
210
211 t->buffer = NULL;
212 t->buffer_pos = 0;
213
214 upb_Status status;
215 const void* data =
216 upb_ZeroCopyInputStream_Next(t->input, &t->buffer_size, &status);
217
218 if (t->buffer_size > 0) {
219 t->buffer = data;
220 t->current_char = t->buffer[0];
221 } else {
222 // end of stream (or read error)
223 t->buffer_size = 0;
224 t->read_error = true;
225 t->current_char = '\0';
226 }
227}
228
229// Consume this character and advance to the next one.
230static void NextChar(upb_Tokenizer* t) {
231 // Update our line and column counters based on the character being
232 // consumed.
233 if (t->current_char == '\n') {
234 t->line++;
235 t->column = 0;
236 } else if (t->current_char == '\t') {
237 t->column += kUpb_Tokenizer_TabWidth - t->column % kUpb_Tokenizer_TabWidth;
238 } else {
239 t->column++;
240 }
241
242 // Advance to the next character.
243 t->buffer_pos++;
244 if (t->buffer_pos < t->buffer_size) {
245 t->current_char = t->buffer[t->buffer_pos];
246 } else {
247 Refresh(t);
248 }
249}
250
251static void RecordTo(upb_Tokenizer* t, upb_String* target) {
252 t->record_target = target;
253 t->record_start = t->buffer_pos;
254}
255
256static void StopRecording(upb_Tokenizer* t) {
257 if (t->buffer_pos > t->record_start) {
258 upb_String_Append(t->record_target, t->buffer + t->record_start,
259 t->buffer_pos - t->record_start);
260 }
261 t->record_target = NULL;
262 t->record_start = -1;
263}
264
265// Called when the current character is the first character of a new
266// token (not including whitespace or comments).
267static void StartToken(upb_Tokenizer* t) {
268 t->token_type = kUpb_TokenType_Start;
269 upb_String_Clear(&t->token_text);
270 t->token_line = t->line;
271 t->token_column = t->column;
272 RecordTo(t, &t->token_text);
273}
274
275// Called when the current character is the first character after the
276// end of the last token. After this returns, current_.text will
277// contain all text consumed since StartToken() was called.
278static void EndToken(upb_Tokenizer* t) {
279 StopRecording(t);
280 t->token_end_column = t->column;
281}
282
283// -----------------------------------------------------------------
284// These helper methods make the parsing code more readable.
285// The "character classes" referred to are defined at the top of the file.
286// The method returns true if c is a member of this "class", like "Letter"
287// or "Digit".
288
289// Returns true if the current character is of the given character
290// class, but does not consume anything.
291static bool LookingAt(const upb_Tokenizer* t, bool (*f)(char)) {
292 return f(t->current_char);
293}
294
295// If the current character is in the given class, consume it and return true.
296// Otherwise return false.
297static bool TryConsumeOne(upb_Tokenizer* t, bool (*f)(char)) {
298 if (f(t->current_char)) {
299 NextChar(t);
300 return true;
301 } else {
302 return false;
303 }
304}
305
306// Like above, but try to consume the specific character indicated.
307static bool TryConsume(upb_Tokenizer* t, char c) {
308 if (t->current_char == c) {
309 NextChar(t);
310 return true;
311 } else {
312 return false;
313 }
314}
315
316// Consume zero or more of the given character class.
317static void ConsumeZeroOrMore(upb_Tokenizer* t, bool (*f)(char)) {
318 while (f(t->current_char)) {
319 NextChar(t);
320 }
321}
322
323// Consume one or more of the given character class or log the given
324// error message.
325static void ConsumeOneOrMore(upb_Tokenizer* t, bool (*f)(char),
326 const char* err_msg) {
327 if (!f(t->current_char)) {
328 ReportError(t, err_msg);
329 }
330
331 do {
332 NextChar(t);
333 } while (f(t->current_char));
334}
335
336// -----------------------------------------------------------------
337// The following four methods are used to consume tokens of specific
338// types. They are actually used to consume all characters *after*
339// the first, since the calling function consumes the first character
340// in order to decide what kind of token is being read.
341
342// Read and consume a string, ending when the given delimiter is consumed.
343static void ConsumeString(upb_Tokenizer* t, char delimiter) {
344 while (true) {
345 switch (t->current_char) {
346 case '\0':
347 ReportError(t, "Unexpected end of string.");
348
349 case '\n':
350 ReportError(t, "String literals cannot cross line boundaries.");
351
352 case '\\': {
353 // An escape sequence.
354 NextChar(t);
355 if (TryConsumeOne(t, upb_Tokenizer_IsEscape)) {
356 // Valid escape sequence.
357 } else if (TryConsumeOne(t, upb_Tokenizer_IsOctalDigit)) {
358 // Possibly followed by two more octal digits, but these will
359 // just be consumed by the main loop anyway so we don't need
360 // to do so explicitly here.
361 } else if (TryConsume(t, 'x')) {
362 if (!TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
363 ReportError(t, "Expected hex digits for escape sequence.");
364 }
365 // Possibly followed by another hex digit, but again we don't care.
366 } else if (TryConsume(t, 'u')) {
367 if (!TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
368 !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
369 !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
370 !TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
371 ReportError(t, "Expected four hex digits for \\u escape sequence.");
372 }
373 } else if (TryConsume(t, 'U')) {
374 // We expect 8 hex digits; but only the range up to 0x10ffff is
375 // legal.
376 if (!TryConsume(t, '0') || !TryConsume(t, '0') ||
377 !(TryConsume(t, '0') || TryConsume(t, '1')) ||
378 !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
379 !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
380 !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
381 !TryConsumeOne(t, upb_Tokenizer_IsHexDigit) ||
382 !TryConsumeOne(t, upb_Tokenizer_IsHexDigit)) {
383 ReportError(t,
384 "Expected eight hex digits up to 10ffff for \\U escape "
385 "sequence");
386 }
387 } else {
388 ReportError(t, "Invalid escape sequence in string literal.");
389 }
390 break;
391 }
392
393 default: {
394 if (t->current_char == delimiter) {
395 NextChar(t);
396 return;
397 }
398 NextChar(t);
399 break;
400 }
401 }
402 }
403}
404
405// Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER depending
406// on what was read. This needs to know if the first characer was a zero in
407// order to correctly recognize hex and octal numbers. It also needs to know
408// whether the first character was a '.' to parse floating point correctly.
409static upb_TokenType ConsumeNumber(upb_Tokenizer* t, bool started_with_zero,
410 bool started_with_dot) {
411 bool is_float = false;
412
413 if (started_with_zero && (TryConsume(t, 'x') || TryConsume(t, 'X'))) {
414 // A hex number (started with "0x").
415 ConsumeOneOrMore(t, upb_Tokenizer_IsHexDigit,
416 "\"0x\" must be followed by hex digits.");
417
418 } else if (started_with_zero && LookingAt(t, upb_Tokenizer_IsDigit)) {
419 // An octal number (had a leading zero).
420 ConsumeZeroOrMore(t, upb_Tokenizer_IsOctalDigit);
421 if (LookingAt(t, upb_Tokenizer_IsDigit)) {
422 ReportError(t, "Numbers starting with leading zero must be in octal.");
423 }
424
425 } else {
426 // A decimal number.
427 if (started_with_dot) {
428 is_float = true;
429 ConsumeZeroOrMore(t, upb_Tokenizer_IsDigit);
430 } else {
431 ConsumeZeroOrMore(t, upb_Tokenizer_IsDigit);
432
433 if (TryConsume(t, '.')) {
434 is_float = true;
435 ConsumeZeroOrMore(t, upb_Tokenizer_IsDigit);
436 }
437 }
438
439 if (TryConsume(t, 'e') || TryConsume(t, 'E')) {
440 is_float = true;
441 if (!TryConsume(t, '-')) TryConsume(t, '+');
442 ConsumeOneOrMore(t, upb_Tokenizer_IsDigit,
443 "\"e\" must be followed by exponent.");
444 }
445
446 if (t->options & kUpb_TokenizerOption_AllowFAfterFloat) {
447 if (TryConsume(t, 'f') || TryConsume(t, 'F')) is_float = true;
448 }
449 }
450
451 if (LookingAt(t, upb_Tokenizer_IsLetter)) {
452 ReportError(t, "Need space between number and identifier.");
453 }
454
455 if (t->current_char == '.') {
456 if (is_float) {
457 ReportError(
458 t, "Already saw decimal point or exponent; can't have another one.");
459 } else {
460 ReportError(t, "Hex and octal numbers must be integers.");
461 }
462 }
463
464 return is_float ? kUpb_TokenType_Float : kUpb_TokenType_Integer;
465}
466
467// Consume the rest of a line.
468static void ConsumeLineComment(upb_Tokenizer* t, upb_String* content) {
469 if (content != NULL) RecordTo(t, content);
470
471 while (t->current_char != '\0' && t->current_char != '\n') {
472 NextChar(t);
473 }
474 TryConsume(t, '\n');
475
476 if (content != NULL) StopRecording(t);
477}
478
479static void ConsumeBlockComment(upb_Tokenizer* t, upb_String* content) {
480 const int start_line = t->line;
481 const int start_column = t->column - 2;
482
483 if (content != NULL) RecordTo(t, content);
484
485 while (true) {
486 while (t->current_char != '\0' && t->current_char != '*' &&
487 t->current_char != '/' && t->current_char != '\n') {
488 NextChar(t);
489 }
490
491 if (TryConsume(t, '\n')) {
492 if (content != NULL) StopRecording(t);
493
494 // Consume leading whitespace and asterisk;
495 ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespaceNoNewline);
496 if (TryConsume(t, '*')) {
497 if (TryConsume(t, '/')) {
498 // End of comment.
499 break;
500 }
501 }
502
503 if (content != NULL) RecordTo(t, content);
504 } else if (TryConsume(t, '*') && TryConsume(t, '/')) {
505 // End of comment.
506 if (content != NULL) {
507 StopRecording(t);
508 // Strip trailing "*/".
509 upb_String_Erase(content, upb_String_Size(content) - 2, 2);
510 }
511 break;
512 } else if (TryConsume(t, '/') && t->current_char == '*') {
513 // Note: We didn't consume the '*' because if there is a '/' after it
514 // we want to interpret that as the end of the comment.
515 ReportError(
516 t, "\"/*\" inside block comment. Block comments cannot be nested.");
517 } else if (t->current_char == '\0') {
518 ReportErrorFormat(
519 t, "End-of-file inside block comment.\n%d:%d: Comment started here.",
520 start_line, start_column);
521 }
522 }
523}
524
525// If we're at the start of a new comment, consume it and return what kind
526// of comment it is.
527static upb_CommentType TryConsumeCommentStart(upb_Tokenizer* t) {
528 const bool style_sh = t->options & kUpb_TokenizerOption_CommentStyleShell;
529 const bool style_cpp = !style_sh;
530
531 if (style_cpp && TryConsume(t, '/')) {
532 if (TryConsume(t, '/')) {
533 return kUpb_CommentType_Line;
534 } else if (TryConsume(t, '*')) {
535 return kUpb_CommentType_Block;
536 } else {
537 // Oops, it was just a slash. Return it.
538 t->token_type = kUpb_TokenType_Symbol;
539 upb_String_Assign(&t->token_text, "/", 1);
540 t->token_line = t->line;
541 t->token_column = t->column - 1;
542 t->token_end_column = t->column;
543 return kUpb_CommentType_SlashNot;
544 }
545 } else if (style_sh && TryConsume(t, '#')) {
546 return kUpb_CommentType_Line;
547 } else {
548 return kUpb_CommentType_None;
549 }
550}
551
552// If we're looking at a TYPE_WHITESPACE token and `report_whitespace` is true,
553// consume it and return true.
554static bool TryConsumeWhitespace(upb_Tokenizer* t) {
555 if (t->options & kUpb_TokenizerOption_ReportNewlines) {
556 if (TryConsumeOne(t, upb_Tokenizer_IsWhitespaceNoNewline)) {
557 ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespaceNoNewline);
558 t->token_type = kUpb_TokenType_Whitespace;
559 return true;
560 }
561 return false;
562 }
563 if (TryConsumeOne(t, upb_Tokenizer_IsWhitespace)) {
564 ConsumeZeroOrMore(t, upb_Tokenizer_IsWhitespace);
565 t->token_type = kUpb_TokenType_Whitespace;
566 return (t->options & kUpb_TokenizerOption_ReportWhitespace) != 0;
567 }
568 return false;
569}
570
571// If we're looking at a TYPE_NEWLINE token and `report_newlines` is true,
572// consume it and return true.
573static bool TryConsumeNewline(upb_Tokenizer* t) {
574 if (t->options & kUpb_TokenizerOption_ReportNewlines) {
575 if (TryConsume(t, '\n')) {
576 t->token_type = kUpb_TokenType_Newline;
577 return true;
578 }
579 }
580 return false;
581}
582
583// -------------------------------------------------------------------
584
585int upb_Tokenizer_Column(const upb_Tokenizer* t) { return t->token_column; }
586
587int upb_Tokenizer_EndColumn(const upb_Tokenizer* t) {
588 return t->token_end_column;
589}
590
591int upb_Tokenizer_Line(const upb_Tokenizer* t) { return t->token_line; }
592
593int upb_Tokenizer_TextSize(const upb_Tokenizer* t) {
594 return t->token_text.size_;
595}
596
597const char* upb_Tokenizer_TextData(const upb_Tokenizer* t) {
598 return t->token_text.data_;
599}
600
601upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t) {
602 return t->token_type;
603}
604
605bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status) {
606 t->status = status;
607 t->previous_type = t->token_type;
608 t->previous_line = t->token_line;
609 t->previous_column = t->token_column;
610 t->previous_end_column = t->token_end_column;
611
612 if (UPB_SETJMP(t->err)) return false;
613
614 while (!t->read_error) {
615 StartToken(t);
616 bool report_token = TryConsumeWhitespace(t) || TryConsumeNewline(t);
617 EndToken(t);
618 if (report_token) return true;
619
620 switch (TryConsumeCommentStart(t)) {
621 case kUpb_CommentType_Line:
622 ConsumeLineComment(t, NULL);
623 continue;
624 case kUpb_CommentType_Block:
625 ConsumeBlockComment(t, NULL);
626 continue;
627 case kUpb_CommentType_SlashNot:
628 return true;
629 case kUpb_CommentType_None:
630 break;
631 }
632
633 // Check for EOF before continuing.
634 if (t->read_error) break;
635
636 if (LookingAt(t, upb_Tokenizer_IsUnprintable) || t->current_char == '\0') {
637 ReportError(t, "Invalid control characters encountered in text.");
638 }
639
640 // Reading some sort of token.
641 StartToken(t);
642
643 if (TryConsumeOne(t, upb_Tokenizer_IsLetter)) {
644 ConsumeZeroOrMore(t, upb_Tokenizer_IsAlphanumeric);
645 t->token_type = kUpb_TokenType_Identifier;
646 } else if (TryConsume(t, '0')) {
647 t->token_type = ConsumeNumber(t, true, false);
648 } else if (TryConsume(t, '.')) {
649 // This could be the beginning of a floating-point number, or it could
650 // just be a '.' symbol.
651
652 if (TryConsumeOne(t, upb_Tokenizer_IsDigit)) {
653 // It's a floating-point number.
654 if (t->previous_type == kUpb_TokenType_Identifier &&
655 t->token_line == t->previous_line &&
656 t->token_column == t->previous_end_column) {
657 // We don't accept syntax like "blah.123".
658 t->column -= 2;
659 ReportError(t, "Need space between identifier and decimal point.");
660 }
661 t->token_type = ConsumeNumber(t, false, true);
662 } else {
663 t->token_type = kUpb_TokenType_Symbol;
664 }
665 } else if (TryConsumeOne(t, upb_Tokenizer_IsDigit)) {
666 t->token_type = ConsumeNumber(t, false, false);
667 } else if (TryConsume(t, '\"')) {
668 ConsumeString(t, '\"');
669 t->token_type = kUpb_TokenType_String;
670 } else if (TryConsume(t, '\'')) {
671 ConsumeString(t, '\'');
672 t->token_type = kUpb_TokenType_String;
673 } else {
674 // Check if the high order bit is set.
675 if (t->current_char & 0x80) {
676 ReportErrorFormat(t, "Interpreting non ascii codepoint %d.",
677 (uint8_t)t->current_char);
678 }
679 NextChar(t);
680 t->token_type = kUpb_TokenType_Symbol;
681 }
682
683 EndToken(t);
684 return true;
685 }
686
687 // EOF
688 t->token_type = kUpb_TokenType_End;
689 upb_String_Clear(&t->token_text);
690 t->token_line = t->line;
691 t->token_column = t->column;
692 t->token_end_column = t->column;
693 upb_Status_Clear(status);
694 return false;
695}
696
697// -------------------------------------------------------------------
698// Token-parsing helpers. Remember that these don't need to report
699// errors since any errors should already have been reported while
700// tokenizing. Also, these can assume that whatever text they
701// are given is text that the tokenizer actually parsed as a token
702// of the given type.
703
704bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output) {
705 // We can't just use strtoull() because (a) it accepts negative numbers,
706 // (b) We want additional range checks, (c) it reports overflows via errno.
707
708 const char* ptr = text;
709 int base = 10;
710 uint64_t overflow_if_mul_base = (UINT64_MAX / 10) + 1;
711 if (ptr[0] == '0') {
712 if (ptr[1] == 'x' || ptr[1] == 'X') {
713 // This is hex.
714 base = 16;
715 overflow_if_mul_base = (UINT64_MAX / 16) + 1;
716 ptr += 2;
717 } else {
718 // This is octal.
719 base = 8;
720 overflow_if_mul_base = (UINT64_MAX / 8) + 1;
721 }
722 }
723
724 uint64_t result = 0;
725 // For all the leading '0's, and also the first non-zero character, we
726 // don't need to multiply.
727 while (*ptr != '\0') {
728 int digit = DigitValue(*ptr++);
729 if (digit >= base) {
730 // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
731 // token, but Tokenizer still think it's integer.
732 return false;
733 }
734 if (digit != 0) {
735 result = digit;
736 break;
737 }
738 }
739 for (; *ptr != '\0'; ptr++) {
740 int digit = DigitValue(*ptr);
741 if (digit < 0 || digit >= base) {
742 // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
743 // token, but Tokenizer still think it's integer.
744 return false;
745 }
746 if (result >= overflow_if_mul_base) {
747 // We know the multiply we're about to do will overflow, so exit now.
748 return false;
749 }
750 // We know that result * base won't overflow, but adding digit might...
751 result = result * base + digit;
752 // C++ guarantees defined "wrap" semantics when unsigned integer
753 // operations overflow, making this a fast way to check if adding
754 // digit made result overflow, and thus, wrap around.
755 if (result < (uint64_t)base) return false;
756 }
757 if (result > max_value) return false;
758
759 *output = result;
760 return true;
761}
762
763double upb_Parse_Float(const char* text) {
764 char* end;
765 double result = _upb_NoLocaleStrtod(text, &end);
766
767 // "1e" is not a valid float, but if the tokenizer reads it, it will
768 // report an error but still return it as a valid token. We need to
769 // accept anything the tokenizer could possibly return, error or not.
770 if (*end == 'e' || *end == 'E') {
771 ++end;
772 if (*end == '-' || *end == '+') ++end;
773 }
774
775 // If the Tokenizer had allow_f_after_float_ enabled, the float may be
776 // suffixed with the letter 'f'.
777 if (*end == 'f' || *end == 'F') {
778 ++end;
779 }
780
781 if ((end - text) != strlen(text) || *text == '-') {
782 fprintf(stderr,
783 "upb_Parse_Float() passed text that could not have"
784 " been tokenized as a float: %s\n",
785 text);
786 UPB_ASSERT(0);
787 }
788 return result;
789}
790
791// Append a Unicode code point to a string as UTF8.
792static void AppendUTF8(uint32_t code_point, upb_String* output) {
793 char temp[24];
794 int len = upb_Unicode_ToUTF8(code_point, temp);
795 if (len == 0) {
796 // ConsumeString permits hex values up to 0x1FFFFF,
797 // and FetchUnicodePoint doesn't perform a range check.
798 // Unicode code points end at 0x10FFFF, so this is out-of-range.
799 len = snprintf(temp, sizeof temp, "\\U%08x", code_point);
800 }
801 upb_String_Append(output, temp, len);
802}
803
804// Try to read <len> hex digits from ptr, and stuff the numeric result into
805// *result. Returns true if that many digits were successfully consumed.
806static bool ReadHexDigits(const char* ptr, int len, uint32_t* result) {
807 *result = 0;
808 if (len == 0) return false;
809 for (const char* end = ptr + len; ptr < end; ++ptr) {
810 if (*ptr == '\0') return false;
811 *result = (*result << 4) + DigitValue(*ptr);
812 }
813 return true;
814}
815
816// Convert the escape sequence parameter to a number of expected hex digits.
817static int UnicodeLength(char key) {
818 if (key == 'u') return 4;
819 if (key == 'U') return 8;
820 return 0;
821}
822
823// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
824// to parse that sequence. On success, returns a pointer to the first char
825// beyond that sequence, and fills in *code_point. On failure, returns ptr
826// itself.
827static const char* FetchUnicodePoint(const char* ptr, uint32_t* code_point) {
828 const char* p = ptr;
829 // Fetch the code point.
830 const int len = UnicodeLength(*p++);
831 if (!ReadHexDigits(p, len, code_point)) return ptr;
832 p += len;
833
834 // Check if the code point we read is a "head surrogate." If so, then we
835 // expect it to be immediately followed by another code point which is a valid
836 // "trail surrogate," and together they form a UTF-16 pair which decodes into
837 // a single Unicode point. Trail surrogates may only use \u, not \U.
838 if (upb_Unicode_IsHigh(*code_point) && *p == '\\' && *(p + 1) == 'u') {
839 uint32_t trail_surrogate;
840 if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
841 upb_Unicode_IsLow(trail_surrogate)) {
842 *code_point = upb_Unicode_FromPair(*code_point, trail_surrogate);
843 p += 6;
844 }
845 // If this failed, then we just emit the head surrogate as a code point.
846 // It's bogus, but so is the string.
847 }
848
849 return p;
850}
851
852// The text string must begin and end with single or double quote characters.
853upb_StringView upb_Parse_String(const char* text, upb_Arena* arena) {
854 const size_t size = strlen(text);
855
856 upb_String output;
857 upb_String_Init(&output, arena);
858
859 // Reminder: text[0] is always a quote character.
860 // (If text is empty, it's invalid, so we'll just return).
861 if (size == 0) {
862 fprintf(stderr,
863 "Tokenizer::ParseStringAppend() passed text that could not"
864 " have been tokenized as a string: %s",
865 text);
866 UPB_ASSERT(0);
867 return upb_StringView_FromDataAndSize(NULL, 0);
868 }
869
870 // Reserve room for new string.
871 const size_t new_len = size + upb_String_Size(&output);
872 upb_String_Reserve(&output, new_len);
873
874 // Loop through the string copying characters to "output" and
875 // interpreting escape sequences. Note that any invalid escape
876 // sequences or other errors were already reported while tokenizing.
877 // In this case we do not need to produce valid results.
878 for (const char* ptr = text + 1; *ptr != '\0'; ptr++) {
879 if (*ptr == '\\' && ptr[1] != '\0') {
880 // An escape sequence.
881 ++ptr;
882
883 if (upb_Tokenizer_IsOctalDigit(*ptr)) {
884 // An octal escape. May one, two, or three digits.
885 int code = DigitValue(*ptr);
886 if (upb_Tokenizer_IsOctalDigit(ptr[1])) {
887 ++ptr;
888 code = code * 8 + DigitValue(*ptr);
889 }
890 if (upb_Tokenizer_IsOctalDigit(ptr[1])) {
891 ++ptr;
892 code = code * 8 + DigitValue(*ptr);
893 }
894 upb_String_PushBack(&output, (char)code);
895
896 } else if (*ptr == 'x') {
897 // A hex escape. May zero, one, or two digits. (The zero case
898 // will have been caught as an error earlier.)
899 int code = 0;
900 if (upb_Tokenizer_IsHexDigit(ptr[1])) {
901 ++ptr;
902 code = DigitValue(*ptr);
903 }
904 if (upb_Tokenizer_IsHexDigit(ptr[1])) {
905 ++ptr;
906 code = code * 16 + DigitValue(*ptr);
907 }
908 upb_String_PushBack(&output, (char)code);
909
910 } else if (*ptr == 'u' || *ptr == 'U') {
911 uint32_t unicode;
912 const char* end = FetchUnicodePoint(ptr, &unicode);
913 if (end == ptr) {
914 // Failure: Just dump out what we saw, don't try to parse it.
915 upb_String_PushBack(&output, *ptr);
916 } else {
917 AppendUTF8(unicode, &output);
918 ptr = end - 1; // Because we're about to ++ptr.
919 }
920 } else {
921 // Some other escape code.
922 upb_String_PushBack(&output, TranslateEscape(*ptr));
923 }
924
925 } else if (*ptr == text[0] && ptr[1] == '\0') {
926 // Ignore final quote matching the starting quote.
927 } else {
928 upb_String_PushBack(&output, *ptr);
929 }
930 }
931
932 return upb_StringView_FromDataAndSize(upb_String_Data(&output),
933 upb_String_Size(&output));
934}
935
936static bool AllInClass(bool (*f)(char), const char* text, int size) {
937 for (int i = 0; i < size; i++) {
938 if (!f(text[i])) return false;
939 }
940 return true;
941}
942
943bool upb_Tokenizer_IsIdentifier(const char* data, int size) {
944 // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
945 if (size == 0) return false;
946 if (!upb_Tokenizer_IsLetter(data[0])) return false;
947 if (!AllInClass(upb_Tokenizer_IsAlphanumeric, data + 1, size - 1))
948 return false;
949 return true;
950}
951
952upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
953 upb_ZeroCopyInputStream* input, int options,
954 upb_Arena* arena) {
955 upb_Tokenizer* t = upb_Arena_Malloc(arena, sizeof(upb_Tokenizer));
956 if (!t) return NULL;
957
958 t->input = input;
959 t->arena = arena;
960 t->buffer = data;
961 t->buffer_size = size;
962 t->buffer_pos = 0;
963 t->read_error = false;
964 t->line = 0;
965 t->column = 0;
966 t->record_target = NULL;
967 t->record_start = -1;
968
969 // ReportNewlines implies ReportWhitespace.
970 if (options & kUpb_TokenizerOption_ReportNewlines) {
971 options |= kUpb_TokenizerOption_ReportWhitespace;
972 }
973 t->options = options;
974
975 upb_String_Init(&t->token_text, arena);
976 t->token_type = kUpb_TokenType_Start;
977 t->token_line = 0;
978 t->token_column = 0;
979 t->token_end_column = 0;
980
981 t->previous_type = kUpb_TokenType_Start;
982 t->previous_line = 0;
983 t->previous_column = 0;
984 t->previous_end_column = 0;
985
986 if (size) {
987 t->current_char = t->buffer[0];
988 } else {
989 Refresh(t);
990 }
991 return t;
992}
993
994void upb_Tokenizer_Fini(upb_Tokenizer* t) {
995 // If we had any buffer left unread, return it to the underlying stream
996 // so that someone else can read it.
997 if (t->buffer_size > t->buffer_pos) {
998 upb_ZeroCopyInputStream_BackUp(t->input, t->buffer_size - t->buffer_pos);
999 }
1000}