| /* |
| * Copyright (C) 2023 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "src/trace_processor/sqlite/sqlite_tokenizer.h" |
| |
| #include <ctype.h> |
| #include <sqlite3.h> |
| #include <cstdint> |
| #include <optional> |
| #include <string_view> |
| |
| #include "perfetto/base/compiler.h" |
| #include "perfetto/base/logging.h" |
| |
| namespace perfetto { |
| namespace trace_processor { |
| |
| // The contents of this file are ~copied from SQLite with some modifications to |
| // minimize the amount copied: i.e. if we can call a libc function/public SQLite |
| // API instead of a private one. |
| // |
| // The changes are as follows: |
| // 1. Remove all ifdefs to only keep branches we actually use |
| // 2. Change handling of |CC_KYWD0| to remove distinction between different |
| // SQLite kewords, reducing how many things we need to copy over. |
| // 3. Constants are changed from be macro defines to be values in |
| // |SqliteTokenType|. |
| |
| namespace { |
| |
| const unsigned char sqlite3CtypeMap[256] = { |
| 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00..07 ........ */ |
| 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 08..0f ........ */ |
| 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10..17 ........ */ |
| 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 18..1f ........ */ |
| 0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80, /* 20..27 !"#$%&' */ |
| 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 28..2f ()*+,-./ */ |
| 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 30..37 01234567 */ |
| 0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 38..3f 89:;<=>? */ |
| |
| 0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02, /* 40..47 @ABCDEFG */ |
| 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 48..4f HIJKLMNO */ |
| 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 50..57 PQRSTUVW */ |
| 0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40, /* 58..5f XYZ[\]^_ */ |
| 0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22, /* 60..67 `abcdefg */ |
| 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 68..6f hijklmno */ |
| 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 70..77 pqrstuvw */ |
| 0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, /* 78..7f xyz{|}~. */ |
| |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 80..87 ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 88..8f ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 90..97 ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 98..9f ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a0..a7 ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a8..af ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b0..b7 ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b8..bf ........ */ |
| |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c0..c7 ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c8..cf ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d0..d7 ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d8..df ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e0..e7 ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e8..ef ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* f0..f7 ........ */ |
| 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40 /* f8..ff ........ */ |
| }; |
| |
| #define CC_X 0 /* The letter 'x', or start of BLOB literal */ |
| #define CC_KYWD0 1 /* First letter of a keyword */ |
| #define CC_KYWD 2 /* Alphabetics or '_'. Usable in a keyword */ |
| #define CC_DIGIT 3 /* Digits */ |
| #define CC_DOLLAR 4 /* '$' */ |
| #define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */ |
| #define CC_VARNUM 6 /* '?'. Numeric SQL variables */ |
| #define CC_SPACE 7 /* Space characters */ |
| #define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */ |
| #define CC_QUOTE2 9 /* '['. [...] style quoted ids */ |
| #define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */ |
| #define CC_MINUS 11 /* '-'. Minus or SQL-style comment */ |
| #define CC_LT 12 /* '<'. Part of < or <= or <> */ |
| #define CC_GT 13 /* '>'. Part of > or >= */ |
| #define CC_EQ 14 /* '='. Part of = or == */ |
| #define CC_BANG 15 /* '!'. Part of != */ |
| #define CC_SLASH 16 /* '/'. / or c-style comment */ |
| #define CC_LP 17 /* '(' */ |
| #define CC_RP 18 /* ')' */ |
| #define CC_SEMI 19 /* ';' */ |
| #define CC_PLUS 20 /* '+' */ |
| #define CC_STAR 21 /* '*' */ |
| #define CC_PERCENT 22 /* '%' */ |
| #define CC_COMMA 23 /* ',' */ |
| #define CC_AND 24 /* '&' */ |
| #define CC_TILDA 25 /* '~' */ |
| #define CC_DOT 26 /* '.' */ |
| #define CC_ID 27 /* unicode characters usable in IDs */ |
| #define CC_NUL 29 /* 0x00 */ |
| #define CC_BOM 30 /* First byte of UTF8 BOM: 0xEF 0xBB 0xBF */ |
| |
| // clang-format off |
| static const unsigned char aiClass[] = { |
| /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */ |
| /* 0x */ 29, 28, 28, 28, 28, 28, 28, 28, 28, 7, 7, 28, 7, 7, 28, 28, |
| /* 1x */ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, |
| /* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16, |
| /* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6, |
| /* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 9, 28, 28, 28, 2, |
| /* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| /* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 28, 10, 28, 25, 28, |
| /* 8x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| /* 9x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| /* Ax */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| /* Bx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| /* Cx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| /* Dx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| /* Ex */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 30, |
| /* Fx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27 |
| }; |
| // clang-format on |
| |
| #define IdChar(C) ((sqlite3CtypeMap[static_cast<unsigned char>(C)] & 0x46) != 0) |
| |
| // Copy of |sqlite3GetToken| for use by the PerfettoSql transpiler. |
| // |
| // We copy this function because |sqlite3GetToken| is static to sqlite3.c |
| // in most distributions of SQLite so we cannot call it from our code. |
| // |
| // While we could redefine SQLITE_PRIVATE, pragmatically that will not fly in |
| // all the places we build trace processor so we need to resort to making a |
| // copy. |
| int GetSqliteToken(const unsigned char* z, SqliteTokenType* tokenType) { |
| int i, c; |
| switch (aiClass[*z]) { /* Switch on the character-class of the first byte |
| ** of the token. See the comment on the CC_ defines |
| ** above. */ |
| case CC_SPACE: { |
| for (i = 1; isspace(z[i]); i++) { |
| } |
| *tokenType = SqliteTokenType::TK_SPACE; |
| return i; |
| } |
| case CC_MINUS: { |
| if (z[1] == '-') { |
| for (i = 2; (c = z[i]) != 0 && c != '\n'; i++) { |
| } |
| *tokenType = SqliteTokenType::TK_SPACE; /* IMP: R-22934-25134 */ |
| return i; |
| } else if (z[1] == '>') { |
| *tokenType = SqliteTokenType::TK_PTR; |
| return 2 + (z[2] == '>'); |
| } |
| *tokenType = SqliteTokenType::TK_MINUS; |
| return 1; |
| } |
| case CC_LP: { |
| *tokenType = SqliteTokenType::TK_LP; |
| return 1; |
| } |
| case CC_RP: { |
| *tokenType = SqliteTokenType::TK_RP; |
| return 1; |
| } |
| case CC_SEMI: { |
| *tokenType = SqliteTokenType::TK_SEMI; |
| return 1; |
| } |
| case CC_PLUS: { |
| *tokenType = SqliteTokenType::TK_PLUS; |
| return 1; |
| } |
| case CC_STAR: { |
| *tokenType = SqliteTokenType::TK_STAR; |
| return 1; |
| } |
| case CC_SLASH: { |
| if (z[1] != '*' || z[2] == 0) { |
| *tokenType = SqliteTokenType::TK_SLASH; |
| return 1; |
| } |
| for (i = 3, c = z[2]; (c != '*' || z[i] != '/') && (c = z[i]) != 0; i++) { |
| } |
| if (c) |
| i++; |
| *tokenType = SqliteTokenType::TK_SPACE; /* IMP: R-22934-25134 */ |
| return i; |
| } |
| case CC_PERCENT: { |
| *tokenType = SqliteTokenType::TK_REM; |
| return 1; |
| } |
| case CC_EQ: { |
| *tokenType = SqliteTokenType::TK_EQ; |
| return 1 + (z[1] == '='); |
| } |
| case CC_LT: { |
| if ((c = z[1]) == '=') { |
| *tokenType = SqliteTokenType::TK_LE; |
| return 2; |
| } else if (c == '>') { |
| *tokenType = SqliteTokenType::TK_NE; |
| return 2; |
| } else if (c == '<') { |
| *tokenType = SqliteTokenType::TK_LSHIFT; |
| return 2; |
| } else { |
| *tokenType = SqliteTokenType::TK_LT; |
| return 1; |
| } |
| } |
| case CC_GT: { |
| if ((c = z[1]) == '=') { |
| *tokenType = SqliteTokenType::TK_GE; |
| return 2; |
| } else if (c == '>') { |
| *tokenType = SqliteTokenType::TK_RSHIFT; |
| return 2; |
| } else { |
| *tokenType = SqliteTokenType::TK_GT; |
| return 1; |
| } |
| } |
| case CC_BANG: { |
| if (z[1] != '=') { |
| *tokenType = SqliteTokenType::TK_ILLEGAL; |
| return 1; |
| } else { |
| *tokenType = SqliteTokenType::TK_NE; |
| return 2; |
| } |
| } |
| case CC_PIPE: { |
| if (z[1] != '|') { |
| *tokenType = SqliteTokenType::TK_BITOR; |
| return 1; |
| } else { |
| *tokenType = SqliteTokenType::TK_CONCAT; |
| return 2; |
| } |
| } |
| case CC_COMMA: { |
| *tokenType = SqliteTokenType::TK_COMMA; |
| return 1; |
| } |
| case CC_AND: { |
| *tokenType = SqliteTokenType::TK_BITAND; |
| return 1; |
| } |
| case CC_TILDA: { |
| *tokenType = SqliteTokenType::TK_BITNOT; |
| return 1; |
| } |
| case CC_QUOTE: { |
| int delim = z[0]; |
| for (i = 1; (c = z[i]) != 0; i++) { |
| if (c == delim) { |
| if (z[i + 1] == delim) { |
| i++; |
| } else { |
| break; |
| } |
| } |
| } |
| if (c == '\'') { |
| *tokenType = SqliteTokenType::TK_STRING; |
| return i + 1; |
| } else if (c != 0) { |
| *tokenType = SqliteTokenType::TK_ID; |
| return i + 1; |
| } else { |
| *tokenType = SqliteTokenType::TK_ILLEGAL; |
| return i; |
| } |
| } |
| case CC_DOT: { |
| if (!isdigit(z[1])) { |
| *tokenType = SqliteTokenType::TK_DOT; |
| return 1; |
| } |
| [[fallthrough]]; |
| } |
| case CC_DIGIT: { |
| *tokenType = SqliteTokenType::TK_INTEGER; |
| if (z[0] == '0' && (z[1] == 'x' || z[1] == 'X') && isxdigit(z[2])) { |
| for (i = 3; isxdigit(z[i]); i++) { |
| } |
| return i; |
| } |
| for (i = 0; isxdigit(z[i]); i++) { |
| } |
| if (z[i] == '.') { |
| i++; |
| while (isxdigit(z[i])) { |
| i++; |
| } |
| *tokenType = SqliteTokenType::TK_FLOAT; |
| } |
| if ((z[i] == 'e' || z[i] == 'E') && |
| (isdigit(z[i + 1]) || |
| ((z[i + 1] == '+' || z[i + 1] == '-') && isdigit(z[i + 2])))) { |
| i += 2; |
| while (isdigit(z[i])) { |
| i++; |
| } |
| *tokenType = SqliteTokenType::TK_FLOAT; |
| } |
| while (IdChar(z[i])) { |
| *tokenType = SqliteTokenType::TK_ILLEGAL; |
| i++; |
| } |
| return i; |
| } |
| case CC_QUOTE2: { |
| for (i = 1, c = z[0]; c != ']' && (c = z[i]) != 0; i++) { |
| } |
| *tokenType = |
| c == ']' ? SqliteTokenType::TK_ID : SqliteTokenType::TK_ILLEGAL; |
| return i; |
| } |
| case CC_VARNUM: { |
| *tokenType = SqliteTokenType::TK_VARIABLE; |
| for (i = 1; isdigit(z[i]); i++) { |
| } |
| return i; |
| } |
| case CC_DOLLAR: |
| case CC_VARALPHA: { |
| int n = 0; |
| *tokenType = SqliteTokenType::TK_VARIABLE; |
| for (i = 1; (c = z[i]) != 0; i++) { |
| if (IdChar(c)) { |
| n++; |
| } else if (c == '(' && n > 0) { |
| do { |
| i++; |
| } while ((c = z[i]) != 0 && !isspace(c) && c != ')'); |
| if (c == ')') { |
| i++; |
| } else { |
| *tokenType = SqliteTokenType::TK_ILLEGAL; |
| } |
| break; |
| } else if (c == ':' && z[i + 1] == ':') { |
| i++; |
| } else { |
| break; |
| } |
| } |
| if (n == 0) |
| *tokenType = SqliteTokenType::TK_ILLEGAL; |
| return i; |
| } |
| case CC_KYWD0: { |
| for (i = 1; aiClass[z[i]] <= CC_KYWD; i++) { |
| } |
| if (IdChar(z[i])) { |
| /* This token started out using characters that can appear in keywords, |
| ** but z[i] is a character not allowed within keywords, so this must |
| ** be an identifier instead */ |
| i++; |
| break; |
| } |
| if (sqlite3_keyword_check(reinterpret_cast<const char*>(z), i)) { |
| *tokenType = SqliteTokenType::TK_GENERIC_KEYWORD; |
| } else { |
| *tokenType = SqliteTokenType::TK_ID; |
| } |
| return i; |
| } |
| case CC_X: { |
| if (z[1] == '\'') { |
| *tokenType = SqliteTokenType::TK_BLOB; |
| for (i = 2; isdigit(z[i]); i++) { |
| } |
| if (z[i] != '\'' || i % 2) { |
| *tokenType = SqliteTokenType::TK_ILLEGAL; |
| while (z[i] && z[i] != '\'') { |
| i++; |
| } |
| } |
| if (z[i]) |
| i++; |
| return i; |
| } |
| [[fallthrough]]; |
| } |
| case CC_KYWD: |
| case CC_ID: { |
| i = 1; |
| break; |
| } |
| case CC_BOM: { |
| if (z[1] == 0xbb && z[2] == 0xbf) { |
| *tokenType = SqliteTokenType::TK_SPACE; |
| return 3; |
| } |
| i = 1; |
| break; |
| } |
| case CC_NUL: { |
| *tokenType = SqliteTokenType::TK_ILLEGAL; |
| return 0; |
| } |
| default: { |
| *tokenType = SqliteTokenType::TK_ILLEGAL; |
| return 1; |
| } |
| } |
| while (IdChar(z[i])) { |
| i++; |
| } |
| *tokenType = SqliteTokenType::TK_ID; |
| return i; |
| } |
| |
| } // namespace |
| |
| SqliteTokenizer::SqliteTokenizer(SqlSource sql) : source_(std::move(sql)) {} |
| |
| SqliteTokenizer::Token SqliteTokenizer::Next() { |
| Token token; |
| const char* start = source_.sql().data() + offset_; |
| int n = GetSqliteToken(reinterpret_cast<const unsigned char*>(start), |
| &token.token_type); |
| offset_ += static_cast<uint32_t>(n); |
| token.str = std::string_view(start, static_cast<uint32_t>(n)); |
| return token; |
| } |
| |
| SqliteTokenizer::Token SqliteTokenizer::NextNonWhitespace() { |
| Token t; |
| for (t = Next(); t.token_type == SqliteTokenType::TK_SPACE; t = Next()) { |
| } |
| return t; |
| } |
| |
| SqliteTokenizer::Token SqliteTokenizer::NextTerminal() { |
| Token tok = Next(); |
| while (!tok.IsTerminal()) { |
| tok = Next(); |
| } |
| return tok; |
| } |
| |
| SqlSource SqliteTokenizer::Substr(const Token& start, const Token& end) const { |
| uint32_t offset = |
| static_cast<uint32_t>(start.str.data() - source_.sql().c_str()); |
| uint32_t len = static_cast<uint32_t>(end.str.data() - start.str.data()); |
| return source_.Substr(offset, len); |
| } |
| |
| SqlSource SqliteTokenizer::SubstrToken(const Token& token) const { |
| uint32_t offset = |
| static_cast<uint32_t>(token.str.data() - source_.sql().c_str()); |
| uint32_t len = static_cast<uint32_t>(token.str.size()); |
| return source_.Substr(offset, len); |
| } |
| |
| std::string SqliteTokenizer::AsTraceback(const Token& token) const { |
| PERFETTO_CHECK(source_.sql().c_str() <= token.str.data()); |
| PERFETTO_CHECK(token.str.data() <= |
| source_.sql().c_str() + source_.sql().size()); |
| uint32_t offset = |
| static_cast<uint32_t>(token.str.data() - source_.sql().c_str()); |
| return source_.AsTraceback(offset); |
| } |
| |
| void SqliteTokenizer::Rewrite(SqlSource::Rewriter& rewriter, |
| const Token& start, |
| const Token& end, |
| SqlSource rewrite, |
| EndToken end_token) const { |
| uint32_t s_off = |
| static_cast<uint32_t>(start.str.data() - source_.sql().c_str()); |
| uint32_t e_off = |
| static_cast<uint32_t>(end.str.data() - source_.sql().c_str()); |
| uint32_t e_diff = end_token == EndToken::kInclusive |
| ? static_cast<uint32_t>(end.str.size()) |
| : 0; |
| rewriter.Rewrite(s_off, e_off + e_diff, std::move(rewrite)); |
| } |
| |
| void SqliteTokenizer::RewriteToken(SqlSource::Rewriter& rewriter, |
| const Token& token, |
| SqlSource rewrite) const { |
| uint32_t s_off = |
| static_cast<uint32_t>(token.str.data() - source_.sql().c_str()); |
| uint32_t e_off = static_cast<uint32_t>(token.str.data() + token.str.size() - |
| source_.sql().c_str()); |
| rewriter.Rewrite(s_off, e_off, std::move(rewrite)); |
| } |
| |
| } // namespace trace_processor |
| } // namespace perfetto |