blob: 972857b7a974e66dba50983a735ec034cb12b68b [file] [log] [blame]
/*
* Copyright (C) 2023 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/trace_processor/sqlite/sqlite_tokenizer.h"
#include <ctype.h>
#include <sqlite3.h>
#include <cstdint>
#include <optional>
#include <string_view>
#include "perfetto/base/compiler.h"
#include "perfetto/base/logging.h"
namespace perfetto {
namespace trace_processor {
// The contents of this file are ~copied from SQLite with some modifications to
// minimize the amount copied: i.e. if we can call a libc function/public SQLite
// API instead of a private one.
//
// The changes are as follows:
// 1. Remove all ifdefs to only keep branches we actually use
// 2. Change handling of |CC_KYWD0| to remove distinction between different
// SQLite kewords, reducing how many things we need to copy over.
// 3. Constants are changed from be macro defines to be values in
// |SqliteTokenType|.
namespace {
const unsigned char sqlite3CtypeMap[256] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00..07 ........ */
0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 08..0f ........ */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10..17 ........ */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 18..1f ........ */
0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80, /* 20..27 !"#$%&' */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 28..2f ()*+,-./ */
0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 30..37 01234567 */
0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 38..3f 89:;<=>? */
0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02, /* 40..47 @ABCDEFG */
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 48..4f HIJKLMNO */
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 50..57 PQRSTUVW */
0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40, /* 58..5f XYZ[\]^_ */
0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22, /* 60..67 `abcdefg */
0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 68..6f hijklmno */
0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 70..77 pqrstuvw */
0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, /* 78..7f xyz{|}~. */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 80..87 ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 88..8f ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 90..97 ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 98..9f ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a0..a7 ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a8..af ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b0..b7 ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b8..bf ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c0..c7 ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c8..cf ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d0..d7 ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d8..df ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e0..e7 ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e8..ef ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* f0..f7 ........ */
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40 /* f8..ff ........ */
};
#define CC_X 0 /* The letter 'x', or start of BLOB literal */
#define CC_KYWD0 1 /* First letter of a keyword */
#define CC_KYWD 2 /* Alphabetics or '_'. Usable in a keyword */
#define CC_DIGIT 3 /* Digits */
#define CC_DOLLAR 4 /* '$' */
#define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */
#define CC_VARNUM 6 /* '?'. Numeric SQL variables */
#define CC_SPACE 7 /* Space characters */
#define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */
#define CC_QUOTE2 9 /* '['. [...] style quoted ids */
#define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */
#define CC_MINUS 11 /* '-'. Minus or SQL-style comment */
#define CC_LT 12 /* '<'. Part of < or <= or <> */
#define CC_GT 13 /* '>'. Part of > or >= */
#define CC_EQ 14 /* '='. Part of = or == */
#define CC_BANG 15 /* '!'. Part of != */
#define CC_SLASH 16 /* '/'. / or c-style comment */
#define CC_LP 17 /* '(' */
#define CC_RP 18 /* ')' */
#define CC_SEMI 19 /* ';' */
#define CC_PLUS 20 /* '+' */
#define CC_STAR 21 /* '*' */
#define CC_PERCENT 22 /* '%' */
#define CC_COMMA 23 /* ',' */
#define CC_AND 24 /* '&' */
#define CC_TILDA 25 /* '~' */
#define CC_DOT 26 /* '.' */
#define CC_ID 27 /* unicode characters usable in IDs */
#define CC_NUL 29 /* 0x00 */
#define CC_BOM 30 /* First byte of UTF8 BOM: 0xEF 0xBB 0xBF */
// clang-format off
static const unsigned char aiClass[] = {
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */
/* 0x */ 29, 28, 28, 28, 28, 28, 28, 28, 28, 7, 7, 28, 7, 7, 28, 28,
/* 1x */ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
/* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16,
/* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6,
/* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 9, 28, 28, 28, 2,
/* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 28, 10, 28, 25, 28,
/* 8x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
/* 9x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
/* Ax */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
/* Bx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
/* Cx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
/* Dx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
/* Ex */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 30,
/* Fx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27
};
// clang-format on
#define IdChar(C) ((sqlite3CtypeMap[static_cast<unsigned char>(C)] & 0x46) != 0)
// Copy of |sqlite3GetToken| for use by the PerfettoSql transpiler.
//
// We copy this function because |sqlite3GetToken| is static to sqlite3.c
// in most distributions of SQLite so we cannot call it from our code.
//
// While we could redefine SQLITE_PRIVATE, pragmatically that will not fly in
// all the places we build trace processor so we need to resort to making a
// copy.
int GetSqliteToken(const unsigned char* z, SqliteTokenType* tokenType) {
int i, c;
switch (aiClass[*z]) { /* Switch on the character-class of the first byte
** of the token. See the comment on the CC_ defines
** above. */
case CC_SPACE: {
for (i = 1; isspace(z[i]); i++) {
}
*tokenType = SqliteTokenType::TK_SPACE;
return i;
}
case CC_MINUS: {
if (z[1] == '-') {
for (i = 2; (c = z[i]) != 0 && c != '\n'; i++) {
}
*tokenType = SqliteTokenType::TK_SPACE; /* IMP: R-22934-25134 */
return i;
} else if (z[1] == '>') {
*tokenType = SqliteTokenType::TK_PTR;
return 2 + (z[2] == '>');
}
*tokenType = SqliteTokenType::TK_MINUS;
return 1;
}
case CC_LP: {
*tokenType = SqliteTokenType::TK_LP;
return 1;
}
case CC_RP: {
*tokenType = SqliteTokenType::TK_RP;
return 1;
}
case CC_SEMI: {
*tokenType = SqliteTokenType::TK_SEMI;
return 1;
}
case CC_PLUS: {
*tokenType = SqliteTokenType::TK_PLUS;
return 1;
}
case CC_STAR: {
*tokenType = SqliteTokenType::TK_STAR;
return 1;
}
case CC_SLASH: {
if (z[1] != '*' || z[2] == 0) {
*tokenType = SqliteTokenType::TK_SLASH;
return 1;
}
for (i = 3, c = z[2]; (c != '*' || z[i] != '/') && (c = z[i]) != 0; i++) {
}
if (c)
i++;
*tokenType = SqliteTokenType::TK_SPACE; /* IMP: R-22934-25134 */
return i;
}
case CC_PERCENT: {
*tokenType = SqliteTokenType::TK_REM;
return 1;
}
case CC_EQ: {
*tokenType = SqliteTokenType::TK_EQ;
return 1 + (z[1] == '=');
}
case CC_LT: {
if ((c = z[1]) == '=') {
*tokenType = SqliteTokenType::TK_LE;
return 2;
} else if (c == '>') {
*tokenType = SqliteTokenType::TK_NE;
return 2;
} else if (c == '<') {
*tokenType = SqliteTokenType::TK_LSHIFT;
return 2;
} else {
*tokenType = SqliteTokenType::TK_LT;
return 1;
}
}
case CC_GT: {
if ((c = z[1]) == '=') {
*tokenType = SqliteTokenType::TK_GE;
return 2;
} else if (c == '>') {
*tokenType = SqliteTokenType::TK_RSHIFT;
return 2;
} else {
*tokenType = SqliteTokenType::TK_GT;
return 1;
}
}
case CC_BANG: {
if (z[1] != '=') {
*tokenType = SqliteTokenType::TK_ILLEGAL;
return 1;
} else {
*tokenType = SqliteTokenType::TK_NE;
return 2;
}
}
case CC_PIPE: {
if (z[1] != '|') {
*tokenType = SqliteTokenType::TK_BITOR;
return 1;
} else {
*tokenType = SqliteTokenType::TK_CONCAT;
return 2;
}
}
case CC_COMMA: {
*tokenType = SqliteTokenType::TK_COMMA;
return 1;
}
case CC_AND: {
*tokenType = SqliteTokenType::TK_BITAND;
return 1;
}
case CC_TILDA: {
*tokenType = SqliteTokenType::TK_BITNOT;
return 1;
}
case CC_QUOTE: {
int delim = z[0];
for (i = 1; (c = z[i]) != 0; i++) {
if (c == delim) {
if (z[i + 1] == delim) {
i++;
} else {
break;
}
}
}
if (c == '\'') {
*tokenType = SqliteTokenType::TK_STRING;
return i + 1;
} else if (c != 0) {
*tokenType = SqliteTokenType::TK_ID;
return i + 1;
} else {
*tokenType = SqliteTokenType::TK_ILLEGAL;
return i;
}
}
case CC_DOT: {
if (!isdigit(z[1])) {
*tokenType = SqliteTokenType::TK_DOT;
return 1;
}
[[fallthrough]];
}
case CC_DIGIT: {
*tokenType = SqliteTokenType::TK_INTEGER;
if (z[0] == '0' && (z[1] == 'x' || z[1] == 'X') && isxdigit(z[2])) {
for (i = 3; isxdigit(z[i]); i++) {
}
return i;
}
for (i = 0; isxdigit(z[i]); i++) {
}
if (z[i] == '.') {
i++;
while (isxdigit(z[i])) {
i++;
}
*tokenType = SqliteTokenType::TK_FLOAT;
}
if ((z[i] == 'e' || z[i] == 'E') &&
(isdigit(z[i + 1]) ||
((z[i + 1] == '+' || z[i + 1] == '-') && isdigit(z[i + 2])))) {
i += 2;
while (isdigit(z[i])) {
i++;
}
*tokenType = SqliteTokenType::TK_FLOAT;
}
while (IdChar(z[i])) {
*tokenType = SqliteTokenType::TK_ILLEGAL;
i++;
}
return i;
}
case CC_QUOTE2: {
for (i = 1, c = z[0]; c != ']' && (c = z[i]) != 0; i++) {
}
*tokenType =
c == ']' ? SqliteTokenType::TK_ID : SqliteTokenType::TK_ILLEGAL;
return i;
}
case CC_VARNUM: {
*tokenType = SqliteTokenType::TK_VARIABLE;
for (i = 1; isdigit(z[i]); i++) {
}
return i;
}
case CC_DOLLAR:
case CC_VARALPHA: {
int n = 0;
*tokenType = SqliteTokenType::TK_VARIABLE;
for (i = 1; (c = z[i]) != 0; i++) {
if (IdChar(c)) {
n++;
} else if (c == '(' && n > 0) {
do {
i++;
} while ((c = z[i]) != 0 && !isspace(c) && c != ')');
if (c == ')') {
i++;
} else {
*tokenType = SqliteTokenType::TK_ILLEGAL;
}
break;
} else if (c == ':' && z[i + 1] == ':') {
i++;
} else {
break;
}
}
if (n == 0)
*tokenType = SqliteTokenType::TK_ILLEGAL;
return i;
}
case CC_KYWD0: {
for (i = 1; aiClass[z[i]] <= CC_KYWD; i++) {
}
if (IdChar(z[i])) {
/* This token started out using characters that can appear in keywords,
** but z[i] is a character not allowed within keywords, so this must
** be an identifier instead */
i++;
break;
}
if (sqlite3_keyword_check(reinterpret_cast<const char*>(z), i)) {
*tokenType = SqliteTokenType::TK_GENERIC_KEYWORD;
} else {
*tokenType = SqliteTokenType::TK_ID;
}
return i;
}
case CC_X: {
if (z[1] == '\'') {
*tokenType = SqliteTokenType::TK_BLOB;
for (i = 2; isxdigit(z[i]); i++) {
}
if (z[i] != '\'' || i % 2) {
*tokenType = SqliteTokenType::TK_ILLEGAL;
while (z[i] && z[i] != '\'') {
i++;
}
}
if (z[i])
i++;
return i;
}
[[fallthrough]];
}
case CC_KYWD:
case CC_ID: {
i = 1;
break;
}
case CC_BOM: {
if (z[1] == 0xbb && z[2] == 0xbf) {
*tokenType = SqliteTokenType::TK_SPACE;
return 3;
}
i = 1;
break;
}
case CC_NUL: {
*tokenType = SqliteTokenType::TK_ILLEGAL;
return 0;
}
default: {
*tokenType = SqliteTokenType::TK_ILLEGAL;
return 1;
}
}
while (IdChar(z[i])) {
i++;
}
*tokenType = SqliteTokenType::TK_ID;
return i;
}
} // namespace
SqliteTokenizer::SqliteTokenizer(SqlSource sql) : source_(std::move(sql)) {}
SqliteTokenizer::Token SqliteTokenizer::Next() {
Token token;
const char* start = source_.sql().data() + offset_;
int n = GetSqliteToken(reinterpret_cast<const unsigned char*>(start),
&token.token_type);
offset_ += static_cast<uint32_t>(n);
token.str = std::string_view(start, static_cast<uint32_t>(n));
return token;
}
SqliteTokenizer::Token SqliteTokenizer::NextNonWhitespace() {
Token t;
for (t = Next(); t.token_type == SqliteTokenType::TK_SPACE; t = Next()) {
}
return t;
}
SqliteTokenizer::Token SqliteTokenizer::NextTerminal() {
Token tok = Next();
while (!tok.IsTerminal()) {
tok = Next();
}
return tok;
}
SqlSource SqliteTokenizer::Substr(const Token& start,
const Token& end,
EndToken end_token) const {
uint32_t offset =
static_cast<uint32_t>(start.str.data() - source_.sql().c_str());
const char* e =
end.str.data() +
(end_token == SqliteTokenizer::EndToken::kInclusive ? end.str.size() : 0);
uint32_t len = static_cast<uint32_t>(e - start.str.data());
return source_.Substr(offset, len);
}
SqlSource SqliteTokenizer::SubstrToken(const Token& token) const {
uint32_t offset =
static_cast<uint32_t>(token.str.data() - source_.sql().c_str());
uint32_t len = static_cast<uint32_t>(token.str.size());
return source_.Substr(offset, len);
}
std::string SqliteTokenizer::AsTraceback(const Token& token) const {
PERFETTO_CHECK(source_.sql().c_str() <= token.str.data());
PERFETTO_CHECK(token.str.data() <=
source_.sql().c_str() + source_.sql().size());
uint32_t offset =
static_cast<uint32_t>(token.str.data() - source_.sql().c_str());
return source_.AsTraceback(offset);
}
void SqliteTokenizer::Rewrite(SqlSource::Rewriter& rewriter,
const Token& start,
const Token& end,
SqlSource rewrite,
EndToken end_token) const {
uint32_t s_off =
static_cast<uint32_t>(start.str.data() - source_.sql().c_str());
uint32_t e_off =
static_cast<uint32_t>(end.str.data() - source_.sql().c_str());
uint32_t e_diff = end_token == EndToken::kInclusive
? static_cast<uint32_t>(end.str.size())
: 0;
rewriter.Rewrite(s_off, e_off + e_diff, std::move(rewrite));
}
void SqliteTokenizer::RewriteToken(SqlSource::Rewriter& rewriter,
const Token& token,
SqlSource rewrite) const {
uint32_t s_off =
static_cast<uint32_t>(token.str.data() - source_.sql().c_str());
uint32_t e_off = static_cast<uint32_t>(token.str.data() + token.str.size() -
source_.sql().c_str());
rewriter.Rewrite(s_off, e_off, std::move(rewrite));
}
} // namespace trace_processor
} // namespace perfetto