blob: e31c3d60593ba089b678b013e14cd7f340fc31e5 [file] [log] [blame]
// Copyright (c) 2020 Pantor. All rights reserved.
#ifndef INCLUDE_INJA_LEXER_HPP_
#define INCLUDE_INJA_LEXER_HPP_
#include <cctype>
#include <locale>
#include "config.hpp"
#include "token.hpp"
#include "utils.hpp"
namespace inja {
/*!
* \brief Class for lexing an inja Template.
*/
class Lexer {
enum class State {
Text,
ExpressionStart,
ExpressionStartForceLstrip,
ExpressionBody,
LineStart,
LineBody,
StatementStart,
StatementStartNoLstrip,
StatementStartForceLstrip,
StatementBody,
CommentStart,
CommentBody,
};
enum class MinusState {
Operator,
Number,
};
const LexerConfig &config;
State state;
MinusState minus_state;
nonstd::string_view m_in;
size_t tok_start;
size_t pos;
Token scan_body(nonstd::string_view close, Token::Kind closeKind, nonstd::string_view close_trim = nonstd::string_view(), bool trim = false) {
again:
// skip whitespace (except for \n as it might be a close)
if (tok_start >= m_in.size()) {
return make_token(Token::Kind::Eof);
}
char ch = m_in[tok_start];
if (ch == ' ' || ch == '\t' || ch == '\r') {
tok_start += 1;
goto again;
}
// check for close
if (!close_trim.empty() && inja::string_view::starts_with(m_in.substr(tok_start), close_trim)) {
state = State::Text;
pos = tok_start + close_trim.size();
Token tok = make_token(closeKind);
skip_whitespaces_and_newlines();
return tok;
}
if (inja::string_view::starts_with(m_in.substr(tok_start), close)) {
state = State::Text;
pos = tok_start + close.size();
Token tok = make_token(closeKind);
if (trim) {
skip_whitespaces_and_first_newline();
}
return tok;
}
// skip \n
if (ch == '\n') {
tok_start += 1;
goto again;
}
pos = tok_start + 1;
if (std::isalpha(ch)) {
minus_state = MinusState::Operator;
return scan_id();
}
MinusState current_minus_state = minus_state;
if (minus_state == MinusState::Operator) {
minus_state = MinusState::Number;
}
switch (ch) {
case '+':
return make_token(Token::Kind::Plus);
case '-':
if (current_minus_state == MinusState::Operator) {
return make_token(Token::Kind::Minus);
}
return scan_number();
case '*':
return make_token(Token::Kind::Times);
case '/':
return make_token(Token::Kind::Slash);
case '^':
return make_token(Token::Kind::Power);
case '%':
return make_token(Token::Kind::Percent);
case '.':
return make_token(Token::Kind::Dot);
case ',':
return make_token(Token::Kind::Comma);
case ':':
return make_token(Token::Kind::Colon);
case '(':
return make_token(Token::Kind::LeftParen);
case ')':
minus_state = MinusState::Operator;
return make_token(Token::Kind::RightParen);
case '[':
return make_token(Token::Kind::LeftBracket);
case ']':
minus_state = MinusState::Operator;
return make_token(Token::Kind::RightBracket);
case '{':
return make_token(Token::Kind::LeftBrace);
case '}':
minus_state = MinusState::Operator;
return make_token(Token::Kind::RightBrace);
case '>':
if (pos < m_in.size() && m_in[pos] == '=') {
pos += 1;
return make_token(Token::Kind::GreaterEqual);
}
return make_token(Token::Kind::GreaterThan);
case '<':
if (pos < m_in.size() && m_in[pos] == '=') {
pos += 1;
return make_token(Token::Kind::LessEqual);
}
return make_token(Token::Kind::LessThan);
case '=':
if (pos < m_in.size() && m_in[pos] == '=') {
pos += 1;
return make_token(Token::Kind::Equal);
}
return make_token(Token::Kind::Unknown);
case '!':
if (pos < m_in.size() && m_in[pos] == '=') {
pos += 1;
return make_token(Token::Kind::NotEqual);
}
return make_token(Token::Kind::Unknown);
case '\"':
return scan_string();
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
minus_state = MinusState::Operator;
return scan_number();
case '_':
case '@':
case '$':
minus_state = MinusState::Operator;
return scan_id();
default:
return make_token(Token::Kind::Unknown);
}
}
Token scan_id() {
for (;;) {
if (pos >= m_in.size()) {
break;
}
char ch = m_in[pos];
if (!std::isalnum(ch) && ch != '.' && ch != '/' && ch != '_' && ch != '-') {
break;
}
pos += 1;
}
return make_token(Token::Kind::Id);
}
Token scan_number() {
for (;;) {
if (pos >= m_in.size()) {
break;
}
char ch = m_in[pos];
// be very permissive in lexer (we'll catch errors when conversion happens)
if (!std::isdigit(ch) && ch != '.' && ch != 'e' && ch != 'E' && ch != '+' && ch != '-') {
break;
}
pos += 1;
}
return make_token(Token::Kind::Number);
}
Token scan_string() {
bool escape {false};
for (;;) {
if (pos >= m_in.size()) {
break;
}
char ch = m_in[pos++];
if (ch == '\\') {
escape = true;
} else if (!escape && ch == m_in[tok_start]) {
break;
} else {
escape = false;
}
}
return make_token(Token::Kind::String);
}
Token make_token(Token::Kind kind) const { return Token(kind, string_view::slice(m_in, tok_start, pos)); }
void skip_whitespaces_and_newlines() {
if (pos < m_in.size()) {
while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t' || m_in[pos] == '\n' || m_in[pos] == '\r')) {
pos += 1;
}
}
}
void skip_whitespaces_and_first_newline() {
if (pos < m_in.size()) {
while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t')) {
pos += 1;
}
}
if (pos < m_in.size()) {
char ch = m_in[pos];
if (ch == '\n') {
pos += 1;
} else if (ch == '\r') {
pos += 1;
if (pos < m_in.size() && m_in[pos] == '\n') {
pos += 1;
}
}
}
}
static nonstd::string_view clear_final_line_if_whitespace(nonstd::string_view text) {
nonstd::string_view result = text;
while (!result.empty()) {
char ch = result.back();
if (ch == ' ' || ch == '\t') {
result.remove_suffix(1);
} else if (ch == '\n' || ch == '\r') {
break;
} else {
return text;
}
}
return result;
}
public:
explicit Lexer(const LexerConfig &config) : config(config), state(State::Text), minus_state(MinusState::Number) {}
SourceLocation current_position() const {
return get_source_location(m_in, tok_start);
}
void start(nonstd::string_view input) {
m_in = input;
tok_start = 0;
pos = 0;
state = State::Text;
minus_state = MinusState::Number;
// Consume byte order mark (BOM) for UTF-8
if (inja::string_view::starts_with(m_in, "\xEF\xBB\xBF")) {
m_in = m_in.substr(3);
}
}
Token scan() {
tok_start = pos;
again:
if (tok_start >= m_in.size()) {
return make_token(Token::Kind::Eof);
}
switch (state) {
default:
case State::Text: {
// fast-scan to first open character
size_t open_start = m_in.substr(pos).find_first_of(config.open_chars);
if (open_start == nonstd::string_view::npos) {
// didn't find open, return remaining text as text token
pos = m_in.size();
return make_token(Token::Kind::Text);
}
pos += open_start;
// try to match one of the opening sequences, and get the close
nonstd::string_view open_str = m_in.substr(pos);
bool must_lstrip = false;
if (inja::string_view::starts_with(open_str, config.expression_open)) {
if (inja::string_view::starts_with(open_str, config.expression_open_force_lstrip)) {
state = State::ExpressionStartForceLstrip;
must_lstrip = true;
} else {
state = State::ExpressionStart;
}
} else if (inja::string_view::starts_with(open_str, config.statement_open)) {
if (inja::string_view::starts_with(open_str, config.statement_open_no_lstrip)) {
state = State::StatementStartNoLstrip;
} else if (inja::string_view::starts_with(open_str, config.statement_open_force_lstrip )) {
state = State::StatementStartForceLstrip;
must_lstrip = true;
} else {
state = State::StatementStart;
must_lstrip = config.lstrip_blocks;
}
} else if (inja::string_view::starts_with(open_str, config.comment_open)) {
state = State::CommentStart;
must_lstrip = config.lstrip_blocks;
} else if ((pos == 0 || m_in[pos - 1] == '\n') && inja::string_view::starts_with(open_str, config.line_statement)) {
state = State::LineStart;
} else {
pos += 1; // wasn't actually an opening sequence
goto again;
}
nonstd::string_view text = string_view::slice(m_in, tok_start, pos);
if (must_lstrip) {
text = clear_final_line_if_whitespace(text);
}
if (text.empty()) {
goto again; // don't generate empty token
}
return Token(Token::Kind::Text, text);
}
case State::ExpressionStart: {
state = State::ExpressionBody;
pos += config.expression_open.size();
return make_token(Token::Kind::ExpressionOpen);
}
case State::ExpressionStartForceLstrip: {
state = State::ExpressionBody;
pos += config.expression_open_force_lstrip.size();
return make_token(Token::Kind::ExpressionOpen);
}
case State::LineStart: {
state = State::LineBody;
pos += config.line_statement.size();
return make_token(Token::Kind::LineStatementOpen);
}
case State::StatementStart: {
state = State::StatementBody;
pos += config.statement_open.size();
return make_token(Token::Kind::StatementOpen);
}
case State::StatementStartNoLstrip: {
state = State::StatementBody;
pos += config.statement_open_no_lstrip.size();
return make_token(Token::Kind::StatementOpen);
}
case State::StatementStartForceLstrip: {
state = State::StatementBody;
pos += config.statement_open_force_lstrip.size();
return make_token(Token::Kind::StatementOpen);
}
case State::CommentStart: {
state = State::CommentBody;
pos += config.comment_open.size();
return make_token(Token::Kind::CommentOpen);
}
case State::ExpressionBody:
return scan_body(config.expression_close, Token::Kind::ExpressionClose, config.expression_close_force_rstrip);
case State::LineBody:
return scan_body("\n", Token::Kind::LineStatementClose);
case State::StatementBody:
return scan_body(config.statement_close, Token::Kind::StatementClose, config.statement_close_force_rstrip, config.trim_blocks);
case State::CommentBody: {
// fast-scan to comment close
size_t end = m_in.substr(pos).find(config.comment_close);
if (end == nonstd::string_view::npos) {
pos = m_in.size();
return make_token(Token::Kind::Eof);
}
// return the entire comment in the close token
state = State::Text;
pos += end + config.comment_close.size();
Token tok = make_token(Token::Kind::CommentClose);
if (config.trim_blocks) {
skip_whitespaces_and_first_newline();
}
return tok;
}
}
}
const LexerConfig &get_config() const {
return config;
}
};
} // namespace inja
#endif // INCLUDE_INJA_LEXER_HPP_