| // Copyright (c) 2019 Pantor. All rights reserved. |
| |
| #ifndef INCLUDE_INJA_LEXER_HPP_ |
| #define INCLUDE_INJA_LEXER_HPP_ |
| |
| #include <cctype> |
| #include <locale> |
| |
| #include "config.hpp" |
| #include "token.hpp" |
| #include "utils.hpp" |
| |
| |
| namespace inja { |
| |
| /*! |
| * \brief Class for lexing an inja Template. |
| */ |
| class Lexer { |
| enum class State { |
| Text, |
| ExpressionStart, |
| ExpressionBody, |
| LineStart, |
| LineBody, |
| StatementStart, |
| StatementBody, |
| CommentStart, |
| CommentBody |
| } m_state; |
| |
| const LexerConfig& m_config; |
| nonstd::string_view m_in; |
| size_t m_tok_start; |
| size_t m_pos; |
| |
| public: |
| explicit Lexer(const LexerConfig& config) : m_config(config) {} |
| |
| void start(nonstd::string_view in) { |
| m_in = in; |
| m_tok_start = 0; |
| m_pos = 0; |
| m_state = State::Text; |
| } |
| |
| Token scan() { |
| m_tok_start = m_pos; |
| |
| again: |
| if (m_tok_start >= m_in.size()) return make_token(Token::Kind::Eof); |
| |
| switch (m_state) { |
| default: |
| case State::Text: { |
| // fast-scan to first open character |
| size_t open_start = m_in.substr(m_pos).find_first_of(m_config.open_chars); |
| if (open_start == nonstd::string_view::npos) { |
| // didn't find open, return remaining text as text token |
| m_pos = m_in.size(); |
| return make_token(Token::Kind::Text); |
| } |
| m_pos += open_start; |
| |
| // try to match one of the opening sequences, and get the close |
| nonstd::string_view open_str = m_in.substr(m_pos); |
| bool must_lstrip = false; |
| if (inja::string_view::starts_with(open_str, m_config.expression_open)) { |
| m_state = State::ExpressionStart; |
| } else if (inja::string_view::starts_with(open_str, m_config.statement_open)) { |
| m_state = State::StatementStart; |
| must_lstrip = m_config.lstrip_blocks; |
| } else if (inja::string_view::starts_with(open_str, m_config.comment_open)) { |
| m_state = State::CommentStart; |
| must_lstrip = m_config.lstrip_blocks; |
| } else if ((m_pos == 0 || m_in[m_pos - 1] == '\n') && |
| inja::string_view::starts_with(open_str, m_config.line_statement)) { |
| m_state = State::LineStart; |
| } else { |
| m_pos += 1; // wasn't actually an opening sequence |
| goto again; |
| } |
| |
| nonstd::string_view text = string_view::slice(m_in, m_tok_start, m_pos); |
| if (must_lstrip) |
| text = clear_final_line_if_whitespace(text); |
| |
| if (text.empty()) goto again; // don't generate empty token |
| return Token(Token::Kind::Text, text); |
| } |
| case State::ExpressionStart: { |
| m_state = State::ExpressionBody; |
| m_pos += m_config.expression_open.size(); |
| return make_token(Token::Kind::ExpressionOpen); |
| } |
| case State::LineStart: { |
| m_state = State::LineBody; |
| m_pos += m_config.line_statement.size(); |
| return make_token(Token::Kind::LineStatementOpen); |
| } |
| case State::StatementStart: { |
| m_state = State::StatementBody; |
| m_pos += m_config.statement_open.size(); |
| return make_token(Token::Kind::StatementOpen); |
| } |
| case State::CommentStart: { |
| m_state = State::CommentBody; |
| m_pos += m_config.comment_open.size(); |
| return make_token(Token::Kind::CommentOpen); |
| } |
| case State::ExpressionBody: |
| return scan_body(m_config.expression_close, Token::Kind::ExpressionClose); |
| case State::LineBody: |
| return scan_body("\n", Token::Kind::LineStatementClose); |
| case State::StatementBody: |
| return scan_body(m_config.statement_close, Token::Kind::StatementClose, m_config.trim_blocks); |
| case State::CommentBody: { |
| // fast-scan to comment close |
| size_t end = m_in.substr(m_pos).find(m_config.comment_close); |
| if (end == nonstd::string_view::npos) { |
| m_pos = m_in.size(); |
| return make_token(Token::Kind::Eof); |
| } |
| // return the entire comment in the close token |
| m_state = State::Text; |
| m_pos += end + m_config.comment_close.size(); |
| Token tok = make_token(Token::Kind::CommentClose); |
| if (m_config.trim_blocks) |
| skip_newline(); |
| return tok; |
| } |
| } |
| } |
| |
| const LexerConfig& get_config() const { return m_config; } |
| |
| private: |
| Token scan_body(nonstd::string_view close, Token::Kind closeKind, bool trim = false) { |
| again: |
| // skip whitespace (except for \n as it might be a close) |
| if (m_tok_start >= m_in.size()) return make_token(Token::Kind::Eof); |
| char ch = m_in[m_tok_start]; |
| if (ch == ' ' || ch == '\t' || ch == '\r') { |
| m_tok_start += 1; |
| goto again; |
| } |
| |
| // check for close |
| if (inja::string_view::starts_with(m_in.substr(m_tok_start), close)) { |
| m_state = State::Text; |
| m_pos = m_tok_start + close.size(); |
| Token tok = make_token(closeKind); |
| if (trim) |
| skip_newline(); |
| return tok; |
| } |
| |
| // skip \n |
| if (ch == '\n') { |
| m_tok_start += 1; |
| goto again; |
| } |
| |
| m_pos = m_tok_start + 1; |
| if (std::isalpha(ch)) return scan_id(); |
| switch (ch) { |
| case ',': |
| return make_token(Token::Kind::Comma); |
| case ':': |
| return make_token(Token::Kind::Colon); |
| case '(': |
| return make_token(Token::Kind::LeftParen); |
| case ')': |
| return make_token(Token::Kind::RightParen); |
| case '[': |
| return make_token(Token::Kind::LeftBracket); |
| case ']': |
| return make_token(Token::Kind::RightBracket); |
| case '{': |
| return make_token(Token::Kind::LeftBrace); |
| case '}': |
| return make_token(Token::Kind::RightBrace); |
| case '>': |
| if (m_pos < m_in.size() && m_in[m_pos] == '=') { |
| m_pos += 1; |
| return make_token(Token::Kind::GreaterEqual); |
| } |
| return make_token(Token::Kind::GreaterThan); |
| case '<': |
| if (m_pos < m_in.size() && m_in[m_pos] == '=') { |
| m_pos += 1; |
| return make_token(Token::Kind::LessEqual); |
| } |
| return make_token(Token::Kind::LessThan); |
| case '=': |
| if (m_pos < m_in.size() && m_in[m_pos] == '=') { |
| m_pos += 1; |
| return make_token(Token::Kind::Equal); |
| } |
| return make_token(Token::Kind::Unknown); |
| case '!': |
| if (m_pos < m_in.size() && m_in[m_pos] == '=') { |
| m_pos += 1; |
| return make_token(Token::Kind::NotEqual); |
| } |
| return make_token(Token::Kind::Unknown); |
| case '\"': |
| return scan_string(); |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| case '-': |
| return scan_number(); |
| case '_': |
| return scan_id(); |
| default: |
| return make_token(Token::Kind::Unknown); |
| } |
| } |
| |
| Token scan_id() { |
| for (;;) { |
| if (m_pos >= m_in.size()) { |
| break; |
| } |
| char ch = m_in[m_pos]; |
| if (!std::isalnum(ch) && ch != '.' && ch != '/' && ch != '_' && ch != '-') { |
| break; |
| } |
| m_pos += 1; |
| } |
| return make_token(Token::Kind::Id); |
| } |
| |
| Token scan_number() { |
| for (;;) { |
| if (m_pos >= m_in.size()) { |
| break; |
| } |
| char ch = m_in[m_pos]; |
| // be very permissive in lexer (we'll catch errors when conversion happens) |
| if (!std::isdigit(ch) && ch != '.' && ch != 'e' && ch != 'E' && ch != '+' && ch != '-') { |
| break; |
| } |
| m_pos += 1; |
| } |
| return make_token(Token::Kind::Number); |
| } |
| |
| Token scan_string() { |
| bool escape {false}; |
| for (;;) { |
| if (m_pos >= m_in.size()) break; |
| char ch = m_in[m_pos++]; |
| if (ch == '\\') { |
| escape = true; |
| } else if (!escape && ch == m_in[m_tok_start]) { |
| break; |
| } else { |
| escape = false; |
| } |
| } |
| return make_token(Token::Kind::String); |
| } |
| |
| Token make_token(Token::Kind kind) const { |
| return Token(kind, string_view::slice(m_in, m_tok_start, m_pos)); |
| } |
| |
| void skip_newline() { |
| if (m_pos < m_in.size()) { |
| char ch = m_in[m_pos]; |
| if (ch == '\n') |
| m_pos += 1; |
| else if (ch == '\r') { |
| m_pos += 1; |
| if (m_pos < m_in.size() && m_in[m_pos] == '\n') |
| m_pos += 1; |
| } |
| } |
| } |
| |
| static nonstd::string_view clear_final_line_if_whitespace(nonstd::string_view text) |
| { |
| nonstd::string_view result = text; |
| while (!result.empty()) { |
| char ch = result.back(); |
| if (ch == ' ' || ch == '\t') |
| result.remove_suffix(1); |
| else if (ch == '\n' || ch == '\r') |
| break; |
| else |
| return text; |
| } |
| return result; |
| } |
| }; |
| |
| } |
| |
| #endif // INCLUDE_INJA_LEXER_HPP_ |