mirror of https://github.com/PurpleI2P/i2pd.git
I2P: End-to-End encrypted and anonymous Internet
https://i2pd.website/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
435 lines
12 KiB
435 lines
12 KiB
#ifndef INCLUDE_INJA_LEXER_HPP_ |
|
#define INCLUDE_INJA_LEXER_HPP_ |
|
|
|
#include <cctype> |
|
#include <locale> |
|
|
|
#include "config.hpp" |
|
#include "token.hpp" |
|
#include "utils.hpp" |
|
|
|
namespace inja { |
|
|
|
/*! |
|
* \brief Class for lexing an inja Template. |
|
*/ |
|
class Lexer { |
|
enum class State { |
|
Text, |
|
ExpressionStart, |
|
ExpressionStartForceLstrip, |
|
ExpressionBody, |
|
LineStart, |
|
LineBody, |
|
StatementStart, |
|
StatementStartNoLstrip, |
|
StatementStartForceLstrip, |
|
StatementBody, |
|
CommentStart, |
|
CommentStartForceLstrip, |
|
CommentBody, |
|
}; |
|
|
|
enum class MinusState { |
|
Operator, |
|
Number, |
|
}; |
|
|
|
const LexerConfig& config; |
|
|
|
State state; |
|
MinusState minus_state; |
|
std::string_view m_in; |
|
size_t tok_start; |
|
size_t pos; |
|
|
|
Token scan_body(std::string_view close, Token::Kind closeKind, std::string_view close_trim = std::string_view(), bool trim = false) { |
|
again: |
|
// skip whitespace (except for \n as it might be a close) |
|
if (tok_start >= m_in.size()) { |
|
return make_token(Token::Kind::Eof); |
|
} |
|
const char ch = m_in[tok_start]; |
|
if (ch == ' ' || ch == '\t' || ch == '\r') { |
|
tok_start += 1; |
|
goto again; |
|
} |
|
|
|
// check for close |
|
if (!close_trim.empty() && inja::string_view::starts_with(m_in.substr(tok_start), close_trim)) { |
|
state = State::Text; |
|
pos = tok_start + close_trim.size(); |
|
const Token tok = make_token(closeKind); |
|
skip_whitespaces_and_newlines(); |
|
return tok; |
|
} |
|
|
|
if (inja::string_view::starts_with(m_in.substr(tok_start), close)) { |
|
state = State::Text; |
|
pos = tok_start + close.size(); |
|
const Token tok = make_token(closeKind); |
|
if (trim) { |
|
skip_whitespaces_and_first_newline(); |
|
} |
|
return tok; |
|
} |
|
|
|
// skip \n |
|
if (ch == '\n') { |
|
tok_start += 1; |
|
goto again; |
|
} |
|
|
|
pos = tok_start + 1; |
|
if (std::isalpha(ch)) { |
|
minus_state = MinusState::Operator; |
|
return scan_id(); |
|
} |
|
|
|
const MinusState current_minus_state = minus_state; |
|
if (minus_state == MinusState::Operator) { |
|
minus_state = MinusState::Number; |
|
} |
|
|
|
switch (ch) { |
|
case '+': |
|
return make_token(Token::Kind::Plus); |
|
case '-': |
|
if (current_minus_state == MinusState::Operator) { |
|
return make_token(Token::Kind::Minus); |
|
} |
|
return scan_number(); |
|
case '*': |
|
return make_token(Token::Kind::Times); |
|
case '/': |
|
return make_token(Token::Kind::Slash); |
|
case '^': |
|
return make_token(Token::Kind::Power); |
|
case '%': |
|
return make_token(Token::Kind::Percent); |
|
case '.': |
|
return make_token(Token::Kind::Dot); |
|
case ',': |
|
return make_token(Token::Kind::Comma); |
|
case ':': |
|
return make_token(Token::Kind::Colon); |
|
case '(': |
|
return make_token(Token::Kind::LeftParen); |
|
case ')': |
|
minus_state = MinusState::Operator; |
|
return make_token(Token::Kind::RightParen); |
|
case '[': |
|
return make_token(Token::Kind::LeftBracket); |
|
case ']': |
|
minus_state = MinusState::Operator; |
|
return make_token(Token::Kind::RightBracket); |
|
case '{': |
|
return make_token(Token::Kind::LeftBrace); |
|
case '}': |
|
minus_state = MinusState::Operator; |
|
return make_token(Token::Kind::RightBrace); |
|
case '>': |
|
if (pos < m_in.size() && m_in[pos] == '=') { |
|
pos += 1; |
|
return make_token(Token::Kind::GreaterEqual); |
|
} |
|
return make_token(Token::Kind::GreaterThan); |
|
case '<': |
|
if (pos < m_in.size() && m_in[pos] == '=') { |
|
pos += 1; |
|
return make_token(Token::Kind::LessEqual); |
|
} |
|
return make_token(Token::Kind::LessThan); |
|
case '=': |
|
if (pos < m_in.size() && m_in[pos] == '=') { |
|
pos += 1; |
|
return make_token(Token::Kind::Equal); |
|
} |
|
return make_token(Token::Kind::Unknown); |
|
case '!': |
|
if (pos < m_in.size() && m_in[pos] == '=') { |
|
pos += 1; |
|
return make_token(Token::Kind::NotEqual); |
|
} |
|
return make_token(Token::Kind::Unknown); |
|
case '\"': |
|
return scan_string(); |
|
case '0': |
|
case '1': |
|
case '2': |
|
case '3': |
|
case '4': |
|
case '5': |
|
case '6': |
|
case '7': |
|
case '8': |
|
case '9': |
|
minus_state = MinusState::Operator; |
|
return scan_number(); |
|
case '_': |
|
case '@': |
|
case '$': |
|
minus_state = MinusState::Operator; |
|
return scan_id(); |
|
default: |
|
return make_token(Token::Kind::Unknown); |
|
} |
|
} |
|
|
|
Token scan_id() { |
|
for (;;) { |
|
if (pos >= m_in.size()) { |
|
break; |
|
} |
|
const char ch = m_in[pos]; |
|
if (!std::isalnum(ch) && ch != '.' && ch != '/' && ch != '_' && ch != '-') { |
|
break; |
|
} |
|
pos += 1; |
|
} |
|
return make_token(Token::Kind::Id); |
|
} |
|
|
|
Token scan_number() { |
|
for (;;) { |
|
if (pos >= m_in.size()) { |
|
break; |
|
} |
|
const char ch = m_in[pos]; |
|
// be very permissive in lexer (we'll catch errors when conversion happens) |
|
if (!(std::isdigit(ch) || ch == '.' || ch == 'e' || ch == 'E' || (ch == '+' && (pos == 0 || m_in[pos-1] == 'e' || m_in[pos-1] == 'E')) || (ch == '-' && (pos == 0 || m_in[pos-1] == 'e' || m_in[pos-1] == 'E')))) { |
|
break; |
|
} |
|
pos += 1; |
|
} |
|
return make_token(Token::Kind::Number); |
|
} |
|
|
|
Token scan_string() { |
|
bool escape {false}; |
|
for (;;) { |
|
if (pos >= m_in.size()) { |
|
break; |
|
} |
|
const char ch = m_in[pos++]; |
|
if (ch == '\\') { |
|
escape = true; |
|
} else if (!escape && ch == m_in[tok_start]) { |
|
break; |
|
} else { |
|
escape = false; |
|
} |
|
} |
|
return make_token(Token::Kind::String); |
|
} |
|
|
|
Token make_token(Token::Kind kind) const { |
|
return Token(kind, string_view::slice(m_in, tok_start, pos)); |
|
} |
|
|
|
void skip_whitespaces_and_newlines() { |
|
if (pos < m_in.size()) { |
|
while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t' || m_in[pos] == '\n' || m_in[pos] == '\r')) { |
|
pos += 1; |
|
} |
|
} |
|
} |
|
|
|
void skip_whitespaces_and_first_newline() { |
|
if (pos < m_in.size()) { |
|
while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t')) { |
|
pos += 1; |
|
} |
|
} |
|
|
|
if (pos < m_in.size()) { |
|
const char ch = m_in[pos]; |
|
if (ch == '\n') { |
|
pos += 1; |
|
} else if (ch == '\r') { |
|
pos += 1; |
|
if (pos < m_in.size() && m_in[pos] == '\n') { |
|
pos += 1; |
|
} |
|
} |
|
} |
|
} |
|
|
|
static std::string_view clear_final_line_if_whitespace(std::string_view text) { |
|
std::string_view result = text; |
|
while (!result.empty()) { |
|
const char ch = result.back(); |
|
if (ch == ' ' || ch == '\t') { |
|
result.remove_suffix(1); |
|
} else if (ch == '\n' || ch == '\r') { |
|
break; |
|
} else { |
|
return text; |
|
} |
|
} |
|
return result; |
|
} |
|
|
|
public: |
|
explicit Lexer(const LexerConfig& config): config(config), state(State::Text), minus_state(MinusState::Number) {} |
|
|
|
SourceLocation current_position() const { |
|
return get_source_location(m_in, tok_start); |
|
} |
|
|
|
void start(std::string_view input) { |
|
m_in = input; |
|
tok_start = 0; |
|
pos = 0; |
|
state = State::Text; |
|
minus_state = MinusState::Number; |
|
|
|
// Consume byte order mark (BOM) for UTF-8 |
|
if (inja::string_view::starts_with(m_in, "\xEF\xBB\xBF")) { |
|
m_in = m_in.substr(3); |
|
} |
|
} |
|
|
|
Token scan() { |
|
tok_start = pos; |
|
|
|
again: |
|
if (tok_start >= m_in.size()) { |
|
return make_token(Token::Kind::Eof); |
|
} |
|
|
|
switch (state) { |
|
default: |
|
case State::Text: { |
|
// fast-scan to first open character |
|
const size_t open_start = m_in.substr(pos).find_first_of(config.open_chars); |
|
if (open_start == std::string_view::npos) { |
|
// didn't find open, return remaining text as text token |
|
pos = m_in.size(); |
|
return make_token(Token::Kind::Text); |
|
} |
|
pos += open_start; |
|
|
|
// try to match one of the opening sequences, and get the close |
|
std::string_view open_str = m_in.substr(pos); |
|
bool must_lstrip = false; |
|
if (inja::string_view::starts_with(open_str, config.expression_open)) { |
|
if (inja::string_view::starts_with(open_str, config.expression_open_force_lstrip)) { |
|
state = State::ExpressionStartForceLstrip; |
|
must_lstrip = true; |
|
} else { |
|
state = State::ExpressionStart; |
|
} |
|
} else if (inja::string_view::starts_with(open_str, config.statement_open)) { |
|
if (inja::string_view::starts_with(open_str, config.statement_open_no_lstrip)) { |
|
state = State::StatementStartNoLstrip; |
|
} else if (inja::string_view::starts_with(open_str, config.statement_open_force_lstrip)) { |
|
state = State::StatementStartForceLstrip; |
|
must_lstrip = true; |
|
} else { |
|
state = State::StatementStart; |
|
must_lstrip = config.lstrip_blocks; |
|
} |
|
} else if (inja::string_view::starts_with(open_str, config.comment_open)) { |
|
if (inja::string_view::starts_with(open_str, config.comment_open_force_lstrip)) { |
|
state = State::CommentStartForceLstrip; |
|
must_lstrip = true; |
|
} else { |
|
state = State::CommentStart; |
|
must_lstrip = config.lstrip_blocks; |
|
} |
|
} else if ((pos == 0 || m_in[pos - 1] == '\n') && inja::string_view::starts_with(open_str, config.line_statement)) { |
|
state = State::LineStart; |
|
} else { |
|
pos += 1; // wasn't actually an opening sequence |
|
goto again; |
|
} |
|
|
|
std::string_view text = string_view::slice(m_in, tok_start, pos); |
|
if (must_lstrip) { |
|
text = clear_final_line_if_whitespace(text); |
|
} |
|
|
|
if (text.empty()) { |
|
goto again; // don't generate empty token |
|
} |
|
return Token(Token::Kind::Text, text); |
|
} |
|
case State::ExpressionStart: { |
|
state = State::ExpressionBody; |
|
pos += config.expression_open.size(); |
|
return make_token(Token::Kind::ExpressionOpen); |
|
} |
|
case State::ExpressionStartForceLstrip: { |
|
state = State::ExpressionBody; |
|
pos += config.expression_open_force_lstrip.size(); |
|
return make_token(Token::Kind::ExpressionOpen); |
|
} |
|
case State::LineStart: { |
|
state = State::LineBody; |
|
pos += config.line_statement.size(); |
|
return make_token(Token::Kind::LineStatementOpen); |
|
} |
|
case State::StatementStart: { |
|
state = State::StatementBody; |
|
pos += config.statement_open.size(); |
|
return make_token(Token::Kind::StatementOpen); |
|
} |
|
case State::StatementStartNoLstrip: { |
|
state = State::StatementBody; |
|
pos += config.statement_open_no_lstrip.size(); |
|
return make_token(Token::Kind::StatementOpen); |
|
} |
|
case State::StatementStartForceLstrip: { |
|
state = State::StatementBody; |
|
pos += config.statement_open_force_lstrip.size(); |
|
return make_token(Token::Kind::StatementOpen); |
|
} |
|
case State::CommentStart: { |
|
state = State::CommentBody; |
|
pos += config.comment_open.size(); |
|
return make_token(Token::Kind::CommentOpen); |
|
} |
|
case State::CommentStartForceLstrip: { |
|
state = State::CommentBody; |
|
pos += config.comment_open_force_lstrip.size(); |
|
return make_token(Token::Kind::CommentOpen); |
|
} |
|
case State::ExpressionBody: |
|
return scan_body(config.expression_close, Token::Kind::ExpressionClose, config.expression_close_force_rstrip); |
|
case State::LineBody: |
|
return scan_body("\n", Token::Kind::LineStatementClose); |
|
case State::StatementBody: |
|
return scan_body(config.statement_close, Token::Kind::StatementClose, config.statement_close_force_rstrip, config.trim_blocks); |
|
case State::CommentBody: { |
|
// fast-scan to comment close |
|
const size_t end = m_in.substr(pos).find(config.comment_close); |
|
if (end == std::string_view::npos) { |
|
pos = m_in.size(); |
|
return make_token(Token::Kind::Eof); |
|
} |
|
|
|
// Check for trim pattern |
|
const bool must_rstrip = inja::string_view::starts_with(m_in.substr(pos + end - 1), config.comment_close_force_rstrip); |
|
|
|
// return the entire comment in the close token |
|
state = State::Text; |
|
pos += end + config.comment_close.size(); |
|
Token tok = make_token(Token::Kind::CommentClose); |
|
|
|
if (must_rstrip || config.trim_blocks) { |
|
skip_whitespaces_and_first_newline(); |
|
} |
|
return tok; |
|
} |
|
} |
|
} |
|
|
|
const LexerConfig& get_config() const { |
|
return config; |
|
} |
|
}; |
|
|
|
} // namespace inja |
|
|
|
#endif // INCLUDE_INJA_LEXER_HPP_
|
|
|