mirror of https://git.meli.delivery/meli/meli
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
591 lines
21 KiB
Rust
591 lines
21 KiB
Rust
/*
|
|
* Copyright (c) 2020-2023, Stalwart Labs Ltd.
|
|
*
|
|
* This file is part of the Stalwart Sieve Interpreter.
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as
|
|
* published by the Free Software Foundation, either version 3 of
|
|
* the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Affero General Public License for more details.
|
|
* in the LICENSE file at the top-level directory of this distribution.
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
* You can be released from the requirements of the AGPLv3 license by
|
|
* purchasing a commercial license. Please contact licensing@stalw.art
|
|
* for more details.
|
|
*/
|
|
|
|
use std::{iter::Peekable, slice::Iter};
|
|
|
|
use crate::sieve::{
|
|
compiler::{CompileError, ErrorType, Number},
|
|
runtime::eval::IntoString,
|
|
Compiler,
|
|
};
|
|
|
|
use super::{word::WORDS, StringConstant, Token};
|
|
|
|
pub(crate) struct Tokenizer<'x> {
|
|
pub compiler: &'x Compiler,
|
|
pub iter: Peekable<Iter<'x, u8>>,
|
|
pub buf: Vec<u8>,
|
|
pub next_token: Vec<TokenInfo>,
|
|
|
|
pub pos: usize,
|
|
pub line_num: usize,
|
|
pub line_start: usize,
|
|
|
|
pub text_line_num: usize,
|
|
pub text_line_pos: usize,
|
|
|
|
pub token_line_num: usize,
|
|
pub token_line_pos: usize,
|
|
|
|
pub token_is_tag: bool,
|
|
|
|
pub last_ch: u8,
|
|
pub state: State,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub(crate) struct TokenInfo {
|
|
pub(crate) token: Token,
|
|
pub(crate) line_num: usize,
|
|
pub(crate) line_pos: usize,
|
|
}
|
|
|
|
pub(crate) enum State {
|
|
None,
|
|
BracketComment,
|
|
HashComment,
|
|
QuotedString(StringType),
|
|
MultiLine(StringType),
|
|
}
|
|
|
|
#[derive(Clone, Copy, Default)]
|
|
pub(crate) struct StringType {
|
|
maybe_variable: bool,
|
|
has_other: bool,
|
|
has_digits: bool,
|
|
has_dots: bool,
|
|
}
|
|
|
|
impl<'x> Tokenizer<'x> {
|
|
pub fn new(compiler: &'x Compiler, bytes: &'x [u8]) -> Self {
|
|
Tokenizer {
|
|
compiler,
|
|
iter: bytes.iter().peekable(),
|
|
buf: Vec::with_capacity(bytes.len() / 2),
|
|
pos: usize::MAX,
|
|
line_num: 1,
|
|
line_start: 0,
|
|
text_line_num: 0,
|
|
text_line_pos: 0,
|
|
token_line_num: 0,
|
|
token_line_pos: 0,
|
|
token_is_tag: false,
|
|
next_token: Vec::with_capacity(2),
|
|
last_ch: 0,
|
|
state: State::None,
|
|
}
|
|
}
|
|
|
|
pub fn get_current_token(&mut self) -> Option<TokenInfo> {
|
|
if !self.buf.is_empty() {
|
|
let word = std::str::from_utf8(&self.buf).unwrap();
|
|
let token = if let Some(word) = WORDS.get(word) {
|
|
if self.token_is_tag {
|
|
self.token_line_pos -= 1;
|
|
Token::Tag(*word)
|
|
} else {
|
|
Token::Identifier(*word)
|
|
}
|
|
} else if self.buf.first().unwrap().is_ascii_digit() {
|
|
let multiplier = match self.buf.last().unwrap() {
|
|
b'k' => 1024,
|
|
b'm' => 1048576,
|
|
b'g' => 1073741824,
|
|
_ => 1,
|
|
};
|
|
|
|
if let Ok(number) = (if multiplier > 1 && self.buf.len() > 1 {
|
|
std::str::from_utf8(&self.buf[..self.buf.len() - 1]).unwrap()
|
|
} else {
|
|
word
|
|
})
|
|
.parse::<usize>()
|
|
{
|
|
Token::Number(number.saturating_mul(multiplier))
|
|
} else if self.token_is_tag {
|
|
Token::Unknown(format!(":{word}"))
|
|
} else {
|
|
Token::Unknown(word.to_string())
|
|
}
|
|
} else if self.token_is_tag {
|
|
Token::Unknown(format!(":{word}"))
|
|
} else {
|
|
Token::Unknown(word.to_string())
|
|
};
|
|
|
|
self.reset_current_token();
|
|
|
|
Some(TokenInfo {
|
|
token,
|
|
line_num: self.token_line_num,
|
|
line_pos: self.token_line_pos,
|
|
})
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn reset_current_token(&mut self) {
|
|
self.buf.clear();
|
|
self.token_is_tag = false;
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn token_is_tag(&mut self) {
|
|
self.token_is_tag = true;
|
|
}
|
|
|
|
pub fn get_token(&mut self, token: Token) -> TokenInfo {
|
|
let next_token = TokenInfo {
|
|
token,
|
|
line_num: self.line_num,
|
|
line_pos: self.pos - self.line_start,
|
|
};
|
|
if let Some(token) = self.get_current_token() {
|
|
self.next_token.push(next_token);
|
|
token
|
|
} else {
|
|
next_token
|
|
}
|
|
}
|
|
|
|
pub fn get_string(&mut self, str_type: StringType) -> Result<TokenInfo, CompileError> {
|
|
if self.buf.len() < self.compiler.max_string_size {
|
|
let token = if str_type.maybe_variable {
|
|
Token::StringVariable(self.buf.to_vec())
|
|
} else {
|
|
let constant = self.buf.to_vec().into_string();
|
|
if !str_type.has_other && str_type.has_digits {
|
|
if !str_type.has_dots {
|
|
if let Some(number) = constant.parse::<i64>().ok().and_then(|n| {
|
|
if n.to_string() == constant {
|
|
Some(n)
|
|
} else {
|
|
None
|
|
}
|
|
}) {
|
|
Token::StringConstant(StringConstant::Number(Number::Integer(number)))
|
|
} else {
|
|
Token::StringConstant(StringConstant::String(constant))
|
|
}
|
|
} else if let Some(number) = constant.parse::<f64>().ok().and_then(|n| {
|
|
if n.to_string() == constant {
|
|
Some(n)
|
|
} else {
|
|
None
|
|
}
|
|
}) {
|
|
Token::StringConstant(StringConstant::Number(Number::Float(number)))
|
|
} else {
|
|
Token::StringConstant(StringConstant::String(constant))
|
|
}
|
|
} else {
|
|
Token::StringConstant(StringConstant::String(constant))
|
|
}
|
|
};
|
|
|
|
self.buf.clear();
|
|
|
|
Ok(TokenInfo {
|
|
token,
|
|
line_num: self.text_line_num,
|
|
line_pos: self.text_line_pos,
|
|
})
|
|
} else {
|
|
Err(CompileError {
|
|
line_num: self.text_line_num,
|
|
line_pos: self.text_line_pos,
|
|
error_type: ErrorType::StringTooLong,
|
|
})
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn push_byte(&mut self, ch: u8) {
|
|
if self.buf.is_empty() {
|
|
self.token_line_num = self.line_num;
|
|
self.token_line_pos = self.pos - self.line_start;
|
|
}
|
|
self.buf.push(ch);
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn new_line(&mut self) {
|
|
self.line_num += 1;
|
|
self.line_start = self.pos;
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn text_start(&mut self) {
|
|
self.text_line_num = self.line_num;
|
|
self.text_line_pos = self.pos - self.line_start;
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn is_token_start(&self) -> bool {
|
|
self.buf.is_empty()
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn token_bytes(&self) -> &[u8] {
|
|
&self.buf
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn next_byte(&mut self) -> Option<(u8, u8)> {
|
|
self.iter.next().map(|&ch| {
|
|
let last_ch = self.last_ch;
|
|
self.pos = self.pos.wrapping_add(1);
|
|
self.last_ch = ch;
|
|
(ch, last_ch)
|
|
})
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn peek_byte(&mut self) -> Option<u8> {
|
|
self.iter.peek().map(|ch| **ch)
|
|
}
|
|
|
|
pub fn unwrap_next(&mut self) -> Result<TokenInfo, CompileError> {
|
|
if let Some(token) = self.next() {
|
|
token
|
|
} else {
|
|
Err(CompileError {
|
|
line_num: self.line_num,
|
|
line_pos: self.pos - self.line_start,
|
|
error_type: ErrorType::UnexpectedEOF,
|
|
})
|
|
}
|
|
}
|
|
|
|
pub fn expect_token(&mut self, token: Token) -> Result<(), CompileError> {
|
|
let next_token = self.unwrap_next()?;
|
|
if next_token.token == token {
|
|
Ok(())
|
|
} else {
|
|
Err(next_token.expected(format!("'{token}'")))
|
|
}
|
|
}
|
|
|
|
pub fn expect_static_string(&mut self) -> Result<String, CompileError> {
|
|
let next_token = self.unwrap_next()?;
|
|
match next_token.token {
|
|
Token::StringConstant(s) => Ok(s.into_string()),
|
|
Token::BracketOpen => {
|
|
let mut string = None;
|
|
loop {
|
|
let token_info = self.unwrap_next()?;
|
|
match token_info.token {
|
|
Token::StringConstant(string_) => {
|
|
string = string_.into();
|
|
}
|
|
Token::BracketClose if string.is_some() => break,
|
|
_ => return Err(token_info.expected("constant string")),
|
|
}
|
|
}
|
|
Ok(string.unwrap().into_string())
|
|
}
|
|
_ => Err(next_token.expected("constant string")),
|
|
}
|
|
}
|
|
|
|
pub fn expect_number(&mut self, max_value: usize) -> Result<usize, CompileError> {
|
|
let next_token = self.unwrap_next()?;
|
|
if let Token::Number(n) = next_token.token {
|
|
if n < max_value {
|
|
Ok(n)
|
|
} else {
|
|
Err(next_token.expected(format!("number lower than {max_value}")))
|
|
}
|
|
} else {
|
|
Err(next_token.expected("number"))
|
|
}
|
|
}
|
|
|
|
pub fn invalid_character(&self) -> CompileError {
|
|
CompileError {
|
|
line_num: self.line_num,
|
|
line_pos: self.pos - self.line_start,
|
|
error_type: ErrorType::InvalidCharacter(self.last_ch),
|
|
}
|
|
}
|
|
|
|
pub fn peek(&mut self) -> Option<Result<&TokenInfo, CompileError>> {
|
|
if self.next_token.is_empty() {
|
|
match self.next()? {
|
|
Ok(next_token) => self.next_token.push(next_token),
|
|
Err(err) => return Some(Err(err)),
|
|
}
|
|
}
|
|
self.next_token.last().map(Ok)
|
|
}
|
|
}
|
|
|
|
impl<'x> Iterator for Tokenizer<'x> {
|
|
type Item = Result<TokenInfo, CompileError>;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
if let Some(prev_token) = self.next_token.pop() {
|
|
return Some(Ok(prev_token));
|
|
}
|
|
|
|
'outer: while let Some((ch, last_ch)) = self.next_byte() {
|
|
match self.state {
|
|
State::None => match ch {
|
|
b'a'..=b'z' | b'0'..=b'9' | b'_' | b'.' | b'$' => {
|
|
self.push_byte(ch);
|
|
}
|
|
b'A'..=b'Z' => {
|
|
self.push_byte(ch.to_ascii_lowercase());
|
|
}
|
|
b':' => {
|
|
if self.is_token_start()
|
|
&& matches!(self.peek_byte(), Some(b) if b.is_ascii_alphabetic())
|
|
{
|
|
self.token_is_tag();
|
|
} else if self.token_bytes().eq_ignore_ascii_case(b"text") {
|
|
self.state = State::MultiLine(StringType::default());
|
|
self.text_start();
|
|
while let Some((ch, _)) = self.next_byte() {
|
|
if ch == b'\n' {
|
|
self.new_line();
|
|
self.reset_current_token();
|
|
continue 'outer;
|
|
}
|
|
}
|
|
} else {
|
|
return Some(Ok(self.get_token(Token::Colon)));
|
|
//return Some(Err(self.invalid_character()));
|
|
}
|
|
}
|
|
b'"' => {
|
|
self.state = State::QuotedString(StringType::default());
|
|
self.text_start();
|
|
if let Some(token) = self.get_current_token() {
|
|
return Some(Ok(token));
|
|
}
|
|
}
|
|
b'{' => {
|
|
return Some(Ok(self.get_token(Token::CurlyOpen)));
|
|
}
|
|
b'}' => {
|
|
return Some(Ok(self.get_token(Token::CurlyClose)));
|
|
}
|
|
b';' => {
|
|
return Some(Ok(self.get_token(Token::Semicolon)));
|
|
}
|
|
b',' => {
|
|
return Some(Ok(self.get_token(Token::Comma)));
|
|
}
|
|
b'[' => {
|
|
return Some(Ok(self.get_token(Token::BracketOpen)));
|
|
}
|
|
b']' => {
|
|
return Some(Ok(self.get_token(Token::BracketClose)));
|
|
}
|
|
b'(' => {
|
|
return Some(Ok(self.get_token(Token::ParenthesisOpen)));
|
|
}
|
|
b')' => {
|
|
return Some(Ok(self.get_token(Token::ParenthesisClose)));
|
|
}
|
|
b'/' => {
|
|
if let Some((b'*', _)) = self.next_byte() {
|
|
self.last_ch = 0;
|
|
self.state = State::BracketComment;
|
|
self.text_start();
|
|
if let Some(token) = self.get_current_token() {
|
|
return Some(Ok(token));
|
|
}
|
|
} else {
|
|
return Some(Err(self.invalid_character()));
|
|
}
|
|
}
|
|
b'#' => {
|
|
self.state = State::HashComment;
|
|
if let Some(token) = self.get_current_token() {
|
|
return Some(Ok(token));
|
|
}
|
|
}
|
|
b'\n' => {
|
|
self.new_line();
|
|
if let Some(token) = self.get_current_token() {
|
|
return Some(Ok(token));
|
|
}
|
|
}
|
|
b' ' | b'\t' | b'\r' => {
|
|
if let Some(token) = self.get_current_token() {
|
|
return Some(Ok(token));
|
|
}
|
|
}
|
|
_ => {
|
|
return Some(Err(self.invalid_character()));
|
|
}
|
|
},
|
|
State::BracketComment { .. } => match ch {
|
|
b'/' if last_ch == b'*' => {
|
|
self.state = State::None;
|
|
}
|
|
b'\n' => {
|
|
self.new_line();
|
|
}
|
|
_ => (),
|
|
},
|
|
State::HashComment => {
|
|
if ch == b'\n' {
|
|
self.state = State::None;
|
|
self.new_line();
|
|
}
|
|
}
|
|
State::QuotedString(mut str_type) => match ch {
|
|
b'"' if last_ch != b'\\' => {
|
|
self.state = State::None;
|
|
return Some(self.get_string(str_type));
|
|
}
|
|
b'\n' => {
|
|
self.new_line();
|
|
self.push_byte(b'\n');
|
|
str_type.has_other = true;
|
|
self.state = State::QuotedString(str_type);
|
|
}
|
|
b'{' if (last_ch == b'$' || last_ch == b'%') => {
|
|
str_type.maybe_variable = true;
|
|
self.state = State::QuotedString(str_type);
|
|
self.push_byte(ch);
|
|
}
|
|
b'\\' => {
|
|
if last_ch == b'\\' {
|
|
self.push_byte(ch);
|
|
}
|
|
}
|
|
b'0'..=b'9' => {
|
|
if !str_type.has_digits {
|
|
str_type.has_digits = true;
|
|
self.state = State::QuotedString(str_type);
|
|
}
|
|
self.push_byte(ch);
|
|
}
|
|
b'.' => {
|
|
if !str_type.has_dots {
|
|
str_type.has_dots = true;
|
|
} else {
|
|
str_type.has_other = true;
|
|
}
|
|
self.state = State::QuotedString(str_type);
|
|
self.push_byte(ch);
|
|
}
|
|
_ => {
|
|
if !str_type.has_other && ch != b'-' {
|
|
str_type.has_other = true;
|
|
self.state = State::QuotedString(str_type);
|
|
}
|
|
self.push_byte(ch);
|
|
}
|
|
},
|
|
State::MultiLine(mut str_type) => match ch {
|
|
b'.' if last_ch == b'\n' => {
|
|
let is_eof = match (self.next_byte(), self.peek_byte()) {
|
|
(Some((b'\r', _)), Some(b'\n')) => {
|
|
self.next_byte();
|
|
true
|
|
}
|
|
(Some((b'\n', _)), _) => true,
|
|
(Some((b'.', _)), _) => {
|
|
self.push_byte(b'.');
|
|
false
|
|
}
|
|
(Some((ch, _)), _) => {
|
|
self.push_byte(b'.');
|
|
self.push_byte(ch);
|
|
false
|
|
}
|
|
_ => false,
|
|
};
|
|
|
|
if is_eof {
|
|
self.new_line();
|
|
self.state = State::None;
|
|
return Some(self.get_string(str_type));
|
|
}
|
|
}
|
|
b'\n' => {
|
|
self.new_line();
|
|
self.push_byte(b'\n');
|
|
}
|
|
b'{' if (last_ch == b'$' || last_ch == b'%') => {
|
|
str_type.maybe_variable = true;
|
|
self.state = State::MultiLine(str_type);
|
|
self.push_byte(ch);
|
|
}
|
|
b'0'..=b'9' => {
|
|
if !str_type.has_digits {
|
|
str_type.has_digits = true;
|
|
self.state = State::MultiLine(str_type);
|
|
}
|
|
self.push_byte(ch);
|
|
}
|
|
b'.' => {
|
|
if !str_type.has_dots {
|
|
str_type.has_dots = true;
|
|
} else {
|
|
str_type.has_other = true;
|
|
}
|
|
self.state = State::MultiLine(str_type);
|
|
self.push_byte(ch);
|
|
}
|
|
_ => {
|
|
if !str_type.has_other && ch != b'-' {
|
|
str_type.has_other = true;
|
|
self.state = State::MultiLine(str_type);
|
|
}
|
|
self.push_byte(ch);
|
|
}
|
|
},
|
|
}
|
|
}
|
|
|
|
match self.state {
|
|
State::BracketComment | State::QuotedString(_) | State::MultiLine(_) => {
|
|
Some(Err(CompileError {
|
|
line_num: self.text_line_num,
|
|
line_pos: self.text_line_pos,
|
|
error_type: (&self.state).into(),
|
|
}))
|
|
}
|
|
_ => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<&State> for ErrorType {
|
|
fn from(state: &State) -> Self {
|
|
match state {
|
|
State::BracketComment => ErrorType::UnterminatedComment,
|
|
State::QuotedString(_) => ErrorType::UnterminatedString,
|
|
State::MultiLine(_) => ErrorType::UnterminatedMultiline,
|
|
_ => unreachable!(),
|
|
}
|
|
}
|
|
}
|