use std::{borrow::Borrow, fmt}; #[derive(Debug, Clone)] pub enum ParsingError { Asterisk {s: String, pos: usize}, OpeningParenthesis {s: String, pos: usize}, ClosingParenthesis {s: String, pos: usize}, EmptyAlternativeVariant } impl fmt::Display for ParsingError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { ParsingError::Asterisk {s, pos} => { write!(f, "The asterisk operator is missing an argument. Pattern \"{s}\", position {pos}") }, ParsingError::OpeningParenthesis {s, pos} => { write!(f, "An opening parenthesis not found. Pattern \"{s}\", position {pos}") }, ParsingError::ClosingParenthesis {s, pos} => { write!(f, "A closing parenthesis not found. Pattern \"{s}\", position {pos}") }, ParsingError::EmptyAlternativeVariant => { write!(f, "Found an empty Alternative variant.") } } } } /// A single letter or other alphabet symbol. pub struct Symbol { /// Symbol position in the regular expression. position: usize } /// A unary operator specifying its content to occur zero or more times. pub struct Asterisk { content: Box } /// An operator with a variable number of arguments, specifying exchangeable alternatives. pub struct Alternative { content: Vec> } /// An operator expressing a concatenation of its content Tokens. pub struct Chain { content: Vec> } /// Enum encapsulating possible items of a regular expression. pub enum Token { Lambda, // An empty string, useful as an `Alternative`. Symbol(Symbol), Asterisk(Asterisk), Alternative(Alternative), // A special token to temporarily separate Alternative variants. Removed in the Alternative constructor. AlternativeSeparator, Chain(Chain) } impl Symbol { fn list_first(&self) -> Vec { return vec![self.position]; } fn list_last(&self) -> Vec { return vec![self.position]; } fn list_neighbours(&self) -> Vec<(usize, usize)> { return vec![]; } } impl Asterisk { fn list_first(&self) -> Vec { return self.content.list_first(); } fn list_last(&self) -> Vec { return self.content.list_last(); } fn list_neighbours(&self) -> Vec<(usize, usize)> { let mut res = self.content.list_neighbours(); for x in self.list_last() { for y in self.list_first() { res.push((x, y)); } } return res; } } impl Alternative { /// Split a sequence of `Tokens` by `AlternativeSeparator` into alternative variants and return the result. /// If any variant is empty, return an `Err``. fn new(content: Vec>) -> Result { let mut variants: Vec>> = vec![Vec::new()]; content.into_iter().for_each(|x| { if matches!(x.borrow(), Token::AlternativeSeparator) { variants.push(Vec::new()); } else { variants.last_mut().unwrap().push(x); } }); if variants.iter().any(|x| x.is_empty()) { return Err(ParsingError::EmptyAlternativeVariant); } return Ok(Alternative{ content: variants.into_iter().map( |x| Box::new(Token::Chain(Chain{content: x})) ).collect() }); } fn is_skippable(&self) -> bool { return self.content.iter().any(|x| x.is_skippable()); } fn list_first(&self) -> Vec { let mut res = Vec::new(); for token in self.content.iter() { res.append(&mut token.list_first()); } return res; } fn list_last(&self) -> Vec { let mut res = Vec::new(); for token in self.content.iter() { res.append(&mut token.list_last()); } return res; } fn list_neighbours(&self) -> Vec<(usize, usize)> { let mut res = Vec::new(); for token in self.content.iter() { res.append(&mut token.list_neighbours()); } return res; } } impl Chain { fn is_skippable(&self) -> bool { return self.content.iter().all(|x| x.is_skippable()); } fn list_first(&self) -> Vec { let mut res = Vec::new(); for token in self.content.iter() { res.append(&mut token.list_first()); if !token.is_skippable() {break;} } return res; } fn list_last(&self) -> Vec { let mut res = Vec::new(); for token in self.content.iter().rev() { res.append(&mut token.list_last()); if !token.is_skippable() {break;} } return res; } fn list_neighbours(&self) -> Vec<(usize, usize)> { let mut res = Vec::new(); let mut previous: Vec<&Box> = Vec::new(); for token in self.content.iter() { for t in previous.iter() { for x in t.list_last() { for y in token.list_first() { res.push((x, y)); } } } res.append(&mut token.list_neighbours()); if token.is_skippable() { previous.push(token); } else { previous = vec![token]; } } return res; } } impl Token { /// Decide whether the `Token` has to contain some `Symbols`, /// or whether it can be stepped over when looking for first, last and neighbouring items. pub fn is_skippable(&self) -> bool { match self { Token::Lambda => true, Token::Symbol(_) => false, Token::Asterisk(_) => true, Token::Alternative(t) => t.is_skippable(), Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"), Token::Chain(t) => t.is_skippable() } } /// List all possible string positions the token can start with. pub fn list_first(&self) -> Vec { match self { Token::Lambda => vec![], Token::Symbol(t) => t.list_first(), Token::Asterisk(t) => t.list_first(), Token::Alternative(t) => t.list_first(), Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"), Token::Chain(t) => t.list_first() } } /// List all possible string positions the token can end with. pub fn list_last(&self) -> Vec { match self { Token::Lambda => vec![], Token::Symbol(t) => t.list_last(), Token::Asterisk(t) => t.list_last(), Token::Alternative(t) => t.list_last(), Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"), Token::Chain(t) => t.list_last() } } /// List positions of all possibly neighbouring subtokens. pub fn list_neighbours(&self) -> Vec<(usize, usize)> { match self { Token::Lambda => vec![], Token::Symbol(t) => t.list_neighbours(), Token::Asterisk(t) => t.list_neighbours(), Token::Alternative(t) => t.list_neighbours(), Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"), Token::Chain(t) => t.list_neighbours() } } } /// For a string starting with a parenthesis, find its matching closing parenthesis, or return None. fn find_closing_parenthesis(s: &String) -> Option { let chars: Vec = s.chars().collect(); let mut counter = 0; for (i, c) in chars.iter().enumerate() { if *c == '(' {counter += 1;} else if *c == ')' {counter -= 1;} if counter == 0 {return Some(i);} } return None; } /// Recursively parse the pattern into a `Token` tree. /// /// The `offset` defines where the `pattern` starts relative to the original pattern, /// to record correct global token positions in the subcalls pub fn parse(pattern: &String, offset: usize) -> Result { let chars: Vec = pattern.chars().collect(); let mut res: Vec> = Vec::new(); let mut is_alternative = false; let mut i = 0; while i < pattern.len() { let c = chars[i]; match c { '(' => { let j = find_closing_parenthesis(&pattern[i..].to_string()).ok_or(ParsingError::ClosingParenthesis {s: pattern.clone(), pos: i})? + i; let inner_content = parse(&pattern[i+1..j].to_string(), offset+i+1)?; res.push(Box::new(inner_content)); i = j+1; } '*' => { let token = res.pop().ok_or(ParsingError::Asterisk{s: pattern.clone(), pos: i})?; res.push(Box::new(Token::Asterisk(Asterisk{content: token}))); i += 1; } ')' => { return Err(ParsingError::OpeningParenthesis {s: pattern.clone(), pos: i}); } '|' | '+' => { is_alternative = true; res.push(Box::new(Token::AlternativeSeparator)); i += 1; } '_' => { res.push(Box::new(Token::Lambda)); i += 1; } _c => { res.push(Box::new(Token::Symbol(Symbol{position: i+offset}))); i += 1; } } } if is_alternative { return Ok(Token::Alternative(Alternative::new(res)?)); } else { return Ok(Token::Chain(Chain{content: res})); } } mod test { use super::*; #[test] fn test_closing_parenthesis() { let s = "()"; assert_eq!(find_closing_parenthesis(&s.to_string()), Some(1)); } }