diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -66,6 +66,44 @@ class Asterisk(Plus): return str(self.content) + "*" +class Alternative(Token): + def __init__(self, content: list): + self.variants = [] + subsequence = [] + + for token in content: + if isinstance(token, AlternativeSeparator): + if not subsequence: + raise ParsingError("Found an empty Alternative variant.") + self.variants.append(Chain(subsequence)) + subsequence = [] + else: + subsequence.append(token) + + if not subsequence: + raise ParsingError("Found an empty Alternative variant.") + self.variants.append(Chain(subsequence)) + + + def list_first(self): + for x in self.variants: + yield from x.list_first() + + def list_last(self): + for x in self.variants: + yield from x.list_last() + + def list_neighbours(self): + for x in self.variants: + yield from x.list_neighbours() + + @property + def is_skippable(self): + return any(x.is_skippable for x in self.variants) + +class AlternativeSeparator: + pass + class Chain(Token): def __init__(self, content: list): self.content = content @@ -120,6 +158,7 @@ def find_closing_parenthesis(pattern, k) def parse(pattern, offset=0): res = [] + is_alternative = False i = 0 while i < len(pattern): @@ -145,11 +184,18 @@ def parse(pattern, offset=0): i += 1 elif c == ")": raise ParsingError(f'An opening parenthesis not found. Pattern: "{pattern}", position: {i}') + elif c == "|": + is_alternative = True + res.append(AlternativeSeparator()) + i += 1 else: res.append(Symbol(i+offset, c)) i += 1 - return Chain(res) + if is_alternative: + return Alternative(res) + else: + return Chain(res) class Regexp: diff --git a/src/main.rs b/src/main.rs --- a/src/main.rs +++ b/src/main.rs @@ -2,7 +2,7 @@ use regexp::Regexp; fn main() { let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]; - for pattern in ["*", "((a)", "a)", "+a"] { + for pattern in ["a(b|c)", "a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"] { println!("# {pattern}"); let r = match Regexp::new(&pattern.to_string()) { Ok(r1) => r1.determinize(), diff --git a/src/regexp/token.rs b/src/regexp/token.rs --- a/src/regexp/token.rs +++ b/src/regexp/token.rs @@ -1,11 +1,12 @@ -use std::fmt; +use std::{borrow::Borrow, fmt}; #[derive(Debug, Clone)] pub enum ParsingError { Asterisk {s: String, pos: usize}, Plus {s: String, pos: usize}, OpeningParenthesis {s: String, pos: usize}, - ClosingParenthesis {s: String, pos: usize} + ClosingParenthesis {s: String, pos: usize}, + EmptyAlternativeVariant } impl fmt::Display for ParsingError { @@ -22,6 +23,9 @@ impl fmt::Display for ParsingError { }, ParsingError::ClosingParenthesis {s, pos} => { write!(f, "An closing parenthesis not found. Pattern \"{s}\", position {pos}") + }, + ParsingError::EmptyAlternativeVariant => { + write!(f, "Found an empty Alternative variant.") } } } @@ -39,6 +43,10 @@ pub struct Plus { content: Box } +pub struct Alternative { + content: Vec> +} + pub struct Chain { content: Vec> } @@ -47,6 +55,8 @@ pub enum Token { Symbol(Symbol), Asterisk(Asterisk), Plus(Plus), + Alternative(Alternative), + AlternativeSeparator, Chain(Chain) } @@ -108,6 +118,61 @@ impl Plus { } } +impl Alternative { + fn new(content: Vec>) -> Result { + let mut variants: Vec>> = vec![Vec::new()]; + + content.into_iter().for_each(|x| { + if matches!(x.borrow(), Token::AlternativeSeparator) { + variants.push(Vec::new()); + } else { + variants.last_mut().unwrap().push(x); + } + }); + + if variants.iter().any(|x| x.is_empty()) { + return Err(ParsingError::EmptyAlternativeVariant); + } + + return Ok(Alternative{ + content: variants.into_iter().map( + |x| Box::new(Token::Chain(Chain{content: x})) + ).collect() + }); + } + + fn is_skippable(&self) -> bool { + return self.content.iter().any(|x| x.is_skippable()); + } + + fn list_first(&self) -> Vec { + let mut res = Vec::new(); + for token in self.content.iter() { + res.append(&mut token.list_first()); + } + + return res; + } + + fn list_last(&self) -> Vec { + let mut res = Vec::new(); + for token in self.content.iter() { + res.append(&mut token.list_last()); + } + + return res; + } + + fn list_neighbours(&self) -> Vec<(usize, usize)> { + let mut res = Vec::new(); + for token in self.content.iter() { + res.append(&mut token.list_neighbours()); + } + + return res; + } +} + impl Chain { fn is_skippable(&self) -> bool { return self.content.iter().all(|x| x.is_skippable()); @@ -163,6 +228,8 @@ impl Token { Token::Symbol(_) => false, Token::Asterisk(_) => true, Token::Plus(_) => false, + Token::Alternative(t) => t.is_skippable(), + Token::AlternativeSeparator => panic!(), Token::Chain(t) => t.is_skippable() } } @@ -172,6 +239,8 @@ impl Token { Token::Symbol(t) => t.list_first(), Token::Asterisk(t) => t.list_first(), Token::Plus(t) => t.list_first(), + Token::Alternative(t) => t.list_first(), + Token::AlternativeSeparator => panic!(), Token::Chain(t) => t.list_first() } } @@ -181,6 +250,8 @@ impl Token { Token::Symbol(t) => t.list_last(), Token::Asterisk(t) => t.list_last(), Token::Plus(t) => t.list_last(), + Token::Alternative(t) => t.list_last(), + Token::AlternativeSeparator => panic!(), Token::Chain(t) => t.list_last() } } @@ -190,6 +261,8 @@ impl Token { Token::Symbol(t) => t.list_neighbours(), Token::Asterisk(t) => t.list_neighbours(), Token::Plus(t) => t.list_neighbours(), + Token::Alternative(t) => t.list_neighbours(), + Token::AlternativeSeparator => panic!(), Token::Chain(t) => t.list_neighbours() } } @@ -211,6 +284,7 @@ fn find_closing_parenthesis(s: &String) pub fn parse(pattern: &String, offset: usize) -> Result { let chars: Vec = pattern.chars().collect(); let mut res: Vec> = Vec::new(); + let mut is_alternative = false; let mut i = 0; while i < pattern.len() { let c = chars[i]; @@ -234,6 +308,11 @@ pub fn parse(pattern: &String, offset: u ')' => { return Err(ParsingError::OpeningParenthesis {s: pattern.clone(), pos: i}); } + '|' => { + is_alternative = true; + res.push(Box::new(Token::AlternativeSeparator)); + i += 1; + } _c => { res.push(Box::new(Token::Symbol(Symbol{position: i+offset}))); i += 1; @@ -241,7 +320,11 @@ pub fn parse(pattern: &String, offset: u } } - return Ok(Token::Chain(Chain{content: res})); + if is_alternative { + return Ok(Token::Alternative(Alternative::new(res)?)); + } else { + return Ok(Token::Chain(Chain{content: res})); + } } mod test { diff --git a/tests/test_regexp.rs b/tests/test_regexp.rs --- a/tests/test_regexp.rs +++ b/tests/test_regexp.rs @@ -69,6 +69,26 @@ fn test_eval_plus_dfa() { } #[test] +fn test_eval_alternative_nfa() { + let r = Regexp::new(&String::from("a|b|c")).unwrap(); + assert!(r.eval(String::from("a"))); + assert!(r.eval(String::from("b"))); + assert!(r.eval(String::from("c"))); + assert!(!r.eval(String::from(""))); + assert!(!r.eval(String::from("ab"))); +} + +#[test] +fn test_eval_alternative_dfa() { + let r = Regexp::new(&String::from("a|b|c")).unwrap().determinize(); + assert!(r.eval(String::from("a"))); + assert!(r.eval(String::from("b"))); + assert!(r.eval(String::from("c"))); + assert!(!r.eval(String::from(""))); + assert!(!r.eval(String::from("ab"))); +} + +#[test] fn test_invalid_asterisk() { let x = Regexp::new(&String::from("*")); assert!(matches!(x, Err(ParsingError::Asterisk{s: _, pos: 0}))); @@ -91,3 +111,21 @@ fn test_invalid_opening_parenthesis() { let x = Regexp::new(&String::from("a)")); assert!(matches!(x, Err(ParsingError::OpeningParenthesis{s: _, pos: 1}))); } + +#[test] +fn test_invalid_empty_variant_start() { + let x = Regexp::new(&String::from("a(|b)")); + assert!(matches!(x, Err(ParsingError::EmptyAlternativeVariant))); +} + +#[test] +fn test_invalid_empty_variant_end() { + let x = Regexp::new(&String::from("a|")); + assert!(matches!(x, Err(ParsingError::EmptyAlternativeVariant))); +} + +#[test] +fn test_invalid_empty_variant_middle() { + let x = Regexp::new(&String::from("a||b")); + assert!(matches!(x, Err(ParsingError::EmptyAlternativeVariant))); +}