# HG changeset patch # User Laman # Date 2024-06-18 16:21:22 # Node ID 7e640b0cffa728e0f5af3abd3d01ca74cfe3b352 # Parent 9ddf4beb947bef7b379bf11afd0292b5773f74cd handling unparsable patterns diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -1,6 +1,10 @@ from abc import abstractmethod +class ParsingError(Exception): + pass + + class Token: is_skippable = False @@ -111,7 +115,7 @@ def find_closing_parenthesis(pattern, k) if counter == 0: return k+i - return None + raise ParsingError(f'A closing parenthesis not found. Pattern: "{pattern}", position: {k}') def parse(pattern, offset=0): @@ -126,13 +130,21 @@ def parse(pattern, offset=0): res.append(inner_content) i = j+1 elif c == "*": - token = res.pop() + try: + token = res.pop() + except IndexError as e: + raise ParsingError(f'The asterisk operator is missing an argument. Pattern: "{pattern}", position {i}') res.append(Asterisk(token)) i += 1 elif c == "+": - token = res.pop() + try: + token = res.pop() + except IndexError as e: + raise ParsingError(f'The plus operator is missing an argument. Pattern: "{pattern}", position {i}') res.append(Plus(token)) i += 1 + elif c == ")": + raise ParsingError(f'An opening parenthesis not found. Pattern: "{pattern}", position: {i}') else: res.append(Symbol(i+offset, c)) i += 1 @@ -228,9 +240,14 @@ class RegexpDFA: if __name__ == "__main__": tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] - for pattern in ["a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"]: + for pattern in ["*", "((a)", "a)"]: print("#", pattern) - r = RegexpDFA(pattern) + try: + r = RegexpDFA(pattern) + except ParsingError as e: + print("Failed to parse the regexp:") + print(e) + continue for t in tests: print(t, r.match(t)) print() diff --git a/src/lib.rs b/src/lib.rs --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,2 @@ mod regexp; -pub use regexp::Regexp; +pub use regexp::{Regexp, ParsingError}; diff --git a/src/main.rs b/src/main.rs --- a/src/main.rs +++ b/src/main.rs @@ -2,9 +2,15 @@ use regexp::Regexp; fn main() { let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]; - for pattern in ["a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"] { + for pattern in ["*", "((a)", "a)", "+a"] { println!("# {pattern}"); - let r = Regexp::new(&pattern.to_string()).determinize(); + let r = match Regexp::new(&pattern.to_string()) { + Ok(r1) => r1.determinize(), + Err(e) => { + println!("{e}"); + continue; + } + }; for &t in tests.iter() { println!("{t} {}", r.eval(t.to_string())); } diff --git a/src/regexp.rs b/src/regexp.rs --- a/src/regexp.rs +++ b/src/regexp.rs @@ -1,6 +1,7 @@ use std::collections::{HashMap, HashSet}; mod token; +pub use token::ParsingError; use token::{Token, parse}; const START: usize = usize::MAX; @@ -28,14 +29,15 @@ fn decode_set(x: u64) ->HashSet { return res; } +#[derive(Debug)] pub struct Regexp { rules: HashMap<(usize, char), HashSet>, end_states: HashSet } impl Regexp { - pub fn new(pattern: &String) -> Regexp { - let r = parse(pattern, 0); + pub fn new(pattern: &String) -> Result { + let r = parse(pattern, 0)?; let pattern_chars = Vec::from_iter(pattern.chars()); let mut rules: HashMap<(usize, char), HashSet> = HashMap::new(); @@ -62,7 +64,7 @@ impl Regexp { end_states.insert(START); } - return Regexp{rules, end_states}; + return Ok(Regexp{rules, end_states}); } pub fn eval(&self, s: String) -> bool { diff --git a/src/regexp/token.rs b/src/regexp/token.rs --- a/src/regexp/token.rs +++ b/src/regexp/token.rs @@ -1,3 +1,32 @@ +use std::fmt; + +#[derive(Debug, Clone)] +pub enum ParsingError { + Asterisk {s: String, pos: usize}, + Plus {s: String, pos: usize}, + OpeningParenthesis {s: String, pos: usize}, + ClosingParenthesis {s: String, pos: usize} +} + +impl fmt::Display for ParsingError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + ParsingError::Asterisk {s, pos} => { + write!(f, "The asterisk operator is missing an argument. Pattern \"{s}\", position {pos}") + }, + ParsingError::Plus {s, pos} => { + write!(f, "The plus operator is missing an argument. Pattern \"{s}\", position {pos}") + }, + ParsingError::OpeningParenthesis {s, pos} => { + write!(f, "An opening parenthesis not found. Pattern \"{s}\", position {pos}") + }, + ParsingError::ClosingParenthesis {s, pos} => { + write!(f, "An closing parenthesis not found. Pattern \"{s}\", position {pos}") + } + } + } +} + pub trait Token { fn is_skippable(&self) -> bool {false} fn list_first(&self) -> Vec; @@ -144,7 +173,7 @@ fn find_closing_parenthesis(s: &String) return None; } -pub fn parse(pattern: &String, offset: usize) -> Chain { +pub fn parse(pattern: &String, offset: usize) -> Result { let chars: Vec = pattern.chars().collect(); let mut res: Vec> = Vec::new(); let mut i = 0; @@ -152,21 +181,24 @@ pub fn parse(pattern: &String, offset: u let c = chars[i]; match c { '(' => { - let j = find_closing_parenthesis(&pattern[i..].to_string()).unwrap() + i; - let inner_content = parse(&pattern[i+1..j].to_string(), offset+i+1); + let j = find_closing_parenthesis(&pattern[i..].to_string()).ok_or(ParsingError::ClosingParenthesis {s: pattern.clone(), pos: i})? + i; + let inner_content = parse(&pattern[i+1..j].to_string(), offset+i+1)?; res.push(Box::new(inner_content)); i = j+1; } '*' => { - let token = res.pop().unwrap(); + let token = res.pop().ok_or(ParsingError::Asterisk{s: pattern.clone(), pos: i})?; res.push(Box::new(Asterisk{content: token})); i += 1; } '+' => { - let token = res.pop().unwrap(); + let token = res.pop().ok_or(ParsingError::Plus{s: pattern.clone(), pos: i})?; res.push(Box::new(Plus{content: token})); i += 1; } + ')' => { + return Err(ParsingError::OpeningParenthesis {s: pattern.clone(), pos: i}); + } c => { res.push(Box::new(Symbol{position: i+offset, value: c})); i += 1; @@ -174,7 +206,7 @@ pub fn parse(pattern: &String, offset: u } } - return Chain{content: res}; + return Ok(Chain{content: res}); } mod test { diff --git a/tests/test_regexp.rs b/tests/test_regexp.rs --- a/tests/test_regexp.rs +++ b/tests/test_regexp.rs @@ -1,8 +1,9 @@ use regexp::Regexp; +use regexp::ParsingError; #[test] fn test_eval_basic_nfa() { - let r = Regexp::new(&String::from("abc")); + let r = Regexp::new(&String::from("abc")).unwrap(); assert!(r.eval(String::from("abc"))); assert!(!r.eval(String::from("ab"))); assert!(!r.eval(String::from("abcd"))); @@ -10,7 +11,7 @@ fn test_eval_basic_nfa() { #[test] fn test_eval_basic_dfa() { - let r = Regexp::new(&String::from("abc")).determinize(); + let r = Regexp::new(&String::from("abc")).unwrap().determinize(); assert!(r.eval(String::from("abc"))); assert!(!r.eval(String::from("ab"))); assert!(!r.eval(String::from("abcd"))); @@ -18,22 +19,22 @@ fn test_eval_basic_dfa() { #[test] fn test_eval_empty_nfa() { - assert!(Regexp::new(&String::from("a*")).eval(String::from(""))); - assert!(Regexp::new(&String::from("")).eval(String::from(""))); - assert!(!Regexp::new(&String::from("")).eval(String::from("a"))); - assert!(!Regexp::new(&String::from("a")).eval(String::from(""))); + assert!(Regexp::new(&String::from("a*")).unwrap().eval(String::from(""))); + assert!(Regexp::new(&String::from("")).unwrap().eval(String::from(""))); + assert!(!Regexp::new(&String::from("")).unwrap().eval(String::from("a"))); + assert!(!Regexp::new(&String::from("a")).unwrap().eval(String::from(""))); } #[test] fn test_eval_empty_dfa() { - assert!(Regexp::new(&String::from("a*")).determinize().eval(String::from(""))); - assert!(Regexp::new(&String::from("")).determinize().eval(String::from(""))); - assert!(!Regexp::new(&String::from("")).determinize().eval(String::from("a"))); + assert!(Regexp::new(&String::from("a*")).unwrap().determinize().eval(String::from(""))); + assert!(Regexp::new(&String::from("")).unwrap().determinize().eval(String::from(""))); + assert!(!Regexp::new(&String::from("")).unwrap().determinize().eval(String::from("a"))); } #[test] fn test_eval_asterisk_nfa() { - let r = Regexp::new(&String::from("a*b*")); + let r = Regexp::new(&String::from("a*b*")).unwrap(); assert!(r.eval(String::from("a"))); assert!(r.eval(String::from("ab"))); assert!(r.eval(String::from("aabb"))); @@ -42,7 +43,7 @@ fn test_eval_asterisk_nfa() { #[test] fn test_eval_asterisk_dfa() { - let r = Regexp::new(&String::from("a*b*")).determinize(); + let r = Regexp::new(&String::from("a*b*")).unwrap().determinize(); assert!(r.eval(String::from("a"))); assert!(r.eval(String::from("ab"))); assert!(r.eval(String::from("aabb"))); @@ -51,7 +52,7 @@ fn test_eval_asterisk_dfa() { #[test] fn test_eval_plus_nfa() { - let r = Regexp::new(&String::from("(ab)+")); + let r = Regexp::new(&String::from("(ab)+")).unwrap(); assert!(!r.eval(String::from("a"))); assert!(r.eval(String::from("ab"))); assert!(r.eval(String::from("abab"))); @@ -60,9 +61,33 @@ fn test_eval_plus_nfa() { #[test] fn test_eval_plus_dfa() { - let r = Regexp::new(&String::from("(ab)+")).determinize(); + let r = Regexp::new(&String::from("(ab)+")).unwrap().determinize(); assert!(!r.eval(String::from("a"))); assert!(r.eval(String::from("ab"))); assert!(r.eval(String::from("abab"))); assert!(!r.eval(String::from("aabb"))); } + +#[test] +fn test_invalid_asterisk() { + let x = Regexp::new(&String::from("*")); + assert!(matches!(x, Err(ParsingError::Asterisk{s: _, pos: 0}))); +} + +#[test] +fn test_invalid_plus() { + let x = Regexp::new(&String::from("+")); + assert!(matches!(x, Err(ParsingError::Plus{s: _, pos: 0}))); +} + +#[test] +fn test_invalid_closing_parenthesis() { + let x = Regexp::new(&String::from("(a")); + assert!(matches!(x, Err(ParsingError::ClosingParenthesis{s: _, pos: 0}))); +} + +#[test] +fn test_invalid_opening_parenthesis() { + let x = Regexp::new(&String::from("a)")); + assert!(matches!(x, Err(ParsingError::OpeningParenthesis{s: _, pos: 1}))); +}