# HG changeset patch # User Laman # Date 2024-06-23 22:48:03 # Node ID 4f7b6352013d562a3a5b78fd4dd7cce1cb863015 # Parent 0163ce5ddc96491cd933cf6c9521b597ab2e5bbd added the automaton normalization diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -1,4 +1,5 @@ from abc import abstractmethod +from collections import deque class ParsingError(Exception): @@ -283,7 +284,7 @@ class RegexpDFA: return cls(rules, end_states) def match(self, s): - st = (-1,) + st = 0 for c in s: key = (st, c) @@ -300,6 +301,24 @@ class RegexpDFA: return RegexpDFA(rules, end_states) + def normalize(self): + index = {(-1,): 0} + queue = deque([(-1,)]) + + while queue: + state = queue.popleft() + edges = [((st, c), t) for ((st, c), t) in self.rules.items() if st == state] + edges.sort() + for ((st, c), t) in edges: + if t not in index: + index[t] = len(index) + queue.append(t) + + rules = dict(((index[st], c), index[t]) for ((st, c), t) in self.rules.items()) + end_states = {index[st] for st in self.end_states} + + return RegexpDFA(rules, end_states) + def _find_equivalent_states(self): state_set = [(-2,), (-1,)] + sorted(set(self.rules.values())) alphabet = {c for (st, c) in self.rules.keys()} @@ -340,10 +359,10 @@ class RegexpDFA: if __name__ == "__main__": tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] - for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*"]: + for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)"]: print("#", pattern) try: - r = RegexpDFA.create(pattern).reduce() + r = RegexpDFA.create(pattern).reduce().normalize() except ParsingError as e: print("Failed to parse the regexp:") print(e) diff --git a/src/main.rs b/src/main.rs --- a/src/main.rs +++ b/src/main.rs @@ -2,10 +2,10 @@ use regexp::Regexp; fn main() { let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]; - for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*"] { + for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)"] { println!("# {pattern}"); let r = match Regexp::new(&pattern.to_string()) { - Ok(r1) => r1.determinize().reduce(), + Ok(r1) => r1.determinize().reduce().normalize(), Err(e) => { println!("{e}"); continue; diff --git a/src/regexp.rs b/src/regexp.rs --- a/src/regexp.rs +++ b/src/regexp.rs @@ -1,4 +1,4 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::{HashMap, HashSet, VecDeque}; mod token; pub use token::ParsingError; @@ -151,6 +151,28 @@ impl RegexpDFA { return self.collapse_states(equivalents); } + pub fn normalize(&self) -> RegexpDFA { + let mut index = HashMap::from([(START as u64, START as u64)]); + let mut queue = VecDeque::from([START as u64]); + + while !queue.is_empty() { + let state = queue.pop_front().unwrap(); + let mut edges: Vec<((u64, char), u64)> = self.rules.iter().filter(|((st, c), t)| *st == state).map(|((st, c), t)| ((*st, *c), *t)).collect(); + edges.sort(); + for ((_st, _c), t) in edges { + if !index.contains_key(&t) { + index.insert(t, index.len() as u64); + queue.push_back(t); + } + } + } + + let rules = self.rules.iter().map(|((st, c), t)| ((index[st as &u64], *c), index[t as &u64])).collect(); + let end_states = self.end_states.iter().map(|st| index[st]).collect(); + + return RegexpDFA{rules, end_states}; + } + fn find_equivalent_states(&self) -> Vec<(u64, u64)> { let state_set: HashSet = HashSet::from_iter(self.rules.values().copied()); let mut state_vec: Vec = Vec::from_iter(state_set.into_iter()); diff --git a/tests/test_regexp.rs b/tests/test_regexp.rs --- a/tests/test_regexp.rs +++ b/tests/test_regexp.rs @@ -11,7 +11,7 @@ fn test_eval_basic_nfa() { #[test] fn test_eval_basic_dfa() { - let r = Regexp::new(&String::from("abc")).unwrap().determinize().reduce(); + let r = Regexp::new(&String::from("abc")).unwrap().determinize().reduce().normalize(); assert!(r.eval(String::from("abc"))); assert!(!r.eval(String::from("ab"))); assert!(!r.eval(String::from("abcd"))); @@ -27,9 +27,9 @@ fn test_eval_empty_nfa() { #[test] fn test_eval_empty_dfa() { - assert!(Regexp::new(&String::from("a*")).unwrap().determinize().reduce().eval(String::from(""))); - assert!(Regexp::new(&String::from("")).unwrap().determinize().reduce().eval(String::from(""))); - assert!(!Regexp::new(&String::from("")).unwrap().determinize().reduce().eval(String::from("a"))); + assert!(Regexp::new(&String::from("a*")).unwrap().determinize().reduce().normalize().eval(String::from(""))); + assert!(Regexp::new(&String::from("")).unwrap().determinize().reduce().normalize().eval(String::from(""))); + assert!(!Regexp::new(&String::from("")).unwrap().determinize().reduce().normalize().eval(String::from("a"))); } #[test] @@ -43,7 +43,7 @@ fn test_eval_asterisk_nfa() { #[test] fn test_eval_asterisk_dfa() { - let r = Regexp::new(&String::from("a*b*")).unwrap().determinize().reduce(); + let r = Regexp::new(&String::from("a*b*")).unwrap().determinize().normalize().reduce(); assert!(r.eval(String::from("a"))); assert!(r.eval(String::from("ab"))); assert!(r.eval(String::from("aabb"))); @@ -62,7 +62,7 @@ fn test_eval_alternative_nfa() { #[test] fn test_eval_alternative_dfa() { - let r = Regexp::new(&String::from("a|b|c")).unwrap().determinize().reduce(); + let r = Regexp::new(&String::from("a|b|c")).unwrap().determinize().reduce().normalize(); assert!(r.eval(String::from("a"))); assert!(r.eval(String::from("b"))); assert!(r.eval(String::from("c"))); @@ -85,12 +85,12 @@ fn test_eval_lambda_nfa() { #[test] fn test_eval_lambda_dfa() { - let r = Regexp::new(&String::from("a_")).unwrap().determinize().reduce(); + let r = Regexp::new(&String::from("a_")).unwrap().determinize().reduce().normalize(); assert!(r.eval(String::from("a"))); assert!(!r.eval(String::from(""))); assert!(!r.eval(String::from("ab"))); - let r = Regexp::new(&String::from("a|_")).unwrap().determinize().reduce(); + let r = Regexp::new(&String::from("a|_")).unwrap().determinize().reduce().normalize(); assert!(r.eval(String::from("a"))); assert!(r.eval(String::from(""))); assert!(!r.eval(String::from("b")));