diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -246,10 +246,10 @@ class Regexp: def determinize(self): rules = dict() - end_states = {(-1,)} if -1 in self.end_states else set() + end_states = {-1} if -1 in self.end_states else set() + index = {(-1,): -1} stack = [(-1,)] - processed_states = set() while stack: multistate = stack.pop() new_rules = dict() @@ -260,13 +260,14 @@ class Regexp: new_rules[c].update(target) for (c, target_set) in new_rules.items(): - new_target = tuple(sorted(target_set)) - rules[(multistate, c)] = new_target - if any(st in self.end_states for st in new_target): - end_states.add(new_target) - if new_target not in processed_states: - stack.append(new_target) - processed_states.add(new_target) + target_tup = tuple(sorted(target_set)) + if target_tup not in index: + new_target = len(index)-1 + index[target_tup] = new_target + stack.append(target_tup) + rules[(index[multistate], c)] = index[target_tup] + if any(st in self.end_states for st in target_set): + end_states.add(index[target_tup]) return (rules, end_states) @@ -284,7 +285,7 @@ class RegexpDFA: return cls(rules, end_states) def match(self, s): - st = 0 + st = -1 for c in s: key = (st, c) @@ -302,8 +303,8 @@ class RegexpDFA: return RegexpDFA(rules, end_states) def normalize(self): - index = {(-1,): 0} - queue = deque([(-1,)]) + index = {-1: -1} + queue = deque([-1]) while queue: state = queue.popleft() @@ -311,7 +312,7 @@ class RegexpDFA: edges.sort() for ((st, c), t) in edges: if t not in index: - index[t] = len(index) + index[t] = len(index)-1 queue.append(t) rules = dict(((index[st], c), index[t]) for ((st, c), t) in self.rules.items()) @@ -320,7 +321,7 @@ class RegexpDFA: return RegexpDFA(rules, end_states) def _find_equivalent_states(self): - state_set = [(-2,), (-1,)] + sorted(set(self.rules.values())) + state_set = [-2, -1] + sorted(set(self.rules.values())) alphabet = {c for (st, c) in self.rules.keys()} equivalents = {(s1, s2) for (i, s1) in enumerate(state_set) for s2 in state_set[i+1:]} @@ -333,8 +334,8 @@ class RegexpDFA: ctrl = False for (s1, s2) in equivalents.copy(): for c in alphabet: - t1 = self.rules.get((s1, c), (-2,)) - t2 = self.rules.get((s2, c), (-2,)) + t1 = self.rules.get((s1, c), -2) + t2 = self.rules.get((s2, c), -2) key = (min(t1, t2), max(t1, t2)) if t1 != t2 and key not in equivalents: equivalents.remove((s1, s2)) @@ -359,7 +360,7 @@ class RegexpDFA: if __name__ == "__main__": tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] - for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)"]: + for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"]: print("#", pattern) try: r = RegexpDFA.create(pattern).reduce().normalize() diff --git a/src/main.rs b/src/main.rs --- a/src/main.rs +++ b/src/main.rs @@ -2,7 +2,7 @@ use regexp::Regexp; fn main() { let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]; - for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)"] { + for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"] { println!("# {pattern}"); let r = match Regexp::new(&pattern.to_string()) { Ok(r1) => r1.determinize().reduce().normalize(), diff --git a/src/regexp.rs b/src/regexp.rs --- a/src/regexp.rs +++ b/src/regexp.rs @@ -7,28 +7,11 @@ use token::parse; const START: i32 = -1; const FAIL: i32 = -2; -fn encode_set(set: &HashSet) -> i32 { - let mut res = 0; - for x in set.iter() { - assert!(*x >= 0); - res ^= 1< HashSet { - if x == START {return HashSet::from([START]);} - - let mut x = x; - let mut res: HashSet = HashSet::new(); - - while x > 0 { - let y = x.trailing_zeros(); - res.insert(y as i32); - x ^= 1 << y; - } - - return res; +fn encode_set(set: &HashSet) -> String { + let mut v = Vec::from_iter(set.iter()); + v.sort(); + let res: Vec = v.into_iter().map(|x| x.to_string()).collect(); + return res.join(","); } #[derive(Debug)] @@ -93,11 +76,13 @@ impl Regexp { let mut end_states: HashSet = HashSet::new(); if self.end_states.contains(&START) {end_states.insert(START);} - let mut stack = Vec::from([START]); - let mut processed_states = HashSet::new(); + let mut index_new = HashMap::from([(START.to_string(), START)]); + let mut index_multi = HashMap::from([(START.to_string(), HashSet::from([START]))]); + let mut stack = Vec::from([START.to_string()]); + while !stack.is_empty() { - let state = stack.pop().unwrap(); - let multistate = decode_set(state); + let state_hash = stack.pop().unwrap(); + let multistate = &index_multi[&state_hash]; let mut new_rules: HashMap> = HashMap::new(); for key in self.rules.keys().filter(|key| multistate.contains(&key.0)) { @@ -111,14 +96,17 @@ impl Regexp { } for (c, target_set) in new_rules.into_iter() { - let encoded_target = encode_set(&target_set); - rules.insert((state, c), encoded_target); - if target_set.iter().any(|st| self.end_states.contains(st)) { - end_states.insert(encoded_target); + let target_hash = encode_set(&target_set); + let is_end = target_set.iter().any(|st| self.end_states.contains(st)); + if !index_new.contains_key(&target_hash) { + let target_new = index_new.len() as i32; + index_new.insert(target_hash.clone(), target_new); + index_multi.insert(target_hash.clone(), target_set); + stack.push(target_hash.clone()); } - if !processed_states.contains(&encoded_target) { - stack.push(encoded_target); - processed_states.insert(encoded_target); + rules.insert((index_new[&state_hash], c), index_new[&target_hash]); + if is_end { + end_states.insert(index_new[&target_hash]); } } }