# HG changeset patch # User Laman # Date 2024-07-02 23:16:28 # Node ID f03565ba6137f91c9da3daf5756cb29248138f13 # Parent 4eb3ab63f9dd977e87cebb15753789eef71b841e find_equivalent_states implemented with Hopcroft's n log n algorithm diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -318,8 +318,8 @@ class RegexpDFA: return st in self.end_states def reduce(self): - equivalents = self._find_equivalent_states() - (rules, end_states) = self._collapse_states(equivalents) + partition = self._find_equivalent_states() + (rules, end_states) = self._collapse_states(partition) return RegexpDFA(rules, end_states, self.alphabet_index) @@ -368,45 +368,70 @@ class RegexpDFA: def _find_equivalent_states(self): n = len(self.alphabet_index) - state_list = list(range(len(self.rules) // n)) - equivalents = {(s1, s2) for (i, s1) in enumerate(state_list) for s2 in state_list[i+1:] if (s1 in self.end_states) == (s2 in self.end_states)} + m = len(self.rules) // n + inverse_rules = [set() for i in range(m*n)] + + for i in range(m): + for j in range(n): + target = self.rules[i*n + j] + inverse_rules[target*n + j].add(i) + + set_bag = [self.end_states, set(range(m))-self.end_states] + res = {0, 1} + work = {0, 1} + + while work: + key = work.pop() + target_set = set_bag[key] + for j in range(n): + source_set = set(itertools.chain.from_iterable(inverse_rules[t*n + j] for t in target_set)) + for k in res.copy(): + part = set_bag[k] + intersection = part & source_set + diff = part - source_set + if not intersection or not diff: + continue + res.remove(k) + ki = len(set_bag) + set_bag.append(intersection) + res.add(ki) + kd = len(set_bag) + set_bag.append(diff) + res.add(kd) + if k in work: + work.remove(k) + work.add(ki) + work.add(kd) + elif len(intersection) < len(diff): + work.add(ki) + else: + work.add(kd) - ctrl = True - while ctrl: - ctrl = False - for (s1, s2) in equivalents.copy(): - for ci in range(n): - t1 = self.rules[s1*n + ci] - t2 = self.rules[s2*n + ci] - key = (min(t1, t2), max(t1, t2)) - if t1 != t2 and key not in equivalents: - equivalents.remove((s1, s2)) - ctrl = True - break - - return equivalents + return [set_bag[k] for k in res] - def _collapse_states(self, equivalents): + def _collapse_states(self, partition): n = len(self.alphabet_index) rules = [] eq_mapping = dict() - for (s1, s2) in equivalents: - eq_mapping[s2] = min(s1, eq_mapping.get(s2, math.inf)) + for eq_set in partition: + states = sorted(eq_set) + for st in states: + eq_mapping[st] = states[0] discard_mapping = dict() discard_count = 0 for i in range(0, len(self.rules), n): si = i//n - if si in eq_mapping: + if eq_mapping[si] != si: discard_count += 1 continue discard_mapping[si] = si - discard_count - rules.extend(map(lambda st: eq_mapping.get(st, st), self.rules[i:i+n])) + rules.extend(map(lambda st: eq_mapping[st], self.rules[i:i+n])) rules = [discard_mapping[st] for st in rules] - end_states = {discard_mapping[eq_mapping.get(st, st)] for st in self.end_states} + end_states = {discard_mapping[eq_mapping[st]] for st in self.end_states} return (rules, end_states) @@ -452,7 +477,7 @@ class RegexpDFA: def test(): tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] - for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"]: + for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]: print("#", pattern) try: r = RegexpDFA.create(pattern).reduce().normalize() diff --git a/src/main.rs b/src/main.rs --- a/src/main.rs +++ b/src/main.rs @@ -3,7 +3,7 @@ use regexp::Regexp; fn test() { let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]; - for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"] { + for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"] { println!("# {pattern}"); let r = match Regexp::new(&pattern.to_string()) { Ok(r1) => r1.determinize().reduce().normalize(), diff --git a/src/regexp.rs b/src/regexp.rs --- a/src/regexp.rs +++ b/src/regexp.rs @@ -168,8 +168,8 @@ impl RegexpDFA { } pub fn reduce(&self) -> RegexpDFA { - let equivalents = self.find_equivalent_states(); - return self.collapse_states(equivalents); + let partition = self.find_equivalent_states(); + return self.collapse_states(partition); } pub fn normalize(&self) -> RegexpDFA { @@ -230,42 +230,74 @@ impl RegexpDFA { panic!(); } - fn find_equivalent_states(&self) -> Vec<(usize, usize)> { + fn find_equivalent_states(&self) -> Vec> { let n = self.alphabet_index.len(); - let state_vec: Vec = (0..self.rules.len()/n).collect(); - let mut equivalents = HashSet::new(); - state_vec.iter().enumerate().for_each(|(i, s1)| { - equivalents.extend( - state_vec[i+1..].iter() - .filter(|s2| !(self.end_states.contains(s1)^self.end_states.contains(s2))) - .map(|s2| (*s1, *s2)) - ); - }); + let m = self.rules.len() / n; + let mut inverse_rules = vec![HashSet::::new();m*n]; - let mut m = usize::MAX; - while equivalents.len() < m { - m = equivalents.len(); - equivalents = equivalents.iter().filter(|(s1, s2)| { - !(0..n).any(|ci| { - let t1 = self.rules[s1*n + ci]; - let t2 = self.rules[s2*n + ci]; - let key = (t1.min(t2), t2.max(t1)); - return t1 != t2 && !equivalents.contains(&key); - }) - }).copied().collect(); + for i in 0..m { + for j in 0..n { + let target = self.rules[i*n + j]; + inverse_rules[target*n + j].insert(i); + } } - return Vec::from_iter(equivalents.into_iter()); + let mut set_bag = vec![ + self.end_states.clone(), + HashSet::from_iter(0..m).difference(&self.end_states).copied().collect() + ]; + let mut res = HashSet::::from([0, 1]); + let mut work = HashSet::::from([0, 1]); + + while !work.is_empty() { + let key = *work.iter().next().unwrap(); + work.remove(&key); + let target_set = set_bag[key].clone(); + for j in 0..n { + let source_set = HashSet::::from_iter(target_set.iter().flat_map(|&t| inverse_rules[t*n+j].iter()).copied()); + for k in res.clone().into_iter() { + let part = &set_bag[k]; + let intersection = part.intersection(&source_set).copied().collect::>(); + let diff = part.difference(&source_set).copied().collect::>(); + if intersection.is_empty() || diff.is_empty() { + continue; + } + res.remove(&k); + let ki = set_bag.len(); + set_bag.push(intersection); + res.insert(ki); + let kd = set_bag.len(); + set_bag.push(diff); + res.insert(kd); + if work.contains(&k) { + work.remove(&k); + work.insert(ki); + work.insert(kd); + } else if set_bag[ki].len() < set_bag[kd].len() { + work.insert(ki); + } else { + work.insert(kd); + } + } + } + } + + return Vec::from_iter(res.into_iter().map(|k| std::mem::take(&mut set_bag[k]))); } - fn collapse_states(&self, equivalents: Vec<(usize, usize)>) -> RegexpDFA { + fn collapse_states(&self, partition: Vec>) -> RegexpDFA { let n = self.alphabet_index.len(); let m = self.rules.len()/n; - let mut rules = Vec::new(); + let mut rules = vec![]; let mut eq_mapping: Vec = ((0..m)).collect(); - for (s1, s2) in equivalents.into_iter() { - eq_mapping[s2] = eq_mapping[s2].min(s1); + for eq_set in partition.into_iter() { + let mut eq_vec = Vec::from_iter(eq_set.into_iter()); + eq_vec.sort(); + let label = eq_vec[0]; + for st in eq_vec.into_iter() { + eq_mapping[st] = label; + } } let mut discard_mapping: Vec = ((0..m)).collect();