diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -246,10 +246,10 @@ class Regexp: def determinize(self): rules = dict() - end_states = {(-1,)} if -1 in self.end_states else set() + end_states = {-1} if -1 in self.end_states else set() + index = {(-1,): -1} stack = [(-1,)] - processed_states = set() while stack: multistate = stack.pop() new_rules = dict() @@ -260,13 +260,14 @@ class Regexp: new_rules[c].update(target) for (c, target_set) in new_rules.items(): - new_target = tuple(sorted(target_set)) - rules[(multistate, c)] = new_target - if any(st in self.end_states for st in new_target): - end_states.add(new_target) - if new_target not in processed_states: - stack.append(new_target) - processed_states.add(new_target) + target_tup = tuple(sorted(target_set)) + if target_tup not in index: + new_target = len(index)-1 + index[target_tup] = new_target + stack.append(target_tup) + rules[(index[multistate], c)] = index[target_tup] + if any(st in self.end_states for st in target_set): + end_states.add(index[target_tup]) return (rules, end_states) @@ -284,7 +285,7 @@ class RegexpDFA: return cls(rules, end_states) def match(self, s): - st = 0 + st = -1 for c in s: key = (st, c) @@ -302,8 +303,8 @@ class RegexpDFA: return RegexpDFA(rules, end_states) def normalize(self): - index = {(-1,): 0} - queue = deque([(-1,)]) + index = {-1: -1} + queue = deque([-1]) while queue: state = queue.popleft() @@ -311,7 +312,7 @@ class RegexpDFA: edges.sort() for ((st, c), t) in edges: if t not in index: - index[t] = len(index) + index[t] = len(index)-1 queue.append(t) rules = dict(((index[st], c), index[t]) for ((st, c), t) in self.rules.items()) @@ -320,7 +321,7 @@ class RegexpDFA: return RegexpDFA(rules, end_states) def _find_equivalent_states(self): - state_set = [(-2,), (-1,)] + sorted(set(self.rules.values())) + state_set = [-2, -1] + sorted(set(self.rules.values())) alphabet = {c for (st, c) in self.rules.keys()} equivalents = {(s1, s2) for (i, s1) in enumerate(state_set) for s2 in state_set[i+1:]} @@ -333,8 +334,8 @@ class RegexpDFA: ctrl = False for (s1, s2) in equivalents.copy(): for c in alphabet: - t1 = self.rules.get((s1, c), (-2,)) - t2 = self.rules.get((s2, c), (-2,)) + t1 = self.rules.get((s1, c), -2) + t2 = self.rules.get((s2, c), -2) key = (min(t1, t2), max(t1, t2)) if t1 != t2 and key not in equivalents: equivalents.remove((s1, s2)) @@ -359,7 +360,7 @@ class RegexpDFA: if __name__ == "__main__": tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] - for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)"]: + for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"]: print("#", pattern) try: r = RegexpDFA.create(pattern).reduce().normalize()