diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -1,4 +1,5 @@ from abc import abstractmethod +from collections import deque class ParsingError(Exception): @@ -283,7 +284,7 @@ class RegexpDFA: return cls(rules, end_states) def match(self, s): - st = (-1,) + st = 0 for c in s: key = (st, c) @@ -300,6 +301,24 @@ class RegexpDFA: return RegexpDFA(rules, end_states) + def normalize(self): + index = {(-1,): 0} + queue = deque([(-1,)]) + + while queue: + state = queue.popleft() + edges = [((st, c), t) for ((st, c), t) in self.rules.items() if st == state] + edges.sort() + for ((st, c), t) in edges: + if t not in index: + index[t] = len(index) + queue.append(t) + + rules = dict(((index[st], c), index[t]) for ((st, c), t) in self.rules.items()) + end_states = {index[st] for st in self.end_states} + + return RegexpDFA(rules, end_states) + def _find_equivalent_states(self): state_set = [(-2,), (-1,)] + sorted(set(self.rules.values())) alphabet = {c for (st, c) in self.rules.keys()} @@ -340,10 +359,10 @@ class RegexpDFA: if __name__ == "__main__": tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] - for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*"]: + for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)"]: print("#", pattern) try: - r = RegexpDFA.create(pattern).reduce() + r = RegexpDFA.create(pattern).reduce().normalize() except ParsingError as e: print("Failed to parse the regexp:") print(e)