diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -141,8 +141,8 @@ def parse(pattern, offset=0): class Regexp: - def __init__(self, s): - (self.rules, self.end_states) = self._parse(s) + def __init__(self, pattern): + (self.rules, self.end_states) = self._parse(pattern) def _parse(self, s): r = parse(s) @@ -181,12 +181,56 @@ class Regexp: return any(st in self.end_states for st in current) + def determinize(self): + rules = dict() + end_states = {(-1,)} if -1 in self.end_states else set() + + stack = [(-1,)] + processed_states = set() + while stack: + multistate = stack.pop() + new_rules = dict() + + for ((st, c), target) in filter(lambda item: item[0][0] in multistate, self.rules.items()): + if c not in new_rules: + new_rules[c] = set() + new_rules[c].update(target) + + for (c, target_set) in new_rules.items(): + new_target = tuple(sorted(target_set)) + rules[(multistate, c)] = new_target + if any(st in self.end_states for st in new_target): + end_states.add(new_target) + if new_target not in processed_states: + stack.append(new_target) + processed_states.add(new_target) + + return (rules, end_states) + + +class RegexpDFA: + def __init__(self, pattern): + r = Regexp(pattern) + (self.rules, self.end_states) = r.determinize() + + def match(self, s): + st = (-1,) + + for c in s: + key = (st, c) + if key in self.rules: + st = self.rules[key] + else: + return False + + return st in self.end_states + if __name__ == "__main__": tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] for pattern in ["a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"]: print("#", pattern) - r = Regexp(pattern) + r = RegexpDFA(pattern) for t in tests: print(t, r.match(t)) print()