diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -1,3 +1,5 @@ +import math +import itertools from abc import abstractmethod from collections import deque @@ -205,21 +207,21 @@ def parse(pattern, offset=0): class Regexp: def __init__(self, pattern): - (self.rules, self.end_states) = self._parse(pattern) - - def _parse(self, s): - r = parse(s) + r = parse(pattern) rules = dict() + alphabet = set() for i in r.list_first(): - c = s[i] + c = pattern[i] + alphabet.add(c) key = (-1, c) if key not in rules: rules[key] = set() rules[key].add(i) for (i, j) in r.list_neighbours(): - c = s[j] + c = pattern[j] + alphabet.add(c) key = (i, c) if key not in rules: rules[key] = set() @@ -229,7 +231,9 @@ class Regexp: if r.is_skippable: end_states.add(-1) - return rules, end_states + self.rules = rules + self.end_states = end_states + self.alphabet = alphabet def match(self, s): current = {-1} @@ -245,10 +249,12 @@ class Regexp: return any(st in self.end_states for st in current) def determinize(self): - rules = dict() - end_states = {-1} if -1 in self.end_states else set() + alphabet_index = {c: i for (i, c) in enumerate(sorted(self.alphabet))} + n = len(alphabet_index) + compact_rules = [-1] * n + end_states = {0} if -1 in self.end_states else set() - index = {(-1,): -1} + index = {(-1,): 0} stack = [(-1,)] while stack: multistate = stack.pop() @@ -262,37 +268,39 @@ class Regexp: for (c, target_set) in new_rules.items(): target_tup = tuple(sorted(target_set)) if target_tup not in index: - new_target = len(index)-1 + new_target = len(index) index[target_tup] = new_target + compact_rules.extend([-1] * n) stack.append(target_tup) - rules[(index[multistate], c)] = index[target_tup] + compact_rules[index[multistate]*n + alphabet_index[c]] = index[target_tup] if any(st in self.end_states for st in target_set): end_states.add(index[target_tup]) - return (rules, end_states) + return (compact_rules, end_states, alphabet_index) class RegexpDFA: - def __init__(self, rules, end_states): + def __init__(self, rules, end_states, alphabet_index): self.rules = rules self.end_states = end_states + self.alphabet_index = alphabet_index @classmethod def create(cls, pattern): r = Regexp(pattern) - (rules, end_states) = r.determinize() + (rules, end_states, alphabet_index) = r.determinize() - return cls(rules, end_states) + return cls(rules, end_states, alphabet_index) def match(self, s): - st = -1 + st = 0 + n = len(self.alphabet_index) for c in s: - key = (st, c) - if key in self.rules: - st = self.rules[key] - else: + if c not in self.alphabet_index or st < 0: return False + key = (st*n + self.alphabet_index[c]) + st = self.rules[key] return st in self.end_states @@ -300,42 +308,40 @@ class RegexpDFA: equivalents = self._find_equivalent_states() (rules, end_states) = self._collapse_states(equivalents) - return RegexpDFA(rules, end_states) + return RegexpDFA(rules, end_states, self.alphabet_index) def normalize(self): - index = {-1: -1} - queue = deque([-1]) + n = len(self.alphabet_index) + index = {-1: -1, 0: 0} + queue = deque([0]) + + rules = [] while queue: - state = queue.popleft() - edges = [((st, c), t) for ((st, c), t) in self.rules.items() if st == state] - edges.sort() - for ((st, c), t) in edges: - if t not in index: - index[t] = len(index)-1 - queue.append(t) + si = queue.popleft() + row = self.rules[si*n:(si+1)*n] + for sj in row: + if sj not in index: + index[sj] = len(index)-1 + queue.append(sj) + rules.extend(index[sj] for sj in row) - rules = dict(((index[st], c), index[t]) for ((st, c), t) in self.rules.items()) - end_states = {index[st] for st in self.end_states} + end_states = {index[si] for si in self.end_states} - return RegexpDFA(rules, end_states) + return RegexpDFA(rules, end_states, self.alphabet_index) def _find_equivalent_states(self): - state_set = [-2, -1] + sorted(set(self.rules.values())) - alphabet = {c for (st, c) in self.rules.keys()} - equivalents = {(s1, s2) for (i, s1) in enumerate(state_set) for s2 in state_set[i+1:]} - - for (s1, s2) in equivalents.copy(): - if (s1 in self.end_states and s2 not in self.end_states) or (s1 not in self.end_states and s2 in self.end_states): - equivalents.remove((s1, s2)) + n = len(self.alphabet_index) + state_list = list(range(len(self.rules) // n)) + equivalents = {(s1, s2) for (i, s1) in enumerate(state_list) for s2 in state_list[i+1:] if (s1 in self.end_states) == (s2 in self.end_states)} ctrl = True while ctrl: ctrl = False for (s1, s2) in equivalents.copy(): - for c in alphabet: - t1 = self.rules.get((s1, c), -2) - t2 = self.rules.get((s2, c), -2) + for ci in range(n): + t1 = self.rules[s1*n + ci] + t2 = self.rules[s2*n + ci] key = (min(t1, t2), max(t1, t2)) if t1 != t2 and key not in equivalents: equivalents.remove((s1, s2)) @@ -345,17 +351,28 @@ class RegexpDFA: return equivalents def _collapse_states(self, equivalents): - rules = self.rules.items() - end_states = self.end_states.copy() + n = len(self.alphabet_index) + rules = [] + + eq_mapping = dict() + for (s1, s2) in equivalents: + eq_mapping[s2] = min(s1, eq_mapping.get(s2, math.inf)) + + discard_mapping = {-1: -1} + discard_count = 0 - for (s1, s2) in sorted(equivalents): - rules = map( - lambda item: (item[0], s1 if item[1] == s2 else item[1]), - filter(lambda item: item[0][0] != s2, rules) - ) - end_states.discard(s2) + for i in range(0, len(self.rules), n): + si = i//n + if si in eq_mapping: + discard_count += 1 + continue + discard_mapping[si] = si - discard_count + rules.extend(map(lambda st: eq_mapping.get(st, st), self.rules[i:i+n])) - return (dict(rules), end_states) + rules = [discard_mapping[st] for st in rules] + end_states = {discard_mapping[eq_mapping.get(st, st)] for st in self.end_states} + + return (rules, end_states) if __name__ == "__main__":