from abc import abstractmethod from collections import deque class ParsingError(Exception): pass class Token: is_skippable = False @abstractmethod def list_first(self): pass @abstractmethod def list_last(self): pass @abstractmethod def list_neighbours(self): pass class Lambda(Token): is_skippable = True def list_first(self): yield from [] def list_last(self): yield from [] def list_neighbours(self): yield from [] class Symbol(Token): def __init__(self, position, value): self.position = position self.value = value def list_first(self): yield self.position def list_last(self): yield self.position def list_neighbours(self): yield from [] def __str__(self): return self.value class Asterisk(Token): is_skippable = True def __init__(self, content: Token): self.content = content def list_first(self): yield from self.content.list_first() def list_last(self): yield from self.content.list_last() def list_neighbours(self): yield from self.content.list_neighbours() for x in self.list_last(): for y in self.list_first(): yield (x, y) def __str__(self): return str(self.content) + "*" class Alternative(Token): def __init__(self, content: list): self.variants = [] subsequence = [] for token in content: if isinstance(token, AlternativeSeparator): if not subsequence: raise ParsingError("Found an empty Alternative variant.") self.variants.append(Chain(subsequence)) subsequence = [] else: subsequence.append(token) if not subsequence: raise ParsingError("Found an empty Alternative variant.") self.variants.append(Chain(subsequence)) def list_first(self): for x in self.variants: yield from x.list_first() def list_last(self): for x in self.variants: yield from x.list_last() def list_neighbours(self): for x in self.variants: yield from x.list_neighbours() @property def is_skippable(self): return any(x.is_skippable for x in self.variants) class AlternativeSeparator: pass class Chain(Token): def __init__(self, content: list): self.content = content def list_first(self): for token in self.content: yield from token.list_first() if not token.is_skippable: break def list_last(self): for token in reversed(self.content): yield from token.list_last() if not token.is_skippable: break def list_neighbours(self): previous = [] for token in self.content: for t in previous: for x in t.list_last(): for y in token.list_first(): yield (x, y) yield from token.list_neighbours() if token.is_skippable: previous.append(token) else: previous = [token] @property def is_skippable(self): return all(x.is_skippable for x in self.content) def __str__(self): return "(" + "".join(str(x) for x in self.content) + ")" def find_closing_parenthesis(pattern, k): counter = 0 for (i, c) in enumerate(pattern[k:]): if c == "(": counter += 1 elif c == ")": counter -= 1 if counter == 0: return k+i raise ParsingError(f'A closing parenthesis not found. Pattern: "{pattern}", position: {k}') def parse(pattern, offset=0): res = [] is_alternative = False i = 0 while i < len(pattern): c = pattern[i] if c == "(": j = find_closing_parenthesis(pattern, i) inner_content = parse(pattern[i+1:j], offset+i+1) res.append(inner_content) i = j+1 elif c == "*": try: token = res.pop() except IndexError as e: raise ParsingError(f'The asterisk operator is missing an argument. Pattern: "{pattern}", position {i}') res.append(Asterisk(token)) i += 1 elif c == ")": raise ParsingError(f'An opening parenthesis not found. Pattern: "{pattern}", position: {i}') elif c == "|" or c == "+": is_alternative = True res.append(AlternativeSeparator()) i += 1 elif c == "_": res.append(Lambda()) i += 1 else: res.append(Symbol(i+offset, c)) i += 1 if is_alternative: return Alternative(res) else: return Chain(res) class Regexp: def __init__(self, pattern): (self.rules, self.end_states) = self._parse(pattern) def _parse(self, s): r = parse(s) rules = dict() for i in r.list_first(): c = s[i] key = (-1, c) if key not in rules: rules[key] = set() rules[key].add(i) for (i, j) in r.list_neighbours(): c = s[j] key = (i, c) if key not in rules: rules[key] = set() rules[key].add(j) end_states = set(r.list_last()) if r.is_skippable: end_states.add(-1) return rules, end_states def match(self, s): current = {-1} for c in s: new_state = set() for st in current: key = (st, c) if key in self.rules: new_state.update(self.rules[key]) current = new_state return any(st in self.end_states for st in current) def determinize(self): rules = dict() end_states = {(-1,)} if -1 in self.end_states else set() stack = [(-1,)] processed_states = set() while stack: multistate = stack.pop() new_rules = dict() for ((st, c), target) in filter(lambda item: item[0][0] in multistate, self.rules.items()): if c not in new_rules: new_rules[c] = set() new_rules[c].update(target) for (c, target_set) in new_rules.items(): new_target = tuple(sorted(target_set)) rules[(multistate, c)] = new_target if any(st in self.end_states for st in new_target): end_states.add(new_target) if new_target not in processed_states: stack.append(new_target) processed_states.add(new_target) return (rules, end_states) class RegexpDFA: def __init__(self, rules, end_states): self.rules = rules self.end_states = end_states @classmethod def create(cls, pattern): r = Regexp(pattern) (rules, end_states) = r.determinize() return cls(rules, end_states) def match(self, s): st = 0 for c in s: key = (st, c) if key in self.rules: st = self.rules[key] else: return False return st in self.end_states def reduce(self): equivalents = self._find_equivalent_states() (rules, end_states) = self._collapse_states(equivalents) return RegexpDFA(rules, end_states) def normalize(self): index = {(-1,): 0} queue = deque([(-1,)]) while queue: state = queue.popleft() edges = [((st, c), t) for ((st, c), t) in self.rules.items() if st == state] edges.sort() for ((st, c), t) in edges: if t not in index: index[t] = len(index) queue.append(t) rules = dict(((index[st], c), index[t]) for ((st, c), t) in self.rules.items()) end_states = {index[st] for st in self.end_states} return RegexpDFA(rules, end_states) def _find_equivalent_states(self): state_set = [(-2,), (-1,)] + sorted(set(self.rules.values())) alphabet = {c for (st, c) in self.rules.keys()} equivalents = {(s1, s2) for (i, s1) in enumerate(state_set) for s2 in state_set[i+1:]} for (s1, s2) in equivalents.copy(): if (s1 in self.end_states and s2 not in self.end_states) or (s1 not in self.end_states and s2 in self.end_states): equivalents.remove((s1, s2)) ctrl = True while ctrl: ctrl = False for (s1, s2) in equivalents.copy(): for c in alphabet: t1 = self.rules.get((s1, c), (-2,)) t2 = self.rules.get((s2, c), (-2,)) key = (min(t1, t2), max(t1, t2)) if t1 != t2 and key not in equivalents: equivalents.remove((s1, s2)) ctrl = True break return equivalents def _collapse_states(self, equivalents): rules = self.rules.items() end_states = self.end_states.copy() for (s1, s2) in sorted(equivalents): rules = map( lambda item: (item[0], s1 if item[1] == s2 else item[1]), filter(lambda item: item[0][0] != s2, rules) ) end_states.discard(s2) return (dict(rules), end_states) if __name__ == "__main__": tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)"]: print("#", pattern) try: r = RegexpDFA.create(pattern).reduce().normalize() except ParsingError as e: print("Failed to parse the regexp:") print(e) continue for t in tests: print(t, r.match(t)) print()