import math import itertools import argparse from abc import abstractmethod from collections import deque from typing import Iterator, Optional class ParsingError(Exception): pass class Token: """Abstract base class representing a single item of a regular expressions.""" # is the Token mandatory, or can it be omitted is_skippable = False @abstractmethod def list_first(self) -> Iterator[int]: """List all possible string positions the token can start with.""" pass @abstractmethod def list_last(self) -> Iterator[int]: """List all possible string positions the token can end with.""" pass @abstractmethod def list_neighbours(self) -> Iterator[tuple[int, int]]: """List positions of all possibly neighbouring subtokens.""" pass class Lambda(Token): """An empty string, useful as an `Alternative`.""" is_skippable = True def list_first(self): yield from [] def list_last(self): yield from [] def list_neighbours(self): yield from [] class Symbol(Token): """A regular letter or any other symbol without a special function.""" def __init__(self, position: int, value: str): self.position = position self.value = value def list_first(self): yield self.position def list_last(self): yield self.position def list_neighbours(self): yield from [] def __str__(self): return self.value class Asterisk(Token): """A unary operator specifying its content to occur zero or more times.""" is_skippable = True def __init__(self, content: Token): self.content = content def list_first(self): yield from self.content.list_first() def list_last(self): yield from self.content.list_last() def list_neighbours(self): yield from self.content.list_neighbours() for x in self.list_last(): for y in self.list_first(): yield (x, y) def __str__(self): return str(self.content) + "*" class Alternative(Token): """An operator with a variable number of arguments, specifying exchangeable alternatives.""" def __init__(self, content: list[Token]): """:raises ParsingError: raised on an empty variant. That is an AlternativeSeparator surrounded by no other Tokens.""" self.variants = [] subsequence = [] for token in content: if isinstance(token, AlternativeSeparator): if not subsequence: raise ParsingError("Found an empty Alternative variant.") self.variants.append(Chain(subsequence)) subsequence = [] else: subsequence.append(token) if not subsequence: raise ParsingError("Found an empty Alternative variant.") self.variants.append(Chain(subsequence)) def list_first(self): for x in self.variants: yield from x.list_first() def list_last(self): for x in self.variants: yield from x.list_last() def list_neighbours(self): for x in self.variants: yield from x.list_neighbours() @property def is_skippable(self): return any(x.is_skippable for x in self.variants) class AlternativeSeparator: """A special token to temporarily separate Alternative variants. Removed in the Alternative constructor.""" pass class Chain(Token): """An operator expressing a concatenation of its content Tokens.""" def __init__(self, content: list[Token]): self.content = content def list_first(self): for token in self.content: yield from token.list_first() if not token.is_skippable: break def list_last(self): for token in reversed(self.content): yield from token.list_last() if not token.is_skippable: break def list_neighbours(self): previous = [] for token in self.content: for t in previous: for x in t.list_last(): for y in token.list_first(): yield (x, y) yield from token.list_neighbours() if token.is_skippable: previous.append(token) else: previous = [token] @property def is_skippable(self): return all(x.is_skippable for x in self.content) def __str__(self): return "(" + "".join(str(x) for x in self.content) + ")" def find_closing_parenthesis(pattern: str, pos: int) -> int: """Given an opening parenthesis, find the closing one. :param pattern: the regexp pattern :param pos: the opening parenthesis position :return: a position of the matching closing parenthesis :raises AssertionError: on an incorrect call, if there is not an opening parenthesis at `pos` :raises ParsingError: on a malformed pattern with no matching parenthesis""" assert pattern[pos] == "(" counter = 0 for (i, c) in enumerate(pattern[pos:]): if c == "(": counter += 1 elif c == ")": counter -= 1 if counter == 0: return pos+i raise ParsingError(f'A closing parenthesis not found. Pattern: "{pattern}", position: {pos}') def parse(pattern: str, offset: int=0) -> Token: """Recursively parse the pattern into a Token tree. :param pattern: the regexp string :param offset: where the `pattern` starts in the original pattern, to record correct global token positions in the subcalls :return: a token sequence, wrapped in a Chain or an Alternative""" res = [] is_alternative = False i = 0 while i < len(pattern): c = pattern[i] if c == "(": j = find_closing_parenthesis(pattern, i) inner_content = parse(pattern[i+1:j], offset+i+1) res.append(inner_content) i = j+1 elif c == "*": try: token = res.pop() except IndexError as e: raise ParsingError(f'The asterisk operator is missing an argument. Pattern: "{pattern}", position {i}') res.append(Asterisk(token)) i += 1 elif c == ")": raise ParsingError(f'An opening parenthesis not found. Pattern: "{pattern}", position: {i}') elif c == "|" or c == "+": is_alternative = True res.append(AlternativeSeparator()) i += 1 elif c == "_": res.append(Lambda()) i += 1 else: res.append(Symbol(i+offset, c)) i += 1 if is_alternative: return Alternative(res) else: return Chain(res) def print_dfa(dfa: "RegexpDFA", label: str=""): """Utility function for printing automatons in a readable format.""" n = len(dfa.alphabet_index) print(label) for i in range(0, len(dfa.rules), n): print(i//n, dfa.rules[i:i+n]) print(dfa.end_states) class Regexp: def __init__(self, pattern: str): """Parse a pattern string into a sequence of Tokens and build a non-deterministic finite automaton from them.""" r = parse(pattern) # (state, symbol): {state1, ...} rules = dict() alphabet = set() # record possible starting symbols for i in r.list_first(): c = pattern[i] alphabet.add(c) key = (-1, c) if key not in rules: rules[key] = set() rules[key].add(i) # record all pairs of symbols that can immediately follow each other for (i, j) in r.list_neighbours(): c = pattern[j] alphabet.add(c) key = (i, c) if key not in rules: rules[key] = set() rules[key].add(j) # record possible last symbols and mark them as accepting states end_states = set(r.list_last()) if r.is_skippable: end_states.add(-1) self.rules = rules self.end_states = end_states self.alphabet = alphabet def match(self, s: str) -> bool: """Decide if a string matches the regexp. :param s: an input string""" current = {-1} for c in s: new_state = set() for st in current: key = (st, c) if key in self.rules: new_state.update(self.rules[key]) current = new_state return any(st in self.end_states for st in current) def determinize(self) -> "RegexpDFA": """Convert the non-deterministic finite automaton into a deterministic one.""" alphabet_index = {c: i for (i, c) in enumerate(sorted(self.alphabet))} n = len(alphabet_index) compact_rules = [-1] * n end_states = {0} if -1 in self.end_states else set() index = {(-1,): 0} stack = [(-1,)] # explore all possible state sets the NFA can visit while stack: multistate = stack.pop() new_rules = dict() # collect possible target states for ((st, c), target) in filter(lambda item: item[0][0] in multistate, self.rules.items()): if c not in new_rules: new_rules[c] = set() new_rules[c].update(target) # map the state sets to integer labels and record them in a single dimensional rules list for (c, target_set) in new_rules.items(): target_tup = tuple(sorted(target_set)) if target_tup not in index: new_target = len(index) index[target_tup] = new_target compact_rules.extend([-1] * n) stack.append(target_tup) compact_rules[index[multistate]*n + alphabet_index[c]] = index[target_tup] if any(st in self.end_states for st in target_set): end_states.add(index[target_tup]) # create an additional fail state # and redirect all undefined transition rules to it fail = len(index) compact_rules = [(st if st >= 0 else fail) for st in compact_rules] compact_rules.extend([fail] * n) return RegexpDFA(compact_rules, end_states, alphabet_index) class RegexpDFA: def __init__(self, rules: list[int], end_states: set[int], alphabet_index: dict[str, int]): """A deterministic finite automaton constructor. If the `rules` or the `alphabet_index` are empty, then instead of an empty automaton we create a trivial automaton failing on any input (accepting empty strings). :param rules: transition rules, where (i*n+j)th item defines the transition for i-th state on j-th alphabet symbol :param end_states: accepting states :param alphabet_index: mapping from alphabet symbols to rule columns""" self.rules = rules or [1, 1] self.end_states = end_states self.alphabet_index = alphabet_index or {"": 0} @staticmethod def create(pattern: str) -> "RegexpDFA": """Create a `Regexp` and determinize it.""" r = Regexp(pattern) return r.determinize() def match(self, s: str): """Decide if a string matches the regexp. :param s: an input string""" st = 0 n = len(self.alphabet_index) for c in s: if c not in self.alphabet_index: return False key = (st*n + self.alphabet_index[c]) st = self.rules[key] return st in self.end_states def reduce(self) -> "RegexpDFA": """Minimize the automaton by collapsing equivalent states.""" partition = self._find_equivalent_states() (rules, end_states) = self._collapse_states(partition) return RegexpDFA(rules, end_states, self.alphabet_index) def normalize(self) -> "RegexpDFA": """Normalize the automaton by relabeling the states in a breadth first search walkthrough order and remove unreachable states.""" n = len(self.alphabet_index) index = {0: 0} queue = deque([0]) rules = [] while queue: si = queue.popleft() row = self.rules[si*n:(si+1)*n] for sj in row: if sj not in index: index[sj] = len(index) queue.append(sj) rules.extend(index[sj] for sj in row) end_states = {index[si] for si in self.end_states if si in index} return RegexpDFA(rules, end_states, self.alphabet_index) def find_distinguishing_string(self, r: "RegexpDFA") -> Optional[str]: """Find the shortest string that is accepted by self or `r`, but not both. :param r: the other automaton :return: the distinguishing string, or None if the automatons are equivalent""" r1 = self.reduce().normalize() r2 = r.reduce().normalize() if r1.rules == r2.rules and r1.end_states == r2.end_states: return None r1 = r1._expand_alphabet(r2.alphabet_index) r2 = r2._expand_alphabet(r1.alphabet_index) product = r1._build_product_automaton(r2) n = len(product.alphabet_index) # state: symbol inverse_alphabet_index = {v: k for (k, v) in product.alphabet_index.items()} queue = deque([(0, "")]) visited = {0} while queue: (state, acc) = queue.popleft() if state in product.end_states: return acc for (i, target) in enumerate(product.rules[state*n:(state+1)*n]): if target not in visited: queue.append((target, acc+inverse_alphabet_index[i])) visited.add(target) assert False def _find_equivalent_states(self) -> list[set[int]]: """Partition states into their equivalence classes with Hopcroft's algorithm. :return: sets of equivalent states. Unique states get single item sets.""" n = len(self.alphabet_index) m = len(self.rules) // n inverse_rules = [set() for i in range(m*n)] # a transition rules inverse. target state + alphabet symbol -> source states for i in range(m): for j in range(n): target = self.rules[i*n + j] inverse_rules[target*n + j].add(i) set_bag = [self.end_states, set(range(m))-self.end_states] res = {0, 1} work = {0, 1} while work: key = work.pop() target_set = set_bag[key] for j in range(n): source_set = set(itertools.chain.from_iterable(inverse_rules[t*n + j] for t in target_set)) for k in res.copy(): part = set_bag[k] intersection = part & source_set diff = part - source_set if not intersection or not diff: continue res.remove(k) ki = len(set_bag) set_bag.append(intersection) res.add(ki) kd = len(set_bag) set_bag.append(diff) res.add(kd) if k in work: work.remove(k) work.add(ki) work.add(kd) elif len(intersection) < len(diff): work.add(ki) else: work.add(kd) return [set_bag[k] for k in res] def _collapse_states(self, partition: list[set[int]]) -> tuple[list[int], set[int]]: """Collapse equivalent states from each class into a single one. :param partition: list of equivalence classes :return: rules list, accepting states""" n = len(self.alphabet_index) rules = [] # states mapping due to the equivalents collapse eq_mapping = dict() for eq_set in partition: states = sorted(eq_set) for st in states: eq_mapping[st] = states[0] # states mapping to keep the rules list compact after the equivalents collapse discard_mapping = dict() discard_count = 0 for i in range(0, len(self.rules), n): si = i//n if eq_mapping[si] != si: discard_count += 1 continue discard_mapping[si] = si - discard_count rules.extend(map(lambda st: eq_mapping[st], self.rules[i:i+n])) rules = [discard_mapping[st] for st in rules] end_states = {discard_mapping[eq_mapping[st]] for st in self.end_states} return (rules, end_states) def _expand_alphabet(self, alphabet_index: dict[str, int]) -> "RegexpDFA": """Expand the automaton to accommodate union of `self.alphabet_index` and the provided `alphabet_index`. :param alphabet_index: a mapping from letters to rules columns :return: a RegexpDFA over the unified alphabet""" if alphabet_index == self.alphabet_index: return self n1 = len(self.alphabet_index) m = len(self.rules) // n1 combined_alphabet = set(self.alphabet_index.keys()) | set(alphabet_index.keys()) # a new alphabet_index combined_index = {c: i for (i, c) in enumerate(sorted(combined_alphabet))} # a new letter key: the old letter key conversion_index = {combined_index[k]: v for (k, v) in self.alphabet_index.items()} n2 = len(combined_alphabet) # rewrite the rules into a new table, filling blanks with a new fail state rules = [ self.rules[i*n1 + conversion_index[j]] if j in conversion_index else m for i in range(m) for j in range(n2) ] rules.extend([m]*n2) return RegexpDFA(rules, self.end_states, combined_index).reduce().normalize() def _build_product_automaton(self, r: "RegexpDFA") -> "RegexpDFA": """Create a new automaton, with a carthesian product of the arguments' states and corresponding transition rules. :param r: the other finite automaton. It must have the same `alphabet_index` as `self`""" assert self.alphabet_index == r.alphabet_index n = len(self.alphabet_index) m = len(r.rules) // n k = len(self.rules) // n rules = [] end_states = set() # expand each self state into m new states, one for each of the r states for s1 in range(k): row1 = self.rules[s1*n:(s1+1)*n] for s2 in range(m): row2 = r.rules[s2*n:(s2+1)*n] rules.extend([x*m + y for (x, y) in zip(row1, row2)]) if (s1 in self.end_states) != (s2 in r.end_states): end_states.add(s1*m + s2) return RegexpDFA(rules, end_states, self.alphabet_index).reduce().normalize() def test(): tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] for pattern in ["", "a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]: print("#", pattern) try: r = RegexpDFA.create(pattern).reduce().normalize() except ParsingError as e: print("Failed to parse the regexp:") print(e) continue for t in tests: print(t, r.match(t)) print() if __name__ == "__main__": parser = argparse.ArgumentParser() subparsers = parser.add_subparsers() test_parser = subparsers.add_parser("test") test_parser.set_defaults(name="test") match_parser = subparsers.add_parser("match") match_parser.add_argument("pattern") match_parser.add_argument("string") match_parser.set_defaults(name="match") compare_parser = subparsers.add_parser("compare") compare_parser.add_argument("pattern1") compare_parser.add_argument("pattern2") compare_parser.set_defaults(name="compare") args = parser.parse_args() if args.name == "test": test() elif args.name == "match": try: r = RegexpDFA.create(args.pattern).reduce().normalize() except ParsingError as e: print("Failed to parse the regexp:") print(e) print(r.match(args.string)) elif args.name == "compare": r1 = RegexpDFA.create(args.pattern1).reduce().normalize() r2 = RegexpDFA.create(args.pattern2).reduce().normalize() print(r1.find_distinguishing_string(r2)) else: parser.print_help()