diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -3,6 +3,7 @@ import itertools import argparse from abc import abstractmethod from collections import deque +from typing import Iterator, Optional class ParsingError(Exception): @@ -10,22 +11,28 @@ class ParsingError(Exception): class Token: + """Abstract base class representing a single item of a regular expressions.""" + # is the Token mandatory, or can it be omitted is_skippable = False @abstractmethod - def list_first(self): + def list_first(self) -> Iterator[int]: + """List all possible string positions the token can start with.""" pass @abstractmethod - def list_last(self): + def list_last(self) -> Iterator[int]: + """List all possible string positions the token can end with.""" pass @abstractmethod - def list_neighbours(self): + def list_neighbours(self) -> Iterator[tuple[int, int]]: + """List positions of all possibly neighbouring subtokens.""" pass class Lambda(Token): + """An empty string, useful as an `Alternative`.""" is_skippable = True def list_first(self): @@ -39,7 +46,8 @@ class Lambda(Token): class Symbol(Token): - def __init__(self, position, value): + """A regular letter or any other symbol without a special function.""" + def __init__(self, position: int, value: str): self.position = position self.value = value @@ -57,6 +65,7 @@ class Symbol(Token): class Asterisk(Token): + """A unary operator specifying its content to occur zero or more times.""" is_skippable = True def __init__(self, content: Token): @@ -79,7 +88,9 @@ class Asterisk(Token): class Alternative(Token): - def __init__(self, content: list): + """An operator with a variable number of arguments, specifying exchangeable alternatives.""" + def __init__(self, content: list[Token]): + """:raises ParsingError: raised on an empty variant. That is an AlternativeSeparator surrounded by no other Tokens.""" self.variants = [] subsequence = [] @@ -114,10 +125,12 @@ class Alternative(Token): return any(x.is_skippable for x in self.variants) class AlternativeSeparator: + """A special token to temporarily separate Alternative variants. Removed in the Alternative constructor.""" pass class Chain(Token): - def __init__(self, content: list): + """An operator expressing a concatenation of its content Tokens.""" + def __init__(self, content: list[Token]): self.content = content def list_first(self): @@ -154,21 +167,34 @@ class Chain(Token): return "(" + "".join(str(x) for x in self.content) + ")" -def find_closing_parenthesis(pattern, k): +def find_closing_parenthesis(pattern: str, pos: int) -> int: + """Given an opening parenthesis, find the closing one. + + :param pattern: the regexp pattern + :param pos: the opening parenthesis position + :return: a position of the matching closing parenthesis + :raises AssertionError: on an incorrect call, if there is not an opening parenthesis at `pos` + :raises ParsingError: on a malformed pattern with no matching parenthesis""" + assert pattern[pos] == "(" counter = 0 - for (i, c) in enumerate(pattern[k:]): + for (i, c) in enumerate(pattern[pos:]): if c == "(": counter += 1 elif c == ")": counter -= 1 if counter == 0: - return k+i + return pos+i - raise ParsingError(f'A closing parenthesis not found. Pattern: "{pattern}", position: {k}') + raise ParsingError(f'A closing parenthesis not found. Pattern: "{pattern}", position: {pos}') -def parse(pattern, offset=0): +def parse(pattern: str, offset: int=0) -> Token: + """Recursively parse the pattern into a Token tree. + + :param pattern: the regexp string + :param offset: where the `pattern` starts in the original pattern, to record correct global token positions in the subcalls + :return: a token sequence, wrapped in a Chain or an Alternative""" res = [] is_alternative = False @@ -206,7 +232,8 @@ def parse(pattern, offset=0): return Chain(res) -def print_dfa(dfa, label=""): +def print_dfa(dfa: "RegexpDFA", label: str=""): + """Utility function for printing automatons in a readable format.""" n = len(dfa.alphabet_index) print(label) for i in range(0, len(dfa.rules), n): @@ -215,11 +242,14 @@ def print_dfa(dfa, label=""): class Regexp: - def __init__(self, pattern): + def __init__(self, pattern: str): + """Parse a pattern string into a sequence of Tokens and build a non-deterministic finite automaton from them.""" r = parse(pattern) + # (state, symbol): {state1, ...} rules = dict() alphabet = set() + # record possible starting symbols for i in r.list_first(): c = pattern[i] alphabet.add(c) @@ -228,6 +258,7 @@ class Regexp: rules[key] = set() rules[key].add(i) + # record all pairs of symbols that can immediately follow each other for (i, j) in r.list_neighbours(): c = pattern[j] alphabet.add(c) @@ -236,6 +267,7 @@ class Regexp: rules[key] = set() rules[key].add(j) + # record possible last symbols and mark them as accepting states end_states = set(r.list_last()) if r.is_skippable: end_states.add(-1) @@ -244,7 +276,10 @@ class Regexp: self.end_states = end_states self.alphabet = alphabet - def match(self, s): + def match(self, s: str) -> bool: + """Decide if a string matches the regexp. + + :param s: an input string""" current = {-1} for c in s: @@ -257,7 +292,8 @@ class Regexp: return any(st in self.end_states for st in current) - def determinize(self): + def determinize(self) -> "RegexpDFA": + """Convert the non-deterministic finite automaton into a deterministic one.""" alphabet_index = {c: i for (i, c) in enumerate(sorted(self.alphabet))} n = len(alphabet_index) compact_rules = [-1] * n @@ -265,15 +301,18 @@ class Regexp: index = {(-1,): 0} stack = [(-1,)] + # explore all possible state sets the NFA can visit while stack: multistate = stack.pop() new_rules = dict() + # collect possible target states for ((st, c), target) in filter(lambda item: item[0][0] in multistate, self.rules.items()): if c not in new_rules: new_rules[c] = set() new_rules[c].update(target) + # map the state sets to integer labels and record them in a single dimensional rules list for (c, target_set) in new_rules.items(): target_tup = tuple(sorted(target_set)) if target_tup not in index: @@ -285,27 +324,38 @@ class Regexp: if any(st in self.end_states for st in target_set): end_states.add(index[target_tup]) + # create an additional fail state + # and redirect all undefined transition rules to it fail = len(index) compact_rules = [(st if st >= 0 else fail) for st in compact_rules] compact_rules.extend([fail] * n) - return (compact_rules, end_states, alphabet_index) + return RegexpDFA(compact_rules, end_states, alphabet_index) class RegexpDFA: - def __init__(self, rules, end_states, alphabet_index): + def __init__(self, rules: list[int], end_states: set[int], alphabet_index: dict[str, int]): + """A deterministic finite automaton constructor. + + If the `rules` or the `alphabet_index` are empty, then instead of an empty automaton we create a trivial automaton failing on any input (accepting empty strings). + + :param rules: transition rules, where (i*n+j)th item defines the transition for i-th state on j-th alphabet symbol + :param end_states: accepting states + :param alphabet_index: mapping from alphabet symbols to rule columns""" self.rules = rules or [1, 1] self.end_states = end_states self.alphabet_index = alphabet_index or {"": 0} - @classmethod - def create(cls, pattern): + @staticmethod + def create(pattern: str) -> "RegexpDFA": + """Create a `Regexp` and determinize it.""" r = Regexp(pattern) - (rules, end_states, alphabet_index) = r.determinize() + return r.determinize() - return cls(rules, end_states, alphabet_index) - - def match(self, s): + def match(self, s: str): + """Decide if a string matches the regexp. + + :param s: an input string""" st = 0 n = len(self.alphabet_index) @@ -317,13 +367,15 @@ class RegexpDFA: return st in self.end_states - def reduce(self): + def reduce(self) -> "RegexpDFA": + """Minimize the automaton by collapsing equivalent states.""" partition = self._find_equivalent_states() (rules, end_states) = self._collapse_states(partition) return RegexpDFA(rules, end_states, self.alphabet_index) - def normalize(self): + def normalize(self) -> "RegexpDFA": + """Normalize the automaton by relabeling the states in a breadth first search walkthrough order and remove unreachable states.""" n = len(self.alphabet_index) index = {0: 0} queue = deque([0]) @@ -343,16 +395,24 @@ class RegexpDFA: return RegexpDFA(rules, end_states, self.alphabet_index) - def find_distinguishing_string(self, r): - if self.rules == r.rules and self.end_states == r.end_states: + def find_distinguishing_string(self, r: "RegexpDFA") -> Optional[str]: + """Find the shortest string that is accepted by self or `r`, but not both. + + :param r: the other automaton + :return: the distinguishing string, or None if the automatons are equivalent""" + r1 = self.reduce().normalize() + r2 = r.reduce().normalize() + + if r1.rules == r2.rules and r1.end_states == r2.end_states: return None - r1 = self._expand_alphabet(r.alphabet_index) - r2 = r._expand_alphabet(self.alphabet_index) + r1 = r1._expand_alphabet(r2.alphabet_index) + r2 = r2._expand_alphabet(r1.alphabet_index) product = r1._build_product_automaton(r2) n = len(product.alphabet_index) - reverse_alphabet_index = {v: k for (k, v) in product.alphabet_index.items()} + # state: symbol + inverse_alphabet_index = {v: k for (k, v) in product.alphabet_index.items()} queue = deque([(0, "")]) visited = {0} while queue: @@ -361,16 +421,20 @@ class RegexpDFA: return acc for (i, target) in enumerate(product.rules[state*n:(state+1)*n]): if target not in visited: - queue.append((target, acc+reverse_alphabet_index[i])) + queue.append((target, acc+inverse_alphabet_index[i])) visited.add(target) assert False - def _find_equivalent_states(self): + def _find_equivalent_states(self) -> list[set[int]]: + """Partition states into their equivalence classes with Hopcroft's algorithm. + + :return: sets of equivalent states. Unique states get single item sets.""" n = len(self.alphabet_index) m = len(self.rules) // n inverse_rules = [set() for i in range(m*n)] + # a transition rules inverse. target state + alphabet symbol -> source states for i in range(m): for j in range(n): target = self.rules[i*n + j] @@ -409,16 +473,22 @@ class RegexpDFA: return [set_bag[k] for k in res] - def _collapse_states(self, partition): + def _collapse_states(self, partition: list[set[int]]) -> tuple[list[int], set[int]]: + """Collapse equivalent states from each class into a single one. + + :param partition: list of equivalence classes + :return: rules list, accepting states""" n = len(self.alphabet_index) rules = [] + # states mapping due to the equivalents collapse eq_mapping = dict() for eq_set in partition: states = sorted(eq_set) for st in states: eq_mapping[st] = states[0] + # states mapping to keep the rules list compact after the equivalents collapse discard_mapping = dict() discard_count = 0 @@ -435,7 +505,11 @@ class RegexpDFA: return (rules, end_states) - def _expand_alphabet(self, alphabet_index): + def _expand_alphabet(self, alphabet_index: dict[str, int]) -> "RegexpDFA": + """Expand the automaton to accommodate union of `self.alphabet_index` and the provided `alphabet_index`. + + :param alphabet_index: a mapping from letters to rules columns + :return: a RegexpDFA over the unified alphabet""" if alphabet_index == self.alphabet_index: return self @@ -443,10 +517,13 @@ class RegexpDFA: m = len(self.rules) // n1 combined_alphabet = set(self.alphabet_index.keys()) | set(alphabet_index.keys()) + # a new alphabet_index combined_index = {c: i for (i, c) in enumerate(sorted(combined_alphabet))} + # a new letter key: the old letter key conversion_index = {combined_index[k]: v for (k, v) in self.alphabet_index.items()} n2 = len(combined_alphabet) + # rewrite the rules into a new table, filling blanks with a new fail state rules = [ self.rules[i*n1 + conversion_index[j]] if j in conversion_index else m @@ -456,7 +533,11 @@ class RegexpDFA: return RegexpDFA(rules, self.end_states, combined_index).reduce().normalize() - def _build_product_automaton(self, r): + def _build_product_automaton(self, r: "RegexpDFA") -> "RegexpDFA": + """Create a new automaton, with a carthesian product of the arguments' states and corresponding transition rules. + + :param r: the other finite automaton. It must have the same `alphabet_index` as `self`""" + assert self.alphabet_index == r.alphabet_index n = len(self.alphabet_index) m = len(r.rules) // n k = len(self.rules) // n @@ -464,6 +545,7 @@ class RegexpDFA: rules = [] end_states = set() + # expand each self state into m new states, one for each of the r states for s1 in range(k): row1 = self.rules[s1*n:(s1+1)*n] for s2 in range(m): @@ -477,7 +559,7 @@ class RegexpDFA: def test(): tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] - for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]: + for pattern in ["", "a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]: print("#", pattern) try: r = RegexpDFA.create(pattern).reduce().normalize()