diff --git a/Cargo.lock b/Cargo.lock --- a/Cargo.lock +++ b/Cargo.lock @@ -4,4 +4,4 @@ version = 3 [[package]] name = "regexp" -version = "0.1.0" +version = "1.0.0" diff --git a/Cargo.toml b/Cargo.toml --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [package] name = "regexp" -version = "0.1.0" +version = "1.0.0" edition = "2021" +license = " GPL-3.0-or-later" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/README.md b/README.md new file mode 100644 --- /dev/null +++ b/README.md @@ -0,0 +1,47 @@ +# Regular expresso + +Written as an exercise in Rust and a refreshment of my university classes on finite automatons. + +The supported syntax is restricted to the following operators: +* `*` - the Kleene star. The previous item can be present zero or more times. +* Concatenation - is implied between items following each other. `ab*` matches an `a` followed by any number of `b`s +* `+` or `|` - a union of several alternatives. Both symbols are equivalent. +* `_` - a lambda, an empty string +* `()` - parentheses, grouping several tokens together. For example `a|b*` matches a single `a` or any number of `b`s. `(a|b)*` matches any string of `a`s and `b`s +* Symbols not listed above are used as they are. There is no escaping, so matching for the operator literals is not possible. + +## Usage +The python code supports Python 3.9 and higher. The rust code is written for Rust edition 2021. + +``` +# Match a string against a pattern (returns true or false) +python regexp.py match PATTERN STRING +# or +cargo run match PATTERN STRING + +# Compare two regexps and return None of they are equivalent, or the shortest string that can distinguish them +# That is, it gets accepted by one, but not the other +python regexp.py compare PATTERN1 PATTERN2 +# or +cargo run compare PATTERN1 PATTERN2 +``` + +## Theory +1. The pattern gets parsed into a tree of its parts. +2. We check which symbols can stand at the beginning, which can follow each other and which can stand at the end of the string. These data are used to construct a non-deterministic finite automaton (NFA). +3. *Determinization:* We explore the possible paths through the NFA, recording the state sets visited. Each set is then converted to a single state of a deterministic finite automaton (DFA). +4. *Reduction (minimization):* We iteratively check which states of the DFA are equivalent - they can't be distinguished from each other by any string. Each equivalence class can be collapsed to a single state, compressing and simplifying the DFA. +5. *Normalization:* Finally we normalize the DFA by visiting its states in a breadth-first search (BFS) order, with edges ordered by the alphabet. Any unreachable states get removed here. (1) +6. Two regular expressions are equivalent if and only if their normalized DFAs are equal. + +### Looking for a distinguishing string +1. We construct DFAs for both patterns as described just above. +2. We merge them into a product automaton, with states being a carthesian product of the original DFAs' states, and the transition function constructed accordingly. The state `(qa, qb)` is accepting if exactly one of `qa`, `qb` was accepting in the original automatons. (2) +3. We search through the product automaton's states in a BFS order, recording our path, until we find an accepting state. The path from the start to the accepting state defines a distinguishing word for the two input automata. + +### Notes +(1) This is formally described as a part of reduction, but the implementation leaves it as a side effect of normalization. + +Also note that the algorithm doesn't actually produce any unreachable states, so this is only relevant in (2). + +(2) Construction of the product automaton can create plenty of states that can't be actually reached and their removal gets useful. diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -3,6 +3,7 @@ import itertools import argparse from abc import abstractmethod from collections import deque +from typing import Iterator, Optional class ParsingError(Exception): @@ -10,22 +11,28 @@ class ParsingError(Exception): class Token: + """Abstract base class representing a single item of a regular expressions.""" + # is the Token mandatory, or can it be omitted is_skippable = False @abstractmethod - def list_first(self): + def list_first(self) -> Iterator[int]: + """List all possible string positions the token can start with.""" pass @abstractmethod - def list_last(self): + def list_last(self) -> Iterator[int]: + """List all possible string positions the token can end with.""" pass @abstractmethod - def list_neighbours(self): + def list_neighbours(self) -> Iterator[tuple[int, int]]: + """List positions of all possibly neighbouring subtokens.""" pass class Lambda(Token): + """An empty string, useful as an `Alternative`.""" is_skippable = True def list_first(self): @@ -39,7 +46,8 @@ class Lambda(Token): class Symbol(Token): - def __init__(self, position, value): + """A regular letter or any other symbol without a special function.""" + def __init__(self, position: int, value: str): self.position = position self.value = value @@ -57,6 +65,7 @@ class Symbol(Token): class Asterisk(Token): + """A unary operator specifying its content to occur zero or more times.""" is_skippable = True def __init__(self, content: Token): @@ -79,7 +88,9 @@ class Asterisk(Token): class Alternative(Token): - def __init__(self, content: list): + """An operator with a variable number of arguments, specifying exchangeable alternatives.""" + def __init__(self, content: list[Token]): + """:raises ParsingError: raised on an empty variant. That is an AlternativeSeparator surrounded by no other Tokens.""" self.variants = [] subsequence = [] @@ -114,10 +125,12 @@ class Alternative(Token): return any(x.is_skippable for x in self.variants) class AlternativeSeparator: + """A special token to temporarily separate Alternative variants. Removed in the Alternative constructor.""" pass class Chain(Token): - def __init__(self, content: list): + """An operator expressing a concatenation of its content Tokens.""" + def __init__(self, content: list[Token]): self.content = content def list_first(self): @@ -154,21 +167,34 @@ class Chain(Token): return "(" + "".join(str(x) for x in self.content) + ")" -def find_closing_parenthesis(pattern, k): +def find_closing_parenthesis(pattern: str, pos: int) -> int: + """Given an opening parenthesis, find the closing one. + + :param pattern: the regexp pattern + :param pos: the opening parenthesis position + :return: a position of the matching closing parenthesis + :raises AssertionError: on an incorrect call, if there is not an opening parenthesis at `pos` + :raises ParsingError: on a malformed pattern with no matching parenthesis""" + assert pattern[pos] == "(" counter = 0 - for (i, c) in enumerate(pattern[k:]): + for (i, c) in enumerate(pattern[pos:]): if c == "(": counter += 1 elif c == ")": counter -= 1 if counter == 0: - return k+i + return pos+i - raise ParsingError(f'A closing parenthesis not found. Pattern: "{pattern}", position: {k}') + raise ParsingError(f'A closing parenthesis not found. Pattern: "{pattern}", position: {pos}') -def parse(pattern, offset=0): +def parse(pattern: str, offset: int=0) -> Token: + """Recursively parse the pattern into a Token tree. + + :param pattern: the regexp string + :param offset: where the `pattern` starts in the original pattern, to record correct global token positions in the subcalls + :return: a token sequence, wrapped in a Chain or an Alternative""" res = [] is_alternative = False @@ -206,7 +232,8 @@ def parse(pattern, offset=0): return Chain(res) -def print_dfa(dfa, label=""): +def print_dfa(dfa: "RegexpDFA", label: str=""): + """Utility function for printing automatons in a readable format.""" n = len(dfa.alphabet_index) print(label) for i in range(0, len(dfa.rules), n): @@ -215,11 +242,14 @@ def print_dfa(dfa, label=""): class Regexp: - def __init__(self, pattern): + def __init__(self, pattern: str): + """Parse a pattern string into a sequence of Tokens and build a non-deterministic finite automaton from them.""" r = parse(pattern) + # (state, symbol): {state1, ...} rules = dict() alphabet = set() + # record possible starting symbols for i in r.list_first(): c = pattern[i] alphabet.add(c) @@ -228,6 +258,7 @@ class Regexp: rules[key] = set() rules[key].add(i) + # record all pairs of symbols that can immediately follow each other for (i, j) in r.list_neighbours(): c = pattern[j] alphabet.add(c) @@ -236,6 +267,7 @@ class Regexp: rules[key] = set() rules[key].add(j) + # record possible last symbols and mark them as accepting states end_states = set(r.list_last()) if r.is_skippable: end_states.add(-1) @@ -244,7 +276,10 @@ class Regexp: self.end_states = end_states self.alphabet = alphabet - def match(self, s): + def match(self, s: str) -> bool: + """Decide if a string matches the regexp. + + :param s: an input string""" current = {-1} for c in s: @@ -257,7 +292,8 @@ class Regexp: return any(st in self.end_states for st in current) - def determinize(self): + def determinize(self) -> "RegexpDFA": + """Convert the non-deterministic finite automaton into a deterministic one.""" alphabet_index = {c: i for (i, c) in enumerate(sorted(self.alphabet))} n = len(alphabet_index) compact_rules = [-1] * n @@ -265,15 +301,18 @@ class Regexp: index = {(-1,): 0} stack = [(-1,)] + # explore all possible state sets the NFA can visit while stack: multistate = stack.pop() new_rules = dict() + # collect possible target states for ((st, c), target) in filter(lambda item: item[0][0] in multistate, self.rules.items()): if c not in new_rules: new_rules[c] = set() new_rules[c].update(target) + # map the state sets to integer labels and record them in a single dimensional rules list for (c, target_set) in new_rules.items(): target_tup = tuple(sorted(target_set)) if target_tup not in index: @@ -285,27 +324,38 @@ class Regexp: if any(st in self.end_states for st in target_set): end_states.add(index[target_tup]) + # create an additional fail state + # and redirect all undefined transition rules to it fail = len(index) compact_rules = [(st if st >= 0 else fail) for st in compact_rules] compact_rules.extend([fail] * n) - return (compact_rules, end_states, alphabet_index) + return RegexpDFA(compact_rules, end_states, alphabet_index) class RegexpDFA: - def __init__(self, rules, end_states, alphabet_index): + def __init__(self, rules: list[int], end_states: set[int], alphabet_index: dict[str, int]): + """A deterministic finite automaton constructor. + + If the `rules` or the `alphabet_index` are empty, then instead of an empty automaton we create a trivial automaton failing on any input (accepting empty strings). + + :param rules: transition rules, where (i*n+j)th item defines the transition for i-th state on j-th alphabet symbol + :param end_states: accepting states + :param alphabet_index: mapping from alphabet symbols to rule columns""" self.rules = rules or [1, 1] self.end_states = end_states self.alphabet_index = alphabet_index or {"": 0} - @classmethod - def create(cls, pattern): + @staticmethod + def create(pattern: str) -> "RegexpDFA": + """Create a `Regexp` and determinize it.""" r = Regexp(pattern) - (rules, end_states, alphabet_index) = r.determinize() + return r.determinize() - return cls(rules, end_states, alphabet_index) - - def match(self, s): + def match(self, s: str): + """Decide if a string matches the regexp. + + :param s: an input string""" st = 0 n = len(self.alphabet_index) @@ -317,13 +367,15 @@ class RegexpDFA: return st in self.end_states - def reduce(self): + def reduce(self) -> "RegexpDFA": + """Minimize the automaton by collapsing equivalent states.""" partition = self._find_equivalent_states() (rules, end_states) = self._collapse_states(partition) return RegexpDFA(rules, end_states, self.alphabet_index) - def normalize(self): + def normalize(self) -> "RegexpDFA": + """Normalize the automaton by relabeling the states in a breadth first search walkthrough order and remove unreachable states.""" n = len(self.alphabet_index) index = {0: 0} queue = deque([0]) @@ -343,16 +395,24 @@ class RegexpDFA: return RegexpDFA(rules, end_states, self.alphabet_index) - def find_distinguishing_string(self, r): - if self.rules == r.rules and self.end_states == r.end_states: + def find_distinguishing_string(self, r: "RegexpDFA") -> Optional[str]: + """Find the shortest string that is accepted by self or `r`, but not both. + + :param r: the other automaton + :return: the distinguishing string, or None if the automatons are equivalent""" + r1 = self.reduce().normalize() + r2 = r.reduce().normalize() + + if r1.rules == r2.rules and r1.end_states == r2.end_states: return None - r1 = self._expand_alphabet(r.alphabet_index) - r2 = r._expand_alphabet(self.alphabet_index) + r1 = r1._expand_alphabet(r2.alphabet_index) + r2 = r2._expand_alphabet(r1.alphabet_index) product = r1._build_product_automaton(r2) n = len(product.alphabet_index) - reverse_alphabet_index = {v: k for (k, v) in product.alphabet_index.items()} + # state: symbol + inverse_alphabet_index = {v: k for (k, v) in product.alphabet_index.items()} queue = deque([(0, "")]) visited = {0} while queue: @@ -361,16 +421,20 @@ class RegexpDFA: return acc for (i, target) in enumerate(product.rules[state*n:(state+1)*n]): if target not in visited: - queue.append((target, acc+reverse_alphabet_index[i])) + queue.append((target, acc+inverse_alphabet_index[i])) visited.add(target) assert False - def _find_equivalent_states(self): + def _find_equivalent_states(self) -> list[set[int]]: + """Partition states into their equivalence classes with Hopcroft's algorithm. + + :return: sets of equivalent states. Unique states get single item sets.""" n = len(self.alphabet_index) m = len(self.rules) // n inverse_rules = [set() for i in range(m*n)] + # a transition rules inverse. target state + alphabet symbol -> source states for i in range(m): for j in range(n): target = self.rules[i*n + j] @@ -409,16 +473,22 @@ class RegexpDFA: return [set_bag[k] for k in res] - def _collapse_states(self, partition): + def _collapse_states(self, partition: list[set[int]]) -> tuple[list[int], set[int]]: + """Collapse equivalent states from each class into a single one. + + :param partition: list of equivalence classes + :return: rules list, accepting states""" n = len(self.alphabet_index) rules = [] + # states mapping due to the equivalents collapse eq_mapping = dict() for eq_set in partition: states = sorted(eq_set) for st in states: eq_mapping[st] = states[0] + # states mapping to keep the rules list compact after the equivalents collapse discard_mapping = dict() discard_count = 0 @@ -435,7 +505,11 @@ class RegexpDFA: return (rules, end_states) - def _expand_alphabet(self, alphabet_index): + def _expand_alphabet(self, alphabet_index: dict[str, int]) -> "RegexpDFA": + """Expand the automaton to accommodate union of `self.alphabet_index` and the provided `alphabet_index`. + + :param alphabet_index: a mapping from letters to rules columns + :return: a RegexpDFA over the unified alphabet""" if alphabet_index == self.alphabet_index: return self @@ -443,10 +517,13 @@ class RegexpDFA: m = len(self.rules) // n1 combined_alphabet = set(self.alphabet_index.keys()) | set(alphabet_index.keys()) + # a new alphabet_index combined_index = {c: i for (i, c) in enumerate(sorted(combined_alphabet))} + # a new letter key: the old letter key conversion_index = {combined_index[k]: v for (k, v) in self.alphabet_index.items()} n2 = len(combined_alphabet) + # rewrite the rules into a new table, filling blanks with a new fail state rules = [ self.rules[i*n1 + conversion_index[j]] if j in conversion_index else m @@ -456,7 +533,11 @@ class RegexpDFA: return RegexpDFA(rules, self.end_states, combined_index).reduce().normalize() - def _build_product_automaton(self, r): + def _build_product_automaton(self, r: "RegexpDFA") -> "RegexpDFA": + """Create a new automaton, with a carthesian product of the arguments' states and corresponding transition rules. + + :param r: the other finite automaton. It must have the same `alphabet_index` as `self`""" + assert self.alphabet_index == r.alphabet_index n = len(self.alphabet_index) m = len(r.rules) // n k = len(self.rules) // n @@ -464,6 +545,7 @@ class RegexpDFA: rules = [] end_states = set() + # expand each self state into m new states, one for each of the r states for s1 in range(k): row1 = self.rules[s1*n:(s1+1)*n] for s2 in range(m): @@ -477,7 +559,7 @@ class RegexpDFA: def test(): tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] - for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]: + for pattern in ["", "a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]: print("#", pattern) try: r = RegexpDFA.create(pattern).reduce().normalize()