# HG changeset patch
# User Laman
# Date 2024-07-08 13:40:46
# Node ID 6fc184bbea04788eca9e701cb17f7da45c4f89eb
# Parent  d2dfbcc8bb96befbf8995cd94895ad44ff593531

added documentation

diff --git a/Cargo.lock b/Cargo.lock
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,4 +4,4 @@ version = 3
 
 [[package]]
 name = "regexp"
-version = "0.1.0"
+version = "1.0.0"
diff --git a/Cargo.toml b/Cargo.toml
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,7 @@
 [package]
 name = "regexp"
-version = "0.1.0"
+version = "1.0.0"
 edition = "2021"
+license = " GPL-3.0-or-later"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/README.md b/README.md
new file mode 100644
--- /dev/null
+++ b/README.md
@@ -0,0 +1,47 @@
+# Regular expresso
+
+Written as an exercise in Rust and a refreshment of my university classes on finite automatons.
+
+The supported syntax is restricted to the following operators:
+* `*` - the Kleene star. The previous item can be present zero or more times.
+* Concatenation - is implied between items following each other. `ab*` matches an `a` followed by any number of `b`s
+* `+` or `|` - a union of several alternatives. Both symbols are equivalent.
+* `_` - a lambda, an empty string
+* `()` - parentheses, grouping several tokens together. For example `a|b*` matches a single `a` or any number of `b`s. `(a|b)*` matches any string of `a`s and `b`s
+* Symbols not listed above are used as they are. There is no escaping, so matching for the operator literals is not possible.
+
+## Usage
+The python code supports Python 3.9 and higher. The rust code is written for Rust edition 2021.
+
+```
+# Match a string against a pattern (returns true or false)
+python regexp.py match PATTERN STRING
+# or
+cargo run match PATTERN STRING
+
+# Compare two regexps and return None of they are equivalent, or the shortest string that can distinguish them
+# That is, it gets accepted by one, but not the other
+python regexp.py compare PATTERN1 PATTERN2
+# or
+cargo run compare PATTERN1 PATTERN2
+```
+
+## Theory
+1. The pattern gets parsed into a tree of its parts.
+2. We check which symbols can stand at the beginning, which can follow each other and which can stand at the end of the string. These data are used to construct a non-deterministic finite automaton (NFA).
+3. *Determinization:* We explore the possible paths through the NFA, recording the state sets visited. Each set is then converted to a single state of a deterministic finite automaton (DFA).
+4. *Reduction (minimization):* We iteratively check which states of the DFA are equivalent - they can't be distinguished from each other by any string. Each equivalence class can be collapsed to a single state, compressing and simplifying the DFA.
+5. *Normalization:* Finally we normalize the DFA by visiting its states in a breadth-first search (BFS) order, with edges ordered by the alphabet. Any unreachable states get removed here. (1)
+6. Two regular expressions are equivalent if and only if their normalized DFAs are equal.
+
+### Looking for a distinguishing string
+1. We construct DFAs for both patterns as described just above.
+2. We merge them into a product automaton, with states being a carthesian product of the original DFAs' states, and the transition function constructed accordingly. The state `(qa, qb)` is accepting if exactly one of `qa`, `qb` was accepting in the original automatons. (2)
+3. We search through the product automaton's states in a BFS order, recording our path, until we find an accepting state. The path from the start to the accepting state defines a distinguishing word for the two input automata.
+
+### Notes
+(1) This is formally described as a part of reduction, but the implementation leaves it as a side effect of normalization.
+
+Also note that the algorithm doesn't actually produce any unreachable states, so this is only relevant in (2).
+
+(2) Construction of the product automaton can create plenty of states that can't be actually reached and their removal gets useful.
diff --git a/regexp.py b/regexp.py
--- a/regexp.py
+++ b/regexp.py
@@ -3,6 +3,7 @@ import itertools
 import argparse
 from abc import abstractmethod
 from collections import deque
+from typing import Iterator, Optional
 
 
 class ParsingError(Exception):
@@ -10,22 +11,28 @@ class ParsingError(Exception):
 
 
 class Token:
+	"""Abstract base class representing a single item of a regular expressions."""
+	# is the Token mandatory, or can it be omitted
 	is_skippable = False
 
 	@abstractmethod
-	def list_first(self):
+	def list_first(self) -> Iterator[int]:
+		"""List all possible string positions the token can start with."""
 		pass
 
 	@abstractmethod
-	def list_last(self):
+	def list_last(self) -> Iterator[int]:
+		"""List all possible string positions the token can end with."""
 		pass
 
 	@abstractmethod
-	def list_neighbours(self):
+	def list_neighbours(self) -> Iterator[tuple[int, int]]:
+		"""List positions of all possibly neighbouring subtokens."""
 		pass
 
 
 class Lambda(Token):
+	"""An empty string, useful as an `Alternative`."""
 	is_skippable = True
 
 	def list_first(self):
@@ -39,7 +46,8 @@ class Lambda(Token):
 
 
 class Symbol(Token):
-	def __init__(self, position, value):
+	"""A regular letter or any other symbol without a special function."""
+	def __init__(self, position: int, value: str):
 		self.position = position
 		self.value = value
 
@@ -57,6 +65,7 @@ class Symbol(Token):
 
 
 class Asterisk(Token):
+	"""A unary operator specifying its content to occur zero or more times."""
 	is_skippable = True
 
 	def __init__(self, content: Token):
@@ -79,7 +88,9 @@ class Asterisk(Token):
 
 
 class Alternative(Token):
-	def __init__(self, content: list):
+	"""An operator with a variable number of arguments, specifying exchangeable alternatives."""
+	def __init__(self, content: list[Token]):
+		""":raises ParsingError: raised on an empty variant. That is an AlternativeSeparator surrounded by no other Tokens."""
 		self.variants = []
 		subsequence = []
 
@@ -114,10 +125,12 @@ class Alternative(Token):
 		return any(x.is_skippable for x in self.variants)
 
 class AlternativeSeparator:
+	"""A special token to temporarily separate Alternative variants. Removed in the Alternative constructor."""
 	pass
 
 class Chain(Token):
-	def __init__(self, content: list):
+	"""An operator expressing a concatenation of its content Tokens."""
+	def __init__(self, content: list[Token]):
 		self.content = content
 
 	def list_first(self):
@@ -154,21 +167,34 @@ class Chain(Token):
 		return "(" + "".join(str(x) for x in self.content) + ")"
 
 
-def find_closing_parenthesis(pattern, k):
+def find_closing_parenthesis(pattern: str, pos: int) -> int:
+	"""Given an opening parenthesis, find the closing one.
+	
+	:param pattern: the regexp pattern
+	:param pos: the opening parenthesis position
+	:return: a position of the matching closing parenthesis
+	:raises AssertionError: on an incorrect call, if there is not an opening parenthesis at `pos`
+	:raises ParsingError: on a malformed pattern with no matching parenthesis"""
+	assert pattern[pos] == "("
 	counter = 0
 
-	for (i, c) in enumerate(pattern[k:]):
+	for (i, c) in enumerate(pattern[pos:]):
 		if c == "(":
 			counter += 1
 		elif c == ")":
 			counter -= 1
 		if counter == 0:
-			return k+i
+			return pos+i
 
-	raise ParsingError(f'A closing parenthesis not found. Pattern: "{pattern}", position: {k}')
+	raise ParsingError(f'A closing parenthesis not found. Pattern: "{pattern}", position: {pos}')
 
 
-def parse(pattern, offset=0):
+def parse(pattern: str, offset: int=0) -> Token:
+	"""Recursively parse the pattern into a Token tree.
+
+	:param pattern: the regexp string
+	:param offset: where the `pattern` starts in the original pattern, to record correct global token positions in the subcalls
+	:return: a token sequence, wrapped in a Chain or an Alternative"""
 	res = []
 	is_alternative = False
 
@@ -206,7 +232,8 @@ def parse(pattern, offset=0):
 		return Chain(res)
 
 
-def print_dfa(dfa, label=""):
+def print_dfa(dfa: "RegexpDFA", label: str=""):
+	"""Utility function for printing automatons in a readable format."""
 	n = len(dfa.alphabet_index)
 	print(label)
 	for i in range(0, len(dfa.rules), n):
@@ -215,11 +242,14 @@ def print_dfa(dfa, label=""):
 
 
 class Regexp:
-	def __init__(self, pattern):
+	def __init__(self, pattern: str):
+		"""Parse a pattern string into a sequence of Tokens and build a non-deterministic finite automaton from them."""
 		r = parse(pattern)
+		# (state, symbol): {state1, ...}
 		rules = dict()
 		alphabet = set()
 
+		# record possible starting symbols
 		for i in r.list_first():
 			c = pattern[i]
 			alphabet.add(c)
@@ -228,6 +258,7 @@ class Regexp:
 				rules[key] = set()
 			rules[key].add(i)
 
+		# record all pairs of symbols that can immediately follow each other
 		for (i, j) in r.list_neighbours():
 			c = pattern[j]
 			alphabet.add(c)
@@ -236,6 +267,7 @@ class Regexp:
 				rules[key] = set()
 			rules[key].add(j)
 
+		# record possible last symbols and mark them as accepting states
 		end_states = set(r.list_last())
 		if r.is_skippable:
 			end_states.add(-1)
@@ -244,7 +276,10 @@ class Regexp:
 		self.end_states = end_states
 		self.alphabet = alphabet
 
-	def match(self, s):
+	def match(self, s: str) -> bool:
+		"""Decide if a string matches the regexp.
+		
+		:param s: an input string"""
 		current = {-1}
 
 		for c in s:
@@ -257,7 +292,8 @@ class Regexp:
 
 		return any(st in self.end_states for st in current)
 
-	def determinize(self):
+	def determinize(self) -> "RegexpDFA":
+		"""Convert the non-deterministic finite automaton into a deterministic one."""
 		alphabet_index = {c: i for (i, c) in enumerate(sorted(self.alphabet))}
 		n = len(alphabet_index)
 		compact_rules = [-1] * n
@@ -265,15 +301,18 @@ class Regexp:
 
 		index = {(-1,): 0}
 		stack = [(-1,)]
+		# explore all possible state sets the NFA can visit
 		while stack:
 			multistate = stack.pop()
 			new_rules = dict()
 			
+			# collect possible target states
 			for ((st, c), target) in filter(lambda item: item[0][0] in multistate, self.rules.items()):
 				if c not in new_rules:
 					new_rules[c] = set()
 				new_rules[c].update(target)
 			
+			# map the state sets to integer labels and record them in a single dimensional rules list
 			for (c, target_set) in new_rules.items():
 				target_tup = tuple(sorted(target_set))
 				if target_tup not in index:
@@ -285,27 +324,38 @@ class Regexp:
 				if any(st in self.end_states for st in target_set):
 					end_states.add(index[target_tup])
 
+		# create an additional fail state
+		# and redirect all undefined transition rules to it
 		fail = len(index)
 		compact_rules = [(st if st >= 0 else fail) for st in compact_rules]
 		compact_rules.extend([fail] * n)
 		
-		return (compact_rules, end_states, alphabet_index)
+		return RegexpDFA(compact_rules, end_states, alphabet_index)
 
 
 class RegexpDFA:
-	def __init__(self, rules, end_states, alphabet_index):
+	def __init__(self, rules: list[int], end_states: set[int], alphabet_index: dict[str, int]):
+		"""A deterministic finite automaton constructor.
+
+		If the `rules` or the `alphabet_index` are empty, then instead of an empty automaton we create a trivial automaton failing on any input (accepting empty strings).
+
+		:param rules: transition rules, where (i*n+j)th item defines the transition for i-th state on j-th alphabet symbol
+		:param end_states: accepting states
+		:param alphabet_index: mapping from alphabet symbols to rule columns"""
 		self.rules = rules or [1, 1]
 		self.end_states = end_states
 		self.alphabet_index = alphabet_index or {"": 0}
 
-	@classmethod
-	def create(cls, pattern):
+	@staticmethod
+	def create(pattern: str) -> "RegexpDFA":
+		"""Create a `Regexp` and determinize it."""
 		r = Regexp(pattern)
-		(rules, end_states, alphabet_index) = r.determinize()
+		return r.determinize()
 
-		return cls(rules, end_states, alphabet_index)
-
-	def match(self, s):
+	def match(self, s: str):
+		"""Decide if a string matches the regexp.
+		
+		:param s: an input string"""
 		st = 0
 		n = len(self.alphabet_index)
 
@@ -317,13 +367,15 @@ class RegexpDFA:
 
 		return st in self.end_states
 
-	def reduce(self):
+	def reduce(self) -> "RegexpDFA":
+		"""Minimize the automaton by collapsing equivalent states."""
 		partition = self._find_equivalent_states()
 		(rules, end_states) = self._collapse_states(partition)
 
 		return RegexpDFA(rules, end_states, self.alphabet_index)
 
-	def normalize(self):
+	def normalize(self) -> "RegexpDFA":
+		"""Normalize the automaton by relabeling the states in a breadth first search walkthrough order and remove unreachable states."""
 		n = len(self.alphabet_index)
 		index = {0: 0}
 		queue = deque([0])
@@ -343,16 +395,24 @@ class RegexpDFA:
 
 		return RegexpDFA(rules, end_states, self.alphabet_index)
 
-	def find_distinguishing_string(self, r):
-		if self.rules == r.rules and self.end_states == r.end_states:
+	def find_distinguishing_string(self, r: "RegexpDFA") -> Optional[str]:
+		"""Find the shortest string that is accepted by self or `r`, but not both.
+		
+		:param r: the other automaton
+		:return: the distinguishing string, or None if the automatons are equivalent"""
+		r1 = self.reduce().normalize()
+		r2 = r.reduce().normalize()
+
+		if r1.rules == r2.rules and r1.end_states == r2.end_states:
 			return None
 
-		r1 = self._expand_alphabet(r.alphabet_index)
-		r2 = r._expand_alphabet(self.alphabet_index)
+		r1 = r1._expand_alphabet(r2.alphabet_index)
+		r2 = r2._expand_alphabet(r1.alphabet_index)
 		product = r1._build_product_automaton(r2)
 
 		n = len(product.alphabet_index)
-		reverse_alphabet_index = {v: k for (k, v) in product.alphabet_index.items()}
+		# state: symbol
+		inverse_alphabet_index = {v: k for (k, v) in product.alphabet_index.items()}
 		queue = deque([(0, "")])
 		visited = {0}
 		while queue:
@@ -361,16 +421,20 @@ class RegexpDFA:
 				return acc
 			for (i, target) in enumerate(product.rules[state*n:(state+1)*n]):
 				if target not in visited:
-					queue.append((target, acc+reverse_alphabet_index[i]))
+					queue.append((target, acc+inverse_alphabet_index[i]))
 					visited.add(target)
 		
 		assert False
 
-	def _find_equivalent_states(self):
+	def _find_equivalent_states(self) -> list[set[int]]:
+		"""Partition states into their equivalence classes with Hopcroft's algorithm.
+		
+		:return: sets of equivalent states. Unique states get single item sets."""
 		n = len(self.alphabet_index)
 		m = len(self.rules) // n
 		inverse_rules = [set() for i in range(m*n)]
 
+		# a transition rules inverse. target state + alphabet symbol -> source states
 		for i in range(m):
 			for j in range(n):
 				target = self.rules[i*n + j]
@@ -409,16 +473,22 @@ class RegexpDFA:
 		
 		return [set_bag[k] for k in res]
 	
-	def _collapse_states(self, partition):
+	def _collapse_states(self, partition: list[set[int]]) -> tuple[list[int], set[int]]:
+		"""Collapse equivalent states from each class into a single one.
+		
+		:param partition: list of equivalence classes
+		:return: rules list, accepting states"""
 		n = len(self.alphabet_index)
 		rules = []
 
+		# states mapping due to the equivalents collapse
 		eq_mapping = dict()
 		for eq_set in partition:
 			states = sorted(eq_set)
 			for st in states:
 				eq_mapping[st] = states[0]
 
+		# states mapping to keep the rules list compact after the equivalents collapse
 		discard_mapping = dict()
 		discard_count = 0
 
@@ -435,7 +505,11 @@ class RegexpDFA:
 		
 		return (rules, end_states)
 
-	def _expand_alphabet(self, alphabet_index):
+	def _expand_alphabet(self, alphabet_index: dict[str, int]) -> "RegexpDFA":
+		"""Expand the automaton to accommodate union of `self.alphabet_index` and the provided `alphabet_index`.
+		
+		:param alphabet_index: a mapping from letters to rules columns
+		:return: a RegexpDFA over the unified alphabet"""
 		if alphabet_index == self.alphabet_index:
 			return self
 
@@ -443,10 +517,13 @@ class RegexpDFA:
 		m = len(self.rules) // n1
 
 		combined_alphabet = set(self.alphabet_index.keys()) | set(alphabet_index.keys())
+		# a new alphabet_index
 		combined_index = {c: i for (i, c) in enumerate(sorted(combined_alphabet))}
+		# a new letter key: the old letter key
 		conversion_index = {combined_index[k]: v for (k, v) in self.alphabet_index.items()}
 		n2 = len(combined_alphabet)
 
+		# rewrite the rules into a new table, filling blanks with a new fail state
 		rules = [
 			self.rules[i*n1 + conversion_index[j]]
 			if j in conversion_index else m
@@ -456,7 +533,11 @@ class RegexpDFA:
 
 		return RegexpDFA(rules, self.end_states, combined_index).reduce().normalize()
 
-	def _build_product_automaton(self, r):
+	def _build_product_automaton(self, r: "RegexpDFA") -> "RegexpDFA":
+		"""Create a new automaton, with a carthesian product of the arguments' states and corresponding transition rules.
+		
+		:param r: the other finite automaton. It must have the same `alphabet_index` as `self`"""
+		assert self.alphabet_index == r.alphabet_index
 		n = len(self.alphabet_index)
 		m = len(r.rules) // n
 		k = len(self.rules) // n
@@ -464,6 +545,7 @@ class RegexpDFA:
 		rules = []
 		end_states = set()
 
+		# expand each self state into m new states, one for each of the r states
 		for s1 in range(k):
 			row1 = self.rules[s1*n:(s1+1)*n]
 			for s2 in range(m):
@@ -477,7 +559,7 @@ class RegexpDFA:
 
 def test():
 	tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]
-	for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]:
+	for pattern in ["", "a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]:
 		print("#", pattern)
 		try:
 			r = RegexpDFA.create(pattern).reduce().normalize()