# HG changeset patch # User Laman # Date 2024-07-09 13:05:02 # Node ID 08f1519c859e00c6b552977e2f05a92ae66c7c15 # Parent bac2e7f972b5c82d86872a48a0c4704a1326b6f9 added Rust documentation diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -11,7 +11,7 @@ class ParsingError(Exception): class Token: - """Abstract base class representing a single item of a regular expressions.""" + """Abstract base class representing a single item of a regular expression.""" # is the Token mandatory, or can it be omitted is_skippable = False diff --git a/src/lib.rs b/src/lib.rs --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,2 @@ mod regexp; -pub use regexp::{Regexp, ParsingError}; +pub use regexp::{RegexpNFA, ParsingError}; diff --git a/src/main.rs b/src/main.rs --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,11 @@ use std::env; -use regexp::Regexp; +use regexp::RegexpNFA; fn test() { let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]; for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"] { println!("# {pattern}"); - let r = match Regexp::new(&pattern.to_string()) { + let r = match RegexpNFA::new(&pattern.to_string()) { Ok(r1) => r1.determinize().reduce().normalize(), Err(e) => { println!("{e}"); @@ -24,7 +24,7 @@ fn main() { match args[1].as_str() { "test" => test(), "match" => { - let r = match Regexp::new(&args[2].to_string()) { + let r = match RegexpNFA::new(&args[2].to_string()) { Ok(r1) => r1.determinize().reduce().normalize(), Err(e) => { panic!("ERROR: {e}"); @@ -33,13 +33,13 @@ fn main() { println!("{}", r.eval(args[3].to_string())); }, "compare" => { - let r1 = match Regexp::new(&args[2].to_string()) { + let r1 = match RegexpNFA::new(&args[2].to_string()) { Ok(r) => r.determinize().reduce().normalize(), Err(e) => { panic!("ERROR: {e}"); } }; - let r2 = match Regexp::new(&args[3].to_string()) { + let r2 = match RegexpNFA::new(&args[3].to_string()) { Ok(r) => r.determinize().reduce().normalize(), Err(e) => { panic!("ERROR: {e}"); @@ -48,7 +48,7 @@ fn main() { println!("{}", r1.find_distinguishing_string(&r2).unwrap_or("None".to_string())); }, s => { - println!("An unknown command: \"{s}\". Use \"match\" or \"test\".") + println!("An unknown command: \"{s}\". Use \"match\", \"compare\" or \"test\".") } } } diff --git a/src/regexp.rs b/src/regexp.rs --- a/src/regexp.rs +++ b/src/regexp.rs @@ -7,15 +7,16 @@ use token::parse; const START_NFA: usize = usize::MAX; const START_DFA: usize = 0; +/// A regular expression implemented as a non-deterministic finite automaton. #[derive(Debug)] -pub struct Regexp { +pub struct RegexpNFA { rules: HashMap<(usize, char), HashSet>, end_states: HashSet, alphabet: Vec } -impl Regexp { - pub fn new(pattern: &String) -> Result { +impl RegexpNFA { + pub fn new(pattern: &String) -> Result { let r = parse(pattern, 0)?; let pattern_chars = Vec::from_iter(pattern.chars()); let mut rules: HashMap<(usize, char), HashSet> = HashMap::new(); @@ -49,9 +50,10 @@ impl Regexp { let mut alphabet_vec = Vec::from_iter(alphabet.into_iter()); alphabet_vec.sort(); - return Ok(Regexp{rules, end_states, alphabet: alphabet_vec}); + return Ok(RegexpNFA{rules, end_states, alphabet: alphabet_vec}); } + /// Decide if a string matches the regexp. pub fn eval(&self, s: String) -> bool { let mut multistate = HashSet::from([START_NFA]); @@ -71,6 +73,7 @@ impl Regexp { return multistate.iter().any(|x| self.end_states.contains(x)); } + /// Convert the non-deterministic finite automaton into a deterministic one. pub fn determinize(&self) -> RegexpDFA { const FAIL: usize = usize::MAX; let alphabet_index: HashMap = self.alphabet.iter().enumerate().map(|(i, c)| (*c, i)).collect(); @@ -79,6 +82,7 @@ impl Regexp { let mut end_states: HashSet = HashSet::new(); if self.end_states.contains(&START_NFA) {end_states.insert(START_DFA);} + // mapping the NFA state subsets -> DFA states let mut index = HashMap::from([(vec![START_NFA], START_DFA)]); let mut stack = vec![vec![START_NFA]]; @@ -86,6 +90,7 @@ impl Regexp { let multistate = stack.pop().unwrap(); let mut new_rules: HashMap> = HashMap::new(); + // collect all possible destination states from the multistate for key in self.rules.keys().filter(|(st, _c)| multistate.binary_search(st).is_ok()) { let (_st, c) = key; if !new_rules.contains_key(c) { @@ -96,6 +101,7 @@ impl Regexp { } } + // build a row for the DFA transition function table for (c, target_set) in new_rules.into_iter() { let mut target_vec = Vec::from_iter(target_set.into_iter()); target_vec.sort(); @@ -113,6 +119,7 @@ impl Regexp { } } + // add a fail state, so the transition function is complete let fail = index.len(); compact_rules = compact_rules.into_iter().map(|st| if st != FAIL {st} else {fail}).collect(); compact_rules.extend(iter::repeat(fail).take(n)); @@ -121,6 +128,8 @@ impl Regexp { } } +/// A regular expression implemented as a deterministic finite automaton. +/// This simplifies support for more features compared to the NFA Regexp. #[derive(Clone)] pub struct RegexpDFA { rules: Vec, @@ -129,10 +138,12 @@ pub struct RegexpDFA { } impl RegexpDFA { + /// Construct a DFA with the provided parameters, or a minimal DFA if the parameters are empty. pub fn new(rules: Vec, end_states: HashSet, alphabet_index: HashMap) -> RegexpDFA { if rules.len() > 0 { return RegexpDFA{rules, end_states, alphabet_index}; } else { + // this saves us checking for an empty `alphabet_index` in other methods. return RegexpDFA{ rules: vec![1, 1], end_states, @@ -141,6 +152,7 @@ impl RegexpDFA { } } + /// Decide if a string matches the regexp. pub fn eval(&self, s: String) -> bool { let n = self.alphabet_index.len(); let mut state = START_DFA; @@ -156,11 +168,13 @@ impl RegexpDFA { return self.end_states.contains(&state); } + /// Minimize the automaton by collapsing equivalent states. pub fn reduce(&self) -> RegexpDFA { let partition = self.find_equivalent_states(); return self.collapse_states(partition); } + /// Normalize the automaton by relabeling the states in a breadth first search walkthrough order and remove unreachable states. pub fn normalize(&self) -> RegexpDFA { let n = self.alphabet_index.len(); let m = self.rules.len()/n; @@ -190,6 +204,8 @@ impl RegexpDFA { return RegexpDFA{rules, end_states, alphabet_index: self.alphabet_index.clone()}; } + /// Find the shortest string that is accepted by self or `r`, but not both. + /// It is expected that the automatons are already reduced and normalized. pub fn find_distinguishing_string(&self, other: &RegexpDFA) -> Option { if self.rules == other.rules && self.end_states == other.end_states { return None; @@ -199,8 +215,10 @@ impl RegexpDFA { let r2 = other.expand_alphabet(&self.alphabet_index); let product = r1.build_product_automaton(&r2); let n = product.alphabet_index.len(); - let reverse_alphabet_index: HashMap = HashMap::from_iter(product.alphabet_index.iter().map(|(&k, &v)| (v, k))); + // mapping letter keys -> letters + let inverse_alphabet_index: HashMap = HashMap::from_iter(product.alphabet_index.iter().map(|(&k, &v)| (v, k))); + // look for an accepting state with a BFS let mut queue = VecDeque::from([(0, "".to_string())]); let mut visited = HashSet::new(); while !queue.is_empty() { @@ -210,15 +228,16 @@ impl RegexpDFA { } for (i, target) in product.rules[state*n..(state+1)*n].iter().enumerate() { if !visited.contains(target) { - queue.push_back((*target, acc.clone()+&String::from(reverse_alphabet_index[&i]))); + queue.push_back((*target, acc.clone()+&String::from(inverse_alphabet_index[&i]))); visited.insert(target); } } } - panic!(); + panic!("Failed to detect the Regexps as equivalent and failed to find a distinguishing string."); } + /// Partition states into their equivalence classes with Hopcroft's algorithm. fn find_equivalent_states(&self) -> Vec> { let n = self.alphabet_index.len(); let m = self.rules.len() / n; @@ -231,6 +250,7 @@ impl RegexpDFA { } } + // store state subsets here so we can just pass around their indices let mut set_bag = vec![ self.end_states.clone(), HashSet::from_iter(0..m).difference(&self.end_states).copied().collect() @@ -274,11 +294,13 @@ impl RegexpDFA { return Vec::from_iter(res.into_iter().map(|k| std::mem::take(&mut set_bag[k]))); } + /// Collapse equivalent states from each partition class into a single state. fn collapse_states(&self, partition: Vec>) -> RegexpDFA { let n = self.alphabet_index.len(); let m = self.rules.len()/n; let mut rules = vec![]; + // states mapping due to the equivalents collapse let mut eq_mapping: Vec = ((0..m)).collect(); for eq_set in partition.into_iter() { let mut eq_vec = Vec::from_iter(eq_set.into_iter()); @@ -289,6 +311,7 @@ impl RegexpDFA { } } + // states mapping to keep the rules list compact after the equivalents collapse let mut discard_mapping: Vec = ((0..m)).collect(); let mut discard_count = 0; @@ -307,6 +330,7 @@ impl RegexpDFA { return RegexpDFA{rules, end_states, alphabet_index: self.alphabet_index.clone()}; } + /// Expand the automaton to accommodate union of `self.alphabet_index` and the provided `alphabet_index`. fn expand_alphabet(&self, alphabet_index: &HashMap) -> RegexpDFA { if *alphabet_index == self.alphabet_index { return self.clone(); @@ -318,10 +342,13 @@ impl RegexpDFA { let combined_alphabet: HashSet = HashSet::from_iter(self.alphabet_index.keys().chain(alphabet_index.keys()).copied()); let mut combined_vec = Vec::from_iter(combined_alphabet.into_iter()); combined_vec.sort(); + // a new alphabet_index let combined_index = HashMap::from_iter(combined_vec.iter().enumerate().map(|(i, c)| (*c, i))); + // a new letter key -> the old letter key let conversion_index: HashMap = HashMap::from_iter(self.alphabet_index.iter().map(|(k, v)| (combined_index[k], *v))); let n2 = combined_vec.len(); + // rewrite the rules into a new table, filling blanks with a new fail state let rules: Vec = (0..m*n2).map( |i| { let (j, k) = (i/n2, i%n2); @@ -334,6 +361,7 @@ impl RegexpDFA { return RegexpDFA{rules, end_states: self.end_states.clone(), alphabet_index: combined_index}.reduce().normalize(); } + /// Create a new automaton, with a carthesian product of the arguments' states and corresponding transition rules. fn build_product_automaton(&self, other: &RegexpDFA) -> RegexpDFA { let n = self.alphabet_index.len(); let m = other.rules.len() / n; @@ -342,6 +370,7 @@ impl RegexpDFA { let mut rules = vec![]; let mut end_states = HashSet::new(); + // expand each self state into m new states, one for each of the `other` states for s1 in 0..k { let row1 = &self.rules[s1*n..(s1+1)*n]; for s2 in 0..m { diff --git a/src/regexp/token.rs b/src/regexp/token.rs --- a/src/regexp/token.rs +++ b/src/regexp/token.rs @@ -18,7 +18,7 @@ impl fmt::Display for ParsingError { write!(f, "An opening parenthesis not found. Pattern \"{s}\", position {pos}") }, ParsingError::ClosingParenthesis {s, pos} => { - write!(f, "An closing parenthesis not found. Pattern \"{s}\", position {pos}") + write!(f, "A closing parenthesis not found. Pattern \"{s}\", position {pos}") }, ParsingError::EmptyAlternativeVariant => { write!(f, "Found an empty Alternative variant.") @@ -27,27 +27,34 @@ impl fmt::Display for ParsingError { } } +/// A single letter or other alphabet symbol. pub struct Symbol { + /// Symbol position in the regular expression. position: usize } +/// A unary operator specifying its content to occur zero or more times. pub struct Asterisk { content: Box } +/// An operator with a variable number of arguments, specifying exchangeable alternatives. pub struct Alternative { content: Vec> } +/// An operator expressing a concatenation of its content Tokens. pub struct Chain { content: Vec> } +/// Enum encapsulating possible items of a regular expression. pub enum Token { - Lambda, + Lambda, // An empty string, useful as an `Alternative`. Symbol(Symbol), Asterisk(Asterisk), Alternative(Alternative), + // A special token to temporarily separate Alternative variants. Removed in the Alternative constructor. AlternativeSeparator, Chain(Chain) } @@ -89,6 +96,8 @@ impl Asterisk { } impl Alternative { + /// Split a sequence of `Tokens` by `AlternativeSeparator` into alternative variants and return the result. + /// If any variant is empty, return an `Err``. fn new(content: Vec>) -> Result { let mut variants: Vec>> = vec![Vec::new()]; @@ -112,7 +121,7 @@ impl Alternative { } fn is_skippable(&self) -> bool { - return self.content.iter().any(|x| x.is_skippable()); + return self.content.iter().any(|x| x.is_skippable()); } fn list_first(&self) -> Vec { @@ -193,51 +202,57 @@ impl Chain { } impl Token { + /// Decide whether the `Token` has to contain some `Symbols`, + /// or whether it can be stepped over when looking for first, last and neighbouring items. pub fn is_skippable(&self) -> bool { match self { Token::Lambda => true, Token::Symbol(_) => false, Token::Asterisk(_) => true, Token::Alternative(t) => t.is_skippable(), - Token::AlternativeSeparator => panic!(), + Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"), Token::Chain(t) => t.is_skippable() } } + /// List all possible string positions the token can start with. pub fn list_first(&self) -> Vec { match self { Token::Lambda => vec![], Token::Symbol(t) => t.list_first(), Token::Asterisk(t) => t.list_first(), Token::Alternative(t) => t.list_first(), - Token::AlternativeSeparator => panic!(), + Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"), Token::Chain(t) => t.list_first() } } + /// List all possible string positions the token can end with. pub fn list_last(&self) -> Vec { match self { Token::Lambda => vec![], Token::Symbol(t) => t.list_last(), Token::Asterisk(t) => t.list_last(), Token::Alternative(t) => t.list_last(), - Token::AlternativeSeparator => panic!(), + Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"), Token::Chain(t) => t.list_last() } } + /// List positions of all possibly neighbouring subtokens. pub fn list_neighbours(&self) -> Vec<(usize, usize)> { match self { Token::Lambda => vec![], Token::Symbol(t) => t.list_neighbours(), Token::Asterisk(t) => t.list_neighbours(), Token::Alternative(t) => t.list_neighbours(), - Token::AlternativeSeparator => panic!(), + Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"), Token::Chain(t) => t.list_neighbours() } } } +/// For a string starting with a parenthesis, find its matching closing parenthesis, or return None. fn find_closing_parenthesis(s: &String) -> Option { let chars: Vec = s.chars().collect(); let mut counter = 0; @@ -251,6 +266,10 @@ fn find_closing_parenthesis(s: &String) return None; } +/// Recursively parse the pattern into a `Token` tree. +/// +/// The `offset` defines where the `pattern` starts relative to the original pattern, +/// to record correct global token positions in the subcalls pub fn parse(pattern: &String, offset: usize) -> Result { let chars: Vec = pattern.chars().collect(); let mut res: Vec> = Vec::new(); diff --git a/tests/test_regexp.rs b/tests/test_regexp.rs --- a/tests/test_regexp.rs +++ b/tests/test_regexp.rs @@ -1,9 +1,9 @@ -use regexp::Regexp; +use regexp::RegexpNFA; use regexp::ParsingError; #[test] fn test_eval_basic_nfa() { - let r = Regexp::new(&String::from("abc")).unwrap(); + let r = RegexpNFA::new(&String::from("abc")).unwrap(); assert!(r.eval(String::from("abc"))); assert!(!r.eval(String::from("ab"))); assert!(!r.eval(String::from("abcd"))); @@ -11,7 +11,7 @@ fn test_eval_basic_nfa() { #[test] fn test_eval_basic_dfa() { - let r = Regexp::new(&String::from("abc")).unwrap().determinize().reduce().normalize(); + let r = RegexpNFA::new(&String::from("abc")).unwrap().determinize().reduce().normalize(); assert!(r.eval(String::from("abc"))); assert!(!r.eval(String::from("ab"))); assert!(!r.eval(String::from("abcd"))); @@ -19,22 +19,22 @@ fn test_eval_basic_dfa() { #[test] fn test_eval_empty_nfa() { - assert!(Regexp::new(&String::from("a*")).unwrap().eval(String::from(""))); - assert!(Regexp::new(&String::from("")).unwrap().eval(String::from(""))); - assert!(!Regexp::new(&String::from("")).unwrap().eval(String::from("a"))); - assert!(!Regexp::new(&String::from("a")).unwrap().eval(String::from(""))); + assert!(RegexpNFA::new(&String::from("a*")).unwrap().eval(String::from(""))); + assert!(RegexpNFA::new(&String::from("")).unwrap().eval(String::from(""))); + assert!(!RegexpNFA::new(&String::from("")).unwrap().eval(String::from("a"))); + assert!(!RegexpNFA::new(&String::from("a")).unwrap().eval(String::from(""))); } #[test] fn test_eval_empty_dfa() { - assert!(Regexp::new(&String::from("a*")).unwrap().determinize().reduce().normalize().eval(String::from(""))); - assert!(Regexp::new(&String::from("")).unwrap().determinize().reduce().normalize().eval(String::from(""))); - assert!(!Regexp::new(&String::from("")).unwrap().determinize().reduce().normalize().eval(String::from("a"))); + assert!(RegexpNFA::new(&String::from("a*")).unwrap().determinize().reduce().normalize().eval(String::from(""))); + assert!(RegexpNFA::new(&String::from("")).unwrap().determinize().reduce().normalize().eval(String::from(""))); + assert!(!RegexpNFA::new(&String::from("")).unwrap().determinize().reduce().normalize().eval(String::from("a"))); } #[test] fn test_eval_asterisk_nfa() { - let r = Regexp::new(&String::from("a*b*")).unwrap(); + let r = RegexpNFA::new(&String::from("a*b*")).unwrap(); assert!(r.eval(String::from("a"))); assert!(r.eval(String::from("ab"))); assert!(r.eval(String::from("aabb"))); @@ -43,7 +43,7 @@ fn test_eval_asterisk_nfa() { #[test] fn test_eval_asterisk_dfa() { - let r = Regexp::new(&String::from("a*b*")).unwrap().determinize().normalize().reduce(); + let r = RegexpNFA::new(&String::from("a*b*")).unwrap().determinize().normalize().reduce(); assert!(r.eval(String::from("a"))); assert!(r.eval(String::from("ab"))); assert!(r.eval(String::from("aabb"))); @@ -52,7 +52,7 @@ fn test_eval_asterisk_dfa() { #[test] fn test_eval_alternative_nfa() { - let r = Regexp::new(&String::from("a|b|c")).unwrap(); + let r = RegexpNFA::new(&String::from("a|b|c")).unwrap(); assert!(r.eval(String::from("a"))); assert!(r.eval(String::from("b"))); assert!(r.eval(String::from("c"))); @@ -62,7 +62,7 @@ fn test_eval_alternative_nfa() { #[test] fn test_eval_alternative_dfa() { - let r = Regexp::new(&String::from("a|b|c")).unwrap().determinize().reduce().normalize(); + let r = RegexpNFA::new(&String::from("a|b|c")).unwrap().determinize().reduce().normalize(); assert!(r.eval(String::from("a"))); assert!(r.eval(String::from("b"))); assert!(r.eval(String::from("c"))); @@ -72,12 +72,12 @@ fn test_eval_alternative_dfa() { #[test] fn test_eval_lambda_nfa() { - let r = Regexp::new(&String::from("a_")).unwrap(); + let r = RegexpNFA::new(&String::from("a_")).unwrap(); assert!(r.eval(String::from("a"))); assert!(!r.eval(String::from(""))); assert!(!r.eval(String::from("ab"))); - let r = Regexp::new(&String::from("a|_")).unwrap(); + let r = RegexpNFA::new(&String::from("a|_")).unwrap(); assert!(r.eval(String::from("a"))); assert!(r.eval(String::from(""))); assert!(!r.eval(String::from("b"))); @@ -85,12 +85,12 @@ fn test_eval_lambda_nfa() { #[test] fn test_eval_lambda_dfa() { - let r = Regexp::new(&String::from("a_")).unwrap().determinize().reduce().normalize(); + let r = RegexpNFA::new(&String::from("a_")).unwrap().determinize().reduce().normalize(); assert!(r.eval(String::from("a"))); assert!(!r.eval(String::from(""))); assert!(!r.eval(String::from("ab"))); - let r = Regexp::new(&String::from("a|_")).unwrap().determinize().reduce().normalize(); + let r = RegexpNFA::new(&String::from("a|_")).unwrap().determinize().reduce().normalize(); assert!(r.eval(String::from("a"))); assert!(r.eval(String::from(""))); assert!(!r.eval(String::from("b"))); @@ -98,36 +98,36 @@ fn test_eval_lambda_dfa() { #[test] fn test_invalid_asterisk() { - let x = Regexp::new(&String::from("*")); + let x = RegexpNFA::new(&String::from("*")); assert!(matches!(x, Err(ParsingError::Asterisk{s: _, pos: 0}))); } #[test] fn test_invalid_closing_parenthesis() { - let x = Regexp::new(&String::from("(a")); + let x = RegexpNFA::new(&String::from("(a")); assert!(matches!(x, Err(ParsingError::ClosingParenthesis{s: _, pos: 0}))); } #[test] fn test_invalid_opening_parenthesis() { - let x = Regexp::new(&String::from("a)")); + let x = RegexpNFA::new(&String::from("a)")); assert!(matches!(x, Err(ParsingError::OpeningParenthesis{s: _, pos: 1}))); } #[test] fn test_invalid_empty_variant_start() { - let x = Regexp::new(&String::from("a(|b)")); + let x = RegexpNFA::new(&String::from("a(|b)")); assert!(matches!(x, Err(ParsingError::EmptyAlternativeVariant))); } #[test] fn test_invalid_empty_variant_end() { - let x = Regexp::new(&String::from("a|")); + let x = RegexpNFA::new(&String::from("a|")); assert!(matches!(x, Err(ParsingError::EmptyAlternativeVariant))); } #[test] fn test_invalid_empty_variant_middle() { - let x = Regexp::new(&String::from("a||b")); + let x = RegexpNFA::new(&String::from("a||b")); assert!(matches!(x, Err(ParsingError::EmptyAlternativeVariant))); }