diff --git a/src/regexp.rs b/src/regexp.rs --- a/src/regexp.rs +++ b/src/regexp.rs @@ -7,15 +7,16 @@ use token::parse; const START_NFA: usize = usize::MAX; const START_DFA: usize = 0; +/// A regular expression implemented as a non-deterministic finite automaton. #[derive(Debug)] -pub struct Regexp { +pub struct RegexpNFA { rules: HashMap<(usize, char), HashSet>, end_states: HashSet, alphabet: Vec } -impl Regexp { - pub fn new(pattern: &String) -> Result { +impl RegexpNFA { + pub fn new(pattern: &String) -> Result { let r = parse(pattern, 0)?; let pattern_chars = Vec::from_iter(pattern.chars()); let mut rules: HashMap<(usize, char), HashSet> = HashMap::new(); @@ -49,9 +50,10 @@ impl Regexp { let mut alphabet_vec = Vec::from_iter(alphabet.into_iter()); alphabet_vec.sort(); - return Ok(Regexp{rules, end_states, alphabet: alphabet_vec}); + return Ok(RegexpNFA{rules, end_states, alphabet: alphabet_vec}); } + /// Decide if a string matches the regexp. pub fn eval(&self, s: String) -> bool { let mut multistate = HashSet::from([START_NFA]); @@ -71,6 +73,7 @@ impl Regexp { return multistate.iter().any(|x| self.end_states.contains(x)); } + /// Convert the non-deterministic finite automaton into a deterministic one. pub fn determinize(&self) -> RegexpDFA { const FAIL: usize = usize::MAX; let alphabet_index: HashMap = self.alphabet.iter().enumerate().map(|(i, c)| (*c, i)).collect(); @@ -79,6 +82,7 @@ impl Regexp { let mut end_states: HashSet = HashSet::new(); if self.end_states.contains(&START_NFA) {end_states.insert(START_DFA);} + // mapping the NFA state subsets -> DFA states let mut index = HashMap::from([(vec![START_NFA], START_DFA)]); let mut stack = vec![vec![START_NFA]]; @@ -86,6 +90,7 @@ impl Regexp { let multistate = stack.pop().unwrap(); let mut new_rules: HashMap> = HashMap::new(); + // collect all possible destination states from the multistate for key in self.rules.keys().filter(|(st, _c)| multistate.binary_search(st).is_ok()) { let (_st, c) = key; if !new_rules.contains_key(c) { @@ -96,6 +101,7 @@ impl Regexp { } } + // build a row for the DFA transition function table for (c, target_set) in new_rules.into_iter() { let mut target_vec = Vec::from_iter(target_set.into_iter()); target_vec.sort(); @@ -113,6 +119,7 @@ impl Regexp { } } + // add a fail state, so the transition function is complete let fail = index.len(); compact_rules = compact_rules.into_iter().map(|st| if st != FAIL {st} else {fail}).collect(); compact_rules.extend(iter::repeat(fail).take(n)); @@ -121,6 +128,8 @@ impl Regexp { } } +/// A regular expression implemented as a deterministic finite automaton. +/// This simplifies support for more features compared to the NFA Regexp. #[derive(Clone)] pub struct RegexpDFA { rules: Vec, @@ -129,10 +138,12 @@ pub struct RegexpDFA { } impl RegexpDFA { + /// Construct a DFA with the provided parameters, or a minimal DFA if the parameters are empty. pub fn new(rules: Vec, end_states: HashSet, alphabet_index: HashMap) -> RegexpDFA { if rules.len() > 0 { return RegexpDFA{rules, end_states, alphabet_index}; } else { + // this saves us checking for an empty `alphabet_index` in other methods. return RegexpDFA{ rules: vec![1, 1], end_states, @@ -141,6 +152,7 @@ impl RegexpDFA { } } + /// Decide if a string matches the regexp. pub fn eval(&self, s: String) -> bool { let n = self.alphabet_index.len(); let mut state = START_DFA; @@ -156,11 +168,13 @@ impl RegexpDFA { return self.end_states.contains(&state); } + /// Minimize the automaton by collapsing equivalent states. pub fn reduce(&self) -> RegexpDFA { let partition = self.find_equivalent_states(); return self.collapse_states(partition); } + /// Normalize the automaton by relabeling the states in a breadth first search walkthrough order and remove unreachable states. pub fn normalize(&self) -> RegexpDFA { let n = self.alphabet_index.len(); let m = self.rules.len()/n; @@ -190,6 +204,8 @@ impl RegexpDFA { return RegexpDFA{rules, end_states, alphabet_index: self.alphabet_index.clone()}; } + /// Find the shortest string that is accepted by self or `r`, but not both. + /// It is expected that the automatons are already reduced and normalized. pub fn find_distinguishing_string(&self, other: &RegexpDFA) -> Option { if self.rules == other.rules && self.end_states == other.end_states { return None; @@ -199,8 +215,10 @@ impl RegexpDFA { let r2 = other.expand_alphabet(&self.alphabet_index); let product = r1.build_product_automaton(&r2); let n = product.alphabet_index.len(); - let reverse_alphabet_index: HashMap = HashMap::from_iter(product.alphabet_index.iter().map(|(&k, &v)| (v, k))); + // mapping letter keys -> letters + let inverse_alphabet_index: HashMap = HashMap::from_iter(product.alphabet_index.iter().map(|(&k, &v)| (v, k))); + // look for an accepting state with a BFS let mut queue = VecDeque::from([(0, "".to_string())]); let mut visited = HashSet::new(); while !queue.is_empty() { @@ -210,15 +228,16 @@ impl RegexpDFA { } for (i, target) in product.rules[state*n..(state+1)*n].iter().enumerate() { if !visited.contains(target) { - queue.push_back((*target, acc.clone()+&String::from(reverse_alphabet_index[&i]))); + queue.push_back((*target, acc.clone()+&String::from(inverse_alphabet_index[&i]))); visited.insert(target); } } } - panic!(); + panic!("Failed to detect the Regexps as equivalent and failed to find a distinguishing string."); } + /// Partition states into their equivalence classes with Hopcroft's algorithm. fn find_equivalent_states(&self) -> Vec> { let n = self.alphabet_index.len(); let m = self.rules.len() / n; @@ -231,6 +250,7 @@ impl RegexpDFA { } } + // store state subsets here so we can just pass around their indices let mut set_bag = vec![ self.end_states.clone(), HashSet::from_iter(0..m).difference(&self.end_states).copied().collect() @@ -274,11 +294,13 @@ impl RegexpDFA { return Vec::from_iter(res.into_iter().map(|k| std::mem::take(&mut set_bag[k]))); } + /// Collapse equivalent states from each partition class into a single state. fn collapse_states(&self, partition: Vec>) -> RegexpDFA { let n = self.alphabet_index.len(); let m = self.rules.len()/n; let mut rules = vec![]; + // states mapping due to the equivalents collapse let mut eq_mapping: Vec = ((0..m)).collect(); for eq_set in partition.into_iter() { let mut eq_vec = Vec::from_iter(eq_set.into_iter()); @@ -289,6 +311,7 @@ impl RegexpDFA { } } + // states mapping to keep the rules list compact after the equivalents collapse let mut discard_mapping: Vec = ((0..m)).collect(); let mut discard_count = 0; @@ -307,6 +330,7 @@ impl RegexpDFA { return RegexpDFA{rules, end_states, alphabet_index: self.alphabet_index.clone()}; } + /// Expand the automaton to accommodate union of `self.alphabet_index` and the provided `alphabet_index`. fn expand_alphabet(&self, alphabet_index: &HashMap) -> RegexpDFA { if *alphabet_index == self.alphabet_index { return self.clone(); @@ -318,10 +342,13 @@ impl RegexpDFA { let combined_alphabet: HashSet = HashSet::from_iter(self.alphabet_index.keys().chain(alphabet_index.keys()).copied()); let mut combined_vec = Vec::from_iter(combined_alphabet.into_iter()); combined_vec.sort(); + // a new alphabet_index let combined_index = HashMap::from_iter(combined_vec.iter().enumerate().map(|(i, c)| (*c, i))); + // a new letter key -> the old letter key let conversion_index: HashMap = HashMap::from_iter(self.alphabet_index.iter().map(|(k, v)| (combined_index[k], *v))); let n2 = combined_vec.len(); + // rewrite the rules into a new table, filling blanks with a new fail state let rules: Vec = (0..m*n2).map( |i| { let (j, k) = (i/n2, i%n2); @@ -334,6 +361,7 @@ impl RegexpDFA { return RegexpDFA{rules, end_states: self.end_states.clone(), alphabet_index: combined_index}.reduce().normalize(); } + /// Create a new automaton, with a carthesian product of the arguments' states and corresponding transition rules. fn build_product_automaton(&self, other: &RegexpDFA) -> RegexpDFA { let n = self.alphabet_index.len(); let m = other.rules.len() / n; @@ -342,6 +370,7 @@ impl RegexpDFA { let mut rules = vec![]; let mut end_states = HashSet::new(); + // expand each self state into m new states, one for each of the `other` states for s1 in 0..k { let row1 = &self.rules[s1*n..(s1+1)*n]; for s2 in 0..m {