diff --git a/src/regexp.rs b/src/regexp.rs --- a/src/regexp.rs +++ b/src/regexp.rs @@ -5,6 +5,7 @@ pub use token::ParsingError; use token::parse; const START: usize = usize::MAX; +const FAIL: usize = START-1; fn encode_set(set: &HashSet) -> u64 { let mut res = 0; @@ -144,4 +145,58 @@ impl RegexpDFA { return self.end_states.contains(&state); } + + pub fn reduce(&self) -> RegexpDFA { + let equivalents = self.find_equivalent_states(); + return self.collapse_states(equivalents); + } + + fn find_equivalent_states(&self) -> Vec<(u64, u64)> { + let state_set: HashSet = HashSet::from_iter(self.rules.values().copied()); + let mut state_vec: Vec = Vec::from_iter(state_set.into_iter()); + state_vec.push(START as u64); + state_vec.push(FAIL as u64); + state_vec.sort(); + state_vec.reverse(); + let alphabet: HashSet = self.rules.keys().map(|(_st, c)| c).copied().collect(); + + let mut equivalents = HashSet::new(); + state_vec.iter().enumerate().for_each(|(i, s1)| { + equivalents.extend( + state_vec[i+1..].iter() + .filter(|s2| !(self.end_states.contains(s1)^self.end_states.contains(s2))) + .map(|s2| (*s1, *s2)) + ); + }); + + let mut n = usize::MAX; + while equivalents.len() < n { + n = equivalents.len(); + equivalents = equivalents.iter().filter(|(s1, s2)| { + !alphabet.iter().any(|c| { + let t1 = self.rules.get(&(*s1, *c)).unwrap_or(&(FAIL as u64)); + let t2 = self.rules.get(&(*s2, *c)).unwrap_or(&(FAIL as u64)); + let key = (*t1.min(t2), *t1.max(t2)); + return t1 != t2 && !equivalents.contains(&key); + }) + }).copied().collect(); + } + + return Vec::from_iter(equivalents.into_iter()); + } + + fn collapse_states(&self, mut equivalents: Vec<(u64, u64)>) -> RegexpDFA { + let mut rules = self.rules.clone(); + let mut end_states = self.end_states.clone(); + equivalents.sort(); + + for (s1, s2) in equivalents.into_iter() { + rules = rules.into_iter() + .filter(|((st, _c), _t)| *st != s2) + .map(|(key, t)| (key, if t==s2 {s1} else {t})).collect(); + end_states.remove(&s2); + } + + return RegexpDFA{rules, end_states}; + } }