Changeset - d2dfbcc8bb96
[Not reviewed]
default
0 1 0
Laman - 9 months ago 2024-07-03 21:33:39

refactoring: determinization indexing states with vecs instead of string hashes
1 file changed with 14 insertions and 25 deletions:
0 comments (0 inline, 0 general)
src/regexp.rs
Show inline comments
 
@@ -7,13 +7,6 @@ use token::parse;
 
const START_NFA: usize = usize::MAX;
 
const START_DFA: usize = 0;
 

	
 
fn encode_set(set: &HashSet<usize>) -> String {
 
	let mut v = Vec::from_iter(set.iter());
 
	v.sort();
 
	let res: Vec<String> = v.into_iter().map(|x| x.to_string()).collect();
 
	return res.join(",");
 
}
 

	
 
#[derive(Debug)]
 
pub struct Regexp {
 
	rules: HashMap<(usize, char), HashSet<usize>>,
 
@@ -86,18 +79,14 @@ impl Regexp {
 
		let mut end_states: HashSet<usize> = HashSet::new();
 
		if self.end_states.contains(&START_NFA) {end_states.insert(START_DFA);}
 

	
 
		// string hash -> single int DFA state
 
		let mut index_new = HashMap::from([(START_NFA.to_string(), START_DFA)]);
 
		// string hash -> HashSet NFA multistate
 
		let mut index_multi = HashMap::from([(START_NFA.to_string(), HashSet::from([START_NFA]))]);
 
		let mut stack = Vec::from([START_NFA.to_string()]);
 
		let mut index = HashMap::from([(vec![START_NFA], START_DFA)]);
 
		let mut stack = vec![vec![START_NFA]];
 

	
 
		while !stack.is_empty() {
 
			let state_hash = stack.pop().unwrap();
 
			let multistate = &index_multi[&state_hash];
 
			let multistate = stack.pop().unwrap();
 
			let mut new_rules: HashMap<char, HashSet<usize>> = HashMap::new();
 

	
 
			for key in self.rules.keys().filter(|(st, _c)| multistate.contains(st)) {
 
			for key in self.rules.keys().filter(|(st, _c)| multistate.binary_search(st).is_ok()) {
 
				let (_st, c) = key;
 
				if !new_rules.contains_key(c) {
 
					new_rules.insert(*c, HashSet::new());
 
@@ -108,23 +97,23 @@ impl Regexp {
 
			}
 

	
 
			for (c, target_set) in new_rules.into_iter() {
 
				let target_hash = encode_set(&target_set);
 
				let is_end = target_set.iter().any(|st| self.end_states.contains(st));
 
				if !index_new.contains_key(&target_hash) {
 
					let target_new = index_new.len();
 
					index_new.insert(target_hash.clone(), target_new);
 
					index_multi.insert(target_hash.clone(), target_set);
 
				let mut target_vec = Vec::from_iter(target_set.into_iter());
 
				target_vec.sort();
 
				let is_end = target_vec.iter().any(|st| self.end_states.contains(st));
 
				if !index.contains_key(&target_vec) {
 
					let target_new = index.len();
 
					index.insert(target_vec.clone(), target_new);
 
					compact_rules.extend(iter::repeat(FAIL).take(n));
 
					stack.push(target_hash.clone());
 
					stack.push(target_vec.clone());
 
				}
 
				compact_rules[index_new[&state_hash]*n + alphabet_index[&c]] = index_new[&target_hash];
 
				compact_rules[index[&multistate]*n + alphabet_index[&c]] = index[&target_vec];
 
				if is_end {
 
					end_states.insert(index_new[&target_hash]);
 
					end_states.insert(index[&target_vec]);
 
				}
 
			}
 
		}
 

	
 
		let fail = index_new.len();
 
		let fail = index.len();
 
		compact_rules = compact_rules.into_iter().map(|st| if st != FAIL {st} else {fail}).collect();
 
		compact_rules.extend(iter::repeat(fail).take(n));
 
		
0 comments (0 inline, 0 general)