Files @ 61a2b8f09823
Branch filter:

Location: Regular-Expresso/src/regexp.rs

61a2b8f09823 6.0 KiB application/rls-services+xml Show Annotation Show as Raw Download as Raw
Laman
refactoring: states represented by basic integers
use std::collections::{HashMap, HashSet, VecDeque};

mod token;
pub use token::ParsingError;
use token::parse;

const START: i32 = -1;
const FAIL: i32 = -2;

fn encode_set(set: &HashSet<i32>) -> String {
	let mut v = Vec::from_iter(set.iter());
	v.sort();
	let res: Vec<String> = v.into_iter().map(|x| x.to_string()).collect();
	return res.join(",");
}

#[derive(Debug)]
pub struct Regexp {
	rules: HashMap<(i32, char), HashSet<i32>>,
	end_states: HashSet<i32>
}

impl Regexp {
	pub fn new(pattern: &String) -> Result<Regexp, ParsingError> {
		let r = parse(pattern, 0)?;
		let pattern_chars = Vec::from_iter(pattern.chars());
		let mut rules: HashMap<(i32, char), HashSet<i32>> = HashMap::new();
		
		for i in r.list_first() {
			let c = pattern_chars[i];
			let key = (START, c);
			match rules.get_mut(&key) {
				Some(set) => {set.insert(i as i32);},
				None => {rules.insert(key, HashSet::from([i as i32]));}
			};
		}

		for (i, j) in r.list_neighbours() {
			let c = pattern_chars[j];
			let key = (i as i32, c);
			match rules.get_mut(&key) {
				Some(set) => {set.insert(j as i32);},
				None => {rules.insert(key, HashSet::from([j as i32]));}
			};
		}

		let mut end_states = HashSet::from_iter(r.list_last().into_iter().map(|i| i as i32));
		if r.is_skippable() {
			end_states.insert(START);
		}

		return Ok(Regexp{rules, end_states});
	}

	pub fn eval(&self, s: String) -> bool {
		let mut multistate = HashSet::from([START]);

		for c in s.chars() {
			let mut new_multistate = HashSet::new();

			for state in multistate {
				if let Some(x) = self.rules.get(&(state, c)) {
					new_multistate = new_multistate.union(&x).map(|&y| y).collect();
				} else if let Some(x) = self.rules.get(&(state, '.')) {
					new_multistate = new_multistate.union(&x).map(|&y| y).collect();
				}
			}
			multistate = new_multistate;
		}

		return multistate.iter().any(|x| self.end_states.contains(x));
	}

	pub fn determinize(&self) -> RegexpDFA {
		let mut rules: HashMap<(i32, char), i32> = HashMap::new();
		let mut end_states: HashSet<i32> = HashSet::new();
		if self.end_states.contains(&START) {end_states.insert(START);}

		let mut index_new = HashMap::from([(START.to_string(), START)]);
		let mut index_multi = HashMap::from([(START.to_string(), HashSet::from([START]))]);
		let mut stack = Vec::from([START.to_string()]);

		while !stack.is_empty() {
			let state_hash = stack.pop().unwrap();
			let multistate = &index_multi[&state_hash];
			let mut new_rules: HashMap<char, HashSet<i32>> = HashMap::new();

			for key in self.rules.keys().filter(|key| multistate.contains(&key.0)) {
				let (_st, c) = key;
				if !new_rules.contains_key(c) {
					new_rules.insert(*c, HashSet::new());
				}
				for target in &self.rules[key] {
					new_rules.get_mut(c).unwrap().insert(*target);
				}
			}

			for (c, target_set) in new_rules.into_iter() {
				let target_hash = encode_set(&target_set);
				let is_end = target_set.iter().any(|st| self.end_states.contains(st));
				if !index_new.contains_key(&target_hash) {
					let target_new = index_new.len() as i32;
					index_new.insert(target_hash.clone(), target_new);
					index_multi.insert(target_hash.clone(), target_set);
					stack.push(target_hash.clone());
				}
				rules.insert((index_new[&state_hash], c), index_new[&target_hash]);
				if is_end {
					end_states.insert(index_new[&target_hash]);
				}
			}
		}

		return RegexpDFA{rules, end_states};
	}
}

pub struct RegexpDFA {
	rules: HashMap<(i32, char), i32>,
	end_states: HashSet<i32>
}

impl RegexpDFA {
	pub fn eval(&self, s: String) -> bool {
		let mut state = START;

		for c in s.chars() {
			if let Some(x) = self.rules.get(&(state, c)) {
				state = *x;
			} else {
				return false;
			}
		}

		return self.end_states.contains(&state);
	}

	pub fn reduce(&self) -> RegexpDFA {
		let equivalents = self.find_equivalent_states();
		return self.collapse_states(equivalents);
	}

	pub fn normalize(&self) -> RegexpDFA {
		let mut index = HashMap::from([(START, START)]);
		let mut queue = VecDeque::from([START]);

		while !queue.is_empty() {
			let state = queue.pop_front().unwrap();
			let mut edges: Vec<((i32, char), i32)> = self.rules.iter()
				.filter(|((st, c), t)| *st == state)
				.map(|((st, c), t)| ((*st, *c), *t)).collect();
			edges.sort();
			for ((_st, _c), t) in edges {
				if !index.contains_key(&t) {
					index.insert(t, index.len() as i32);
					queue.push_back(t);
				}
			}
		}

		let rules = self.rules.iter().map(|((st, c), t)| ((index[st], *c), index[t])).collect();
		let end_states = self.end_states.iter().map(|st| index[st]).collect();
		
		return RegexpDFA{rules, end_states};
	}

	fn find_equivalent_states(&self) -> Vec<(i32, i32)> {
		let state_set: HashSet<i32> = HashSet::from_iter(self.rules.values().copied());
		let mut state_vec: Vec<i32> = Vec::from_iter(state_set.into_iter());
		state_vec.push(START);
		state_vec.push(FAIL);
		state_vec.sort();
		let alphabet: HashSet<char> = self.rules.keys().map(|(_st, c)| c).copied().collect();

		let mut equivalents = HashSet::new();
		state_vec.iter().enumerate().for_each(|(i, s1)| {
			equivalents.extend(
				state_vec[i+1..].iter()
				.filter(|s2| !(self.end_states.contains(s1)^self.end_states.contains(s2)))
				.map(|s2| (*s1, *s2))
			);
		});

		let mut n = usize::MAX;
		while equivalents.len() < n {
			n = equivalents.len();
			equivalents = equivalents.iter().filter(|(s1, s2)| {
				!alphabet.iter().any(|c| {
					let t1 = self.rules.get(&(*s1, *c)).unwrap_or(&FAIL);
					let t2 = self.rules.get(&(*s2, *c)).unwrap_or(&FAIL);
					let key = (*t1.min(t2), *t1.max(t2));
					return t1 != t2 && !equivalents.contains(&key);
				})
			}).copied().collect();
		}

		return Vec::from_iter(equivalents.into_iter());
	}

	fn collapse_states(&self, mut equivalents: Vec<(i32, i32)>) -> RegexpDFA {
		let mut rules = self.rules.clone();
		let mut end_states = self.end_states.clone();
		equivalents.sort();

		for (s1, s2) in equivalents.into_iter() {
			rules = rules.into_iter()
				.filter(|((st, _c), _t)| *st != s2)
				.map(|(key, t)| (key, if t==s2 {s1} else {t})).collect();
			end_states.remove(&s2);
		}

		return RegexpDFA{rules, end_states};
	}
}