use std::collections::{HashMap, HashSet, VecDeque}; mod token; pub use token::ParsingError; use token::parse; const START: i32 = -1; const FAIL: i32 = -2; fn encode_set(set: &HashSet) -> i32 { let mut res = 0; for x in set.iter() { assert!(*x >= 0); res ^= 1< HashSet { if x == START {return HashSet::from([START]);} let mut x = x; let mut res: HashSet = HashSet::new(); while x > 0 { let y = x.trailing_zeros(); res.insert(y as i32); x ^= 1 << y; } return res; } #[derive(Debug)] pub struct Regexp { rules: HashMap<(i32, char), HashSet>, end_states: HashSet } impl Regexp { pub fn new(pattern: &String) -> Result { let r = parse(pattern, 0)?; let pattern_chars = Vec::from_iter(pattern.chars()); let mut rules: HashMap<(i32, char), HashSet> = HashMap::new(); for i in r.list_first() { let c = pattern_chars[i]; let key = (START, c); match rules.get_mut(&key) { Some(set) => {set.insert(i as i32);}, None => {rules.insert(key, HashSet::from([i as i32]));} }; } for (i, j) in r.list_neighbours() { let c = pattern_chars[j]; let key = (i as i32, c); match rules.get_mut(&key) { Some(set) => {set.insert(j as i32);}, None => {rules.insert(key, HashSet::from([j as i32]));} }; } let mut end_states = HashSet::from_iter(r.list_last().into_iter().map(|i| i as i32)); if r.is_skippable() { end_states.insert(START); } return Ok(Regexp{rules, end_states}); } pub fn eval(&self, s: String) -> bool { let mut multistate = HashSet::from([START]); for c in s.chars() { let mut new_multistate = HashSet::new(); for state in multistate { if let Some(x) = self.rules.get(&(state, c)) { new_multistate = new_multistate.union(&x).map(|&y| y).collect(); } else if let Some(x) = self.rules.get(&(state, '.')) { new_multistate = new_multistate.union(&x).map(|&y| y).collect(); } } multistate = new_multistate; } return multistate.iter().any(|x| self.end_states.contains(x)); } pub fn determinize(&self) -> RegexpDFA { let mut rules: HashMap<(i32, char), i32> = HashMap::new(); let mut end_states: HashSet = HashSet::new(); if self.end_states.contains(&START) {end_states.insert(START);} let mut stack = Vec::from([START]); let mut processed_states = HashSet::new(); while !stack.is_empty() { let state = stack.pop().unwrap(); let multistate = decode_set(state); let mut new_rules: HashMap> = HashMap::new(); for key in self.rules.keys().filter(|key| multistate.contains(&key.0)) { let (_st, c) = key; if !new_rules.contains_key(c) { new_rules.insert(*c, HashSet::new()); } for target in &self.rules[key] { new_rules.get_mut(c).unwrap().insert(*target); } } for (c, target_set) in new_rules.into_iter() { let encoded_target = encode_set(&target_set); rules.insert((state, c), encoded_target); if target_set.iter().any(|st| self.end_states.contains(st)) { end_states.insert(encoded_target); } if !processed_states.contains(&encoded_target) { stack.push(encoded_target); processed_states.insert(encoded_target); } } } return RegexpDFA{rules, end_states}; } } pub struct RegexpDFA { rules: HashMap<(i32, char), i32>, end_states: HashSet } impl RegexpDFA { pub fn eval(&self, s: String) -> bool { let mut state = START; for c in s.chars() { if let Some(x) = self.rules.get(&(state, c)) { state = *x; } else { return false; } } return self.end_states.contains(&state); } pub fn reduce(&self) -> RegexpDFA { let equivalents = self.find_equivalent_states(); return self.collapse_states(equivalents); } pub fn normalize(&self) -> RegexpDFA { let mut index = HashMap::from([(START, START)]); let mut queue = VecDeque::from([START]); while !queue.is_empty() { let state = queue.pop_front().unwrap(); let mut edges: Vec<((i32, char), i32)> = self.rules.iter() .filter(|((st, c), t)| *st == state) .map(|((st, c), t)| ((*st, *c), *t)).collect(); edges.sort(); for ((_st, _c), t) in edges { if !index.contains_key(&t) { index.insert(t, index.len() as i32); queue.push_back(t); } } } let rules = self.rules.iter().map(|((st, c), t)| ((index[st], *c), index[t])).collect(); let end_states = self.end_states.iter().map(|st| index[st]).collect(); return RegexpDFA{rules, end_states}; } fn find_equivalent_states(&self) -> Vec<(i32, i32)> { let state_set: HashSet = HashSet::from_iter(self.rules.values().copied()); let mut state_vec: Vec = Vec::from_iter(state_set.into_iter()); state_vec.push(START); state_vec.push(FAIL); state_vec.sort(); let alphabet: HashSet = self.rules.keys().map(|(_st, c)| c).copied().collect(); let mut equivalents = HashSet::new(); state_vec.iter().enumerate().for_each(|(i, s1)| { equivalents.extend( state_vec[i+1..].iter() .filter(|s2| !(self.end_states.contains(s1)^self.end_states.contains(s2))) .map(|s2| (*s1, *s2)) ); }); let mut n = usize::MAX; while equivalents.len() < n { n = equivalents.len(); equivalents = equivalents.iter().filter(|(s1, s2)| { !alphabet.iter().any(|c| { let t1 = self.rules.get(&(*s1, *c)).unwrap_or(&FAIL); let t2 = self.rules.get(&(*s2, *c)).unwrap_or(&FAIL); let key = (*t1.min(t2), *t1.max(t2)); return t1 != t2 && !equivalents.contains(&key); }) }).copied().collect(); } return Vec::from_iter(equivalents.into_iter()); } fn collapse_states(&self, mut equivalents: Vec<(i32, i32)>) -> RegexpDFA { let mut rules = self.rules.clone(); let mut end_states = self.end_states.clone(); equivalents.sort(); for (s1, s2) in equivalents.into_iter() { rules = rules.into_iter() .filter(|((st, _c), _t)| *st != s2) .map(|(key, t)| (key, if t==s2 {s1} else {t})).collect(); end_states.remove(&s2); } return RegexpDFA{rules, end_states}; } }