# HG changeset patch # User Laman # Date 2024-06-16 16:04:44 # Node ID c827cb147cf5d073f50ad8ac88d8aa6d8f7d3eff # Parent a86ff65ec3210aaea7ba84221872d0f705778249 added the finite automaton determinization diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -141,8 +141,8 @@ def parse(pattern, offset=0): class Regexp: - def __init__(self, s): - (self.rules, self.end_states) = self._parse(s) + def __init__(self, pattern): + (self.rules, self.end_states) = self._parse(pattern) def _parse(self, s): r = parse(s) @@ -181,12 +181,56 @@ class Regexp: return any(st in self.end_states for st in current) + def determinize(self): + rules = dict() + end_states = {(-1,)} if -1 in self.end_states else set() + + stack = [(-1,)] + processed_states = set() + while stack: + multistate = stack.pop() + new_rules = dict() + + for ((st, c), target) in filter(lambda item: item[0][0] in multistate, self.rules.items()): + if c not in new_rules: + new_rules[c] = set() + new_rules[c].update(target) + + for (c, target_set) in new_rules.items(): + new_target = tuple(sorted(target_set)) + rules[(multistate, c)] = new_target + if any(st in self.end_states for st in new_target): + end_states.add(new_target) + if new_target not in processed_states: + stack.append(new_target) + processed_states.add(new_target) + + return (rules, end_states) + + +class RegexpDFA: + def __init__(self, pattern): + r = Regexp(pattern) + (self.rules, self.end_states) = r.determinize() + + def match(self, s): + st = (-1,) + + for c in s: + key = (st, c) + if key in self.rules: + st = self.rules[key] + else: + return False + + return st in self.end_states + if __name__ == "__main__": tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"] for pattern in ["a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"]: print("#", pattern) - r = Regexp(pattern) + r = RegexpDFA(pattern) for t in tests: print(t, r.match(t)) print() diff --git a/src/main.rs b/src/main.rs --- a/src/main.rs +++ b/src/main.rs @@ -181,6 +181,29 @@ fn parse(pattern: &String, offset: usize return Chain{content: res}; } +fn encode_set(set: &HashSet) -> u64 { + let mut res = 0; + for x in set.iter() { + res ^= 1<HashSet { + if x == START as u64 {return HashSet::from([START]);} + + let mut x = x; + let mut res: HashSet = HashSet::new(); + + while x > 0 { + let y = x.trailing_zeros(); + res.insert(y as usize); + x ^= 1 << y; + } + + return res; +} + struct Regexp { rules: HashMap<(usize, char), HashSet>, end_states: HashSet @@ -236,13 +259,72 @@ impl Regexp { return multistate.iter().any(|x| self.end_states.contains(x)); } + + fn determinize(&self) -> RegexpDFA { + let mut rules: HashMap<(u64, char), u64> = HashMap::new(); + let mut end_states: HashSet = HashSet::new(); + if self.end_states.contains(&START) {end_states.insert(START as u64);} + + let mut stack = Vec::from([START as u64]); + let mut processed_states = HashSet::new(); + while !stack.is_empty() { + let state = stack.pop().unwrap(); + let multistate = decode_set(state); + let mut new_rules: HashMap> = HashMap::new(); + + for key in self.rules.keys().filter(|key| multistate.contains(&key.0)) { + let (_st, c) = key; + if !new_rules.contains_key(c) { + new_rules.insert(*c, HashSet::new()); + } + for target in &self.rules[key] { + new_rules.get_mut(c).unwrap().insert(*target); + } + } + + for (c, target_set) in new_rules.into_iter() { + let encoded_target = encode_set(&target_set); + rules.insert((state, c), encoded_target); + if target_set.iter().any(|st| self.end_states.contains(st)) { + end_states.insert(encoded_target); + } + if !processed_states.contains(&encoded_target) { + stack.push(encoded_target); + processed_states.insert(encoded_target); + } + } + } + + return RegexpDFA{rules, end_states}; + } +} + +struct RegexpDFA { + rules: HashMap<(u64, char), u64>, + end_states: HashSet +} + +impl RegexpDFA { + fn eval(&self, s: String) -> bool { + let mut state = START as u64; + + for c in s.chars() { + if let Some(x) = self.rules.get(&(state, c)) { + state = *x; + } else { + return false; + } + } + + return self.end_states.contains(&state); + } } fn main() { let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]; for pattern in ["a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"] { println!("# {pattern}"); - let r = Regexp::new(&pattern.to_string()); + let r = Regexp::new(&pattern.to_string()).determinize(); for &t in tests.iter() { println!("{t} {}", r.eval(t.to_string())); }