# HG changeset patch # User Laman # Date 2024-06-30 16:15:19 # Node ID fd790596c35304cf2c2a24c50961e8ffe318c8f4 # Parent f0f9655e62ee1e688546dcbc3f8a51e3d88a65af implemented comparison of regexps with different alphabets diff --git a/regexp.py b/regexp.py --- a/regexp.py +++ b/regexp.py @@ -206,6 +206,14 @@ def parse(pattern, offset=0): return Chain(res) +def print_dfa(dfa, label=""): + n = len(dfa.alphabet_index) + print(label) + for i in range(0, len(dfa.rules), n): + print(i//n, dfa.rules[i:i+n]) + print(dfa.end_states) + + class Regexp: def __init__(self, pattern): r = parse(pattern) @@ -340,7 +348,9 @@ class RegexpDFA: if self.rules == r.rules and self.end_states == r.end_states: return None - product = self._build_product_automaton(r) + r1 = self._expand_alphabet(r.alphabet_index) + r2 = r._expand_alphabet(self.alphabet_index) + product = r1._build_product_automaton(r2) n = len(product.alphabet_index) reverse_alphabet_index = {v: k for (k, v) in product.alphabet_index.items()} @@ -401,6 +411,28 @@ class RegexpDFA: return (rules, end_states) + def _expand_alphabet(self, alphabet_index): + if alphabet_index == self.alphabet_index: + return self + + n1 = len(self.alphabet_index) + m = len(self.rules) // n1 + + combined_alphabet = set(self.alphabet_index.keys()) | set(alphabet_index.keys()) + combined_index = {c: i for (i, c) in enumerate(sorted(combined_alphabet))} + conversion_index = {v: combined_index[k] for (k, v) in self.alphabet_index.items()} + n2 = len(combined_alphabet) + + rules = [] + for i in range(0, len(self.rules), n1): + row = ([m]*n2) + for (j, st) in enumerate(self.rules[i:i+n1]): + row[conversion_index[j]] = st + rules.extend(row) + rules.extend([m]*n2) + + return RegexpDFA(rules, self.end_states, combined_index).reduce().normalize() + def _build_product_automaton(self, r): n = len(self.alphabet_index) m = len(r.rules) // n diff --git a/src/regexp.rs b/src/regexp.rs --- a/src/regexp.rs +++ b/src/regexp.rs @@ -132,6 +132,7 @@ impl Regexp { } } +#[derive(Clone)] pub struct RegexpDFA { rules: Vec, end_states: HashSet, @@ -206,7 +207,9 @@ impl RegexpDFA { return None; } - let product = self.build_product_automaton(other); + let r1 = self.expand_alphabet(&other.alphabet_index); + let r2 = other.expand_alphabet(&self.alphabet_index); + let product = r1.build_product_automaton(&r2); let n = product.alphabet_index.len(); let reverse_alphabet_index: HashMap = HashMap::from_iter(product.alphabet_index.iter().map(|(&k, &v)| (v, k))); @@ -284,6 +287,34 @@ impl RegexpDFA { return RegexpDFA{rules, end_states, alphabet_index: self.alphabet_index.clone()}; } + fn expand_alphabet(&self, alphabet_index: &HashMap) -> RegexpDFA { + if *alphabet_index == self.alphabet_index { + return self.clone(); + } + + let n1 = self.alphabet_index.len(); + let m = self.rules.len() / n1; + + let combined_alphabet: HashSet = HashSet::from_iter(self.alphabet_index.keys().chain(alphabet_index.keys()).copied()); + let mut combined_vec = Vec::from_iter(combined_alphabet.into_iter()); + combined_vec.sort(); + let combined_index = HashMap::from_iter(combined_vec.iter().enumerate().map(|(i, c)| (*c, i))); + let conversion_index: HashMap = HashMap::from_iter(self.alphabet_index.iter().map(|(k, v)| (*v, combined_index[k]))); + let n2 = combined_vec.len(); + + let mut rules = vec![]; + for i in 0..m { + let mut row = vec![m;n2]; + for (j, st) in self.rules[i*n1..(i+1)*n1].iter().enumerate() { + row[conversion_index[&j]] = *st; + } + rules.append(&mut row); + } + rules.append(&mut vec![m;n2]); + + return RegexpDFA{rules, end_states: self.end_states.clone(), alphabet_index: combined_index}.reduce().normalize(); + } + fn build_product_automaton(&self, other: &RegexpDFA) -> RegexpDFA { let n = self.alphabet_index.len(); let m = other.rules.len() / n;