Changeset - f03565ba6137
[Not reviewed]
default
0 3 0
Laman - 10 months ago 2024-07-02 23:16:28

find_equivalent_states implemented with Hopcroft's n log n algorithm
3 files changed with 111 insertions and 54 deletions:
0 comments (0 inline, 0 general)
regexp.py
Show inline comments
 
@@ -318,8 +318,8 @@ class RegexpDFA:
 
		return st in self.end_states
 

	
 
	def reduce(self):
 
		equivalents = self._find_equivalent_states()
 
		(rules, end_states) = self._collapse_states(equivalents)
 
		partition = self._find_equivalent_states()
 
		(rules, end_states) = self._collapse_states(partition)
 

	
 
		return RegexpDFA(rules, end_states, self.alphabet_index)
 

	
 
@@ -368,45 +368,70 @@ class RegexpDFA:
 

	
 
	def _find_equivalent_states(self):
 
		n = len(self.alphabet_index)
 
		state_list = list(range(len(self.rules) // n))
 
		equivalents = {(s1, s2) for (i, s1) in enumerate(state_list) for s2 in state_list[i+1:] if (s1 in self.end_states) == (s2 in self.end_states)}
 
		m = len(self.rules) // n
 
		inverse_rules = [set() for i in range(m*n)]
 

	
 
		for i in range(m):
 
			for j in range(n):
 
				target = self.rules[i*n + j]
 
				inverse_rules[target*n + j].add(i)
 

	
 
		set_bag = [self.end_states, set(range(m))-self.end_states]
 
		res = {0, 1}
 
		work = {0, 1}
 

	
 
		while work:
 
			key = work.pop()
 
			target_set = set_bag[key]
 
			for j in range(n):
 
				source_set = set(itertools.chain.from_iterable(inverse_rules[t*n + j] for t in target_set))
 
				for k in res.copy():
 
					part = set_bag[k]
 
					intersection = part & source_set
 
					diff = part - source_set
 
					if not intersection or not diff:
 
						continue
 
					res.remove(k)
 
					ki = len(set_bag)
 
					set_bag.append(intersection)
 
					res.add(ki)
 
					kd = len(set_bag)
 
					set_bag.append(diff)
 
					res.add(kd)
 
					if k in work:
 
						work.remove(k)
 
						work.add(ki)
 
						work.add(kd)
 
					elif len(intersection) < len(diff):
 
						work.add(ki)
 
					else:
 
						work.add(kd)
 
		
 
		ctrl = True
 
		while ctrl:
 
			ctrl = False
 
			for (s1, s2) in equivalents.copy():
 
				for ci in range(n):
 
					t1 = self.rules[s1*n + ci]
 
					t2 = self.rules[s2*n + ci]
 
					key = (min(t1, t2), max(t1, t2))
 
					if t1 != t2 and key not in equivalents:
 
						equivalents.remove((s1, s2))
 
						ctrl = True
 
						break
 
		
 
		return equivalents
 
		return [set_bag[k] for k in res]
 
	
 
	def _collapse_states(self, equivalents):
 
	def _collapse_states(self, partition):
 
		n = len(self.alphabet_index)
 
		rules = []
 

	
 
		eq_mapping = dict()
 
		for (s1, s2) in equivalents:
 
			eq_mapping[s2] = min(s1, eq_mapping.get(s2, math.inf))
 
		for eq_set in partition:
 
			states = sorted(eq_set)
 
			for st in states:
 
				eq_mapping[st] = states[0]
 

	
 
		discard_mapping = dict()
 
		discard_count = 0
 

	
 
		for i in range(0, len(self.rules), n):
 
			si = i//n
 
			if si in eq_mapping:
 
			if eq_mapping[si] != si:
 
				discard_count += 1
 
				continue
 
			discard_mapping[si] = si - discard_count
 
			rules.extend(map(lambda st: eq_mapping.get(st, st), self.rules[i:i+n]))
 
			rules.extend(map(lambda st: eq_mapping[st], self.rules[i:i+n]))
 
		
 
		rules = [discard_mapping[st] for st in rules]
 
		end_states = {discard_mapping[eq_mapping.get(st, st)] for st in self.end_states}
 
		end_states = {discard_mapping[eq_mapping[st]] for st in self.end_states}
 
		
 
		return (rules, end_states)
 

	
 
@@ -452,7 +477,7 @@ class RegexpDFA:
 

	
 
def test():
 
	tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]
 
	for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"]:
 
	for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"]:
 
		print("#", pattern)
 
		try:
 
			r = RegexpDFA.create(pattern).reduce().normalize()
src/main.rs
Show inline comments
 
@@ -3,7 +3,7 @@ use regexp::Regexp;
 

	
 
fn test() {
 
	let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"];
 
	for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"] {
 
	for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"] {
 
		println!("# {pattern}");
 
		let r = match Regexp::new(&pattern.to_string()) {
 
			Ok(r1) => r1.determinize().reduce().normalize(),
src/regexp.rs
Show inline comments
 
@@ -168,8 +168,8 @@ impl RegexpDFA {
 
	}
 

	
 
	pub fn reduce(&self) -> RegexpDFA {
 
		let equivalents = self.find_equivalent_states();
 
		return self.collapse_states(equivalents);
 
		let partition = self.find_equivalent_states();
 
		return self.collapse_states(partition);
 
	}
 

	
 
	pub fn normalize(&self) -> RegexpDFA {
 
@@ -230,42 +230,74 @@ impl RegexpDFA {
 
		panic!();
 
	}
 

	
 
	fn find_equivalent_states(&self) -> Vec<(usize, usize)> {
 
	fn find_equivalent_states(&self) -> Vec<HashSet<usize>> {
 
		let n = self.alphabet_index.len();
 
		let state_vec: Vec<usize> = (0..self.rules.len()/n).collect();
 
		let mut equivalents = HashSet::new();
 
		state_vec.iter().enumerate().for_each(|(i, s1)| {
 
			equivalents.extend(
 
				state_vec[i+1..].iter()
 
				.filter(|s2| !(self.end_states.contains(s1)^self.end_states.contains(s2)))
 
				.map(|s2| (*s1, *s2))
 
			);
 
		});
 
		let m = self.rules.len() / n;
 
		let mut inverse_rules = vec![HashSet::<usize>::new();m*n];
 

	
 
		let mut m = usize::MAX;
 
		while equivalents.len() < m {
 
			m = equivalents.len();
 
			equivalents = equivalents.iter().filter(|(s1, s2)| {
 
				!(0..n).any(|ci| {
 
					let t1 = self.rules[s1*n + ci];
 
					let t2 = self.rules[s2*n + ci];
 
					let key = (t1.min(t2), t2.max(t1));
 
					return t1 != t2 && !equivalents.contains(&key);
 
				})
 
			}).copied().collect();
 
		for i in 0..m {
 
			for j in 0..n {
 
				let target = self.rules[i*n + j];
 
				inverse_rules[target*n + j].insert(i);
 
			}
 
		}
 

	
 
		return Vec::from_iter(equivalents.into_iter());
 
		let mut set_bag = vec![
 
			self.end_states.clone(),
 
			HashSet::from_iter(0..m).difference(&self.end_states).copied().collect()
 
		];
 
		let mut res = HashSet::<usize>::from([0, 1]);
 
		let mut work = HashSet::<usize>::from([0, 1]);
 

	
 
		while !work.is_empty() {
 
			let key = *work.iter().next().unwrap();
 
			work.remove(&key);
 
			let target_set = set_bag[key].clone();
 
			for j in 0..n {
 
				let source_set = HashSet::<usize>::from_iter(target_set.iter().flat_map(|&t| inverse_rules[t*n+j].iter()).copied());
 
				for k in res.clone().into_iter() {
 
					let part = &set_bag[k];
 
					let intersection = part.intersection(&source_set).copied().collect::<HashSet<usize>>();
 
					let diff = part.difference(&source_set).copied().collect::<HashSet<usize>>();
 
					if intersection.is_empty() || diff.is_empty() {
 
						continue;
 
					}
 
					res.remove(&k);
 
					let ki = set_bag.len();
 
					set_bag.push(intersection);
 
					res.insert(ki);
 
					let kd = set_bag.len();
 
					set_bag.push(diff);
 
					res.insert(kd);
 
					if work.contains(&k) {
 
						work.remove(&k);
 
						work.insert(ki);
 
						work.insert(kd);
 
					} else if set_bag[ki].len() < set_bag[kd].len() {
 
						work.insert(ki);
 
					} else {
 
						work.insert(kd);
 
					}
 
				}
 
			}
 
		}
 

	
 
		return Vec::from_iter(res.into_iter().map(|k| std::mem::take(&mut set_bag[k])));
 
	}
 

	
 
	fn collapse_states(&self, equivalents: Vec<(usize, usize)>) -> RegexpDFA {
 
	fn collapse_states(&self, partition: Vec<HashSet<usize>>) -> RegexpDFA {
 
		let n = self.alphabet_index.len();
 
		let m = self.rules.len()/n;
 
		let mut rules = Vec::new();
 
		let mut rules = vec![];
 

	
 
		let mut eq_mapping: Vec<usize> = ((0..m)).collect();
 
		for (s1, s2) in equivalents.into_iter() {
 
			eq_mapping[s2] = eq_mapping[s2].min(s1);
 
		for eq_set in partition.into_iter() {
 
			let mut eq_vec = Vec::from_iter(eq_set.into_iter());
 
			eq_vec.sort();
 
			let label = eq_vec[0];
 
			for st in eq_vec.into_iter() {
 
				eq_mapping[st] = label;
 
			}
 
		}
 

	
 
		let mut discard_mapping: Vec<usize> = ((0..m)).collect();
0 comments (0 inline, 0 general)