Files @ c827cb147cf5
Branch filter:

Location: Regular-Expresso/src/main.rs

c827cb147cf5 7.2 KiB application/rls-services+xml Show Annotation Show as Raw Download as Raw
Laman
added the finite automaton determinization
use std::collections::{HashMap, HashSet};

const START: usize = usize::MAX;

trait Token {
	fn is_skippable(&self) -> bool {false}
	fn list_first(&self) -> Vec<usize>;
	fn list_last(&self) -> Vec<usize>;
	fn list_neighbours(&self) -> Vec<(usize, usize)>;
}

struct Symbol {
	position: usize,
	value: char
}

struct Asterisk {
	content: Box<dyn Token>
}

struct Plus {
	content: Box<dyn Token>
}

struct Chain {
	content: Vec<Box<dyn Token>>
}

impl Token for Symbol {
	fn list_first(&self) -> Vec<usize> {
		return vec![self.position];
	}

	fn list_last(&self) -> Vec<usize> {
		return vec![self.position];
	}

	fn list_neighbours(&self) -> Vec<(usize, usize)> {
		return vec![];
	}
}

impl Token for Asterisk {
	fn is_skippable(&self) -> bool {true}

	fn list_first(&self) -> Vec<usize> {
		return self.content.list_first();
	}

	fn list_last(&self) -> Vec<usize> {
		return self.content.list_last();
	}

	fn list_neighbours(&self) -> Vec<(usize, usize)> {
		let mut res = self.content.list_neighbours();

		for x in self.list_last() {
			for y in self.list_first() {
				res.push((x, y));
			}
		}

		return res;
	}
}

impl Token for Plus {
	fn list_first(&self) -> Vec<usize> {
		return self.content.list_first();
	}

	fn list_last(&self) -> Vec<usize> {
		return self.content.list_last();
	}

	fn list_neighbours(&self) -> Vec<(usize, usize)> {
		let mut res = self.content.list_neighbours();

		for x in self.list_last() {
			for y in self.list_first() {
				res.push((x, y));
			}
		}

		return res;
	}
}

impl Token for Chain {
	fn is_skippable(&self) -> bool {
		return self.content.iter().all(|x| x.is_skippable());
	}

	fn list_first(&self) -> Vec<usize> {
		let mut res = Vec::new();
		for token in self.content.iter() {
			res.append(&mut token.list_first());
			if !token.is_skippable() {break;}
		}

		return res;
	}

	fn list_last(&self) -> Vec<usize> {
		let mut res = Vec::new();
		for token in self.content.iter().rev() {
			res.append(&mut token.list_last());
			if !token.is_skippable() {break;}
		}

		return res;
	}

	fn list_neighbours(&self) -> Vec<(usize, usize)> {
		let mut res = Vec::new();
		let mut previous: Vec<&Box<dyn Token>> = Vec::new();
		for token in self.content.iter() {
			for t in previous.iter() {
				for x in t.list_last() {
					for y in token.list_first() {
						res.push((x, y));
					}
				}
			}
			res.append(&mut token.list_neighbours());

			if token.is_skippable() {
				previous.push(token);
			} else {
				previous = vec![token];
			}
		}

		return res;
	}
}

fn find_closing_parenthesis(s: &String) -> Option<usize> {
	let chars: Vec<char> = s.chars().collect();
	let mut counter = 0;

	for (i, c) in chars.iter().enumerate() {
		if *c == '(' {counter += 1;}
		else if *c == ')' {counter -= 1;}
		if counter == 0 {return Some(i);}
	}

	return None;
}

fn parse(pattern: &String, offset: usize) -> Chain {
	let chars: Vec<char> = pattern.chars().collect();
	let mut res: Vec<Box<dyn Token>> = Vec::new();
	let mut i = 0;
	while i < pattern.len() {
		let c = chars[i];
		match c {
			'(' => {
				let j = find_closing_parenthesis(&pattern[i..].to_string()).unwrap() + i;
				let inner_content = parse(&pattern[i+1..j].to_string(), offset+i+1);
				res.push(Box::new(inner_content));
				i = j+1;
			}
			'*' => {
				let token = res.pop().unwrap();
				res.push(Box::new(Asterisk{content: token}));
				i += 1;
			}
			'+' => {
				let token = res.pop().unwrap();
				res.push(Box::new(Plus{content: token}));
				i += 1;
			}
			c => {
				res.push(Box::new(Symbol{position: i+offset, value: c}));
				i += 1;
			}
		}
	}

	return Chain{content: res};
}

fn encode_set(set: &HashSet<usize>) -> u64 {
	let mut res = 0;
	for x in set.iter() {
		res ^= 1<<x;
	}
	return res;
}

fn decode_set(x: u64) ->HashSet<usize> {
	if x == START as u64 {return HashSet::from([START]);}

	let mut x = x;
	let mut res: HashSet<usize> = HashSet::new();
	
	while x > 0 {
		let y = x.trailing_zeros();
		res.insert(y as usize);
		x ^= 1 << y;
	}

	return res;
}

struct Regexp {
	rules: HashMap<(usize, char), HashSet<usize>>,
	end_states: HashSet<usize>
}

impl Regexp {
	fn new(pattern: &String) -> Regexp {
		let r = parse(pattern, 0);
		let pattern_chars = Vec::from_iter(pattern.chars());
		let mut rules: HashMap<(usize, char), HashSet<usize>> = HashMap::new();
		
		for i in r.list_first() {
			let c = pattern_chars[i];
			let key = (START, c);
			match rules.get_mut(&key) {
				Some(set) => {set.insert(i);},
				None => {rules.insert(key, HashSet::from([i]));}
			};
		}

		for (i, j) in r.list_neighbours() {
			let c = pattern_chars[j];
			let key = (i, c);
			match rules.get_mut(&key) {
				Some(set) => {set.insert(j);},
				None => {rules.insert(key, HashSet::from([j]));}
			};
		}

		let mut end_states = HashSet::from_iter(r.list_last().into_iter());
		if r.is_skippable() {
			end_states.insert(START);
		}

		return Regexp{rules, end_states};
	}

	pub fn eval(&self, s: String) -> bool {
		let mut multistate = HashSet::from([START]);

		for c in s.chars() {
			let mut new_multistate = HashSet::new();

			for state in multistate {
				if let Some(x) = self.rules.get(&(state, c)) {
					new_multistate = new_multistate.union(&x).map(|&y| y).collect();
				} else if let Some(x) = self.rules.get(&(state, '.')) {
					new_multistate = new_multistate.union(&x).map(|&y| y).collect();
				}
			}
			multistate = new_multistate;
		}

		return multistate.iter().any(|x| self.end_states.contains(x));
	}

	fn determinize(&self) -> RegexpDFA {
		let mut rules: HashMap<(u64, char), u64> = HashMap::new();
		let mut end_states: HashSet<u64> = HashSet::new();
		if self.end_states.contains(&START) {end_states.insert(START as u64);}

		let mut stack = Vec::from([START as u64]);
		let mut processed_states = HashSet::new();
		while !stack.is_empty() {
			let state = stack.pop().unwrap();
			let multistate = decode_set(state);
			let mut new_rules: HashMap<char, HashSet<usize>> = HashMap::new();

			for key in self.rules.keys().filter(|key| multistate.contains(&key.0)) {
				let (_st, c) = key;
				if !new_rules.contains_key(c) {
					new_rules.insert(*c, HashSet::new());
				}
				for target in &self.rules[key] {
					new_rules.get_mut(c).unwrap().insert(*target);
				}
			}

			for (c, target_set) in new_rules.into_iter() {
				let encoded_target = encode_set(&target_set);
				rules.insert((state, c), encoded_target);
				if target_set.iter().any(|st| self.end_states.contains(st)) {
					end_states.insert(encoded_target);
				}
				if !processed_states.contains(&encoded_target) {
					stack.push(encoded_target);
					processed_states.insert(encoded_target);
				}
			}
		}

		return RegexpDFA{rules, end_states};
	}
}

struct RegexpDFA {
	rules: HashMap<(u64, char), u64>,
	end_states: HashSet<u64>
}

impl RegexpDFA {
	fn eval(&self, s: String) -> bool {
		let mut state = START as u64;

		for c in s.chars() {
			if let Some(x) = self.rules.get(&(state, c)) {
				state = *x;
			} else {
				return false;
			}
		}

		return self.end_states.contains(&state);
	}
}

fn main() {
	let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"];
	for pattern in ["a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"] {
		println!("# {pattern}");
		let r = Regexp::new(&pattern.to_string()).determinize();
		for &t in tests.iter() {
			println!("{t} {}", r.eval(t.to_string()));
		}
		println!();
	}
}