Changeset - a86ff65ec321
[Not reviewed]
default
0 2 0
Laman - 10 months ago 2024-06-12 23:27:53

fixed the negative match for an empty string
2 files changed with 16 insertions and 3 deletions:
0 comments (0 inline, 0 general)
regexp.py
Show inline comments
 
@@ -47,140 +47,146 @@ class Plus(Token):
 

	
 
	def list_neighbours(self):
 
		yield from self.content.list_neighbours()
 
		for x in self.list_last():
 
			for y in self.list_first():
 
				yield (x, y)
 

	
 
	def __str__(self):
 
		return str(self.content) + "+"
 

	
 

	
 
class Asterisk(Plus):
 
	is_skippable = True
 
	
 
	def __str__(self):
 
		return str(self.content) + "*"
 

	
 

	
 
class Chain(Token):
 
	def __init__(self, content: list):
 
		self.content = content
 

	
 
	def list_first(self):
 
		for token in self.content:
 
			yield from token.list_first()
 
			if not token.is_skippable:
 
				break
 

	
 
	def list_last(self):
 
		for token in reversed(self.content):
 
			yield from token.list_last()
 
			if not token.is_skippable:
 
				break
 

	
 
	def list_neighbours(self):
 
		previous = []
 
		for token in self.content:
 
			for t in previous:
 
				for x in t.list_last():
 
					for y in token.list_first():
 
						yield (x, y)
 
			yield from token.list_neighbours()
 

	
 
			if token.is_skippable:
 
				previous.append(token)
 
			else:
 
				previous = [token]
 

	
 
	@property
 
	def is_skippable(self):
 
		return all(x.is_skippable for x in self.content)
 

	
 
	def __str__(self):
 
		return "(" + "".join(str(x) for x in self.content) + ")"
 

	
 

	
 
def find_closing_parenthesis(pattern, k):
 
	counter = 0
 

	
 
	for (i, c) in enumerate(pattern[k:]):
 
		if c == "(":
 
			counter += 1
 
		elif c == ")":
 
			counter -= 1
 
		if counter == 0:
 
			return k+i
 

	
 
	return None
 

	
 

	
 
def parse(pattern, offset=0):
 
	res = []
 

	
 
	i = 0
 
	while i < len(pattern):
 
		c = pattern[i]
 
		if c == "(":
 
			j = find_closing_parenthesis(pattern, i)
 
			inner_content = parse(pattern[i+1:j], offset+i+1)
 
			res.append(inner_content)
 
			i = j+1
 
		elif c == "*":
 
			token = res.pop()
 
			res.append(Asterisk(token))
 
			i += 1
 
		elif c == "+":
 
			token = res.pop()
 
			res.append(Plus(token))
 
			i += 1
 
		else:
 
			res.append(Symbol(i+offset, c))
 
			i += 1
 

	
 
	return Chain(res)
 

	
 

	
 
class Regexp:
 
	def __init__(self, s):
 
		(self.rules, self.end_states) = self._parse(s)
 

	
 
	def _parse(self, s):
 
		r = parse(s)
 
		rules = dict()
 

	
 
		for i in r.list_first():
 
			c = s[i]
 
			key = (-1, c)
 
			if key not in rules:
 
				rules[key] = set()
 
			rules[key].add(i)
 

	
 
		for (i, j) in r.list_neighbours():
 
			c = s[j]
 
			key = (i, c)
 
			if key not in rules:
 
				rules[key] = set()
 
			rules[key].add(j)
 

	
 
		end_states = set(r.list_last())
 
		if r.is_skippable:
 
			end_states.add(-1)
 

	
 
		return rules, end_states
 

	
 
	def match(self, s):
 
		current = {-1}
 

	
 
		for c in s:
 
			new_state = set()
 
			for st in current:
 
				key = (st, c)
 
				if key in self.rules:
 
					new_state.update(self.rules[key])
 
			current = new_state
 

	
 
		return any(st in self.end_states for st in current)
 

	
 

	
 
if __name__ == "__main__":
 
	tests = ["a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]
 
	tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]
 
	for pattern in ["a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"]:
 
		print("#", pattern)
 
		r = Regexp(pattern)
 
		for t in tests:
 
			print(t, r.match(t))
 
		print()
src/main.rs
Show inline comments
 
@@ -42,96 +42,100 @@ impl Token for Symbol {
 

	
 
impl Token for Asterisk {
 
	fn is_skippable(&self) -> bool {true}
 

	
 
	fn list_first(&self) -> Vec<usize> {
 
		return self.content.list_first();
 
	}
 

	
 
	fn list_last(&self) -> Vec<usize> {
 
		return self.content.list_last();
 
	}
 

	
 
	fn list_neighbours(&self) -> Vec<(usize, usize)> {
 
		let mut res = self.content.list_neighbours();
 

	
 
		for x in self.list_last() {
 
			for y in self.list_first() {
 
				res.push((x, y));
 
			}
 
		}
 

	
 
		return res;
 
	}
 
}
 

	
 
impl Token for Plus {
 
	fn list_first(&self) -> Vec<usize> {
 
		return self.content.list_first();
 
	}
 

	
 
	fn list_last(&self) -> Vec<usize> {
 
		return self.content.list_last();
 
	}
 

	
 
	fn list_neighbours(&self) -> Vec<(usize, usize)> {
 
		let mut res = self.content.list_neighbours();
 

	
 
		for x in self.list_last() {
 
			for y in self.list_first() {
 
				res.push((x, y));
 
			}
 
		}
 

	
 
		return res;
 
	}
 
}
 

	
 
impl Token for Chain {
 
	fn is_skippable(&self) -> bool {
 
		return self.content.iter().all(|x| x.is_skippable());
 
	}
 

	
 
	fn list_first(&self) -> Vec<usize> {
 
		let mut res = Vec::new();
 
		for token in self.content.iter() {
 
			res.append(&mut token.list_first());
 
			if !token.is_skippable() {break;}
 
		}
 

	
 
		return res;
 
	}
 

	
 
	fn list_last(&self) -> Vec<usize> {
 
		let mut res = Vec::new();
 
		for token in self.content.iter().rev() {
 
			res.append(&mut token.list_last());
 
			if !token.is_skippable() {break;}
 
		}
 

	
 
		return res;
 
	}
 

	
 
	fn list_neighbours(&self) -> Vec<(usize, usize)> {
 
		let mut res = Vec::new();
 
		let mut previous: Vec<&Box<dyn Token>> = Vec::new();
 
		for token in self.content.iter() {
 
			for t in previous.iter() {
 
				for x in t.list_last() {
 
					for y in token.list_first() {
 
						res.push((x, y));
 
					}
 
				}
 
			}
 
			res.append(&mut token.list_neighbours());
 

	
 
			if token.is_skippable() {
 
				previous.push(token);
 
			} else {
 
				previous = vec![token];
 
			}
 
		}
 

	
 
		return res;
 
	}
 
}
 

	
 
fn find_closing_parenthesis(s: &String) -> Option<usize> {
 
	let chars: Vec<char> = s.chars().collect();
 
	let mut counter = 0;
 

	
 
@@ -161,84 +165,87 @@ fn parse(pattern: &String, offset: usize
 
				let token = res.pop().unwrap();
 
				res.push(Box::new(Asterisk{content: token}));
 
				i += 1;
 
			}
 
			'+' => {
 
				let token = res.pop().unwrap();
 
				res.push(Box::new(Plus{content: token}));
 
				i += 1;
 
			}
 
			c => {
 
				res.push(Box::new(Symbol{position: i+offset, value: c}));
 
				i += 1;
 
			}
 
		}
 
	}
 

	
 
	return Chain{content: res};
 
}
 

	
 
struct Regexp {
 
	rules: HashMap<(usize, char), HashSet<usize>>,
 
	end_states: HashSet<usize>
 
}
 

	
 
impl Regexp {
 
	fn new(pattern: &String) -> Regexp {
 
		let r = parse(pattern, 0);
 
		let pattern_chars = Vec::from_iter(pattern.chars());
 
		let mut rules: HashMap<(usize, char), HashSet<usize>> = HashMap::new();
 
		
 
		for i in r.list_first() {
 
			let c = pattern_chars[i];
 
			let key = (START, c);
 
			match rules.get_mut(&key) {
 
				Some(set) => {set.insert(i);},
 
				None => {rules.insert(key, HashSet::from([i]));}
 
			};
 
		}
 

	
 
		for (i, j) in r.list_neighbours() {
 
			let c = pattern_chars[j];
 
			let key = (i, c);
 
			match rules.get_mut(&key) {
 
				Some(set) => {set.insert(j);},
 
				None => {rules.insert(key, HashSet::from([j]));}
 
			};
 
		}
 

	
 
		let end_states = HashSet::from_iter(r.list_last().into_iter());
 
		let mut end_states = HashSet::from_iter(r.list_last().into_iter());
 
		if r.is_skippable() {
 
			end_states.insert(START);
 
		}
 

	
 
		return Regexp{rules, end_states};
 
	}
 

	
 
	pub fn eval(&self, s: String) -> bool {
 
		let mut multistate = HashSet::from([START]);
 

	
 
		for c in s.chars() {
 
			let mut new_multistate = HashSet::new();
 

	
 
			for state in multistate {
 
				if let Some(x) = self.rules.get(&(state, c)) {
 
					new_multistate = new_multistate.union(&x).map(|&y| y).collect();
 
				} else if let Some(x) = self.rules.get(&(state, '.')) {
 
					new_multistate = new_multistate.union(&x).map(|&y| y).collect();
 
				}
 
			}
 
			multistate = new_multistate;
 
		}
 

	
 
		return multistate.iter().any(|x| self.end_states.contains(x));
 
	}
 
}
 

	
 
fn main() {
 
	let tests = ["a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"];
 
	let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"];
 
	for pattern in ["a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"] {
 
		println!("# {pattern}");
 
		let r = Regexp::new(&pattern.to_string());
 
		for &t in tests.iter() {
 
			println!("{t} {}", r.eval(t.to_string()));
 
		}
 
		println!();
 
	}
 
}
0 comments (0 inline, 0 general)