Files @ c827cb147cf5
Branch filter:

Location: Regular-Expresso/regexp.py - annotation

Laman
added the finite automaton determinization
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
a522e81d4f37
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
a522e81d4f37
a522e81d4f37
a522e81d4f37
a522e81d4f37
a522e81d4f37
a522e81d4f37
a522e81d4f37
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
a86ff65ec321
a86ff65ec321
a86ff65ec321
a86ff65ec321
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
a522e81d4f37
a522e81d4f37
a522e81d4f37
a522e81d4f37
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
c316bec3c0cd
c827cb147cf5
c827cb147cf5
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
a86ff65ec321
a86ff65ec321
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c316bec3c0cd
171b6c89df1d
a86ff65ec321
a522e81d4f37
171b6c89df1d
c827cb147cf5
c316bec3c0cd
a522e81d4f37
171b6c89df1d
from abc import abstractmethod


class Token:
	is_skippable = False

	@abstractmethod
	def list_first(self):
		pass

	@abstractmethod
	def list_last(self):
		pass

	@abstractmethod
	def list_neighbours(self):
		pass


class Symbol(Token):
	def __init__(self, position, value):
		self.position = position
		self.value = value

	def list_first(self):
		yield self.position

	def list_last(self):
		yield self.position

	def list_neighbours(self):
		yield from []

	def __str__(self):
		return self.value


class Plus(Token):
	def __init__(self, content: Token):
		self.content = content

	def list_first(self):
		yield from self.content.list_first()

	def list_last(self):
		yield from self.content.list_last()

	def list_neighbours(self):
		yield from self.content.list_neighbours()
		for x in self.list_last():
			for y in self.list_first():
				yield (x, y)

	def __str__(self):
		return str(self.content) + "+"


class Asterisk(Plus):
	is_skippable = True
	
	def __str__(self):
		return str(self.content) + "*"


class Chain(Token):
	def __init__(self, content: list):
		self.content = content

	def list_first(self):
		for token in self.content:
			yield from token.list_first()
			if not token.is_skippable:
				break

	def list_last(self):
		for token in reversed(self.content):
			yield from token.list_last()
			if not token.is_skippable:
				break

	def list_neighbours(self):
		previous = []
		for token in self.content:
			for t in previous:
				for x in t.list_last():
					for y in token.list_first():
						yield (x, y)
			yield from token.list_neighbours()

			if token.is_skippable:
				previous.append(token)
			else:
				previous = [token]

	@property
	def is_skippable(self):
		return all(x.is_skippable for x in self.content)

	def __str__(self):
		return "(" + "".join(str(x) for x in self.content) + ")"


def find_closing_parenthesis(pattern, k):
	counter = 0

	for (i, c) in enumerate(pattern[k:]):
		if c == "(":
			counter += 1
		elif c == ")":
			counter -= 1
		if counter == 0:
			return k+i

	return None


def parse(pattern, offset=0):
	res = []

	i = 0
	while i < len(pattern):
		c = pattern[i]
		if c == "(":
			j = find_closing_parenthesis(pattern, i)
			inner_content = parse(pattern[i+1:j], offset+i+1)
			res.append(inner_content)
			i = j+1
		elif c == "*":
			token = res.pop()
			res.append(Asterisk(token))
			i += 1
		elif c == "+":
			token = res.pop()
			res.append(Plus(token))
			i += 1
		else:
			res.append(Symbol(i+offset, c))
			i += 1

	return Chain(res)


class Regexp:
	def __init__(self, pattern):
		(self.rules, self.end_states) = self._parse(pattern)

	def _parse(self, s):
		r = parse(s)
		rules = dict()

		for i in r.list_first():
			c = s[i]
			key = (-1, c)
			if key not in rules:
				rules[key] = set()
			rules[key].add(i)

		for (i, j) in r.list_neighbours():
			c = s[j]
			key = (i, c)
			if key not in rules:
				rules[key] = set()
			rules[key].add(j)

		end_states = set(r.list_last())
		if r.is_skippable:
			end_states.add(-1)

		return rules, end_states

	def match(self, s):
		current = {-1}

		for c in s:
			new_state = set()
			for st in current:
				key = (st, c)
				if key in self.rules:
					new_state.update(self.rules[key])
			current = new_state

		return any(st in self.end_states for st in current)

	def determinize(self):
		rules = dict()
		end_states = {(-1,)} if -1 in self.end_states else set()

		stack = [(-1,)]
		processed_states = set()
		while stack:
			multistate = stack.pop()
			new_rules = dict()
			
			for ((st, c), target) in filter(lambda item: item[0][0] in multistate, self.rules.items()):
				if c not in new_rules:
					new_rules[c] = set()
				new_rules[c].update(target)
			
			for (c, target_set) in new_rules.items():
				new_target = tuple(sorted(target_set))
				rules[(multistate, c)] = new_target
				if any(st in self.end_states for st in new_target):
					end_states.add(new_target)
				if new_target not in processed_states:
					stack.append(new_target)
					processed_states.add(new_target)
		
		return (rules, end_states)


class RegexpDFA:
	def __init__(self, pattern):
		r = Regexp(pattern)
		(self.rules, self.end_states) = r.determinize()

	def match(self, s):
		st = (-1,)

		for c in s:
			key = (st, c)
			if key in self.rules:
				st = self.rules[key]
			else:
				return False

		return st in self.end_states


if __name__ == "__main__":
	tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]
	for pattern in ["a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"]:
		print("#", pattern)
		r = RegexpDFA(pattern)
		for t in tests:
			print(t, r.match(t))
		print()