Files @ 95db38ce6846
Branch filter:

Location: Regular-Expresso/regexp.py - annotation

Laman
refactoring: changed the states representation from usize and u64 to i32
171b6c89df1d
4f7b6352013d
171b6c89df1d
171b6c89df1d
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
9e303d5ff83e
9e303d5ff83e
9e303d5ff83e
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
a86ff65ec321
a86ff65ec321
a86ff65ec321
a86ff65ec321
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
7e640b0cffa7
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
e9496b21cf64
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
171b6c89df1d
171b6c89df1d
7e640b0cffa7
7e640b0cffa7
9e303d5ff83e
e9496b21cf64
e9496b21cf64
e9496b21cf64
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
171b6c89df1d
171b6c89df1d
c316bec3c0cd
c827cb147cf5
c827cb147cf5
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
a86ff65ec321
a86ff65ec321
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
c827cb147cf5
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
c827cb147cf5
c827cb147cf5
4f7b6352013d
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
4f7b6352013d
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
0163ce5ddc96
c316bec3c0cd
171b6c89df1d
a86ff65ec321
4f7b6352013d
171b6c89df1d
7e640b0cffa7
4f7b6352013d
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
c316bec3c0cd
a522e81d4f37
171b6c89df1d
from abc import abstractmethod
from collections import deque


class ParsingError(Exception):
	pass


class Token:
	is_skippable = False

	@abstractmethod
	def list_first(self):
		pass

	@abstractmethod
	def list_last(self):
		pass

	@abstractmethod
	def list_neighbours(self):
		pass


class Lambda(Token):
	is_skippable = True

	def list_first(self):
		yield from []

	def list_last(self):
		yield from []

	def list_neighbours(self):
		yield from []


class Symbol(Token):
	def __init__(self, position, value):
		self.position = position
		self.value = value

	def list_first(self):
		yield self.position

	def list_last(self):
		yield self.position

	def list_neighbours(self):
		yield from []

	def __str__(self):
		return self.value


class Asterisk(Token):
	is_skippable = True

	def __init__(self, content: Token):
		self.content = content

	def list_first(self):
		yield from self.content.list_first()

	def list_last(self):
		yield from self.content.list_last()

	def list_neighbours(self):
		yield from self.content.list_neighbours()
		for x in self.list_last():
			for y in self.list_first():
				yield (x, y)

	def __str__(self):
		return str(self.content) + "*"


class Alternative(Token):
	def __init__(self, content: list):
		self.variants = []
		subsequence = []

		for token in content:
			if isinstance(token, AlternativeSeparator):
				if not subsequence:
					raise ParsingError("Found an empty Alternative variant.")
				self.variants.append(Chain(subsequence))
				subsequence = []
			else:
				subsequence.append(token)
		
		if not subsequence:
				raise ParsingError("Found an empty Alternative variant.")
		self.variants.append(Chain(subsequence))
		

	def list_first(self):
		for x in self.variants:
			yield from x.list_first()

	def list_last(self):
		for x in self.variants:
			yield from x.list_last()
	
	def list_neighbours(self):
		for x in self.variants:
			yield from x.list_neighbours()

	@property
	def is_skippable(self):
		return any(x.is_skippable for x in self.variants)

class AlternativeSeparator:
	pass

class Chain(Token):
	def __init__(self, content: list):
		self.content = content

	def list_first(self):
		for token in self.content:
			yield from token.list_first()
			if not token.is_skippable:
				break

	def list_last(self):
		for token in reversed(self.content):
			yield from token.list_last()
			if not token.is_skippable:
				break

	def list_neighbours(self):
		previous = []
		for token in self.content:
			for t in previous:
				for x in t.list_last():
					for y in token.list_first():
						yield (x, y)
			yield from token.list_neighbours()

			if token.is_skippable:
				previous.append(token)
			else:
				previous = [token]

	@property
	def is_skippable(self):
		return all(x.is_skippable for x in self.content)

	def __str__(self):
		return "(" + "".join(str(x) for x in self.content) + ")"


def find_closing_parenthesis(pattern, k):
	counter = 0

	for (i, c) in enumerate(pattern[k:]):
		if c == "(":
			counter += 1
		elif c == ")":
			counter -= 1
		if counter == 0:
			return k+i

	raise ParsingError(f'A closing parenthesis not found. Pattern: "{pattern}", position: {k}')


def parse(pattern, offset=0):
	res = []
	is_alternative = False

	i = 0
	while i < len(pattern):
		c = pattern[i]
		if c == "(":
			j = find_closing_parenthesis(pattern, i)
			inner_content = parse(pattern[i+1:j], offset+i+1)
			res.append(inner_content)
			i = j+1
		elif c == "*":
			try:
				token = res.pop()
			except IndexError as e:
				raise ParsingError(f'The asterisk operator is missing an argument. Pattern: "{pattern}", position {i}')
			res.append(Asterisk(token))
			i += 1
		elif c == ")":
			raise ParsingError(f'An opening parenthesis not found. Pattern: "{pattern}", position: {i}')
		elif c == "|" or c == "+":
			is_alternative = True
			res.append(AlternativeSeparator())
			i += 1
		elif c == "_":
			res.append(Lambda())
			i += 1
		else:
			res.append(Symbol(i+offset, c))
			i += 1

	if is_alternative:
		return Alternative(res)
	else:
		return Chain(res)


class Regexp:
	def __init__(self, pattern):
		(self.rules, self.end_states) = self._parse(pattern)

	def _parse(self, s):
		r = parse(s)
		rules = dict()

		for i in r.list_first():
			c = s[i]
			key = (-1, c)
			if key not in rules:
				rules[key] = set()
			rules[key].add(i)

		for (i, j) in r.list_neighbours():
			c = s[j]
			key = (i, c)
			if key not in rules:
				rules[key] = set()
			rules[key].add(j)

		end_states = set(r.list_last())
		if r.is_skippable:
			end_states.add(-1)

		return rules, end_states

	def match(self, s):
		current = {-1}

		for c in s:
			new_state = set()
			for st in current:
				key = (st, c)
				if key in self.rules:
					new_state.update(self.rules[key])
			current = new_state

		return any(st in self.end_states for st in current)

	def determinize(self):
		rules = dict()
		end_states = {(-1,)} if -1 in self.end_states else set()

		stack = [(-1,)]
		processed_states = set()
		while stack:
			multistate = stack.pop()
			new_rules = dict()
			
			for ((st, c), target) in filter(lambda item: item[0][0] in multistate, self.rules.items()):
				if c not in new_rules:
					new_rules[c] = set()
				new_rules[c].update(target)
			
			for (c, target_set) in new_rules.items():
				new_target = tuple(sorted(target_set))
				rules[(multistate, c)] = new_target
				if any(st in self.end_states for st in new_target):
					end_states.add(new_target)
				if new_target not in processed_states:
					stack.append(new_target)
					processed_states.add(new_target)
		
		return (rules, end_states)


class RegexpDFA:
	def __init__(self, rules, end_states):
		self.rules = rules
		self.end_states = end_states

	@classmethod
	def create(cls, pattern):
		r = Regexp(pattern)
		(rules, end_states) = r.determinize()

		return cls(rules, end_states)

	def match(self, s):
		st = 0

		for c in s:
			key = (st, c)
			if key in self.rules:
				st = self.rules[key]
			else:
				return False

		return st in self.end_states

	def reduce(self):
		equivalents = self._find_equivalent_states()
		(rules, end_states) = self._collapse_states(equivalents)

		return RegexpDFA(rules, end_states)

	def normalize(self):
		index = {(-1,): 0}
		queue = deque([(-1,)])

		while queue:
			state = queue.popleft()
			edges = [((st, c), t) for ((st, c), t) in self.rules.items() if st == state]
			edges.sort()
			for ((st, c), t) in edges:
				if t not in index:
					index[t] = len(index)
					queue.append(t)
		
		rules = dict(((index[st], c), index[t]) for ((st, c), t) in self.rules.items())
		end_states = {index[st] for st in self.end_states}

		return RegexpDFA(rules, end_states)

	def _find_equivalent_states(self):
		state_set = [(-2,), (-1,)] + sorted(set(self.rules.values()))
		alphabet = {c for (st, c) in self.rules.keys()}
		equivalents = {(s1, s2) for (i, s1) in enumerate(state_set) for s2 in state_set[i+1:]}

		for (s1, s2) in equivalents.copy():
			if (s1 in self.end_states and s2 not in self.end_states) or (s1 not in self.end_states and s2 in self.end_states):
				equivalents.remove((s1, s2))
		
		ctrl = True
		while ctrl:
			ctrl = False
			for (s1, s2) in equivalents.copy():
				for c in alphabet:
					t1 = self.rules.get((s1, c), (-2,))
					t2 = self.rules.get((s2, c), (-2,))
					key = (min(t1, t2), max(t1, t2))
					if t1 != t2 and key not in equivalents:
						equivalents.remove((s1, s2))
						ctrl = True
						break
		
		return equivalents
	
	def _collapse_states(self, equivalents):
		rules = self.rules.items()
		end_states = self.end_states.copy()

		for (s1, s2) in sorted(equivalents):
			rules = map(
				lambda item: (item[0], s1 if item[1] == s2 else item[1]),
				filter(lambda item: item[0][0] != s2, rules)
			)
			end_states.discard(s2)
		
		return (dict(rules), end_states)


if __name__ == "__main__":
	tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]
	for pattern in ["a(b|c)", "a*b*", "(ab)*", "a((bc)*d)*", "(a|b)*a(a|b)(a|b)(a|b)"]:
		print("#", pattern)
		try:
			r = RegexpDFA.create(pattern).reduce().normalize()
		except ParsingError as e:
			print("Failed to parse the regexp:")
			print(e)
			continue
		for t in tests:
			print(t, r.match(t))
		print()