Files @ 4eed8a486a3b
Branch filter:

Location: Regular-Expresso/regexp.py - annotation

Laman
removed Symbol.value
171b6c89df1d
171b6c89df1d
171b6c89df1d
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
a522e81d4f37
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
a522e81d4f37
a522e81d4f37
a522e81d4f37
a522e81d4f37
a522e81d4f37
a522e81d4f37
a522e81d4f37
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
a86ff65ec321
a86ff65ec321
a86ff65ec321
a86ff65ec321
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
7e640b0cffa7
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
171b6c89df1d
171b6c89df1d
a522e81d4f37
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
a522e81d4f37
a522e81d4f37
7e640b0cffa7
7e640b0cffa7
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
171b6c89df1d
c316bec3c0cd
c827cb147cf5
c827cb147cf5
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
a86ff65ec321
a86ff65ec321
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c316bec3c0cd
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c827cb147cf5
c316bec3c0cd
171b6c89df1d
a86ff65ec321
7e640b0cffa7
171b6c89df1d
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
c316bec3c0cd
a522e81d4f37
171b6c89df1d
from abc import abstractmethod


class ParsingError(Exception):
	pass


class Token:
	is_skippable = False

	@abstractmethod
	def list_first(self):
		pass

	@abstractmethod
	def list_last(self):
		pass

	@abstractmethod
	def list_neighbours(self):
		pass


class Symbol(Token):
	def __init__(self, position, value):
		self.position = position
		self.value = value

	def list_first(self):
		yield self.position

	def list_last(self):
		yield self.position

	def list_neighbours(self):
		yield from []

	def __str__(self):
		return self.value


class Plus(Token):
	def __init__(self, content: Token):
		self.content = content

	def list_first(self):
		yield from self.content.list_first()

	def list_last(self):
		yield from self.content.list_last()

	def list_neighbours(self):
		yield from self.content.list_neighbours()
		for x in self.list_last():
			for y in self.list_first():
				yield (x, y)

	def __str__(self):
		return str(self.content) + "+"


class Asterisk(Plus):
	is_skippable = True
	
	def __str__(self):
		return str(self.content) + "*"


class Chain(Token):
	def __init__(self, content: list):
		self.content = content

	def list_first(self):
		for token in self.content:
			yield from token.list_first()
			if not token.is_skippable:
				break

	def list_last(self):
		for token in reversed(self.content):
			yield from token.list_last()
			if not token.is_skippable:
				break

	def list_neighbours(self):
		previous = []
		for token in self.content:
			for t in previous:
				for x in t.list_last():
					for y in token.list_first():
						yield (x, y)
			yield from token.list_neighbours()

			if token.is_skippable:
				previous.append(token)
			else:
				previous = [token]

	@property
	def is_skippable(self):
		return all(x.is_skippable for x in self.content)

	def __str__(self):
		return "(" + "".join(str(x) for x in self.content) + ")"


def find_closing_parenthesis(pattern, k):
	counter = 0

	for (i, c) in enumerate(pattern[k:]):
		if c == "(":
			counter += 1
		elif c == ")":
			counter -= 1
		if counter == 0:
			return k+i

	raise ParsingError(f'A closing parenthesis not found. Pattern: "{pattern}", position: {k}')


def parse(pattern, offset=0):
	res = []

	i = 0
	while i < len(pattern):
		c = pattern[i]
		if c == "(":
			j = find_closing_parenthesis(pattern, i)
			inner_content = parse(pattern[i+1:j], offset+i+1)
			res.append(inner_content)
			i = j+1
		elif c == "*":
			try:
				token = res.pop()
			except IndexError as e:
				raise ParsingError(f'The asterisk operator is missing an argument. Pattern: "{pattern}", position {i}')
			res.append(Asterisk(token))
			i += 1
		elif c == "+":
			try:
				token = res.pop()
			except IndexError as e:
				raise ParsingError(f'The plus operator is missing an argument. Pattern: "{pattern}", position {i}')
			res.append(Plus(token))
			i += 1
		elif c == ")":
			raise ParsingError(f'An opening parenthesis not found. Pattern: "{pattern}", position: {i}')
		else:
			res.append(Symbol(i+offset, c))
			i += 1

	return Chain(res)


class Regexp:
	def __init__(self, pattern):
		(self.rules, self.end_states) = self._parse(pattern)

	def _parse(self, s):
		r = parse(s)
		rules = dict()

		for i in r.list_first():
			c = s[i]
			key = (-1, c)
			if key not in rules:
				rules[key] = set()
			rules[key].add(i)

		for (i, j) in r.list_neighbours():
			c = s[j]
			key = (i, c)
			if key not in rules:
				rules[key] = set()
			rules[key].add(j)

		end_states = set(r.list_last())
		if r.is_skippable:
			end_states.add(-1)

		return rules, end_states

	def match(self, s):
		current = {-1}

		for c in s:
			new_state = set()
			for st in current:
				key = (st, c)
				if key in self.rules:
					new_state.update(self.rules[key])
			current = new_state

		return any(st in self.end_states for st in current)

	def determinize(self):
		rules = dict()
		end_states = {(-1,)} if -1 in self.end_states else set()

		stack = [(-1,)]
		processed_states = set()
		while stack:
			multistate = stack.pop()
			new_rules = dict()
			
			for ((st, c), target) in filter(lambda item: item[0][0] in multistate, self.rules.items()):
				if c not in new_rules:
					new_rules[c] = set()
				new_rules[c].update(target)
			
			for (c, target_set) in new_rules.items():
				new_target = tuple(sorted(target_set))
				rules[(multistate, c)] = new_target
				if any(st in self.end_states for st in new_target):
					end_states.add(new_target)
				if new_target not in processed_states:
					stack.append(new_target)
					processed_states.add(new_target)
		
		return (rules, end_states)


class RegexpDFA:
	def __init__(self, pattern):
		r = Regexp(pattern)
		(self.rules, self.end_states) = r.determinize()

	def match(self, s):
		st = (-1,)

		for c in s:
			key = (st, c)
			if key in self.rules:
				st = self.rules[key]
			else:
				return False

		return st in self.end_states


if __name__ == "__main__":
	tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]
	for pattern in ["*", "((a)", "a)"]:
		print("#", pattern)
		try:
			r = RegexpDFA(pattern)
		except ParsingError as e:
			print("Failed to parse the regexp:")
			print(e)
			continue
		for t in tests:
			print(t, r.match(t))
		print()