diff --git a/regexp.py b/regexp.py new file mode 100644 --- /dev/null +++ b/regexp.py @@ -0,0 +1,136 @@ +from abc import abstractmethod + + +class Token: + is_skippable = False + + @abstractmethod + def list_first(self): + pass + + @abstractmethod + def list_last(self): + pass + + @abstractmethod + def list_neighbours(self): + pass + + +class Symbol(Token): + def __init__(self, position, value): + self.position = position + self.value = value + + def list_first(self): + yield self.position + + def list_last(self): + yield self.position + + def list_neighbours(self): + yield from [] + + def __str__(self): + return self.value + + +class Asterisk(Token): + is_skippable = True + + def __init__(self, content: Token): + self.content = content + + def list_first(self): + yield from self.content.list_first() + + def list_last(self): + yield from self.content.list_last() + + def list_neighbours(self): + yield from self.content.list_neighbours() + for x in self.list_last(): + for y in self.list_first(): + yield (x, y) + + def __str__(self): + return str(self.content) + "*" + + +class Chain(Token): + def __init__(self, content: list): + self.content = content + + def list_first(self): + for token in self.content: + yield from token.list_first() + if not token.is_skippable: + break + + def list_last(self): + for token in reversed(self.content): + yield from token.list_last() + if not token.is_skippable: + break + + def list_neighbours(self): + previous = [] + for token in self.content: + for t in previous: + for x in t.list_last(): + for y in token.list_first(): + yield (x, y) + yield from token.list_neighbours() + + if token.is_skippable: + previous.append(token) + else: + previous = [token] + + def __str__(self): + return "(" + "".join(str(x) for x in self.content) + ")" + + +def find_closing_parenthesis(pattern, k): + counter = 0 + + for (i, c) in enumerate(pattern[k:]): + if c == "(": + counter += 1 + elif c == ")": + counter -= 1 + if counter == 0: + return k+i + + return None + + +def parse(pattern, offset=0): + res = [] + + i = 0 + while i < len(pattern): + c = pattern[i] + if c == "(": + j = find_closing_parenthesis(pattern, i) + inner_content = parse(pattern[i+1:j], offset+i+1) + res.append(inner_content) + i = j+1 + elif c == "*": + token = res.pop() + res.append(Asterisk(token)) + i += 1 + else: + res.append(Symbol(i+offset, c)) + i += 1 + + return Chain(res) + + +if __name__ == "__main__": + for pattern in ["a*b*", "(ab)*", "a((bc)*d)*"]: + print("#", pattern) + p = parse(pattern) + print(p, list(p.list_first()), list(p.list_last())) + print(list(p.list_neighbours())) + print()