Files @ 1e7b35645ea4
Branch filter:

Location: Regular-Expresso/src/regexp/token.rs - annotation

1e7b35645ea4 8.3 KiB application/rls-services+xml Show Source Show as Raw Download as Raw
Laman
renamed the package to regular-expresso
e9496b21cf64
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
e9496b21cf64
e9496b21cf64
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
08f1519c859e
e9496b21cf64
e9496b21cf64
e9496b21cf64
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
08f1519c859e
e93b264ec5cc
08f1519c859e
4eed8a486a3b
e93b264ec5cc
e93b264ec5cc
08f1519c859e
e93b264ec5cc
3cdbf505e6f8
e93b264ec5cc
e93b264ec5cc
08f1519c859e
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
08f1519c859e
e93b264ec5cc
3cdbf505e6f8
e93b264ec5cc
e93b264ec5cc
08f1519c859e
3cdbf505e6f8
08f1519c859e
3cdbf505e6f8
3cdbf505e6f8
e9496b21cf64
08f1519c859e
e9496b21cf64
3cdbf505e6f8
3cdbf505e6f8
3cdbf505e6f8
3cdbf505e6f8
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
3cdbf505e6f8
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e9496b21cf64
08f1519c859e
08f1519c859e
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
08f1519c859e
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
3cdbf505e6f8
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
3cdbf505e6f8
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
3cdbf505e6f8
08f1519c859e
08f1519c859e
3cdbf505e6f8
3cdbf505e6f8
68c16b6d84f3
3cdbf505e6f8
3cdbf505e6f8
e9496b21cf64
08f1519c859e
3cdbf505e6f8
3cdbf505e6f8
3cdbf505e6f8
3cdbf505e6f8
08f1519c859e
3cdbf505e6f8
3cdbf505e6f8
68c16b6d84f3
3cdbf505e6f8
3cdbf505e6f8
e9496b21cf64
08f1519c859e
3cdbf505e6f8
3cdbf505e6f8
3cdbf505e6f8
3cdbf505e6f8
08f1519c859e
3cdbf505e6f8
3cdbf505e6f8
68c16b6d84f3
3cdbf505e6f8
3cdbf505e6f8
e9496b21cf64
08f1519c859e
3cdbf505e6f8
3cdbf505e6f8
3cdbf505e6f8
3cdbf505e6f8
08f1519c859e
3cdbf505e6f8
3cdbf505e6f8
68c16b6d84f3
3cdbf505e6f8
3cdbf505e6f8
e9496b21cf64
08f1519c859e
3cdbf505e6f8
3cdbf505e6f8
3cdbf505e6f8
3cdbf505e6f8
3cdbf505e6f8
08f1519c859e
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
08f1519c859e
08f1519c859e
08f1519c859e
08f1519c859e
3cdbf505e6f8
e93b264ec5cc
3cdbf505e6f8
e9496b21cf64
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
7e640b0cffa7
7e640b0cffa7
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
7e640b0cffa7
3cdbf505e6f8
e93b264ec5cc
e93b264ec5cc
7e640b0cffa7
7e640b0cffa7
7e640b0cffa7
9e303d5ff83e
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
68c16b6d84f3
3cdbf505e6f8
3cdbf505e6f8
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e9496b21cf64
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
e93b264ec5cc
use std::{borrow::Borrow, fmt};

#[derive(Debug, Clone)]
pub enum ParsingError {
	Asterisk {s: String, pos: usize},
	OpeningParenthesis {s: String, pos: usize},
	ClosingParenthesis {s: String, pos: usize},
	EmptyAlternativeVariant
}

impl fmt::Display for ParsingError {
	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
		match self {
			ParsingError::Asterisk {s, pos} => {
				write!(f, "The asterisk operator is missing an argument. Pattern \"{s}\", position {pos}")
			},
			ParsingError::OpeningParenthesis {s, pos} => {
				write!(f, "An opening parenthesis not found. Pattern \"{s}\", position {pos}")
			},
			ParsingError::ClosingParenthesis {s, pos} => {
				write!(f, "A closing parenthesis not found. Pattern \"{s}\", position {pos}")
			},
			ParsingError::EmptyAlternativeVariant => {
				write!(f, "Found an empty Alternative variant.")
			}
		}
	}
}

/// A single letter or other alphabet symbol.
pub struct Symbol {
	/// Symbol position in the regular expression.
	position: usize
}

/// A unary operator specifying its content to occur zero or more times.
pub struct Asterisk {
	content: Box<Token>
}

/// An operator with a variable number of arguments, specifying exchangeable alternatives.
pub struct Alternative {
	content: Vec<Box<Token>>
}

/// An operator expressing a concatenation of its content Tokens.
pub struct Chain {
	content: Vec<Box<Token>>
}

/// Enum encapsulating possible items of a regular expression.
pub enum Token {
	Lambda, // An empty string, useful as an `Alternative`.
	Symbol(Symbol),
	Asterisk(Asterisk),
	Alternative(Alternative),
	// A special token to temporarily separate Alternative variants. Removed in the Alternative constructor.
	AlternativeSeparator,
	Chain(Chain)
}

impl Symbol {
	fn list_first(&self) -> Vec<usize> {
		return vec![self.position];
	}

	fn list_last(&self) -> Vec<usize> {
		return vec![self.position];
	}

	fn list_neighbours(&self) -> Vec<(usize, usize)> {
		return vec![];
	}
}

impl Asterisk {
	fn list_first(&self) -> Vec<usize> {
		return self.content.list_first();
	}

	fn list_last(&self) -> Vec<usize> {
		return self.content.list_last();
	}

	fn list_neighbours(&self) -> Vec<(usize, usize)> {
		let mut res = self.content.list_neighbours();

		for x in self.list_last() {
			for y in self.list_first() {
				res.push((x, y));
			}
		}

		return res;
	}
}

impl Alternative {
	/// Split a sequence of `Tokens` by `AlternativeSeparator` into alternative variants and return the result.
	/// If any variant is empty, return an `Err``.
	fn new(content: Vec<Box<Token>>) -> Result<Alternative, ParsingError> {
		let mut variants: Vec<Vec<Box<Token>>> = vec![Vec::new()];

		content.into_iter().for_each(|x| {
			if matches!(x.borrow(), Token::AlternativeSeparator) {
				variants.push(Vec::new());
			} else {
				variants.last_mut().unwrap().push(x);
			}
		});

		if variants.iter().any(|x| x.is_empty()) {
			return Err(ParsingError::EmptyAlternativeVariant);
		}

		return Ok(Alternative{
			content: variants.into_iter().map(
				|x| Box::new(Token::Chain(Chain{content: x}))
			).collect()
		});
	}

	fn is_skippable(&self) -> bool {
		return self.content.iter().any(|x| x.is_skippable());
	}

	fn list_first(&self) -> Vec<usize> {
		let mut res = Vec::new();
		for token in self.content.iter() {
			res.append(&mut token.list_first());
		}

		return res;
	}

	fn list_last(&self) -> Vec<usize> {
		let mut res = Vec::new();
		for token in self.content.iter() {
			res.append(&mut token.list_last());
		}

		return res;
	}

	fn list_neighbours(&self) -> Vec<(usize, usize)> {
		let mut res = Vec::new();
		for token in self.content.iter() {
			res.append(&mut token.list_neighbours());
		}

		return res;
	}
}

impl Chain {
	fn is_skippable(&self) -> bool {
		return self.content.iter().all(|x| x.is_skippable());
	}

	fn list_first(&self) -> Vec<usize> {
		let mut res = Vec::new();
		for token in self.content.iter() {
			res.append(&mut token.list_first());
			if !token.is_skippable() {break;}
		}

		return res;
	}

	fn list_last(&self) -> Vec<usize> {
		let mut res = Vec::new();
		for token in self.content.iter().rev() {
			res.append(&mut token.list_last());
			if !token.is_skippable() {break;}
		}

		return res;
	}

	fn list_neighbours(&self) -> Vec<(usize, usize)> {
		let mut res = Vec::new();
		let mut previous: Vec<&Box<Token>> = Vec::new();
		for token in self.content.iter() {
			for t in previous.iter() {
				for x in t.list_last() {
					for y in token.list_first() {
						res.push((x, y));
					}
				}
			}
			res.append(&mut token.list_neighbours());

			if token.is_skippable() {
				previous.push(token);
			} else {
				previous = vec![token];
			}
		}

		return res;
	}
}

impl Token {
	/// Decide whether the `Token` has to contain some `Symbols`,
	/// or whether it can be stepped over when looking for first, last and neighbouring items.
	pub fn is_skippable(&self) -> bool {
		match self {
			Token::Lambda => true,
			Token::Symbol(_) => false,
			Token::Asterisk(_) => true,
			Token::Alternative(t) => t.is_skippable(),
			Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"),
			Token::Chain(t) => t.is_skippable()
		}
	}

	/// List all possible string positions the token can start with.
	pub fn list_first(&self) -> Vec<usize> {
		match self {
			Token::Lambda => vec![],
			Token::Symbol(t) => t.list_first(),
			Token::Asterisk(t) => t.list_first(),
			Token::Alternative(t) => t.list_first(),
			Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"),
			Token::Chain(t) => t.list_first()
		}
	}

	/// List all possible string positions the token can end with.
	pub fn list_last(&self) -> Vec<usize> {
		match self {
			Token::Lambda => vec![],
			Token::Symbol(t) => t.list_last(),
			Token::Asterisk(t) => t.list_last(),
			Token::Alternative(t) => t.list_last(),
			Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"),
			Token::Chain(t) => t.list_last()
		}
	}

	/// List positions of all possibly neighbouring subtokens.
	pub fn list_neighbours(&self) -> Vec<(usize, usize)> {
		match self {
			Token::Lambda => vec![],
			Token::Symbol(t) => t.list_neighbours(),
			Token::Asterisk(t) => t.list_neighbours(),
			Token::Alternative(t) => t.list_neighbours(),
			Token::AlternativeSeparator => panic!("Separators must be already removed at this stage"),
			Token::Chain(t) => t.list_neighbours()
		}
	}
}

/// For a string starting with a parenthesis, find its matching closing parenthesis, or return None.
fn find_closing_parenthesis(s: &String) -> Option<usize> {
	let chars: Vec<char> = s.chars().collect();
	let mut counter = 0;

	for (i, c) in chars.iter().enumerate() {
		if *c == '(' {counter += 1;}
		else if *c == ')' {counter -= 1;}
		if counter == 0 {return Some(i);}
	}

	return None;
}

/// Recursively parse the pattern into a `Token` tree.
/// 
/// The `offset` defines where the `pattern` starts relative to the original pattern,
/// to record correct global token positions in the subcalls
pub fn parse(pattern: &String, offset: usize) -> Result<Token, ParsingError> {
	let chars: Vec<char> = pattern.chars().collect();
	let mut res: Vec<Box<Token>> = Vec::new();
	let mut is_alternative = false;
	let mut i = 0;
	while i < pattern.len() {
		let c = chars[i];
		match c {
			'(' => {
				let j = find_closing_parenthesis(&pattern[i..].to_string()).ok_or(ParsingError::ClosingParenthesis {s: pattern.clone(), pos: i})? + i;
				let inner_content = parse(&pattern[i+1..j].to_string(), offset+i+1)?;
				res.push(Box::new(inner_content));
				i = j+1;
			}
			'*' => {
				let token = res.pop().ok_or(ParsingError::Asterisk{s: pattern.clone(), pos: i})?;
				res.push(Box::new(Token::Asterisk(Asterisk{content: token})));
				i += 1;
			}
			')' => {
				return Err(ParsingError::OpeningParenthesis {s: pattern.clone(), pos: i});
			}
			'|' | '+' => {
				is_alternative = true;
				res.push(Box::new(Token::AlternativeSeparator));
				i += 1;
			}
			'_' => {
				res.push(Box::new(Token::Lambda));
				i += 1;
			}
			_c => {
				res.push(Box::new(Token::Symbol(Symbol{position: i+offset})));
				i += 1;
			}
		}
	}

	if is_alternative {
		return Ok(Token::Alternative(Alternative::new(res)?));
	} else {
		return Ok(Token::Chain(Chain{content: res}));
	}
}

mod test {
	use super::*;

	#[test]
	fn test_closing_parenthesis() {
		let s = "()";
		assert_eq!(find_closing_parenthesis(&s.to_string()), Some(1));
	}
}