diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,2 @@ +mod regexp; +pub use regexp::Regexp; diff --git a/src/main.rs b/src/main.rs --- a/src/main.rs +++ b/src/main.rs @@ -1,324 +1,4 @@ -use std::collections::{HashMap, HashSet}; - -const START: usize = usize::MAX; - -trait Token { - fn is_skippable(&self) -> bool {false} - fn list_first(&self) -> Vec; - fn list_last(&self) -> Vec; - fn list_neighbours(&self) -> Vec<(usize, usize)>; -} - -struct Symbol { - position: usize, - value: char -} - -struct Asterisk { - content: Box -} - -struct Plus { - content: Box -} - -struct Chain { - content: Vec> -} - -impl Token for Symbol { - fn list_first(&self) -> Vec { - return vec![self.position]; - } - - fn list_last(&self) -> Vec { - return vec![self.position]; - } - - fn list_neighbours(&self) -> Vec<(usize, usize)> { - return vec![]; - } -} - -impl Token for Asterisk { - fn is_skippable(&self) -> bool {true} - - fn list_first(&self) -> Vec { - return self.content.list_first(); - } - - fn list_last(&self) -> Vec { - return self.content.list_last(); - } - - fn list_neighbours(&self) -> Vec<(usize, usize)> { - let mut res = self.content.list_neighbours(); - - for x in self.list_last() { - for y in self.list_first() { - res.push((x, y)); - } - } - - return res; - } -} - -impl Token for Plus { - fn list_first(&self) -> Vec { - return self.content.list_first(); - } - - fn list_last(&self) -> Vec { - return self.content.list_last(); - } - - fn list_neighbours(&self) -> Vec<(usize, usize)> { - let mut res = self.content.list_neighbours(); - - for x in self.list_last() { - for y in self.list_first() { - res.push((x, y)); - } - } - - return res; - } -} - -impl Token for Chain { - fn is_skippable(&self) -> bool { - return self.content.iter().all(|x| x.is_skippable()); - } - - fn list_first(&self) -> Vec { - let mut res = Vec::new(); - for token in self.content.iter() { - res.append(&mut token.list_first()); - if !token.is_skippable() {break;} - } - - return res; - } - - fn list_last(&self) -> Vec { - let mut res = Vec::new(); - for token in self.content.iter().rev() { - res.append(&mut token.list_last()); - if !token.is_skippable() {break;} - } - - return res; - } - - fn list_neighbours(&self) -> Vec<(usize, usize)> { - let mut res = Vec::new(); - let mut previous: Vec<&Box> = Vec::new(); - for token in self.content.iter() { - for t in previous.iter() { - for x in t.list_last() { - for y in token.list_first() { - res.push((x, y)); - } - } - } - res.append(&mut token.list_neighbours()); - - if token.is_skippable() { - previous.push(token); - } else { - previous = vec![token]; - } - } - - return res; - } -} - -fn find_closing_parenthesis(s: &String) -> Option { - let chars: Vec = s.chars().collect(); - let mut counter = 0; - - for (i, c) in chars.iter().enumerate() { - if *c == '(' {counter += 1;} - else if *c == ')' {counter -= 1;} - if counter == 0 {return Some(i);} - } - - return None; -} - -fn parse(pattern: &String, offset: usize) -> Chain { - let chars: Vec = pattern.chars().collect(); - let mut res: Vec> = Vec::new(); - let mut i = 0; - while i < pattern.len() { - let c = chars[i]; - match c { - '(' => { - let j = find_closing_parenthesis(&pattern[i..].to_string()).unwrap() + i; - let inner_content = parse(&pattern[i+1..j].to_string(), offset+i+1); - res.push(Box::new(inner_content)); - i = j+1; - } - '*' => { - let token = res.pop().unwrap(); - res.push(Box::new(Asterisk{content: token})); - i += 1; - } - '+' => { - let token = res.pop().unwrap(); - res.push(Box::new(Plus{content: token})); - i += 1; - } - c => { - res.push(Box::new(Symbol{position: i+offset, value: c})); - i += 1; - } - } - } - - return Chain{content: res}; -} - -fn encode_set(set: &HashSet) -> u64 { - let mut res = 0; - for x in set.iter() { - res ^= 1<HashSet { - if x == START as u64 {return HashSet::from([START]);} - - let mut x = x; - let mut res: HashSet = HashSet::new(); - - while x > 0 { - let y = x.trailing_zeros(); - res.insert(y as usize); - x ^= 1 << y; - } - - return res; -} - -struct Regexp { - rules: HashMap<(usize, char), HashSet>, - end_states: HashSet -} - -impl Regexp { - fn new(pattern: &String) -> Regexp { - let r = parse(pattern, 0); - let pattern_chars = Vec::from_iter(pattern.chars()); - let mut rules: HashMap<(usize, char), HashSet> = HashMap::new(); - - for i in r.list_first() { - let c = pattern_chars[i]; - let key = (START, c); - match rules.get_mut(&key) { - Some(set) => {set.insert(i);}, - None => {rules.insert(key, HashSet::from([i]));} - }; - } - - for (i, j) in r.list_neighbours() { - let c = pattern_chars[j]; - let key = (i, c); - match rules.get_mut(&key) { - Some(set) => {set.insert(j);}, - None => {rules.insert(key, HashSet::from([j]));} - }; - } - - let mut end_states = HashSet::from_iter(r.list_last().into_iter()); - if r.is_skippable() { - end_states.insert(START); - } - - return Regexp{rules, end_states}; - } - - pub fn eval(&self, s: String) -> bool { - let mut multistate = HashSet::from([START]); - - for c in s.chars() { - let mut new_multistate = HashSet::new(); - - for state in multistate { - if let Some(x) = self.rules.get(&(state, c)) { - new_multistate = new_multistate.union(&x).map(|&y| y).collect(); - } else if let Some(x) = self.rules.get(&(state, '.')) { - new_multistate = new_multistate.union(&x).map(|&y| y).collect(); - } - } - multistate = new_multistate; - } - - return multistate.iter().any(|x| self.end_states.contains(x)); - } - - fn determinize(&self) -> RegexpDFA { - let mut rules: HashMap<(u64, char), u64> = HashMap::new(); - let mut end_states: HashSet = HashSet::new(); - if self.end_states.contains(&START) {end_states.insert(START as u64);} - - let mut stack = Vec::from([START as u64]); - let mut processed_states = HashSet::new(); - while !stack.is_empty() { - let state = stack.pop().unwrap(); - let multistate = decode_set(state); - let mut new_rules: HashMap> = HashMap::new(); - - for key in self.rules.keys().filter(|key| multistate.contains(&key.0)) { - let (_st, c) = key; - if !new_rules.contains_key(c) { - new_rules.insert(*c, HashSet::new()); - } - for target in &self.rules[key] { - new_rules.get_mut(c).unwrap().insert(*target); - } - } - - for (c, target_set) in new_rules.into_iter() { - let encoded_target = encode_set(&target_set); - rules.insert((state, c), encoded_target); - if target_set.iter().any(|st| self.end_states.contains(st)) { - end_states.insert(encoded_target); - } - if !processed_states.contains(&encoded_target) { - stack.push(encoded_target); - processed_states.insert(encoded_target); - } - } - } - - return RegexpDFA{rules, end_states}; - } -} - -struct RegexpDFA { - rules: HashMap<(u64, char), u64>, - end_states: HashSet -} - -impl RegexpDFA { - fn eval(&self, s: String) -> bool { - let mut state = START as u64; - - for c in s.chars() { - if let Some(x) = self.rules.get(&(state, c)) { - state = *x; - } else { - return false; - } - } - - return self.end_states.contains(&state); - } -} +use regexp::Regexp; fn main() { let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]; diff --git a/src/main.rs b/src/regexp.rs copy from src/main.rs copy to src/regexp.rs --- a/src/main.rs +++ b/src/regexp.rs @@ -1,186 +1,10 @@ use std::collections::{HashMap, HashSet}; +mod token; +use token::{Token, parse}; + const START: usize = usize::MAX; -trait Token { - fn is_skippable(&self) -> bool {false} - fn list_first(&self) -> Vec; - fn list_last(&self) -> Vec; - fn list_neighbours(&self) -> Vec<(usize, usize)>; -} - -struct Symbol { - position: usize, - value: char -} - -struct Asterisk { - content: Box -} - -struct Plus { - content: Box -} - -struct Chain { - content: Vec> -} - -impl Token for Symbol { - fn list_first(&self) -> Vec { - return vec![self.position]; - } - - fn list_last(&self) -> Vec { - return vec![self.position]; - } - - fn list_neighbours(&self) -> Vec<(usize, usize)> { - return vec![]; - } -} - -impl Token for Asterisk { - fn is_skippable(&self) -> bool {true} - - fn list_first(&self) -> Vec { - return self.content.list_first(); - } - - fn list_last(&self) -> Vec { - return self.content.list_last(); - } - - fn list_neighbours(&self) -> Vec<(usize, usize)> { - let mut res = self.content.list_neighbours(); - - for x in self.list_last() { - for y in self.list_first() { - res.push((x, y)); - } - } - - return res; - } -} - -impl Token for Plus { - fn list_first(&self) -> Vec { - return self.content.list_first(); - } - - fn list_last(&self) -> Vec { - return self.content.list_last(); - } - - fn list_neighbours(&self) -> Vec<(usize, usize)> { - let mut res = self.content.list_neighbours(); - - for x in self.list_last() { - for y in self.list_first() { - res.push((x, y)); - } - } - - return res; - } -} - -impl Token for Chain { - fn is_skippable(&self) -> bool { - return self.content.iter().all(|x| x.is_skippable()); - } - - fn list_first(&self) -> Vec { - let mut res = Vec::new(); - for token in self.content.iter() { - res.append(&mut token.list_first()); - if !token.is_skippable() {break;} - } - - return res; - } - - fn list_last(&self) -> Vec { - let mut res = Vec::new(); - for token in self.content.iter().rev() { - res.append(&mut token.list_last()); - if !token.is_skippable() {break;} - } - - return res; - } - - fn list_neighbours(&self) -> Vec<(usize, usize)> { - let mut res = Vec::new(); - let mut previous: Vec<&Box> = Vec::new(); - for token in self.content.iter() { - for t in previous.iter() { - for x in t.list_last() { - for y in token.list_first() { - res.push((x, y)); - } - } - } - res.append(&mut token.list_neighbours()); - - if token.is_skippable() { - previous.push(token); - } else { - previous = vec![token]; - } - } - - return res; - } -} - -fn find_closing_parenthesis(s: &String) -> Option { - let chars: Vec = s.chars().collect(); - let mut counter = 0; - - for (i, c) in chars.iter().enumerate() { - if *c == '(' {counter += 1;} - else if *c == ')' {counter -= 1;} - if counter == 0 {return Some(i);} - } - - return None; -} - -fn parse(pattern: &String, offset: usize) -> Chain { - let chars: Vec = pattern.chars().collect(); - let mut res: Vec> = Vec::new(); - let mut i = 0; - while i < pattern.len() { - let c = chars[i]; - match c { - '(' => { - let j = find_closing_parenthesis(&pattern[i..].to_string()).unwrap() + i; - let inner_content = parse(&pattern[i+1..j].to_string(), offset+i+1); - res.push(Box::new(inner_content)); - i = j+1; - } - '*' => { - let token = res.pop().unwrap(); - res.push(Box::new(Asterisk{content: token})); - i += 1; - } - '+' => { - let token = res.pop().unwrap(); - res.push(Box::new(Plus{content: token})); - i += 1; - } - c => { - res.push(Box::new(Symbol{position: i+offset, value: c})); - i += 1; - } - } - } - - return Chain{content: res}; -} - fn encode_set(set: &HashSet) -> u64 { let mut res = 0; for x in set.iter() { @@ -204,13 +28,13 @@ fn decode_set(x: u64) ->HashSet { return res; } -struct Regexp { +pub struct Regexp { rules: HashMap<(usize, char), HashSet>, end_states: HashSet } impl Regexp { - fn new(pattern: &String) -> Regexp { + pub fn new(pattern: &String) -> Regexp { let r = parse(pattern, 0); let pattern_chars = Vec::from_iter(pattern.chars()); let mut rules: HashMap<(usize, char), HashSet> = HashMap::new(); @@ -260,7 +84,7 @@ impl Regexp { return multistate.iter().any(|x| self.end_states.contains(x)); } - fn determinize(&self) -> RegexpDFA { + pub fn determinize(&self) -> RegexpDFA { let mut rules: HashMap<(u64, char), u64> = HashMap::new(); let mut end_states: HashSet = HashSet::new(); if self.end_states.contains(&START) {end_states.insert(START as u64);} @@ -299,13 +123,13 @@ impl Regexp { } } -struct RegexpDFA { +pub struct RegexpDFA { rules: HashMap<(u64, char), u64>, end_states: HashSet } impl RegexpDFA { - fn eval(&self, s: String) -> bool { + pub fn eval(&self, s: String) -> bool { let mut state = START as u64; for c in s.chars() { @@ -319,15 +143,3 @@ impl RegexpDFA { return self.end_states.contains(&state); } } - -fn main() { - let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]; - for pattern in ["a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"] { - println!("# {pattern}"); - let r = Regexp::new(&pattern.to_string()).determinize(); - for &t in tests.iter() { - println!("{t} {}", r.eval(t.to_string())); - } - println!(); - } -} diff --git a/src/main.rs b/src/regexp/token.rs copy from src/main.rs copy to src/regexp/token.rs --- a/src/main.rs +++ b/src/regexp/token.rs @@ -1,28 +1,24 @@ -use std::collections::{HashMap, HashSet}; - -const START: usize = usize::MAX; - -trait Token { +pub trait Token { fn is_skippable(&self) -> bool {false} fn list_first(&self) -> Vec; fn list_last(&self) -> Vec; fn list_neighbours(&self) -> Vec<(usize, usize)>; } -struct Symbol { +pub struct Symbol { position: usize, value: char } -struct Asterisk { +pub struct Asterisk { content: Box } -struct Plus { +pub struct Plus { content: Box } -struct Chain { +pub struct Chain { content: Vec> } @@ -148,7 +144,7 @@ fn find_closing_parenthesis(s: &String) return None; } -fn parse(pattern: &String, offset: usize) -> Chain { +pub fn parse(pattern: &String, offset: usize) -> Chain { let chars: Vec = pattern.chars().collect(); let mut res: Vec> = Vec::new(); let mut i = 0; @@ -181,153 +177,12 @@ fn parse(pattern: &String, offset: usize return Chain{content: res}; } -fn encode_set(set: &HashSet) -> u64 { - let mut res = 0; - for x in set.iter() { - res ^= 1<HashSet { - if x == START as u64 {return HashSet::from([START]);} - - let mut x = x; - let mut res: HashSet = HashSet::new(); - - while x > 0 { - let y = x.trailing_zeros(); - res.insert(y as usize); - x ^= 1 << y; - } - - return res; -} - -struct Regexp { - rules: HashMap<(usize, char), HashSet>, - end_states: HashSet -} - -impl Regexp { - fn new(pattern: &String) -> Regexp { - let r = parse(pattern, 0); - let pattern_chars = Vec::from_iter(pattern.chars()); - let mut rules: HashMap<(usize, char), HashSet> = HashMap::new(); - - for i in r.list_first() { - let c = pattern_chars[i]; - let key = (START, c); - match rules.get_mut(&key) { - Some(set) => {set.insert(i);}, - None => {rules.insert(key, HashSet::from([i]));} - }; - } - - for (i, j) in r.list_neighbours() { - let c = pattern_chars[j]; - let key = (i, c); - match rules.get_mut(&key) { - Some(set) => {set.insert(j);}, - None => {rules.insert(key, HashSet::from([j]));} - }; - } - - let mut end_states = HashSet::from_iter(r.list_last().into_iter()); - if r.is_skippable() { - end_states.insert(START); - } +mod test { + use super::*; - return Regexp{rules, end_states}; - } - - pub fn eval(&self, s: String) -> bool { - let mut multistate = HashSet::from([START]); - - for c in s.chars() { - let mut new_multistate = HashSet::new(); - - for state in multistate { - if let Some(x) = self.rules.get(&(state, c)) { - new_multistate = new_multistate.union(&x).map(|&y| y).collect(); - } else if let Some(x) = self.rules.get(&(state, '.')) { - new_multistate = new_multistate.union(&x).map(|&y| y).collect(); - } - } - multistate = new_multistate; - } - - return multistate.iter().any(|x| self.end_states.contains(x)); - } - - fn determinize(&self) -> RegexpDFA { - let mut rules: HashMap<(u64, char), u64> = HashMap::new(); - let mut end_states: HashSet = HashSet::new(); - if self.end_states.contains(&START) {end_states.insert(START as u64);} - - let mut stack = Vec::from([START as u64]); - let mut processed_states = HashSet::new(); - while !stack.is_empty() { - let state = stack.pop().unwrap(); - let multistate = decode_set(state); - let mut new_rules: HashMap> = HashMap::new(); - - for key in self.rules.keys().filter(|key| multistate.contains(&key.0)) { - let (_st, c) = key; - if !new_rules.contains_key(c) { - new_rules.insert(*c, HashSet::new()); - } - for target in &self.rules[key] { - new_rules.get_mut(c).unwrap().insert(*target); - } - } - - for (c, target_set) in new_rules.into_iter() { - let encoded_target = encode_set(&target_set); - rules.insert((state, c), encoded_target); - if target_set.iter().any(|st| self.end_states.contains(st)) { - end_states.insert(encoded_target); - } - if !processed_states.contains(&encoded_target) { - stack.push(encoded_target); - processed_states.insert(encoded_target); - } - } - } - - return RegexpDFA{rules, end_states}; + #[test] + fn test_closing_parenthesis() { + let s = "()"; + assert_eq!(find_closing_parenthesis(&s.to_string()), Some(1)); } } - -struct RegexpDFA { - rules: HashMap<(u64, char), u64>, - end_states: HashSet -} - -impl RegexpDFA { - fn eval(&self, s: String) -> bool { - let mut state = START as u64; - - for c in s.chars() { - if let Some(x) = self.rules.get(&(state, c)) { - state = *x; - } else { - return false; - } - } - - return self.end_states.contains(&state); - } -} - -fn main() { - let tests = ["", "a", "ab", "aabb", "abab", "abcd", "abcbcdbcd"]; - for pattern in ["a*b*", "a+b+", "(ab)*", "(ab)+", "a((bc)*d)*"] { - println!("# {pattern}"); - let r = Regexp::new(&pattern.to_string()).determinize(); - for &t in tests.iter() { - println!("{t} {}", r.eval(t.to_string())); - } - println!(); - } -} diff --git a/tests/test_regexp.rs b/tests/test_regexp.rs new file mode 100644 --- /dev/null +++ b/tests/test_regexp.rs @@ -0,0 +1,7 @@ +use regexp::Regexp; + +#[test] +fn test_eval() { + let r = Regexp::new(&String::from("(ab)*")); + assert!(r.eval(String::from("ab"))); +}