diff --git a/benches/tokenize.rs b/benches/tokenize.rs index 8317856..df432df 100644 --- a/benches/tokenize.rs +++ b/benches/tokenize.rs @@ -6,30 +6,30 @@ mod data; #[bench] fn tokenize_empty(b: &mut test::Bencher) { - b.iter(|| defenestrate::tokens::tokenize(data::EMPTY.as_bytes()).collect::>()); + b.iter(|| defenestrate::tokens::Symbol::parse(data::EMPTY.as_bytes()).collect::>()); } #[bench] fn tokenize_no_tokens(b: &mut test::Bencher) { - b.iter(|| defenestrate::tokens::tokenize(data::NO_TOKENS.as_bytes()).collect::>()); + b.iter(|| defenestrate::tokens::Symbol::parse(data::NO_TOKENS.as_bytes()).collect::>()); } #[bench] fn tokenize_single_token(b: &mut test::Bencher) { - b.iter(|| defenestrate::tokens::tokenize(data::SINGLE_TOKEN.as_bytes()).collect::>()); + b.iter(|| defenestrate::tokens::Symbol::parse(data::SINGLE_TOKEN.as_bytes()).collect::>()); } #[bench] fn tokenize_sherlock(b: &mut test::Bencher) { - b.iter(|| defenestrate::tokens::tokenize(data::SHERLOCK.as_bytes()).collect::>()); + b.iter(|| defenestrate::tokens::Symbol::parse(data::SHERLOCK.as_bytes()).collect::>()); } #[bench] fn tokenize_code(b: &mut test::Bencher) { - b.iter(|| defenestrate::tokens::tokenize(data::CODE.as_bytes()).collect::>()); + b.iter(|| defenestrate::tokens::Symbol::parse(data::CODE.as_bytes()).collect::>()); } #[bench] fn tokenize_corpus(b: &mut test::Bencher) { - b.iter(|| defenestrate::tokens::tokenize(data::CORPUS.as_bytes()).collect::>()); + b.iter(|| defenestrate::tokens::Symbol::parse(data::CORPUS.as_bytes()).collect::>()); } diff --git a/src/lib.rs b/src/lib.rs index edad366..b044eff 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,7 +16,7 @@ pub fn process_file(path: &std::path::Path, dictionary: &Dictionary, report: rep File::open(path)?.read_to_end(&mut buffer)?; for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() { let line_num = line_idx + 1; - for token in tokens::tokenize(line) { + for token in tokens::Symbol::parse(line) { // Correct tokens as-is if let Some(correction) = dictionary.correct_bytes(token.token) { let word = String::from_utf8_lossy(token.token); diff --git a/src/tokens.rs b/src/tokens.rs index 2bd4574..412ef39 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -1,23 +1,23 @@ #[derive(Debug, Clone, PartialEq, Eq)] -pub struct Token<'t> { +pub struct Symbol<'t> { pub token: &'t [u8], pub offset: usize, } -impl<'t> Token<'t> { +impl<'t> Symbol<'t> { pub fn new(token: &'t [u8], offset: usize) -> Self { Self { token, offset, } } -} -pub fn tokenize(content: &[u8]) -> impl Iterator { - lazy_static::lazy_static! { - static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap(); + pub fn parse<'s>(content: &'s [u8]) -> impl Iterator> { + lazy_static::lazy_static! { + static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap(); + } + SPLIT.find_iter(content).map(|m| Symbol::new(m.as_bytes(), m.start())) } - SPLIT.find_iter(content).map(|m| Token::new(m.as_bytes(), m.start())) } #[cfg(test)] @@ -27,48 +27,48 @@ mod test { #[test] fn tokenize_empty_is_empty() { let input = b""; - let expected: Vec = vec![]; - let actual: Vec<_> = tokenize(input).collect(); + let expected: Vec = vec![]; + let actual: Vec<_> = Symbol::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_word_is_word() { let input = b"word"; - let expected: Vec = vec![Token::new(b"word", 0)]; - let actual: Vec<_> = tokenize(input).collect(); + let expected: Vec = vec![Symbol::new(b"word", 0)]; + let actual: Vec<_> = Symbol::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_space_separated_words() { let input = b"A B"; - let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 2)]; - let actual: Vec<_> = tokenize(input).collect(); + let expected: Vec = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)]; + let actual: Vec<_> = Symbol::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_dot_separated_words() { let input = b"A.B"; - let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 2)]; - let actual: Vec<_> = tokenize(input).collect(); + let expected: Vec = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)]; + let actual: Vec<_> = Symbol::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_namespace_separated_words() { let input = b"A::B"; - let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 3)]; - let actual: Vec<_> = tokenize(input).collect(); + let expected: Vec = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 3)]; + let actual: Vec<_> = Symbol::parse(input).collect(); assert_eq!(expected, actual); } #[test] fn tokenize_underscore_doesnt_separate() { let input = b"A_B"; - let expected: Vec = vec![Token::new(b"A_B", 0)]; - let actual: Vec<_> = tokenize(input).collect(); + let expected: Vec = vec![Symbol::new(b"A_B", 0)]; + let actual: Vec<_> = Symbol::parse(input).collect(); assert_eq!(expected, actual); } }