From c0c99ef3adbeeda4424e86d07e8b907f37a2418a Mon Sep 17 00:00:00 2001 From: Ed Page Date: Wed, 23 Jan 2019 07:44:01 -0700 Subject: [PATCH] test: Basic tokenization testing --- src/lib.rs | 70 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 7 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c0867d2..b04d263 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,22 +6,26 @@ use std::io::Read; include!(concat!(env!("OUT_DIR"), "/codegen.rs")); -#[derive(Debug)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Token<'t> { pub token: &'t [u8], pub offset: usize, } +impl<'t> Token<'t> { + pub fn new(token: &'t [u8], offset: usize) -> Self { + Self { + token, + offset, + } + } +} + pub fn tokenize(content: &[u8]) -> impl Iterator { lazy_static::lazy_static! { static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b\w+\b"#).unwrap(); } - SPLIT.find_iter(content).map(|m| { - Token { - token: m.as_bytes(), - offset: m.start(), - } - }) + SPLIT.find_iter(content).map(|m| Token::new(m.as_bytes(), m.start())) } #[derive(Debug, Serialize)] @@ -106,3 +110,55 @@ impl Corrections { } } +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn tokenize_empty_is_empty() { + let input = b""; + let expected: Vec = vec![]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_word_is_word() { + let input = b"word"; + let expected: Vec = vec![Token::new(b"word", 0)]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_space_separated_words() { + let input = b"A B"; + let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 2)]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_dot_separated_words() { + let input = b"A.B"; + let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 2)]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_namespace_separated_words() { + let input = b"A::B"; + let expected: Vec = vec![Token::new(b"A", 0), Token::new(b"B", 3)]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_underscore_doesnt_separate() { + let input = b"A_B"; + let expected: Vec = vec![Token::new(b"A_B", 0)]; + let actual: Vec<_> = tokenize(input).collect(); + assert_eq!(expected, actual); + } +}