test: Basic tokenization testing

This commit is contained in:
Ed Page 2019-01-23 07:44:01 -07:00
parent 98b6c075f9
commit c0c99ef3ad

View file

@ -6,22 +6,26 @@ use std::io::Read;
include!(concat!(env!("OUT_DIR"), "/codegen.rs")); include!(concat!(env!("OUT_DIR"), "/codegen.rs"));
#[derive(Debug)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token<'t> { pub struct Token<'t> {
pub token: &'t [u8], pub token: &'t [u8],
pub offset: usize, pub offset: usize,
} }
impl<'t> Token<'t> {
pub fn new(token: &'t [u8], offset: usize) -> Self {
Self {
token,
offset,
}
}
}
pub fn tokenize(content: &[u8]) -> impl Iterator<Item=Token> { pub fn tokenize(content: &[u8]) -> impl Iterator<Item=Token> {
lazy_static::lazy_static! { lazy_static::lazy_static! {
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b\w+\b"#).unwrap(); static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b\w+\b"#).unwrap();
} }
SPLIT.find_iter(content).map(|m| { SPLIT.find_iter(content).map(|m| Token::new(m.as_bytes(), m.start()))
Token {
token: m.as_bytes(),
offset: m.start(),
}
})
} }
#[derive(Debug, Serialize)] #[derive(Debug, Serialize)]
@ -106,3 +110,55 @@ impl Corrections {
} }
} }
#[cfg(test)]
mod test {
use super::*;
#[test]
fn tokenize_empty_is_empty() {
let input = b"";
let expected: Vec<Token> = vec![];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_word_is_word() {
let input = b"word";
let expected: Vec<Token> = vec![Token::new(b"word", 0)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_space_separated_words() {
let input = b"A B";
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 2)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_dot_separated_words() {
let input = b"A.B";
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 2)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_namespace_separated_words() {
let input = b"A::B";
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 3)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_underscore_doesnt_separate() {
let input = b"A_B";
let expected: Vec<Token> = vec![Token::new(b"A_B", 0)];
let actual: Vec<_> = tokenize(input).collect();
assert_eq!(expected, actual);
}
}