mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-25 10:31:02 -05:00
refactor: Clarify intent of Token
This commit is contained in:
parent
f8d42116da
commit
5992ba110d
3 changed files with 26 additions and 26 deletions
|
@ -6,30 +6,30 @@ mod data;
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn tokenize_empty(b: &mut test::Bencher) {
|
fn tokenize_empty(b: &mut test::Bencher) {
|
||||||
b.iter(|| defenestrate::tokens::tokenize(data::EMPTY.as_bytes()).collect::<Vec<_>>());
|
b.iter(|| defenestrate::tokens::Symbol::parse(data::EMPTY.as_bytes()).collect::<Vec<_>>());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn tokenize_no_tokens(b: &mut test::Bencher) {
|
fn tokenize_no_tokens(b: &mut test::Bencher) {
|
||||||
b.iter(|| defenestrate::tokens::tokenize(data::NO_TOKENS.as_bytes()).collect::<Vec<_>>());
|
b.iter(|| defenestrate::tokens::Symbol::parse(data::NO_TOKENS.as_bytes()).collect::<Vec<_>>());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn tokenize_single_token(b: &mut test::Bencher) {
|
fn tokenize_single_token(b: &mut test::Bencher) {
|
||||||
b.iter(|| defenestrate::tokens::tokenize(data::SINGLE_TOKEN.as_bytes()).collect::<Vec<_>>());
|
b.iter(|| defenestrate::tokens::Symbol::parse(data::SINGLE_TOKEN.as_bytes()).collect::<Vec<_>>());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn tokenize_sherlock(b: &mut test::Bencher) {
|
fn tokenize_sherlock(b: &mut test::Bencher) {
|
||||||
b.iter(|| defenestrate::tokens::tokenize(data::SHERLOCK.as_bytes()).collect::<Vec<_>>());
|
b.iter(|| defenestrate::tokens::Symbol::parse(data::SHERLOCK.as_bytes()).collect::<Vec<_>>());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn tokenize_code(b: &mut test::Bencher) {
|
fn tokenize_code(b: &mut test::Bencher) {
|
||||||
b.iter(|| defenestrate::tokens::tokenize(data::CODE.as_bytes()).collect::<Vec<_>>());
|
b.iter(|| defenestrate::tokens::Symbol::parse(data::CODE.as_bytes()).collect::<Vec<_>>());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn tokenize_corpus(b: &mut test::Bencher) {
|
fn tokenize_corpus(b: &mut test::Bencher) {
|
||||||
b.iter(|| defenestrate::tokens::tokenize(data::CORPUS.as_bytes()).collect::<Vec<_>>());
|
b.iter(|| defenestrate::tokens::Symbol::parse(data::CORPUS.as_bytes()).collect::<Vec<_>>());
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,7 @@ pub fn process_file(path: &std::path::Path, dictionary: &Dictionary, report: rep
|
||||||
File::open(path)?.read_to_end(&mut buffer)?;
|
File::open(path)?.read_to_end(&mut buffer)?;
|
||||||
for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() {
|
for (line_idx, line) in grep_searcher::LineIter::new(b'\n', &buffer).enumerate() {
|
||||||
let line_num = line_idx + 1;
|
let line_num = line_idx + 1;
|
||||||
for token in tokens::tokenize(line) {
|
for token in tokens::Symbol::parse(line) {
|
||||||
// Correct tokens as-is
|
// Correct tokens as-is
|
||||||
if let Some(correction) = dictionary.correct_bytes(token.token) {
|
if let Some(correction) = dictionary.correct_bytes(token.token) {
|
||||||
let word = String::from_utf8_lossy(token.token);
|
let word = String::from_utf8_lossy(token.token);
|
||||||
|
|
|
@ -1,23 +1,23 @@
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub struct Token<'t> {
|
pub struct Symbol<'t> {
|
||||||
pub token: &'t [u8],
|
pub token: &'t [u8],
|
||||||
pub offset: usize,
|
pub offset: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Token<'t> {
|
impl<'t> Symbol<'t> {
|
||||||
pub fn new(token: &'t [u8], offset: usize) -> Self {
|
pub fn new(token: &'t [u8], offset: usize) -> Self {
|
||||||
Self {
|
Self {
|
||||||
token,
|
token,
|
||||||
offset,
|
offset,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
pub fn tokenize(content: &[u8]) -> impl Iterator<Item=Token> {
|
pub fn parse<'s>(content: &'s [u8]) -> impl Iterator<Item=Symbol<'s>> {
|
||||||
lazy_static::lazy_static! {
|
lazy_static::lazy_static! {
|
||||||
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap();
|
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap();
|
||||||
}
|
}
|
||||||
SPLIT.find_iter(content).map(|m| Token::new(m.as_bytes(), m.start()))
|
SPLIT.find_iter(content).map(|m| Symbol::new(m.as_bytes(), m.start()))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -27,48 +27,48 @@ mod test {
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_empty_is_empty() {
|
fn tokenize_empty_is_empty() {
|
||||||
let input = b"";
|
let input = b"";
|
||||||
let expected: Vec<Token> = vec![];
|
let expected: Vec<Symbol> = vec![];
|
||||||
let actual: Vec<_> = tokenize(input).collect();
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_word_is_word() {
|
fn tokenize_word_is_word() {
|
||||||
let input = b"word";
|
let input = b"word";
|
||||||
let expected: Vec<Token> = vec![Token::new(b"word", 0)];
|
let expected: Vec<Symbol> = vec![Symbol::new(b"word", 0)];
|
||||||
let actual: Vec<_> = tokenize(input).collect();
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_space_separated_words() {
|
fn tokenize_space_separated_words() {
|
||||||
let input = b"A B";
|
let input = b"A B";
|
||||||
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 2)];
|
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)];
|
||||||
let actual: Vec<_> = tokenize(input).collect();
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_dot_separated_words() {
|
fn tokenize_dot_separated_words() {
|
||||||
let input = b"A.B";
|
let input = b"A.B";
|
||||||
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 2)];
|
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)];
|
||||||
let actual: Vec<_> = tokenize(input).collect();
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_namespace_separated_words() {
|
fn tokenize_namespace_separated_words() {
|
||||||
let input = b"A::B";
|
let input = b"A::B";
|
||||||
let expected: Vec<Token> = vec![Token::new(b"A", 0), Token::new(b"B", 3)];
|
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 3)];
|
||||||
let actual: Vec<_> = tokenize(input).collect();
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_underscore_doesnt_separate() {
|
fn tokenize_underscore_doesnt_separate() {
|
||||||
let input = b"A_B";
|
let input = b"A_B";
|
||||||
let expected: Vec<Token> = vec![Token::new(b"A_B", 0)];
|
let expected: Vec<Symbol> = vec![Symbol::new(b"A_B", 0)];
|
||||||
let actual: Vec<_> = tokenize(input).collect();
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue