From fce11d6c3527673ae8c0759e899be41eb8130e17 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Sat, 24 Apr 2021 12:33:35 -0500 Subject: [PATCH] refactor(parser)!: Allow short-circuiting word splitting This is prep for experiments with getting this information ahead of time. See #224 --- benches/tokenize.rs | 3 ++- crates/typos/src/tokens.rs | 50 +++++++++++++++++++++++--------------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/benches/tokenize.rs b/benches/tokenize.rs index 9cd0ce9..1426ac8 100644 --- a/benches/tokenize.rs +++ b/benches/tokenize.rs @@ -15,7 +15,8 @@ fn bench_tokenize(c: &mut Criterion) { b.iter(|| parser.parse_str(sample).last()); }); group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| { - let symbol = typos::tokens::Identifier::new_unchecked(sample, 0); + let symbol = + typos::tokens::Identifier::new_unchecked(sample, typos::tokens::Case::None, 0); b.iter(|| symbol.split().last()); }); group.bench_with_input( diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 3d0947f..d94265d 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -105,7 +105,7 @@ impl Tokenizer { self.words_str .find_iter(content) .filter(move |m| self.accept(m.as_str().as_bytes())) - .map(|m| Identifier::new_unchecked(m.as_str(), m.start())) + .map(|m| Identifier::new_unchecked(m.as_str(), Case::None, m.start())) } pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator> { @@ -114,7 +114,7 @@ impl Tokenizer { .filter(move |m| self.accept(m.as_bytes())) .filter_map(|m| { let s = std::str::from_utf8(m.as_bytes()).ok(); - s.map(|s| Identifier::new_unchecked(s, m.start())) + s.map(|s| Identifier::new_unchecked(s, Case::None, m.start())) }) } @@ -159,12 +159,17 @@ fn is_hex(ident: &[u8]) -> bool { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Identifier<'t> { token: &'t str, + case: Case, offset: usize, } impl<'t> Identifier<'t> { - pub fn new_unchecked(token: &'t str, offset: usize) -> Self { - Self { token, offset } + pub fn new_unchecked(token: &'t str, case: Case, offset: usize) -> Self { + Self { + token, + case, + offset, + } } pub fn token(&self) -> &'t str { @@ -172,7 +177,7 @@ impl<'t> Identifier<'t> { } pub fn case(&self) -> Case { - Case::None + self.case } pub fn offset(&self) -> usize { @@ -181,7 +186,12 @@ impl<'t> Identifier<'t> { /// Split into individual Words. pub fn split(&self) -> impl Iterator> { - SplitIdent::new(self.token, self.offset) + match self.case { + Case::None => itertools::Either::Left(SplitIdent::new(self.token, self.offset)), + _ => itertools::Either::Right( + Some(Word::new_unchecked(self.token, self.case, self.offset)).into_iter(), + ), + } } } @@ -409,7 +419,7 @@ mod test { let parser = Tokenizer::new(); let input = "word"; - let expected: Vec = vec![Identifier::new_unchecked("word", 0)]; + let expected: Vec = vec![Identifier::new_unchecked("word", Case::None, 0)]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); let actual: Vec<_> = parser.parse_str(input).collect(); @@ -422,8 +432,8 @@ mod test { let input = "A B"; let expected: Vec = vec![ - Identifier::new_unchecked("A", 0), - Identifier::new_unchecked("B", 2), + Identifier::new_unchecked("A", Case::None, 0), + Identifier::new_unchecked("B", Case::None, 2), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); @@ -437,8 +447,8 @@ mod test { let input = "A.B"; let expected: Vec = vec![ - Identifier::new_unchecked("A", 0), - Identifier::new_unchecked("B", 2), + Identifier::new_unchecked("A", Case::None, 0), + Identifier::new_unchecked("B", Case::None, 2), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); @@ -452,8 +462,8 @@ mod test { let input = "A::B"; let expected: Vec = vec![ - Identifier::new_unchecked("A", 0), - Identifier::new_unchecked("B", 3), + Identifier::new_unchecked("A", Case::None, 0), + Identifier::new_unchecked("B", Case::None, 3), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); @@ -466,7 +476,7 @@ mod test { let parser = Tokenizer::new(); let input = "A_B"; - let expected: Vec = vec![Identifier::new_unchecked("A_B", 0)]; + let expected: Vec = vec![Identifier::new_unchecked("A_B", Case::None, 0)]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); let actual: Vec<_> = parser.parse_str(input).collect(); @@ -479,8 +489,8 @@ mod test { let input = "Hello 0xDEADBEEF World"; let expected: Vec = vec![ - Identifier::new_unchecked("Hello", 0), - Identifier::new_unchecked("World", 17), + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("World", Case::None, 17), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); @@ -497,9 +507,9 @@ mod test { let input = "Hello 0xDEADBEEF World"; let expected: Vec = vec![ - Identifier::new_unchecked("Hello", 0), - Identifier::new_unchecked("0xDEADBEEF", 6), - Identifier::new_unchecked("World", 17), + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("0xDEADBEEF", Case::None, 6), + Identifier::new_unchecked("World", Case::None, 17), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); @@ -564,7 +574,7 @@ mod test { ), ]; for (input, expected) in cases.iter() { - let ident = Identifier::new_unchecked(input, 0); + let ident = Identifier::new_unchecked(input, Case::None, 0); let result: Vec<_> = ident.split().map(|w| (w.token, w.case, w.offset)).collect(); assert_eq!(&result, expected); }