mirror of
https://github.com/crate-ci/typos.git
synced 2024-12-23 08:02:15 -05:00
refactor(parser)!: Allow short-circuiting word splitting
This is prep for experiments with getting this information ahead of time. See #224
This commit is contained in:
parent
47d4f77cb5
commit
fce11d6c35
2 changed files with 32 additions and 21 deletions
|
@ -15,7 +15,8 @@ fn bench_tokenize(c: &mut Criterion) {
|
||||||
b.iter(|| parser.parse_str(sample).last());
|
b.iter(|| parser.parse_str(sample).last());
|
||||||
});
|
});
|
||||||
group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| {
|
group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| {
|
||||||
let symbol = typos::tokens::Identifier::new_unchecked(sample, 0);
|
let symbol =
|
||||||
|
typos::tokens::Identifier::new_unchecked(sample, typos::tokens::Case::None, 0);
|
||||||
b.iter(|| symbol.split().last());
|
b.iter(|| symbol.split().last());
|
||||||
});
|
});
|
||||||
group.bench_with_input(
|
group.bench_with_input(
|
||||||
|
|
|
@ -105,7 +105,7 @@ impl Tokenizer {
|
||||||
self.words_str
|
self.words_str
|
||||||
.find_iter(content)
|
.find_iter(content)
|
||||||
.filter(move |m| self.accept(m.as_str().as_bytes()))
|
.filter(move |m| self.accept(m.as_str().as_bytes()))
|
||||||
.map(|m| Identifier::new_unchecked(m.as_str(), m.start()))
|
.map(|m| Identifier::new_unchecked(m.as_str(), Case::None, m.start()))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
|
@ -114,7 +114,7 @@ impl Tokenizer {
|
||||||
.filter(move |m| self.accept(m.as_bytes()))
|
.filter(move |m| self.accept(m.as_bytes()))
|
||||||
.filter_map(|m| {
|
.filter_map(|m| {
|
||||||
let s = std::str::from_utf8(m.as_bytes()).ok();
|
let s = std::str::from_utf8(m.as_bytes()).ok();
|
||||||
s.map(|s| Identifier::new_unchecked(s, m.start()))
|
s.map(|s| Identifier::new_unchecked(s, Case::None, m.start()))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,12 +159,17 @@ fn is_hex(ident: &[u8]) -> bool {
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
pub struct Identifier<'t> {
|
pub struct Identifier<'t> {
|
||||||
token: &'t str,
|
token: &'t str,
|
||||||
|
case: Case,
|
||||||
offset: usize,
|
offset: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Identifier<'t> {
|
impl<'t> Identifier<'t> {
|
||||||
pub fn new_unchecked(token: &'t str, offset: usize) -> Self {
|
pub fn new_unchecked(token: &'t str, case: Case, offset: usize) -> Self {
|
||||||
Self { token, offset }
|
Self {
|
||||||
|
token,
|
||||||
|
case,
|
||||||
|
offset,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn token(&self) -> &'t str {
|
pub fn token(&self) -> &'t str {
|
||||||
|
@ -172,7 +177,7 @@ impl<'t> Identifier<'t> {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn case(&self) -> Case {
|
pub fn case(&self) -> Case {
|
||||||
Case::None
|
self.case
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn offset(&self) -> usize {
|
pub fn offset(&self) -> usize {
|
||||||
|
@ -181,7 +186,12 @@ impl<'t> Identifier<'t> {
|
||||||
|
|
||||||
/// Split into individual Words.
|
/// Split into individual Words.
|
||||||
pub fn split(&self) -> impl Iterator<Item = Word<'t>> {
|
pub fn split(&self) -> impl Iterator<Item = Word<'t>> {
|
||||||
SplitIdent::new(self.token, self.offset)
|
match self.case {
|
||||||
|
Case::None => itertools::Either::Left(SplitIdent::new(self.token, self.offset)),
|
||||||
|
_ => itertools::Either::Right(
|
||||||
|
Some(Word::new_unchecked(self.token, self.case, self.offset)).into_iter(),
|
||||||
|
),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -409,7 +419,7 @@ mod test {
|
||||||
let parser = Tokenizer::new();
|
let parser = Tokenizer::new();
|
||||||
|
|
||||||
let input = "word";
|
let input = "word";
|
||||||
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", 0)];
|
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", Case::None, 0)];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
let actual: Vec<_> = parser.parse_str(input).collect();
|
let actual: Vec<_> = parser.parse_str(input).collect();
|
||||||
|
@ -422,8 +432,8 @@ mod test {
|
||||||
|
|
||||||
let input = "A B";
|
let input = "A B";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("A", 0),
|
Identifier::new_unchecked("A", Case::None, 0),
|
||||||
Identifier::new_unchecked("B", 2),
|
Identifier::new_unchecked("B", Case::None, 2),
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
|
@ -437,8 +447,8 @@ mod test {
|
||||||
|
|
||||||
let input = "A.B";
|
let input = "A.B";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("A", 0),
|
Identifier::new_unchecked("A", Case::None, 0),
|
||||||
Identifier::new_unchecked("B", 2),
|
Identifier::new_unchecked("B", Case::None, 2),
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
|
@ -452,8 +462,8 @@ mod test {
|
||||||
|
|
||||||
let input = "A::B";
|
let input = "A::B";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("A", 0),
|
Identifier::new_unchecked("A", Case::None, 0),
|
||||||
Identifier::new_unchecked("B", 3),
|
Identifier::new_unchecked("B", Case::None, 3),
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
|
@ -466,7 +476,7 @@ mod test {
|
||||||
let parser = Tokenizer::new();
|
let parser = Tokenizer::new();
|
||||||
|
|
||||||
let input = "A_B";
|
let input = "A_B";
|
||||||
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", 0)];
|
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", Case::None, 0)];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
let actual: Vec<_> = parser.parse_str(input).collect();
|
let actual: Vec<_> = parser.parse_str(input).collect();
|
||||||
|
@ -479,8 +489,8 @@ mod test {
|
||||||
|
|
||||||
let input = "Hello 0xDEADBEEF World";
|
let input = "Hello 0xDEADBEEF World";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("Hello", 0),
|
Identifier::new_unchecked("Hello", Case::None, 0),
|
||||||
Identifier::new_unchecked("World", 17),
|
Identifier::new_unchecked("World", Case::None, 17),
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
|
@ -497,9 +507,9 @@ mod test {
|
||||||
|
|
||||||
let input = "Hello 0xDEADBEEF World";
|
let input = "Hello 0xDEADBEEF World";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("Hello", 0),
|
Identifier::new_unchecked("Hello", Case::None, 0),
|
||||||
Identifier::new_unchecked("0xDEADBEEF", 6),
|
Identifier::new_unchecked("0xDEADBEEF", Case::None, 6),
|
||||||
Identifier::new_unchecked("World", 17),
|
Identifier::new_unchecked("World", Case::None, 17),
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
|
@ -564,7 +574,7 @@ mod test {
|
||||||
),
|
),
|
||||||
];
|
];
|
||||||
for (input, expected) in cases.iter() {
|
for (input, expected) in cases.iter() {
|
||||||
let ident = Identifier::new_unchecked(input, 0);
|
let ident = Identifier::new_unchecked(input, Case::None, 0);
|
||||||
let result: Vec<_> = ident.split().map(|w| (w.token, w.case, w.offset)).collect();
|
let result: Vec<_> = ident.split().map(|w| (w.token, w.case, w.offset)).collect();
|
||||||
assert_eq!(&result, expected);
|
assert_eq!(&result, expected);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue