Merge pull request #194 from epage/arch2

fix(fix): Update as we go
This commit is contained in:
Ed Page 2021-01-02 18:22:43 -06:00 committed by GitHub
commit 1c392c2606
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 1161 additions and 1207 deletions

144
Cargo.lock generated
View file

@ -2,9 +2,9 @@
# It is not intended for manual editing.
[[package]]
name = "addr2line"
version = "0.14.0"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c0929d69e78dd9bf5408269919fcbcaeb2e35e5d43e5815517cdc6a8e11a423"
checksum = "a55f82cfe485775d02112886f4169bde0c5894d75e79ead7eafe7e40a25e45f7"
dependencies = [
"gimli",
]
@ -17,9 +17,9 @@ checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e"
[[package]]
name = "ahash"
version = "0.6.1"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "865f8b0b3fced577b7df82e9b0eb7609595d7209c0b39e78d0646672e244b1b1"
checksum = "a75b7e6a93ecd6dbd2c225154d0fa7f86205574ecaa6c87429fb5f66ee677c44"
dependencies = [
"getrandom 0.2.0",
"lazy_static",
@ -46,9 +46,9 @@ dependencies = [
[[package]]
name = "anyhow"
version = "1.0.34"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf8dcb5b4bbaa28653b647d8c77bd4ed40183b48882e130c1f1ffb73de069fd7"
checksum = "ee67c11feeac938fae061b232e38e0b6d94f97a9df10e6271319325ac4c56a86"
[[package]]
name = "arrayvec"
@ -207,12 +207,6 @@ dependencies = [
"unicase",
]
[[package]]
name = "const_fn"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c478836e029dcef17fb47c89023448c64f781a046e0300e257ad8225ae59afab"
[[package]]
name = "content_inspector"
version = "0.2.4"
@ -224,13 +218,12 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.0"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec91540d98355f690a86367e566ecad2e9e579f230230eb7c21398372be73ea5"
checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d"
dependencies = [
"autocfg",
"cfg-if 1.0.0",
"const_fn",
"lazy_static",
]
@ -275,9 +268,9 @@ dependencies = [
"fnv",
"ident_case",
"proc-macro2 1.0.24",
"quote 1.0.7",
"quote 1.0.8",
"strsim 0.9.3",
"syn 1.0.50",
"syn 1.0.57",
]
[[package]]
@ -287,8 +280,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72"
dependencies = [
"darling_core",
"quote 1.0.7",
"syn 1.0.50",
"quote 1.0.8",
"syn 1.0.57",
]
[[package]]
@ -312,8 +305,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41cb0e6161ad61ed084a36ba71fbba9e3ac5aee3606fb607fe08da6acbcf3d8c"
dependencies = [
"proc-macro2 1.0.24",
"quote 1.0.7",
"syn 1.0.50",
"quote 1.0.8",
"syn 1.0.57",
]
[[package]]
@ -324,8 +317,8 @@ checksum = "6604612c19dd3bb353650b715b61f09bcb089dd17bdca1a9a42637079bf5e428"
dependencies = [
"darling",
"proc-macro2 1.0.24",
"quote 1.0.7",
"syn 1.0.50",
"quote 1.0.8",
"syn 1.0.57",
]
[[package]]
@ -374,8 +367,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "946ee94e3dbf58fdd324f9ce245c7b238d46a66f00e86a020b71996349e46cce"
dependencies = [
"proc-macro2 1.0.24",
"quote 1.0.7",
"syn 1.0.50",
"quote 1.0.8",
"syn 1.0.57",
]
[[package]]
@ -421,17 +414,17 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "funty"
version = "1.0.1"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ba62103ce691c2fd80fbae2213dfdda9ce60804973ac6b6e97de818ea7f52c8"
checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7"
[[package]]
name = "getrandom"
version = "0.1.15"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc587bc0ec293155d5bfa6b9891ec18a1e330c234f896ea47fbada4cadbe47e6"
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
dependencies = [
"cfg-if 0.1.10",
"cfg-if 1.0.0",
"libc",
"wasi",
]
@ -478,9 +471,9 @@ dependencies = [
[[package]]
name = "heck"
version = "0.3.1"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205"
checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac"
dependencies = [
"unicode-segmentation",
]
@ -559,9 +552,9 @@ dependencies = [
[[package]]
name = "itoa"
version = "0.4.6"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc6f3ad7b9d11a0c00842ff8de1b60ee58661048eb8049ed33c73594f359d7e6"
checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
[[package]]
name = "lazy_static"
@ -584,9 +577,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.80"
version = "0.2.81"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614"
checksum = "1482821306169ec4d07f6aca392a4681f66c75c9918aa49641a2595db64053cb"
[[package]]
name = "log"
@ -730,9 +723,9 @@ checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857"
[[package]]
name = "predicates"
version = "1.0.5"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96bfead12e90dccead362d62bb2c90a5f6fc4584963645bc7f71a735e0b0735a"
checksum = "73dd9b7b200044694dfede9edf907c1ca19630908443e9447e624993700c6932"
dependencies = [
"difference",
"float-cmp",
@ -743,15 +736,15 @@ dependencies = [
[[package]]
name = "predicates-core"
version = "1.0.0"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06075c3a3e92559ff8929e7a280684489ea27fe44805174c3ebd9328dcb37178"
checksum = "fb3dbeaaf793584e29c58c7e3a82bbb3c7c06b63cea68d13b0e3cddc124104dc"
[[package]]
name = "predicates-tree"
version = "1.0.0"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e63c4859013b38a76eca2414c64911fba30def9e3202ac461a2d22831220124"
checksum = "aee95d988ee893cb35c06b148c80ed2cd52c8eea927f50ba7a0be1a786aeab73"
dependencies = [
"predicates-core",
"treeline",
@ -771,8 +764,8 @@ checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
dependencies = [
"proc-macro-error-attr",
"proc-macro2 1.0.24",
"quote 1.0.7",
"syn 1.0.50",
"quote 1.0.8",
"syn 1.0.57",
"version_check",
]
@ -783,7 +776,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
dependencies = [
"proc-macro2 1.0.24",
"quote 1.0.7",
"quote 1.0.8",
"version_check",
]
@ -822,9 +815,9 @@ dependencies = [
[[package]]
name = "quote"
version = "1.0.7"
version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37"
checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df"
dependencies = [
"proc-macro2 1.0.24",
]
@ -841,7 +834,7 @@ version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
dependencies = [
"getrandom 0.1.15",
"getrandom 0.1.16",
"libc",
"rand_chacha",
"rand_core",
@ -865,7 +858,7 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
dependencies = [
"getrandom 0.1.15",
"getrandom 0.1.16",
]
[[package]]
@ -975,29 +968,29 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]]
name = "serde"
version = "1.0.117"
version = "1.0.118"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b88fa983de7720629c9387e9f517353ed404164b1e482c970a90c1a4aaf7dc1a"
checksum = "06c64263859d87aa2eb554587e2d23183398d617427327cf2b3d0ed8c69e4800"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.117"
version = "1.0.118"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbd1ae72adb44aab48f325a02444a5fc079349a8d804c1fc922aed3f7454c74e"
checksum = "c84d3526699cd55261af4b941e4e725444df67aa4f9e6a3564f18030d12672df"
dependencies = [
"proc-macro2 1.0.24",
"quote 1.0.7",
"syn 1.0.50",
"quote 1.0.8",
"syn 1.0.57",
]
[[package]]
name = "serde_json"
version = "1.0.59"
version = "1.0.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcac07dbffa1c65e7f816ab9eba78eb142c6d44410f4eeba1e26e4f5dfa56b95"
checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a"
dependencies = [
"itoa",
"ryu",
@ -1048,8 +1041,8 @@ dependencies = [
"heck",
"proc-macro-error",
"proc-macro2 1.0.24",
"quote 1.0.7",
"syn 1.0.50",
"quote 1.0.8",
"syn 1.0.57",
]
[[package]]
@ -1065,12 +1058,12 @@ dependencies = [
[[package]]
name = "syn"
version = "1.0.50"
version = "1.0.57"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "443b4178719c5a851e1bde36ce12da21d74a0e60b4d982ec3385a933c812f0f6"
checksum = "4211ce9909eb971f111059df92c45640aad50a619cf55cd76476be803c4c68e6"
dependencies = [
"proc-macro2 1.0.24",
"quote 1.0.7",
"quote 1.0.8",
"unicode-xid 0.2.1",
]
@ -1114,22 +1107,22 @@ dependencies = [
[[package]]
name = "thiserror"
version = "1.0.22"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e"
checksum = "76cc616c6abf8c8928e2fdcc0dbfab37175edd8fb49a4641066ad1364fdab146"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.22"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56"
checksum = "9be73a2caec27583d0046ef3796c3794f868a5bc813db689eed00c7631275cd1"
dependencies = [
"proc-macro2 1.0.24",
"quote 1.0.7",
"syn 1.0.50",
"quote 1.0.8",
"syn 1.0.57",
]
[[package]]
@ -1143,9 +1136,9 @@ dependencies = [
[[package]]
name = "toml"
version = "0.5.7"
version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75cf45bb0bef80604d001caaec0d09da99611b3c0fd39d3080468875cdb65645"
checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa"
dependencies = [
"serde",
]
@ -1161,16 +1154,11 @@ name = "typos"
version = "0.3.0"
dependencies = [
"anyhow",
"bstr",
"content_inspector",
"derive_more 0.99.11",
"derive_setters",
"itertools",
"log",
"once_cell",
"regex",
"serde",
"serde_json",
"thiserror",
"unicode-segmentation",
]
@ -1185,21 +1173,27 @@ dependencies = [
"bstr",
"clap",
"clap-verbosity-flag",
"content_inspector",
"derive_more 0.99.11",
"derive_setters",
"difflib",
"env_logger 0.8.2",
"human-panic",
"ignore",
"itertools",
"log",
"phf",
"predicates",
"proc-exit",
"serde",
"serde_json",
"structopt",
"toml",
"typos",
"typos-dict",
"typos-vars",
"unicase",
"unicode-segmentation",
]
[[package]]

View file

@ -50,6 +50,12 @@ ahash = "0.6.1"
difflib = "0.4"
proc-exit = "1.0"
human-panic = "1.0.3"
content_inspector = "0.2.4"
unicode-segmentation = "1.6.0"
derive_more = "0.99.11"
derive_setters = "0.1"
itertools = "0.9"
serde_json = "1.0"
[dev-dependencies]
assert_fs = "1.0"

View file

@ -5,131 +5,180 @@ extern crate test;
mod data;
use assert_fs::prelude::*;
use typos::checks::Check;
use typos_cli::checks::FileChecker;
fn bench_files(data: &str, b: &mut test::Bencher) {
let temp = assert_fs::TempDir::new().unwrap();
let sample_path = temp.child("sample");
sample_path.write_str(data).unwrap();
fn bench_parse_ident_str(data: &str, b: &mut test::Bencher) {
let corrections = typos_cli::dict::BuiltIn::new(Default::default());
let parser = typos::tokens::Parser::new();
let checks = typos::checks::TyposSettings::new().build_identifier_parser();
b.iter(|| checks.check_str(data, &parser, &corrections, &typos::report::PrintSilent));
}
#[bench]
fn parse_idents_empty_str(b: &mut test::Bencher) {
bench_parse_ident_str(data::EMPTY, b);
}
#[bench]
fn parse_idents_no_tokens_str(b: &mut test::Bencher) {
bench_parse_ident_str(data::NO_TOKENS, b);
}
#[bench]
fn parse_idents_single_token_str(b: &mut test::Bencher) {
bench_parse_ident_str(data::SINGLE_TOKEN, b);
}
#[bench]
fn parse_idents_sherlock_str(b: &mut test::Bencher) {
bench_parse_ident_str(data::SHERLOCK, b);
}
#[bench]
fn parse_idents_code_str(b: &mut test::Bencher) {
bench_parse_ident_str(data::CODE, b);
}
#[bench]
fn parse_idents_corpus_str(b: &mut test::Bencher) {
bench_parse_ident_str(data::CORPUS, b);
}
fn bench_parse_ident_bytes(data: &str, b: &mut test::Bencher) {
let corrections = typos_cli::dict::BuiltIn::new(Default::default());
let parser = typos::tokens::Parser::new();
let checks = typos::checks::TyposSettings::new().build_identifier_parser();
let parser = typos::tokens::Tokenizer::new();
let checks = typos_cli::checks::TyposSettings::new().build_files();
b.iter(|| {
checks.check_bytes(
data.as_bytes(),
checks.check_file(
sample_path.path(),
true,
&parser,
&corrections,
&typos::report::PrintSilent,
&typos_cli::report::PrintSilent,
)
});
temp.close().unwrap();
}
#[bench]
fn parse_idents_empty_bytes(b: &mut test::Bencher) {
bench_parse_ident_bytes(data::EMPTY, b);
fn files_empty(b: &mut test::Bencher) {
bench_files(data::EMPTY, b);
}
#[bench]
fn parse_idents_no_tokens_bytes(b: &mut test::Bencher) {
bench_parse_ident_bytes(data::NO_TOKENS, b);
fn files_no_tokens(b: &mut test::Bencher) {
bench_files(data::NO_TOKENS, b);
}
#[bench]
fn parse_idents_single_token_bytes(b: &mut test::Bencher) {
bench_parse_ident_bytes(data::SINGLE_TOKEN, b);
fn files_single_token(b: &mut test::Bencher) {
bench_files(data::SINGLE_TOKEN, b);
}
#[bench]
fn parse_idents_sherlock_bytes(b: &mut test::Bencher) {
bench_parse_ident_bytes(data::SHERLOCK, b);
fn files_sherlock(b: &mut test::Bencher) {
bench_files(data::SHERLOCK, b);
}
#[bench]
fn parse_idents_code_bytes(b: &mut test::Bencher) {
bench_parse_ident_bytes(data::CODE, b);
fn files_code(b: &mut test::Bencher) {
bench_files(data::CODE, b);
}
#[bench]
fn parse_idents_corpus_bytes(b: &mut test::Bencher) {
bench_parse_ident_bytes(data::CORPUS, b);
fn files_corpus(b: &mut test::Bencher) {
bench_files(data::CORPUS, b);
}
fn bench_parse_word_str(data: &str, b: &mut test::Bencher) {
fn bench_identifiers(data: &str, b: &mut test::Bencher) {
let temp = assert_fs::TempDir::new().unwrap();
let sample_path = temp.child("sample");
sample_path.write_str(data).unwrap();
let corrections = typos_cli::dict::BuiltIn::new(Default::default());
let parser = typos::tokens::Parser::new();
let checks = typos::checks::TyposSettings::new().build_word_parser();
b.iter(|| checks.check_str(data, &parser, &corrections, &typos::report::PrintSilent));
let parser = typos::tokens::Tokenizer::new();
let checks = typos_cli::checks::TyposSettings::new().build_identifier_parser();
b.iter(|| {
checks.check_file(
sample_path.path(),
true,
&parser,
&corrections,
&typos_cli::report::PrintSilent,
)
});
temp.close().unwrap();
}
#[bench]
fn parse_words_empty(b: &mut test::Bencher) {
bench_parse_word_str(data::EMPTY, b);
fn identifiers_empty(b: &mut test::Bencher) {
bench_identifiers(data::EMPTY, b);
}
#[bench]
fn parse_words_no_tokens(b: &mut test::Bencher) {
bench_parse_word_str(data::NO_TOKENS, b);
fn identifiers_no_tokens(b: &mut test::Bencher) {
bench_identifiers(data::NO_TOKENS, b);
}
#[bench]
fn parse_words_single_token(b: &mut test::Bencher) {
bench_parse_word_str(data::SINGLE_TOKEN, b);
fn identifiers_single_token(b: &mut test::Bencher) {
bench_identifiers(data::SINGLE_TOKEN, b);
}
#[bench]
fn parse_words_sherlock(b: &mut test::Bencher) {
bench_parse_word_str(data::SHERLOCK, b);
fn identifiers_sherlock(b: &mut test::Bencher) {
bench_identifiers(data::SHERLOCK, b);
}
#[bench]
fn parse_words_code(b: &mut test::Bencher) {
bench_parse_word_str(data::CODE, b);
fn identifiers_code(b: &mut test::Bencher) {
bench_identifiers(data::CODE, b);
}
#[bench]
fn parse_words_corpus(b: &mut test::Bencher) {
bench_parse_word_str(data::CORPUS, b);
fn identifiers_corpus(b: &mut test::Bencher) {
bench_identifiers(data::CORPUS, b);
}
fn bench_words(data: &str, b: &mut test::Bencher) {
let temp = assert_fs::TempDir::new().unwrap();
let sample_path = temp.child("sample");
sample_path.write_str(data).unwrap();
let corrections = typos_cli::dict::BuiltIn::new(Default::default());
let parser = typos::tokens::Tokenizer::new();
let checks = typos_cli::checks::TyposSettings::new().build_word_parser();
b.iter(|| {
checks.check_file(
sample_path.path(),
true,
&parser,
&corrections,
&typos_cli::report::PrintSilent,
)
});
temp.close().unwrap();
}
#[bench]
fn words_empty(b: &mut test::Bencher) {
bench_words(data::EMPTY, b);
}
#[bench]
fn words_no_tokens(b: &mut test::Bencher) {
bench_words(data::NO_TOKENS, b);
}
#[bench]
fn words_single_token(b: &mut test::Bencher) {
bench_words(data::SINGLE_TOKEN, b);
}
#[bench]
fn words_sherlock(b: &mut test::Bencher) {
bench_words(data::SHERLOCK, b);
}
#[bench]
fn words_code(b: &mut test::Bencher) {
bench_words(data::CODE, b);
}
#[bench]
fn words_corpus(b: &mut test::Bencher) {
bench_words(data::CORPUS, b);
}
fn bench_typos(data: &str, b: &mut test::Bencher) {
let temp = assert_fs::TempDir::new().unwrap();
let sample_path = temp.child("sample");
sample_path.write_str(data).unwrap();
let corrections = typos_cli::dict::BuiltIn::new(Default::default());
let parser = typos::tokens::Parser::new();
let checks = typos::checks::TyposSettings::new().build_typos();
b.iter(|| checks.check_str(data, &parser, &corrections, &typos::report::PrintSilent));
let parser = typos::tokens::Tokenizer::new();
let checks = typos_cli::checks::TyposSettings::new().build_typos();
b.iter(|| {
checks.check_file(
sample_path.path(),
true,
&parser,
&corrections,
&typos_cli::report::PrintSilent,
)
});
temp.close().unwrap();
}
#[bench]
@ -161,54 +210,3 @@ fn typos_code(b: &mut test::Bencher) {
fn typos_corpus(b: &mut test::Bencher) {
bench_typos(data::CORPUS, b);
}
fn bench_check_file(data: &str, b: &mut test::Bencher) {
let temp = assert_fs::TempDir::new().unwrap();
let sample_path = temp.child("sample");
sample_path.write_str(data).unwrap();
let corrections = typos_cli::dict::BuiltIn::new(Default::default());
let parser = typos::tokens::Parser::new();
let checks = typos::checks::TyposSettings::new().build_typos();
b.iter(|| {
checks.check_file(
sample_path.path(),
true,
&parser,
&corrections,
&typos::report::PrintSilent,
)
});
temp.close().unwrap();
}
#[bench]
fn check_file_empty(b: &mut test::Bencher) {
bench_check_file(data::EMPTY, b);
}
#[bench]
fn check_file_no_tokens(b: &mut test::Bencher) {
bench_check_file(data::NO_TOKENS, b);
}
#[bench]
fn check_file_single_token(b: &mut test::Bencher) {
bench_check_file(data::SINGLE_TOKEN, b);
}
#[bench]
fn check_file_sherlock(b: &mut test::Bencher) {
bench_check_file(data::SHERLOCK, b);
}
#[bench]
fn check_file_code(b: &mut test::Bencher) {
bench_check_file(data::CODE, b);
}
#[bench]
fn check_file_corpus(b: &mut test::Bencher) {
bench_check_file(data::CORPUS, b);
}

View file

@ -6,19 +6,19 @@ mod data;
#[bench]
fn ident_parse_empty(b: &mut test::Bencher) {
let parser = typos::tokens::Parser::new();
let parser = typos::tokens::Tokenizer::new();
b.iter(|| parser.parse_bytes(data::EMPTY.as_bytes()).last());
}
#[bench]
fn ident_parse_no_tokens(b: &mut test::Bencher) {
let parser = typos::tokens::Parser::new();
let parser = typos::tokens::Tokenizer::new();
b.iter(|| parser.parse_bytes(data::NO_TOKENS.as_bytes()).last());
}
#[bench]
fn ident_parse_single_token(b: &mut test::Bencher) {
let parser = typos::tokens::Parser::new();
let parser = typos::tokens::Tokenizer::new();
b.iter(|| {
parser.parse_bytes(data::SINGLE_TOKEN.as_bytes()).last();
});
@ -26,19 +26,19 @@ fn ident_parse_single_token(b: &mut test::Bencher) {
#[bench]
fn ident_parse_sherlock(b: &mut test::Bencher) {
let parser = typos::tokens::Parser::new();
let parser = typos::tokens::Tokenizer::new();
b.iter(|| parser.parse_bytes(data::SHERLOCK.as_bytes()).last());
}
#[bench]
fn ident_parse_code(b: &mut test::Bencher) {
let parser = typos::tokens::Parser::new();
let parser = typos::tokens::Tokenizer::new();
b.iter(|| parser.parse_bytes(data::CODE.as_bytes()).last());
}
#[bench]
fn ident_parse_corpus(b: &mut test::Bencher) {
let parser = typos::tokens::Parser::new();
let parser = typos::tokens::Tokenizer::new();
b.iter(|| parser.parse_bytes(data::CORPUS.as_bytes()).last());
}

View file

@ -20,11 +20,6 @@ thiserror = "1.0"
regex = "1.3"
once_cell = "1.2.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
itertools = "0.9"
bstr = "0.2"
log = "0.4"
unicode-segmentation = "1.7.1"
derive_more = "0.99.11"
derive_setters = "0.1"
content_inspector = "0.2.4"

View file

@ -1,489 +0,0 @@
use bstr::ByteSlice;
use crate::report;
use crate::tokens;
use crate::Dictionary;
use crate::Status;
pub trait Check: Send + Sync {
fn check_str(
&self,
buffer: &str,
parser: &tokens::Parser,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error>;
fn check_bytes(
&self,
buffer: &[u8],
parser: &tokens::Parser,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error>;
fn check_filenames(&self) -> bool;
fn check_files(&self) -> bool;
fn binary(&self) -> bool;
fn check_filename(
&self,
path: &std::path::Path,
parser: &tokens::Parser,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
if !self.check_filenames() {
return Ok(());
}
if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
let context_reporter = ReportContext {
reporter,
context: report::PathContext { path }.into(),
};
self.check_str(file_name, parser, dictionary, &context_reporter)?;
}
Ok(())
}
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
parser: &tokens::Parser,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
if !self.check_files() {
return Ok(());
}
let buffer = read_file(path, reporter)?;
let (buffer, content_type) = massage_data(buffer)?;
if !explicit && !self.binary() && content_type.is_binary() {
let msg = report::BinaryFile { path };
reporter.report(msg.into())?;
return Ok(());
}
for (line_idx, line) in buffer.lines().enumerate() {
let line_num = line_idx + 1;
let context_reporter = ReportContext {
reporter,
context: report::FileContext { path, line_num }.into(),
};
self.check_bytes(line, parser, dictionary, &context_reporter)?;
}
Ok(())
}
}
struct ReportContext<'m, 'r> {
reporter: &'r dyn report::Report,
context: report::Context<'m>,
}
impl<'m, 'r> report::Report for ReportContext<'m, 'r> {
fn report(&self, msg: report::Message) -> Result<(), std::io::Error> {
let msg = msg.context(Some(self.context.clone()));
self.reporter.report(msg)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TyposSettings {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl TyposSettings {
pub fn new() -> Self {
Default::default()
}
pub fn check_filenames(&mut self, yes: bool) -> &mut Self {
self.check_filenames = yes;
self
}
pub fn check_files(&mut self, yes: bool) -> &mut Self {
self.check_files = yes;
self
}
pub fn binary(&mut self, yes: bool) -> &mut Self {
self.binary = yes;
self
}
pub fn build_typos(&self) -> Typos {
Typos {
check_filenames: self.check_filenames,
check_files: self.check_files,
binary: self.binary,
}
}
pub fn build_identifier_parser(&self) -> ParseIdentifiers {
ParseIdentifiers {
check_filenames: self.check_filenames,
check_files: self.check_files,
binary: self.binary,
}
}
pub fn build_word_parser(&self) -> ParseWords {
ParseWords {
check_filenames: self.check_filenames,
check_files: self.check_files,
binary: self.binary,
}
}
pub fn build_files(&self) -> Files {
Files {}
}
}
impl Default for TyposSettings {
fn default() -> Self {
Self {
check_filenames: true,
check_files: true,
binary: false,
}
}
}
#[derive(Debug, Clone)]
pub struct Typos {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl Check for Typos {
fn check_str(
&self,
buffer: &str,
parser: &tokens::Parser,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
for ident in parser.parse_str(buffer) {
match dictionary.correct_ident(ident) {
Some(Status::Valid) => {}
Some(corrections) => {
let byte_offset = ident.offset();
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
byte_offset,
typo: ident.token(),
corrections,
};
reporter.report(msg.into())?;
}
None => {
for word in ident.split() {
match dictionary.correct_word(word) {
Some(Status::Valid) => {}
Some(corrections) => {
let byte_offset = word.offset();
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()),
byte_offset,
typo: word.token(),
corrections,
};
reporter.report(msg.into())?;
}
None => {}
}
}
}
}
}
Ok(())
}
fn check_bytes(
&self,
buffer: &[u8],
parser: &tokens::Parser,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
for ident in parser.parse_bytes(buffer) {
match dictionary.correct_ident(ident) {
Some(Status::Valid) => {}
Some(corrections) => {
let byte_offset = ident.offset();
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer),
byte_offset,
typo: ident.token(),
corrections,
};
reporter.report(msg.into())?;
}
None => {
for word in ident.split() {
match dictionary.correct_word(word) {
Some(Status::Valid) => {}
Some(corrections) => {
let byte_offset = word.offset();
let msg = report::Typo {
context: None,
buffer: std::borrow::Cow::Borrowed(buffer),
byte_offset,
typo: word.token(),
corrections,
};
reporter.report(msg.into())?;
}
None => {}
}
}
}
}
}
Ok(())
}
fn check_filenames(&self) -> bool {
self.check_filenames
}
fn check_files(&self) -> bool {
self.check_files
}
fn binary(&self) -> bool {
self.binary
}
}
#[derive(Debug, Clone)]
pub struct ParseIdentifiers {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl Check for ParseIdentifiers {
fn check_str(
&self,
buffer: &str,
parser: &tokens::Parser,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Identifier,
data: parser.parse_str(buffer).map(|i| i.token()).collect(),
};
if !msg.data.is_empty() {
reporter.report(msg.into())?;
}
Ok(())
}
fn check_bytes(
&self,
buffer: &[u8],
parser: &tokens::Parser,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Identifier,
data: parser.parse_bytes(buffer).map(|i| i.token()).collect(),
};
if !msg.data.is_empty() {
reporter.report(msg.into())?;
}
Ok(())
}
fn check_filenames(&self) -> bool {
self.check_filenames
}
fn check_files(&self) -> bool {
self.check_files
}
fn binary(&self) -> bool {
self.binary
}
}
#[derive(Debug, Clone)]
pub struct ParseWords {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl Check for ParseWords {
fn check_str(
&self,
buffer: &str,
parser: &tokens::Parser,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Word,
data: parser
.parse_str(buffer)
.flat_map(|ident| ident.split().map(|i| i.token()))
.collect(),
};
if !msg.data.is_empty() {
reporter.report(msg.into())?;
}
Ok(())
}
fn check_bytes(
&self,
buffer: &[u8],
parser: &tokens::Parser,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let msg = report::Parse {
context: None,
kind: report::ParseKind::Word,
data: parser
.parse_bytes(buffer)
.flat_map(|ident| ident.split().map(|i| i.token()))
.collect(),
};
if !msg.data.is_empty() {
reporter.report(msg.into())?;
}
Ok(())
}
fn check_filenames(&self) -> bool {
self.check_filenames
}
fn check_files(&self) -> bool {
self.check_files
}
fn binary(&self) -> bool {
self.binary
}
}
#[derive(Debug, Clone)]
pub struct Files {}
impl Check for Files {
fn check_str(
&self,
_buffer: &str,
_parser: &tokens::Parser,
_dictionary: &dyn Dictionary,
_reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
Ok(())
}
fn check_bytes(
&self,
_buffer: &[u8],
_parser: &tokens::Parser,
_dictionary: &dyn Dictionary,
_reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
Ok(())
}
fn check_filenames(&self) -> bool {
true
}
fn check_files(&self) -> bool {
true
}
fn binary(&self) -> bool {
true
}
fn check_filename(
&self,
_path: &std::path::Path,
_parser: &tokens::Parser,
_dictionary: &dyn Dictionary,
_reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
Ok(())
}
fn check_file(
&self,
path: &std::path::Path,
_explicit: bool,
_parser: &tokens::Parser,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let msg = report::File::new(path);
reporter.report(msg.into())?;
Ok(())
}
}
fn read_file(
path: &std::path::Path,
reporter: &dyn report::Report,
) -> Result<Vec<u8>, std::io::Error> {
let buffer = match std::fs::read(path) {
Ok(buffer) => buffer,
Err(err) => {
let msg = report::Error::new(err.to_string());
reporter.report(msg.into())?;
Vec::new()
}
};
Ok(buffer)
}
fn massage_data(
buffer: Vec<u8>,
) -> Result<(Vec<u8>, content_inspector::ContentType), std::io::Error> {
let mut content_type = content_inspector::inspect(&buffer);
// HACK: We only support UTF-8 at the moment
if content_type != content_inspector::ContentType::UTF_8_BOM
&& content_type != content_inspector::ContentType::UTF_8
{
content_type = content_inspector::ContentType::BINARY;
}
Ok((buffer, content_type))
}

View file

@ -1,6 +1,35 @@
use std::borrow::Cow;
#[derive(Clone, PartialEq, Eq, Debug, serde::Serialize, derive_more::From)]
/// Look up the validity of a term.
pub trait Dictionary: Send + Sync {
/// Look up the validity of an Identifier.
///
/// `None` if the status is unknown.
fn correct_ident<'s, 'w>(&'s self, ident: crate::tokens::Identifier<'w>) -> Option<Status<'s>>;
/// Look up the validity of a Word.
///
/// `None` if the status is unknown.
fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option<Status<'s>>;
}
pub(crate) struct NullDictionary;
impl Dictionary for NullDictionary {
fn correct_ident<'s, 'w>(
&'s self,
_ident: crate::tokens::Identifier<'w>,
) -> Option<Status<'s>> {
None
}
fn correct_word<'s, 'w>(&'s self, _word: crate::tokens::Word<'w>) -> Option<Status<'s>> {
None
}
}
/// Validity of a term in a Dictionary.
#[derive(Clone, PartialEq, Eq, Debug, serde::Serialize)]
#[serde(rename_all = "snake_case")]
#[serde(untagged)]
pub enum Status<'c> {
@ -27,6 +56,20 @@ impl<'c> Status<'c> {
}
}
pub fn into_owned(self) -> Status<'static> {
match self {
Status::Valid => Status::Valid,
Status::Invalid => Status::Invalid,
Status::Corrections(corrections) => {
let corrections = corrections
.into_iter()
.map(|c| Cow::Owned(c.into_owned()))
.collect();
Status::Corrections(corrections)
}
}
}
pub fn borrow(&self) -> Status<'_> {
match self {
Status::Corrections(corrections) => {
@ -40,10 +83,3 @@ impl<'c> Status<'c> {
}
}
}
pub trait Dictionary: Send + Sync {
fn correct_ident<'s, 'w>(&'s self, _ident: crate::tokens::Identifier<'w>)
-> Option<Status<'s>>;
fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option<Status<'s>>;
}

View file

@ -1,7 +1,7 @@
mod dict;
mod parser;
pub mod checks;
pub mod report;
pub mod tokens;
pub use crate::dict::*;
pub use dict::*;
pub use parser::*;

147
crates/typos/src/parser.rs Normal file
View file

@ -0,0 +1,147 @@
use crate::tokens;
use crate::Dictionary;
use std::borrow::Cow;
/// Extract typos from the buffer.
#[derive(Clone)]
pub struct ParserBuilder<'p, 'd> {
tokenizer: Option<&'p tokens::Tokenizer>,
dictionary: &'d dyn Dictionary,
}
impl<'p> ParserBuilder<'p, 'static> {
pub fn new() -> Self {
Default::default()
}
}
impl<'p, 'd> ParserBuilder<'p, 'd> {
/// Set the Tokenizer used when parsing.
pub fn tokenizer(mut self, tokenizer: &'p tokens::Tokenizer) -> Self {
self.tokenizer = Some(tokenizer);
self
}
/// Set the dictionary used when parsing.
pub fn dictionary<'d1>(self, dictionary: &'d1 dyn Dictionary) -> ParserBuilder<'p, 'd1> {
ParserBuilder {
tokenizer: self.tokenizer,
dictionary,
}
}
/// Extract typos from the buffer.
pub fn build(&self) -> TyposParser<'p, 'd> {
TyposParser {
tokenizer: self.tokenizer.unwrap_or(&DEFAULT_TOKENIZER),
dictionary: self.dictionary,
}
}
}
impl<'p> Default for ParserBuilder<'p, 'static> {
fn default() -> Self {
Self {
tokenizer: None,
dictionary: &crate::NullDictionary,
}
}
}
static DEFAULT_TOKENIZER: once_cell::sync::Lazy<tokens::Tokenizer> =
once_cell::sync::Lazy::new(tokens::Tokenizer::new);
/// Extract typos from the buffer.
#[derive(Clone)]
pub struct TyposParser<'p, 'd> {
tokenizer: &'p tokens::Tokenizer,
dictionary: &'d dyn Dictionary,
}
impl<'p, 'd> TyposParser<'p, 'd> {
pub fn parse_str<'b, 's: 'b>(&'s self, buffer: &'b str) -> impl Iterator<Item = Typo<'b>> {
self.tokenizer
.parse_str(buffer)
.flat_map(move |ident| self.process_ident(ident))
}
pub fn parse_bytes<'b, 's: 'b>(&'s self, buffer: &'b [u8]) -> impl Iterator<Item = Typo<'b>> {
self.tokenizer
.parse_bytes(buffer)
.flat_map(move |ident| self.process_ident(ident))
}
fn process_ident<'i, 's: 'i>(
&'s self,
ident: tokens::Identifier<'i>,
) -> impl Iterator<Item = Typo<'i>> {
match self.dictionary.correct_ident(ident) {
Some(crate::Status::Valid) => itertools::Either::Left(None.into_iter()),
Some(corrections) => {
let typo = Typo {
byte_offset: ident.offset(),
typo: ident.token().into(),
corrections,
};
itertools::Either::Left(Some(typo).into_iter())
}
None => itertools::Either::Right(
ident
.split()
.filter_map(move |word| self.process_word(word)),
),
}
}
fn process_word<'w, 's: 'w>(&'s self, word: tokens::Word<'w>) -> Option<Typo<'w>> {
match self.dictionary.correct_word(word) {
Some(crate::Status::Valid) => None,
Some(corrections) => {
let typo = Typo {
byte_offset: word.offset(),
typo: word.token().into(),
corrections,
};
Some(typo)
}
None => None,
}
}
}
/// An invalid term found in the buffer.
#[derive(Clone, Debug)]
#[non_exhaustive]
pub struct Typo<'m> {
pub byte_offset: usize,
pub typo: Cow<'m, str>,
pub corrections: crate::Status<'m>,
}
impl<'m> Typo<'m> {
pub fn into_owned(self) -> Typo<'static> {
Typo {
byte_offset: self.byte_offset,
typo: Cow::Owned(self.typo.into_owned()),
corrections: self.corrections.into_owned(),
}
}
pub fn borrow(&self) -> Typo<'_> {
Typo {
byte_offset: self.byte_offset,
typo: Cow::Borrowed(self.typo.as_ref()),
corrections: self.corrections.borrow(),
}
}
}
impl<'m> Default for Typo<'m> {
fn default() -> Self {
Self {
byte_offset: 0,
typo: "".into(),
corrections: crate::Status::Invalid,
}
}
}

View file

@ -1,13 +1,6 @@
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Case {
Title,
Lower,
Scream,
None,
}
/// Define rules for tokenizaing a buffer.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ParserBuilder {
pub struct TokenizerBuilder {
ignore_hex: bool,
leading_digits: bool,
leading_chars: String,
@ -15,37 +8,42 @@ pub struct ParserBuilder {
include_chars: String,
}
impl ParserBuilder {
impl TokenizerBuilder {
pub fn new() -> Self {
Default::default()
}
/// Specify that hexadecimal numbers should be ignored.
pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
self.ignore_hex = yes;
self
}
/// Specify that leading digits are allowed for Identifiers.
pub fn leading_digits(&mut self, yes: bool) -> &mut Self {
self.leading_digits = yes;
self
}
/// Extend accepted leading characters for Identifiers.
pub fn leading_chars(&mut self, chars: String) -> &mut Self {
self.leading_chars = chars;
self
}
/// Specify that digits can be included in Identifiers.
pub fn include_digits(&mut self, yes: bool) -> &mut Self {
self.include_digits = yes;
self
}
/// Extend accepted characters for Identifiers.
pub fn include_chars(&mut self, chars: String) -> &mut Self {
self.include_chars = chars;
self
}
pub fn build(&self) -> Parser {
pub fn build(&self) -> Tokenizer {
let mut pattern = r#"\b("#.to_owned();
Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
@ -54,7 +52,7 @@ impl ParserBuilder {
let words_str = regex::Regex::new(&pattern).unwrap();
let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();
Parser {
Tokenizer {
words_str,
words_bytes,
// `leading_digits` let's us bypass the regexes since you can't have a decimal or
@ -77,7 +75,7 @@ impl ParserBuilder {
}
}
impl Default for ParserBuilder {
impl Default for TokenizerBuilder {
fn default() -> Self {
Self {
ignore_hex: true,
@ -89,17 +87,18 @@ impl Default for ParserBuilder {
}
}
/// Extract Identifiers from a buffer.
#[derive(Debug, Clone)]
pub struct Parser {
pub struct Tokenizer {
words_str: regex::Regex,
words_bytes: regex::bytes::Regex,
ignore_numbers: bool,
ignore_hex: bool,
}
impl Parser {
impl Tokenizer {
pub fn new() -> Self {
ParserBuilder::default().build()
TokenizerBuilder::default().build()
}
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
@ -132,7 +131,7 @@ impl Parser {
}
}
impl Default for Parser {
impl Default for Tokenizer {
fn default() -> Self {
Self::new()
}
@ -156,6 +155,7 @@ fn is_hex(ident: &[u8]) -> bool {
HEX.is_match(ident)
}
/// A term composed of Words.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Identifier<'t> {
token: &'t str,
@ -179,11 +179,13 @@ impl<'t> Identifier<'t> {
self.offset
}
/// Split into individual Words.
pub fn split(&self) -> impl Iterator<Item = Word<'t>> {
split_ident(self.token, self.offset)
}
}
/// An indivisible term.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Word<'t> {
token: &'t str,
@ -237,52 +239,8 @@ impl<'t> Word<'t> {
}
}
/// Tracks the current 'mode' of the transformation algorithm as it scans the input string.
///
/// The mode is a tri-state which tracks the case of the last cased character of the current
/// word. If there is no cased character (either lowercase or uppercase) since the previous
/// word boundary, than the mode is `Boundary`. If the last cased character is lowercase, then
/// the mode is `Lowercase`. Otherrwise, the mode is `Uppercase`.
#[derive(Clone, Copy, PartialEq, Debug)]
enum WordMode {
/// There have been no lowercase or uppercase characters in the current word.
Boundary,
/// The previous cased character in the current word is lowercase.
Lowercase,
/// The previous cased character in the current word is uppercase.
Uppercase,
Number,
}
impl WordMode {
fn classify(c: char) -> Self {
if c.is_lowercase() {
WordMode::Lowercase
} else if c.is_uppercase() {
WordMode::Uppercase
} else if c.is_ascii_digit() {
WordMode::Number
} else {
// This assumes all characters are either lower or upper case.
WordMode::Boundary
}
}
fn case(self, last: WordMode) -> Case {
match (self, last) {
(WordMode::Uppercase, WordMode::Uppercase) => Case::Scream,
(WordMode::Uppercase, WordMode::Lowercase) => Case::Title,
(WordMode::Lowercase, WordMode::Lowercase) => Case::Lower,
(WordMode::Number, WordMode::Number) => Case::None,
(WordMode::Number, _)
| (_, WordMode::Number)
| (WordMode::Boundary, _)
| (_, WordMode::Boundary)
| (WordMode::Lowercase, WordMode::Uppercase) => {
unreachable!("Invalid case combination: ({:?}, {:?})", self, last)
}
}
}
fn split_ident(ident: &str, offset: usize) -> impl Iterator<Item = Word<'_>> {
SplitIdent::new(ident, offset)
}
struct SplitIdent<'s> {
@ -377,8 +335,61 @@ impl<'s> Iterator for SplitIdent<'s> {
}
}
fn split_ident(ident: &str, offset: usize) -> impl Iterator<Item = Word<'_>> {
SplitIdent::new(ident, offset)
/// Format of the term.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Case {
Title,
Lower,
Scream,
None,
}
/// Tracks the current 'mode' of the transformation algorithm as it scans the input string.
///
/// The mode is a tri-state which tracks the case of the last cased character of the current
/// word. If there is no cased character (either lowercase or uppercase) since the previous
/// word boundary, than the mode is `Boundary`. If the last cased character is lowercase, then
/// the mode is `Lowercase`. Otherrwise, the mode is `Uppercase`.
#[derive(Clone, Copy, PartialEq, Debug)]
enum WordMode {
/// There have been no lowercase or uppercase characters in the current word.
Boundary,
/// The previous cased character in the current word is lowercase.
Lowercase,
/// The previous cased character in the current word is uppercase.
Uppercase,
Number,
}
impl WordMode {
fn classify(c: char) -> Self {
if c.is_lowercase() {
WordMode::Lowercase
} else if c.is_uppercase() {
WordMode::Uppercase
} else if c.is_ascii_digit() {
WordMode::Number
} else {
// This assumes all characters are either lower or upper case.
WordMode::Boundary
}
}
fn case(self, last: WordMode) -> Case {
match (self, last) {
(WordMode::Uppercase, WordMode::Uppercase) => Case::Scream,
(WordMode::Uppercase, WordMode::Lowercase) => Case::Title,
(WordMode::Lowercase, WordMode::Lowercase) => Case::Lower,
(WordMode::Number, WordMode::Number) => Case::None,
(WordMode::Number, _)
| (_, WordMode::Number)
| (WordMode::Boundary, _)
| (_, WordMode::Boundary)
| (WordMode::Lowercase, WordMode::Uppercase) => {
unreachable!("Invalid case combination: ({:?}, {:?})", self, last)
}
}
}
}
#[cfg(test)]
@ -387,7 +398,7 @@ mod test {
#[test]
fn tokenize_empty_is_empty() {
let parser = Parser::new();
let parser = Tokenizer::new();
let input = "";
let expected: Vec<Identifier> = vec![];
@ -399,7 +410,7 @@ mod test {
#[test]
fn tokenize_word_is_word() {
let parser = Parser::new();
let parser = Tokenizer::new();
let input = "word";
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", 0)];
@ -411,7 +422,7 @@ mod test {
#[test]
fn tokenize_space_separated_words() {
let parser = Parser::new();
let parser = Tokenizer::new();
let input = "A B";
let expected: Vec<Identifier> = vec![
@ -426,7 +437,7 @@ mod test {
#[test]
fn tokenize_dot_separated_words() {
let parser = Parser::new();
let parser = Tokenizer::new();
let input = "A.B";
let expected: Vec<Identifier> = vec![
@ -441,7 +452,7 @@ mod test {
#[test]
fn tokenize_namespace_separated_words() {
let parser = Parser::new();
let parser = Tokenizer::new();
let input = "A::B";
let expected: Vec<Identifier> = vec![
@ -456,7 +467,7 @@ mod test {
#[test]
fn tokenize_underscore_doesnt_separate() {
let parser = Parser::new();
let parser = Tokenizer::new();
let input = "A_B";
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", 0)];
@ -468,7 +479,7 @@ mod test {
#[test]
fn tokenize_ignore_hex_enabled() {
let parser = ParserBuilder::new().ignore_hex(true).build();
let parser = TokenizerBuilder::new().ignore_hex(true).build();
let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![
@ -483,7 +494,7 @@ mod test {
#[test]
fn tokenize_ignore_hex_disabled() {
let parser = ParserBuilder::new()
let parser = TokenizerBuilder::new()
.ignore_hex(false)
.leading_digits(true)
.build();
@ -523,11 +534,11 @@ mod test {
&[("A", Case::Scream, 0), ("String", Case::Title, 1)],
),
(
"SimpleXMLParser",
"SimpleXMLTokenizer",
&[
("Simple", Case::Title, 0),
("XML", Case::Scream, 6),
("Parser", Case::Title, 9),
("Tokenizer", Case::Title, 9),
],
),
(

View file

@ -12,13 +12,13 @@ arg_enum! {
}
}
pub const PRINT_SILENT: typos::report::PrintSilent = typos::report::PrintSilent;
pub const PRINT_BRIEF: typos::report::PrintBrief = typos::report::PrintBrief;
pub const PRINT_LONG: typos::report::PrintLong = typos::report::PrintLong;
pub const PRINT_JSON: typos::report::PrintJson = typos::report::PrintJson;
pub const PRINT_SILENT: typos_cli::report::PrintSilent = typos_cli::report::PrintSilent;
pub const PRINT_BRIEF: typos_cli::report::PrintBrief = typos_cli::report::PrintBrief;
pub const PRINT_LONG: typos_cli::report::PrintLong = typos_cli::report::PrintLong;
pub const PRINT_JSON: typos_cli::report::PrintJson = typos_cli::report::PrintJson;
impl Format {
pub(crate) fn reporter(self) -> &'static dyn typos::report::Report {
pub(crate) fn reporter(self) -> &'static dyn typos_cli::report::Report {
match self {
Format::Silent => &PRINT_SILENT,
Format::Brief => &PRINT_BRIEF,

View file

@ -1,27 +1,637 @@
pub(crate) fn check_path(
use bstr::ByteSlice;
use crate::report;
use typos::tokens;
use typos::Dictionary;
pub trait FileChecker: Send + Sync {
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
parser: &tokens::Tokenizer,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error>;
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TyposSettings {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl TyposSettings {
pub fn new() -> Self {
Default::default()
}
pub fn check_filenames(&mut self, yes: bool) -> &mut Self {
self.check_filenames = yes;
self
}
pub fn check_files(&mut self, yes: bool) -> &mut Self {
self.check_files = yes;
self
}
pub fn binary(&mut self, yes: bool) -> &mut Self {
self.binary = yes;
self
}
pub fn build_typos(&self) -> Typos {
Typos {
check_filenames: self.check_filenames,
check_files: self.check_files,
binary: self.binary,
}
}
pub fn build_fix_typos(&self) -> FixTypos {
FixTypos {
check_filenames: self.check_filenames,
check_files: self.check_files,
binary: self.binary,
}
}
pub fn build_diff_typos(&self) -> DiffTypos {
DiffTypos {
check_filenames: self.check_filenames,
check_files: self.check_files,
binary: self.binary,
}
}
pub fn build_identifier_parser(&self) -> Identifiers {
Identifiers {
check_filenames: self.check_filenames,
check_files: self.check_files,
binary: self.binary,
}
}
pub fn build_word_parser(&self) -> Words {
Words {
check_filenames: self.check_filenames,
check_files: self.check_files,
binary: self.binary,
}
}
pub fn build_files(&self) -> FoundFiles {
FoundFiles {
binary: self.binary,
}
}
}
impl Default for TyposSettings {
fn default() -> Self {
Self {
check_filenames: true,
check_files: true,
binary: false,
}
}
}
#[derive(Debug, Clone)]
pub struct Typos {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl FileChecker for Typos {
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
tokenizer: &tokens::Tokenizer,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let parser = typos::ParserBuilder::new()
.tokenizer(tokenizer)
.dictionary(dictionary)
.build();
if self.check_filenames {
if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
for typo in parser.parse_str(file_name) {
let msg = report::Typo {
context: Some(report::PathContext { path }.into()),
buffer: std::borrow::Cow::Borrowed(file_name.as_bytes()),
byte_offset: typo.byte_offset,
typo: typo.typo.as_ref(),
corrections: typo.corrections,
};
reporter.report(msg.into())?;
}
}
}
if self.check_files {
let (buffer, content_type) = read_file(path, reporter)?;
if !explicit && !self.binary && content_type.is_binary() {
let msg = report::BinaryFile { path };
reporter.report(msg.into())?;
} else {
let mut accum_line_num = AccumulateLineNum::new();
for typo in parser.parse_bytes(&buffer) {
let line_num = accum_line_num.line_num(&buffer, typo.byte_offset);
let (line, line_offset) = extract_line(&buffer, typo.byte_offset);
let msg = report::Typo {
context: Some(report::FileContext { path, line_num }.into()),
buffer: std::borrow::Cow::Borrowed(line),
byte_offset: line_offset,
typo: typo.typo.as_ref(),
corrections: typo.corrections,
};
reporter.report(msg.into())?;
}
}
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct FixTypos {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl FileChecker for FixTypos {
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
tokenizer: &tokens::Tokenizer,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let parser = typos::ParserBuilder::new()
.tokenizer(tokenizer)
.dictionary(dictionary)
.build();
if self.check_files {
let (buffer, content_type) = read_file(path, reporter)?;
if !explicit && !self.binary && content_type.is_binary() {
let msg = report::BinaryFile { path };
reporter.report(msg.into())?;
} else {
let mut fixes = Vec::new();
let mut accum_line_num = AccumulateLineNum::new();
for typo in parser.parse_bytes(&buffer) {
if is_fixable(&typo) {
fixes.push(typo.into_owned());
} else {
let line_num = accum_line_num.line_num(&buffer, typo.byte_offset);
let (line, line_offset) = extract_line(&buffer, typo.byte_offset);
let msg = report::Typo {
context: Some(report::FileContext { path, line_num }.into()),
buffer: std::borrow::Cow::Borrowed(line),
byte_offset: line_offset,
typo: typo.typo.as_ref(),
corrections: typo.corrections,
};
reporter.report(msg.into())?;
}
}
if !fixes.is_empty() {
let buffer = fix_buffer(buffer, fixes.into_iter());
write_file(path, content_type, &buffer, reporter)?;
}
}
}
// Ensure the above write can happen before renaming the file.
if self.check_filenames {
if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
let mut fixes = Vec::new();
for typo in parser.parse_str(file_name) {
if is_fixable(&typo) {
fixes.push(typo.into_owned());
} else {
let msg = report::Typo {
context: Some(report::PathContext { path }.into()),
buffer: std::borrow::Cow::Borrowed(file_name.as_bytes()),
byte_offset: typo.byte_offset,
typo: typo.typo.as_ref(),
corrections: typo.corrections,
};
reporter.report(msg.into())?;
}
}
if !fixes.is_empty() {
let file_name = file_name.to_owned().into_bytes();
let new_name = fix_buffer(file_name, fixes.into_iter());
let new_name =
String::from_utf8(new_name).expect("corrections are valid utf-8");
let new_path = path.with_file_name(new_name);
std::fs::rename(path, new_path)?;
}
}
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct DiffTypos {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl FileChecker for DiffTypos {
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
tokenizer: &tokens::Tokenizer,
dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
let parser = typos::ParserBuilder::new()
.tokenizer(tokenizer)
.dictionary(dictionary)
.build();
let mut content = Vec::new();
let mut new_content = Vec::new();
if self.check_files {
let (buffer, content_type) = read_file(path, reporter)?;
if !explicit && !self.binary && content_type.is_binary() {
let msg = report::BinaryFile { path };
reporter.report(msg.into())?;
} else {
let mut fixes = Vec::new();
let mut accum_line_num = AccumulateLineNum::new();
for typo in parser.parse_bytes(&buffer) {
if is_fixable(&typo) {
fixes.push(typo.into_owned());
} else {
let line_num = accum_line_num.line_num(&buffer, typo.byte_offset);
let (line, line_offset) = extract_line(&buffer, typo.byte_offset);
let msg = report::Typo {
context: Some(report::FileContext { path, line_num }.into()),
buffer: std::borrow::Cow::Borrowed(line),
byte_offset: line_offset,
typo: typo.typo.as_ref(),
corrections: typo.corrections,
};
reporter.report(msg.into())?;
}
}
if !fixes.is_empty() {
new_content = fix_buffer(buffer.clone(), fixes.into_iter());
content = buffer
}
}
}
// Match FixTypos ordering for easy diffing.
let mut new_path = None;
if self.check_filenames {
if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
let mut fixes = Vec::new();
for typo in parser.parse_str(file_name) {
if is_fixable(&typo) {
fixes.push(typo.into_owned());
} else {
let msg = report::Typo {
context: Some(report::PathContext { path }.into()),
buffer: std::borrow::Cow::Borrowed(file_name.as_bytes()),
byte_offset: typo.byte_offset,
typo: typo.typo.as_ref(),
corrections: typo.corrections,
};
reporter.report(msg.into())?;
}
}
if !fixes.is_empty() {
let file_name = file_name.to_owned().into_bytes();
let new_name = fix_buffer(file_name, fixes.into_iter());
let new_name =
String::from_utf8(new_name).expect("corrections are valid utf-8");
new_path = Some(path.with_file_name(new_name));
}
}
}
if new_path.is_some() || !content.is_empty() {
let original_path = path.display().to_string();
let fixed_path = new_path.as_deref().unwrap_or(path).display().to_string();
let original_content: Vec<_> = content
.lines_with_terminator()
.map(|s| String::from_utf8_lossy(s).into_owned())
.collect();
let fixed_content: Vec<_> = new_content
.lines_with_terminator()
.map(|s| String::from_utf8_lossy(s).into_owned())
.collect();
let diff = difflib::unified_diff(
&original_content,
&fixed_content,
original_path.as_str(),
fixed_path.as_str(),
"original",
"fixed",
0,
);
for line in diff {
print!("{}", line);
}
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct Identifiers {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl FileChecker for Identifiers {
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
if self.check_filenames {
if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
for word in tokenizer.parse_str(file_name) {
let msg = report::Parse {
context: Some(report::PathContext { path }.into()),
kind: report::ParseKind::Identifier,
data: word.token(),
};
reporter.report(msg.into())?;
}
}
}
if self.check_files {
let (buffer, content_type) = read_file(path, reporter)?;
if !explicit && !self.binary && content_type.is_binary() {
let msg = report::BinaryFile { path };
reporter.report(msg.into())?;
} else {
for word in tokenizer.parse_bytes(&buffer) {
// HACK: Don't look up the line_num per entry to better match the performance
// of Typos for comparison purposes. We don't really get much out of it
// anyway.
let line_num = 0;
let msg = report::Parse {
context: Some(report::FileContext { path, line_num }.into()),
kind: report::ParseKind::Identifier,
data: word.token(),
};
reporter.report(msg.into())?;
}
}
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct Words {
check_filenames: bool,
check_files: bool,
binary: bool,
}
impl FileChecker for Words {
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
tokenizer: &tokens::Tokenizer,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
if self.check_filenames {
if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) {
for word in tokenizer.parse_str(file_name).flat_map(|i| i.split()) {
let msg = report::Parse {
context: Some(report::PathContext { path }.into()),
kind: report::ParseKind::Word,
data: word.token(),
};
reporter.report(msg.into())?;
}
}
}
if self.check_files {
let (buffer, content_type) = read_file(path, reporter)?;
if !explicit && !self.binary && content_type.is_binary() {
let msg = report::BinaryFile { path };
reporter.report(msg.into())?;
} else {
for word in tokenizer.parse_bytes(&buffer).flat_map(|i| i.split()) {
// HACK: Don't look up the line_num per entry to better match the performance
// of Typos for comparison purposes. We don't really get much out of it
// anyway.
let line_num = 0;
let msg = report::Parse {
context: Some(report::FileContext { path, line_num }.into()),
kind: report::ParseKind::Word,
data: word.token(),
};
reporter.report(msg.into())?;
}
}
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct FoundFiles {
binary: bool,
}
impl FileChecker for FoundFiles {
fn check_file(
&self,
path: &std::path::Path,
explicit: bool,
_parser: &tokens::Tokenizer,
_dictionary: &dyn Dictionary,
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
// Check `self.binary` first so we can easily check performance of walking vs reading
if self.binary {
let msg = report::File::new(path);
reporter.report(msg.into())?;
} else {
let (_buffer, content_type) = read_file(path, reporter)?;
if !explicit && content_type.is_binary() {
let msg = report::BinaryFile { path };
reporter.report(msg.into())?;
} else {
let msg = report::File::new(path);
reporter.report(msg.into())?;
}
}
Ok(())
}
}
pub fn read_file(
path: &std::path::Path,
reporter: &dyn report::Report,
) -> Result<(Vec<u8>, content_inspector::ContentType), std::io::Error> {
let buffer = match std::fs::read(path) {
Ok(buffer) => buffer,
Err(err) => {
let msg = report::Error::new(err.to_string());
reporter.report(msg.into())?;
Vec::new()
}
};
let mut content_type = content_inspector::inspect(&buffer);
// HACK: We only support UTF-8 at the moment
if content_type != content_inspector::ContentType::UTF_8_BOM
&& content_type != content_inspector::ContentType::UTF_8
{
content_type = content_inspector::ContentType::BINARY;
}
Ok((buffer, content_type))
}
pub fn write_file(
path: &std::path::Path,
content_type: content_inspector::ContentType,
buffer: &[u8],
reporter: &dyn report::Report,
) -> Result<(), std::io::Error> {
assert!(
content_type == content_inspector::ContentType::UTF_8_BOM
|| content_type == content_inspector::ContentType::UTF_8
|| content_type == content_inspector::ContentType::BINARY
);
match std::fs::write(path, buffer) {
Ok(()) => (),
Err(err) => {
let msg = report::Error::new(err.to_string());
reporter.report(msg.into())?;
}
};
Ok(())
}
struct AccumulateLineNum {
line_num: usize,
last_offset: usize,
}
impl AccumulateLineNum {
fn new() -> Self {
Self {
// 1-indexed
line_num: 1,
last_offset: 0,
}
}
fn line_num(&mut self, buffer: &[u8], byte_offset: usize) -> usize {
assert!(self.last_offset <= byte_offset);
let slice = &buffer[self.last_offset..byte_offset];
let newlines = slice.lines().count();
let line_num = self.line_num + newlines;
self.line_num = line_num;
self.last_offset = byte_offset;
line_num
}
}
fn extract_line(buffer: &[u8], byte_offset: usize) -> (&[u8], usize) {
let line_start = buffer[0..byte_offset]
.rfind_byte(b'\n')
// Skip the newline
.map(|s| s + 1)
.unwrap_or(0);
let line = buffer[line_start..]
.lines()
.next()
.expect("should always be at least a line");
let line_offset = byte_offset - line_start;
(line, line_offset)
}
fn extract_fix<'t>(typo: &'t typos::Typo<'t>) -> Option<&'t str> {
match &typo.corrections {
typos::Status::Corrections(c) if c.len() == 1 => Some(c[0].as_ref()),
_ => None,
}
}
fn is_fixable(typo: &typos::Typo<'_>) -> bool {
extract_fix(typo).is_some()
}
fn fix_buffer(mut buffer: Vec<u8>, typos: impl Iterator<Item = typos::Typo<'static>>) -> Vec<u8> {
let mut offset = 0isize;
for typo in typos {
let fix = extract_fix(&typo).expect("Caller only provides fixable typos");
let start = ((typo.byte_offset as isize) + offset) as usize;
let end = start + typo.typo.len();
buffer.splice(start..end, fix.as_bytes().iter().copied());
offset += (fix.len() as isize) - (typo.typo.len() as isize);
}
buffer
}
pub fn walk_path(
walk: ignore::Walk,
checks: &dyn typos::checks::Check,
parser: &typos::tokens::Parser,
checks: &dyn FileChecker,
parser: &typos::tokens::Tokenizer,
dictionary: &dyn typos::Dictionary,
reporter: &dyn typos::report::Report,
reporter: &dyn report::Report,
) -> Result<(), ignore::Error> {
for entry in walk {
check_entry(entry, checks, parser, dictionary, reporter)?;
walk_entry(entry, checks, parser, dictionary, reporter)?;
}
Ok(())
}
pub(crate) fn check_path_parallel(
pub fn walk_path_parallel(
walk: ignore::WalkParallel,
checks: &dyn typos::checks::Check,
parser: &typos::tokens::Parser,
checks: &dyn FileChecker,
parser: &typos::tokens::Tokenizer,
dictionary: &dyn typos::Dictionary,
reporter: &dyn typos::report::Report,
reporter: &dyn report::Report,
) -> Result<(), ignore::Error> {
let error: std::sync::Mutex<Result<(), ignore::Error>> = std::sync::Mutex::new(Ok(()));
walk.run(|| {
Box::new(|entry: Result<ignore::DirEntry, ignore::Error>| {
match check_entry(entry, checks, parser, dictionary, reporter) {
match walk_entry(entry, checks, parser, dictionary, reporter) {
Ok(()) => ignore::WalkState::Continue,
Err(err) => {
*error.lock().unwrap() = Err(err);
@ -34,17 +644,16 @@ pub(crate) fn check_path_parallel(
error.into_inner().unwrap()
}
fn check_entry(
fn walk_entry(
entry: Result<ignore::DirEntry, ignore::Error>,
checks: &dyn typos::checks::Check,
parser: &typos::tokens::Parser,
checks: &dyn FileChecker,
parser: &typos::tokens::Tokenizer,
dictionary: &dyn typos::Dictionary,
reporter: &dyn typos::report::Report,
reporter: &dyn report::Report,
) -> Result<(), ignore::Error> {
let entry = entry?;
if entry.file_type().map(|t| t.is_file()).unwrap_or(true) {
let explicit = entry.depth() == 0;
checks.check_filename(entry.path(), parser, dictionary, reporter)?;
checks.check_file(entry.path(), explicit, parser, dictionary, reporter)?;
}

View file

@ -1,93 +0,0 @@
use std::collections::BTreeMap;
use std::sync;
use bstr::ByteSlice;
pub struct Diff<'r> {
reporter: &'r dyn typos::report::Report,
deferred: sync::Mutex<crate::replace::Deferred>,
}
impl<'r> Diff<'r> {
pub(crate) fn new(reporter: &'r dyn typos::report::Report) -> Self {
Self {
reporter,
deferred: sync::Mutex::new(crate::replace::Deferred::default()),
}
}
pub fn show(&self) -> Result<(), std::io::Error> {
let deferred = self.deferred.lock().unwrap();
for (path, corrections) in deferred.content.iter() {
let buffer = std::fs::read(path)?;
let mut original = Vec::new();
let mut corrected = Vec::new();
for (line_idx, line) in buffer.lines_with_terminator().enumerate() {
original.push(String::from_utf8_lossy(line).into_owned());
let line_num = line_idx + 1;
let line = if let Some(corrections) = corrections.get(&line_num) {
let line = line.to_vec();
crate::replace::correct(line, &corrections)
} else {
line.to_owned()
};
corrected.push(String::from_utf8_lossy(&line).into_owned())
}
let display_path = path.display().to_string();
let diff = difflib::unified_diff(
&original,
&corrected,
display_path.as_str(),
display_path.as_str(),
"original",
"corrected",
0,
);
for line in diff {
print!("{}", line);
}
}
Ok(())
}
}
impl<'r> typos::report::Report for Diff<'r> {
fn report(&self, msg: typos::report::Message<'_>) -> Result<(), std::io::Error> {
let typo = match &msg {
typos::report::Message::Typo(typo) => typo,
_ => return self.reporter.report(msg),
};
let corrections = match &typo.corrections {
typos::Status::Corrections(corrections) if corrections.len() == 1 => corrections,
_ => return self.reporter.report(msg),
};
match &typo.context {
Some(typos::report::Context::File(file)) => {
let path = file.path.to_owned();
let line_num = file.line_num;
let correction = crate::replace::Correction::new(
typo.byte_offset,
typo.typo,
corrections[0].as_ref(),
);
let mut deferred = self.deferred.lock().unwrap();
let content = deferred
.content
.entry(path)
.or_insert_with(BTreeMap::new)
.entry(line_num)
.or_insert_with(Vec::new);
content.push(correction);
Ok(())
}
_ => self.reporter.report(msg),
}
}
}

View file

@ -1,2 +1,4 @@
pub mod checks;
pub mod config;
pub mod dict;
pub mod report;

View file

@ -7,11 +7,10 @@ use std::io::Write;
use structopt::StructOpt;
mod args;
mod checks;
mod config;
mod dict;
mod diff;
mod replace;
use typos_cli::checks;
use typos_cli::config;
use typos_cli::dict;
use typos_cli::report;
use proc_exit::WithCodeResultExt;
@ -61,7 +60,7 @@ fn run() -> proc_exit::ExitResult {
config.default.update(&args.overrides);
let config = config;
let parser = typos::tokens::ParserBuilder::new()
let parser = typos::tokens::TokenizerBuilder::new()
.ignore_hex(config.default.ignore_hex())
.leading_digits(config.default.identifier_leading_digits())
.leading_chars(config.default.identifier_leading_chars().to_owned())
@ -74,7 +73,7 @@ fn run() -> proc_exit::ExitResult {
dictionary.identifiers(config.default.extend_identifiers());
dictionary.words(config.default.extend_words());
let mut settings = typos::checks::TyposSettings::new();
let mut settings = checks::TyposSettings::new();
settings
.check_filenames(config.default.check_filename())
.check_files(config.default.check_file())
@ -98,18 +97,11 @@ fn run() -> proc_exit::ExitResult {
} else {
args.format.reporter()
};
let status_reporter = typos::report::MessageStatus::new(output_reporter);
let mut reporter: &dyn typos::report::Report = &status_reporter;
let replace_reporter = replace::Replace::new(reporter);
let diff_reporter = diff::Diff::new(reporter);
if args.diff {
reporter = &diff_reporter;
} else if args.write_changes {
reporter = &replace_reporter;
}
let status_reporter = report::MessageStatus::new(output_reporter);
let reporter: &dyn report::Report = &status_reporter;
let (files, identifier_parser, word_parser, checks);
let selected_checks: &dyn typos::checks::Check = if args.files {
let (files, identifier_parser, word_parser, checks, fixer, differ);
let selected_checks: &dyn checks::FileChecker = if args.files {
files = settings.build_files();
&files
} else if args.identifiers {
@ -118,13 +110,19 @@ fn run() -> proc_exit::ExitResult {
} else if args.words {
word_parser = settings.build_word_parser();
&word_parser
} else if args.write_changes {
fixer = settings.build_fix_typos();
&fixer
} else if args.diff {
differ = settings.build_diff_typos();
&differ
} else {
checks = settings.build_typos();
&checks
};
if single_threaded {
checks::check_path(
checks::walk_path(
walk.build(),
selected_checks,
&parser,
@ -132,7 +130,7 @@ fn run() -> proc_exit::ExitResult {
reporter,
)
} else {
checks::check_path_parallel(
checks::walk_path_parallel(
walk.build_parallel(),
selected_checks,
&parser,
@ -152,14 +150,6 @@ fn run() -> proc_exit::ExitResult {
if status_reporter.errors_found() {
errors_found = true;
}
if args.diff {
diff_reporter.show().with_code(proc_exit::Code::FAILURE)?;
} else if args.write_changes {
replace_reporter
.write()
.with_code(proc_exit::Code::FAILURE)?;
}
}
if errors_found {

View file

@ -1,263 +0,0 @@
use std::collections::BTreeMap;
use std::io::Write;
use std::path;
use std::sync;
use bstr::ByteSlice;
pub struct Replace<'r> {
reporter: &'r dyn typos::report::Report,
deferred: sync::Mutex<Deferred>,
}
impl<'r> Replace<'r> {
pub(crate) fn new(reporter: &'r dyn typos::report::Report) -> Self {
Self {
reporter,
deferred: sync::Mutex::new(Deferred::default()),
}
}
pub fn write(&self) -> Result<(), std::io::Error> {
let deferred = self.deferred.lock().unwrap();
for (path, corrections) in deferred.content.iter() {
let buffer = std::fs::read(path)?;
let mut file = std::fs::File::create(path)?;
for (line_idx, line) in buffer.lines_with_terminator().enumerate() {
let line_num = line_idx + 1;
if let Some(corrections) = corrections.get(&line_num) {
let line = line.to_vec();
let line = correct(line, &corrections);
file.write_all(&line)?;
} else {
file.write_all(&line)?;
}
}
}
for (path, corrections) in deferred.paths.iter() {
let orig_name = path
.file_name()
.and_then(|s| s.to_str())
.expect("generating a correction requires the filename to be valid.")
.to_owned()
.into_bytes();
let new_name = correct(orig_name, &corrections);
let new_name = String::from_utf8(new_name).expect("corrections are valid utf-8");
let new_path = path.with_file_name(new_name);
std::fs::rename(path, new_path)?;
}
Ok(())
}
}
impl<'r> typos::report::Report for Replace<'r> {
fn report(&self, msg: typos::report::Message<'_>) -> Result<(), std::io::Error> {
let typo = match &msg {
typos::report::Message::Typo(typo) => typo,
_ => return self.reporter.report(msg),
};
let corrections = match &typo.corrections {
typos::Status::Corrections(corrections) if corrections.len() == 1 => corrections,
_ => return self.reporter.report(msg),
};
match &typo.context {
Some(typos::report::Context::File(file)) => {
let path = file.path.to_owned();
let line_num = file.line_num;
let correction =
Correction::new(typo.byte_offset, typo.typo, corrections[0].as_ref());
let mut deferred = self.deferred.lock().unwrap();
let content = deferred
.content
.entry(path)
.or_insert_with(BTreeMap::new)
.entry(line_num)
.or_insert_with(Vec::new);
content.push(correction);
Ok(())
}
Some(typos::report::Context::Path(path)) => {
let path = path.path.to_owned();
let correction =
Correction::new(typo.byte_offset, typo.typo, corrections[0].as_ref());
let mut deferred = self.deferred.lock().unwrap();
let content = deferred.paths.entry(path).or_insert_with(Vec::new);
content.push(correction);
Ok(())
}
_ => self.reporter.report(msg),
}
}
}
#[derive(Clone, Debug, Default)]
pub(crate) struct Deferred {
pub(crate) content: BTreeMap<path::PathBuf, BTreeMap<usize, Vec<Correction>>>,
pub(crate) paths: BTreeMap<path::PathBuf, Vec<Correction>>,
}
#[derive(Clone, Debug, PartialOrd, Ord, PartialEq, Eq)]
pub(crate) struct Correction {
pub byte_offset: usize,
pub typo: Vec<u8>,
pub correction: Vec<u8>,
}
impl Correction {
pub(crate) fn new(byte_offset: usize, typo: &str, correction: &str) -> Self {
Self {
byte_offset,
typo: typo.as_bytes().to_vec(),
correction: correction.as_bytes().to_vec(),
}
}
}
pub(crate) fn correct(mut line: Vec<u8>, corrections: &[Correction]) -> Vec<u8> {
let mut corrections: Vec<_> = corrections.iter().collect();
corrections.sort_unstable();
corrections.reverse();
for correction in corrections {
let start = correction.byte_offset;
let end = start + correction.typo.len();
line.splice(start..end, correction.correction.iter().copied());
}
line
}
#[cfg(test)]
mod test {
use super::*;
use assert_fs::prelude::*;
use typos::report::Report;
fn simple_correct(line: &str, corrections: Vec<(usize, &str, &str)>) -> String {
let line = line.as_bytes().to_vec();
let corrections: Vec<_> = corrections
.into_iter()
.map(|(byte_offset, typo, correction)| Correction {
byte_offset,
typo: typo.as_bytes().to_vec(),
correction: correction.as_bytes().to_vec(),
})
.collect();
let actual = correct(line, &corrections);
String::from_utf8(actual).unwrap()
}
#[test]
fn test_correct_single() {
let actual = simple_correct("foo foo foo", vec![(4, "foo", "bar")]);
assert_eq!(actual, "foo bar foo");
}
#[test]
fn test_correct_single_grow() {
let actual = simple_correct("foo foo foo", vec![(4, "foo", "happy")]);
assert_eq!(actual, "foo happy foo");
}
#[test]
fn test_correct_single_shrink() {
let actual = simple_correct("foo foo foo", vec![(4, "foo", "if")]);
assert_eq!(actual, "foo if foo");
}
#[test]
fn test_correct_start() {
let actual = simple_correct("foo foo foo", vec![(0, "foo", "bar")]);
assert_eq!(actual, "bar foo foo");
}
#[test]
fn test_correct_end() {
let actual = simple_correct("foo foo foo", vec![(8, "foo", "bar")]);
assert_eq!(actual, "foo foo bar");
}
#[test]
fn test_correct_end_grow() {
let actual = simple_correct("foo foo foo", vec![(8, "foo", "happy")]);
assert_eq!(actual, "foo foo happy");
}
#[test]
fn test_correct_multiple() {
let actual = simple_correct(
"foo foo foo",
vec![(4, "foo", "happy"), (8, "foo", "world")],
);
assert_eq!(actual, "foo happy world");
}
#[test]
fn test_replace_content() {
let temp = assert_fs::TempDir::new().unwrap();
let input_file = temp.child("foo.txt");
input_file.write_str("1 foo 2\n3 4 5").unwrap();
let primary = typos::report::PrintSilent;
let replace = Replace::new(&primary);
replace
.report(
typos::report::Typo::default()
.context(Some(
typos::report::FileContext::default()
.path(input_file.path())
.line_num(1)
.into(),
))
.buffer(std::borrow::Cow::Borrowed(b"1 foo 2\n3 4 5"))
.byte_offset(2)
.typo("foo")
.corrections(typos::Status::Corrections(vec![
std::borrow::Cow::Borrowed("bar"),
]))
.into(),
)
.unwrap();
replace.write().unwrap();
input_file.assert("1 bar 2\n3 4 5");
}
#[test]
fn test_replace_path() {
let temp = assert_fs::TempDir::new().unwrap();
let input_file = temp.child("foo.txt");
input_file.write_str("foo foo foo").unwrap();
let primary = typos::report::PrintSilent;
let replace = Replace::new(&primary);
replace
.report(
typos::report::Typo::default()
.context(Some(
typos::report::PathContext::default()
.path(input_file.path())
.into(),
))
.buffer(std::borrow::Cow::Borrowed(b"foo.txt"))
.byte_offset(0)
.typo("foo")
.corrections(typos::Status::Corrections(vec![
std::borrow::Cow::Borrowed("bar"),
]))
.into(),
)
.unwrap();
replace.write().unwrap();
input_file.assert(predicates::path::missing());
temp.child("bar.txt").assert("foo foo foo");
}
}

View file

@ -72,7 +72,7 @@ pub struct Typo<'m> {
pub buffer: Cow<'m, [u8]>,
pub byte_offset: usize,
pub typo: &'m str,
pub corrections: crate::Status<'m>,
pub corrections: typos::Status<'m>,
}
impl<'m> Default for Typo<'m> {
@ -82,7 +82,7 @@ impl<'m> Default for Typo<'m> {
buffer: Cow::Borrowed(&[]),
byte_offset: 0,
typo: "",
corrections: crate::Status::Invalid,
corrections: typos::Status::Invalid,
}
}
}
@ -168,7 +168,7 @@ pub struct Parse<'m> {
#[serde(flatten)]
pub context: Option<Context<'m>>,
pub kind: ParseKind,
pub data: Vec<&'m str>,
pub data: &'m str,
}
impl<'m> Default for Parse<'m> {
@ -176,7 +176,7 @@ impl<'m> Default for Parse<'m> {
Self {
context: None,
kind: ParseKind::Identifier,
data: vec![],
data: "",
}
}
}
@ -234,10 +234,21 @@ impl<'r> MessageStatus<'r> {
impl<'r> Report for MessageStatus<'r> {
fn report(&self, msg: Message) -> Result<(), std::io::Error> {
self.typos_found
.compare_and_swap(false, msg.is_correction(), atomic::Ordering::Relaxed);
self.errors_found
.compare_and_swap(false, msg.is_error(), atomic::Ordering::Relaxed);
let _ = self.typos_found.compare_exchange(
false,
msg.is_correction(),
atomic::Ordering::Relaxed,
atomic::Ordering::Relaxed,
);
let _ = self
.errors_found
.compare_exchange(
false,
msg.is_error(),
atomic::Ordering::Relaxed,
atomic::Ordering::Relaxed,
)
.unwrap();
self.reporter.report(msg)
}
}
@ -265,7 +276,7 @@ impl Report for PrintBrief {
writeln!(io::stdout(), "{}", msg.path.display())?;
}
Message::Parse(msg) => {
writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?;
writeln!(io::stdout(), "{}", msg.data)?;
}
Message::Error(msg) => {
log::error!("{}: {}", context_display(&msg.context), msg.msg);
@ -289,7 +300,7 @@ impl Report for PrintLong {
writeln!(io::stdout(), "{}", msg.path.display())?;
}
Message::Parse(msg) => {
writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?;
writeln!(io::stdout(), "{}", msg.data)?;
}
Message::Error(msg) => {
log::error!("{}: {}", context_display(&msg.context), msg.msg);
@ -308,8 +319,8 @@ fn print_brief_correction(msg: &Typo) -> Result<(), std::io::Error> {
)
.count();
match &msg.corrections {
crate::Status::Valid => {}
crate::Status::Invalid => {
typos::Status::Valid => {}
typos::Status::Invalid => {
writeln!(
io::stdout(),
"{}:{}: `{}` is disallowed",
@ -318,7 +329,7 @@ fn print_brief_correction(msg: &Typo) -> Result<(), std::io::Error> {
msg.typo,
)?;
}
crate::Status::Corrections(corrections) => {
typos::Status::Corrections(corrections) => {
writeln!(
io::stdout(),
"{}:{}: `{}` -> {}",
@ -345,11 +356,11 @@ fn print_long_correction(msg: &Typo) -> Result<(), std::io::Error> {
)
.count();
match &msg.corrections {
crate::Status::Valid => {}
crate::Status::Invalid => {
typos::Status::Valid => {}
typos::Status::Invalid => {
writeln!(handle, "error: `{}` is disallowed`", msg.typo,)?;
}
crate::Status::Corrections(corrections) => {
typos::Status::Corrections(corrections) => {
writeln!(
handle,
"error: `{}` should be {}",