feat: Support english dialects

The goal is to be as accepting and unobtrusive to new code bases as
possible.  To this end, we correct typos into the closest english
dialect.

If someone wants to opt-in, they can have typos correct to a specific
english dialect.

Fixes #52
Fixes #22
This commit is contained in:
Ed Page 2020-05-27 20:46:41 -05:00
parent f1cf48b6be
commit ab4a5bbdaf
22 changed files with 236208 additions and 37310 deletions

View file

@ -1,3 +1,3 @@
*_codegen.rs
*codegen.rs
assets/
typos/benches/corrections.rs
benches/corrections.rs

268
Cargo.lock generated
View file

@ -2,9 +2,9 @@
# It is not intended for manual editing.
[[package]]
name = "aho-corasick"
version = "0.7.10"
version = "0.7.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8716408b8bc624ed7f65d223ddb9ac2d044c0547b6fa4b0d554f3a9540496ada"
checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86"
dependencies = [
"memchr",
]
@ -20,18 +20,15 @@ dependencies = [
[[package]]
name = "anyhow"
version = "1.0.31"
version = "1.0.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f"
checksum = "6b602bfe940d21c130f3895acd65221e8a61270debe89d628b9cb4e3ccb8569b"
[[package]]
name = "arrayvec"
version = "0.4.12"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd9fd44efafa8690358b7408d253adf110036b88f55672a933f01d616ad9b1b9"
dependencies = [
"nodrop",
]
checksum = "cff77d8686867eceff3105329d4698d96c2391c176d5d03adc90c7389162b5b8"
[[package]]
name = "assert_fs"
@ -90,15 +87,15 @@ checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de"
[[package]]
name = "cfg-if"
version = "0.1.9"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"
checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
[[package]]
name = "clap"
version = "2.33.1"
version = "2.33.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129"
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
dependencies = [
"ansi_term",
"atty",
@ -203,10 +200,10 @@ checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b"
dependencies = [
"fnv",
"ident_case",
"proc-macro2 1.0.18",
"quote 1.0.6",
"proc-macro2 1.0.19",
"quote 1.0.7",
"strsim 0.9.3",
"syn 1.0.33",
"syn 1.0.38",
]
[[package]]
@ -216,8 +213,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72"
dependencies = [
"darling_core",
"quote 1.0.6",
"syn 1.0.33",
"quote 1.0.7",
"syn 1.0.38",
]
[[package]]
@ -240,9 +237,9 @@ version = "0.99.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "298998b1cf6b5b2c8a7b023dfd45821825ce3ba8a8af55c921a0e734e4653f76"
dependencies = [
"proc-macro2 1.0.18",
"quote 1.0.6",
"syn 1.0.33",
"proc-macro2 1.0.19",
"quote 1.0.7",
"syn 1.0.38",
]
[[package]]
@ -252,9 +249,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6604612c19dd3bb353650b715b61f09bcb089dd17bdca1a9a42637079bf5e428"
dependencies = [
"darling",
"proc-macro2 1.0.18",
"quote 1.0.6",
"syn 1.0.33",
"proc-macro2 1.0.19",
"quote 1.0.7",
"syn 1.0.38",
]
[[package]]
@ -270,10 +267,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
[[package]]
name = "either"
version = "1.5.3"
name = "edit-distance"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3"
checksum = "bbbaaaf38131deb9ca518a274a45bfdb8771f139517b073b16c2d3d32ae5037b"
[[package]]
name = "either"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd56b59865bce947ac5958779cfa508f6c3b9497cc762b7e24a12d11ccde2c4f"
[[package]]
name = "enumflags2"
@ -290,9 +293,9 @@ version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "946ee94e3dbf58fdd324f9ce245c7b238d46a66f00e86a020b71996349e46cce"
dependencies = [
"proc-macro2 1.0.18",
"quote 1.0.6",
"syn 1.0.33",
"proc-macro2 1.0.19",
"quote 1.0.7",
"syn 1.0.38",
]
[[package]]
@ -310,9 +313,9 @@ dependencies = [
[[package]]
name = "float-cmp"
version = "0.6.0"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da62c4f1b81918835a8c6a484a397775fff5953fe83529afd51b05f5c6a6617d"
checksum = "e1267f4ac4f343772758f7b1bdcbe767c218bbab93bb432acbf5162bbf85a6c4"
dependencies = [
"num-traits",
]
@ -368,9 +371,9 @@ dependencies = [
[[package]]
name = "hermit-abi"
version = "0.1.13"
version = "0.1.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91780f809e750b0a89f5544be56617ff6b1227ee485bcb06ebe10cdf89bd3b71"
checksum = "3deed196b6e7f9e44a2ae8d94225d80302d81208b1bb673fd21fe634645c85a9"
dependencies = [
"libc",
]
@ -419,9 +422,9 @@ dependencies = [
[[package]]
name = "itoa"
version = "0.4.5"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8b7a7c0c47db5545ed3fef7468ee7bb5b74691498139e4b3f6a20685dc6dd8e"
checksum = "dc6f3ad7b9d11a0c00842ff8de1b60ee58661048eb8049ed33c73594f359d7e6"
[[package]]
name = "lazy_static"
@ -431,29 +434,28 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "lexical-core"
version = "0.6.7"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f86d66d380c9c5a685aaac7a11818bdfa1f733198dfd9ec09c70b762cd12ad6f"
checksum = "db65c6da02e61f55dae90a0ae427b2a5f6b3e8db09f58d10efab23af92592616"
dependencies = [
"arrayvec",
"bitflags",
"cfg-if",
"rustc_version",
"ryu",
"static_assertions",
]
[[package]]
name = "libc"
version = "0.2.71"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9457b06509d27052635f90d6466700c65095fdf75409b3fbdd903e988b886f49"
checksum = "a2f02823cf78b754822df5f7f268fb59822e7296276d3e069d8e8cb26a14bd10"
[[package]]
name = "log"
version = "0.4.8"
version = "0.4.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7"
checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b"
dependencies = [
"cfg-if",
]
@ -486,12 +488,6 @@ dependencies = [
"unicase",
]
[[package]]
name = "nodrop"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
[[package]]
name = "nom"
version = "5.1.2"
@ -511,9 +507,9 @@ checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be"
[[package]]
name = "num-traits"
version = "0.2.11"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c62be47e61d1842b9170f0fdeec8eba98e60e90e5446449a0545e5152acd7096"
checksum = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611"
dependencies = [
"autocfg",
]
@ -559,15 +555,15 @@ dependencies = [
[[package]]
name = "ppv-lite86"
version = "0.2.8"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "237a5ed80e274dbc66f86bd59c1e25edc039660be53194b5fe0a482e0f2612ea"
checksum = "c36fa947111f5c62a733b652544dd0016a43ce89619538a8ef92724a6f501a20"
[[package]]
name = "predicates"
version = "1.0.4"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "347a1b6f0b21e636bc9872fb60b83b8e185f6f5516298b8238699f7f9a531030"
checksum = "96bfead12e90dccead362d62bb2c90a5f6fc4584963645bc7f71a735e0b0735a"
dependencies = [
"difference",
"float-cmp",
@ -594,27 +590,25 @@ dependencies = [
[[package]]
name = "proc-macro-error"
version = "1.0.2"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "98e9e4b82e0ef281812565ea4751049f1bdcdfccda7d3f459f2e138a40c08678"
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
dependencies = [
"proc-macro-error-attr",
"proc-macro2 1.0.18",
"quote 1.0.6",
"syn 1.0.33",
"proc-macro2 1.0.19",
"quote 1.0.7",
"syn 1.0.38",
"version_check",
]
[[package]]
name = "proc-macro-error-attr"
version = "1.0.2"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f5444ead4e9935abd7f27dc51f7e852a0569ac888096d5ec2499470794e2e53"
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
dependencies = [
"proc-macro2 1.0.18",
"quote 1.0.6",
"syn 1.0.33",
"syn-mid",
"proc-macro2 1.0.19",
"quote 1.0.7",
"version_check",
]
@ -629,11 +623,11 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.18"
version = "1.0.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "beae6331a816b1f65d04c45b078fd8e6c93e8071771f41b8163255bbd8d7c8fa"
checksum = "04f5f085b5d71e2188cb8271e5da0161ad52c3f227a661a3c135fdf28e258b12"
dependencies = [
"unicode-xid 0.2.0",
"unicode-xid 0.2.1",
]
[[package]]
@ -653,11 +647,11 @@ dependencies = [
[[package]]
name = "quote"
version = "1.0.6"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54a21852a652ad6f610c9510194f398ff6f8692e334fd1145fed931f7fbe44ea"
checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37"
dependencies = [
"proc-macro2 1.0.18",
"proc-macro2 1.0.19",
]
[[package]]
@ -713,9 +707,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.1.56"
version = "0.1.57"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
[[package]]
name = "regex"
@ -746,9 +740,9 @@ checksum = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8"
[[package]]
name = "remove_dir_all"
version = "0.5.2"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a83fa3702a688b9359eccba92d153ac33fd2e8462f9e0e3fdf155239ea7792e"
checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
dependencies = [
"winapi",
]
@ -794,29 +788,29 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]]
name = "serde"
version = "1.0.114"
version = "1.0.115"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5317f7588f0a5078ee60ef675ef96735a1442132dc645eb1d12c018620ed8cd3"
checksum = "e54c9a88f2da7238af84b5101443f0c0d0a3bbdc455e34a5c9497b1903ed55d5"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.114"
version = "1.0.115"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a0be94b04690fbaed37cddffc5c134bf537c8e3329d53e982fe04c374978f8e"
checksum = "609feed1d0a73cc36a0182a840a9b37b4a82f0b1150369f0536a9e3f2a31dc48"
dependencies = [
"proc-macro2 1.0.18",
"quote 1.0.6",
"syn 1.0.33",
"proc-macro2 1.0.19",
"quote 1.0.7",
"syn 1.0.38",
]
[[package]]
name = "serde_json"
version = "1.0.56"
version = "1.0.57"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3433e879a558dde8b5e8feb2a04899cf34fdde1fafb894687e52105fc1162ac3"
checksum = "164eacbdb13512ec2745fb09d51fd5b22b0d65ed294a1dcf7285a360c80a675c"
dependencies = [
"itoa",
"ryu",
@ -831,9 +825,9 @@ checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7"
[[package]]
name = "static_assertions"
version = "0.3.4"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f3eb36b47e512f8f1c9e3d10c2c1965bc992bd9cdb024fa581e2194501c83d3"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "strsim"
@ -849,9 +843,9 @@ checksum = "6446ced80d6c486436db5c078dde11a9f73d42b57fb273121e160b84f63d894c"
[[package]]
name = "structopt"
version = "0.3.15"
version = "0.3.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de2f5e239ee807089b62adce73e48c625e0ed80df02c7ab3f068f5db5281065c"
checksum = "de5472fb24d7e80ae84a7801b7978f95a19ec32cb1876faea59ab711eb901976"
dependencies = [
"clap",
"lazy_static",
@ -860,15 +854,15 @@ dependencies = [
[[package]]
name = "structopt-derive"
version = "0.4.8"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "510413f9de616762a4fbeab62509bf15c729603b72d7cd71280fbca431b1c118"
checksum = "1e0eb37335aeeebe51be42e2dc07f031163fbabfa6ac67d7ea68b5c2f68d5f99"
dependencies = [
"heck",
"proc-macro-error",
"proc-macro2 1.0.18",
"quote 1.0.6",
"syn 1.0.33",
"proc-macro2 1.0.19",
"quote 1.0.7",
"syn 1.0.38",
]
[[package]]
@ -884,24 +878,13 @@ dependencies = [
[[package]]
name = "syn"
version = "1.0.33"
version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8d5d96e8cbb005d6959f119f773bfaebb5684296108fb32600c00cde305b2cd"
checksum = "e69abc24912995b3038597a7a593be5053eb0fb44f3cc5beec0deb421790c1f4"
dependencies = [
"proc-macro2 1.0.18",
"quote 1.0.6",
"unicode-xid 0.2.0",
]
[[package]]
name = "syn-mid"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7be3539f6c128a931cf19dcee741c1af532c7fd387baa739c03dd2e96479338a"
dependencies = [
"proc-macro2 1.0.18",
"quote 1.0.6",
"syn 1.0.33",
"proc-macro2 1.0.19",
"quote 1.0.7",
"unicode-xid 0.2.1",
]
[[package]]
@ -951,9 +934,9 @@ version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd80fc12f73063ac132ac92aceea36734f04a1d93c1240c6944e23a3b8841793"
dependencies = [
"proc-macro2 1.0.18",
"quote 1.0.6",
"syn 1.0.33",
"proc-macro2 1.0.19",
"quote 1.0.7",
"syn 1.0.38",
]
[[package]]
@ -1017,11 +1000,21 @@ dependencies = [
"toml",
"typos",
"typos-dict",
"typos-vars",
"unicase",
]
[[package]]
name = "typos-codegen"
name = "typos-dict"
version = "0.2.1"
dependencies = [
"log",
"phf",
"unicase",
]
[[package]]
name = "typos-dict-codegen"
version = "1.0.2"
dependencies = [
"codegenrs",
@ -1033,12 +1026,45 @@ dependencies = [
]
[[package]]
name = "typos-dict"
name = "typos-dict-verify"
version = "1.0.2"
dependencies = [
"codegenrs",
"csv",
"edit-distance",
"itertools",
"structopt",
"unicase",
"varcon",
]
[[package]]
name = "typos-vars"
version = "0.2.1"
dependencies = [
"log",
"phf",
"unicase",
"varcon-core",
]
[[package]]
name = "typos-vars-codegen"
version = "1.0.2"
dependencies = [
"clap",
"clap-verbosity-flag",
"codegenrs",
"env_logger",
"itertools",
"log",
"phf",
"phf_codegen",
"structopt",
"typos",
"unicase",
"varcon",
"varcon-core",
]
[[package]]
@ -1058,9 +1084,9 @@ checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0"
[[package]]
name = "unicode-width"
version = "0.1.7"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479"
checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3"
[[package]]
name = "unicode-xid"
@ -1070,9 +1096,9 @@ checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
[[package]]
name = "unicode-xid"
version = "0.2.0"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
[[package]]
name = "varcon"
@ -1150,9 +1176,9 @@ dependencies = [
[[package]]
name = "winapi"
version = "0.3.8"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",

View file

@ -1,7 +1,8 @@
[workspace]
members = [
"crates/typos",
"crates/typos-dict", "crates/typos-dict/codegen",
"crates/typos-dict", "crates/typos-dict/codegen", "crates/typos-dict/verify",
"crates/typos-vars", "crates/typos-vars/codegen",
"crates/codespell-dict", "crates/codespell-dict/codegen",
"crates/misspell-dict", "crates/misspell-dict/codegen",
"crates/wikipedia-dict", "crates/wikipedia-dict/codegen",
@ -32,6 +33,7 @@ codecov = { repository = "crate-ci/typos" }
[dependencies]
typos = { version = "^0.3", path = "crates/typos" }
typos-dict = { version = "^0.2", path = "crates/typos-dict" }
typos-vars = { version = "^0.2", path = "crates/typos-vars" }
phf = { version = "0.8", features = ["unicase"] }
unicase = "2.5"
anyhow = "1.0"

View file

@ -154,7 +154,13 @@ stages:
steps:
- template: install-rust.yml@templates
- script: |
cargo run --package typos-codegen -- --output crates/typos-dict/src/dict_codegen.rs --check
cargo run --package typos-dict-codegen -- --output crates/typos-dict/src/dict_codegen.rs --check
displayName: Verify typos-dict
- script: |
cargo run --package typos-vars-codegen -- --output crates/typos-vars/src/vars_codegen.rs --check
displayName: Verify typos-dict
- script: |
cargo run --package typos-dict-verify -- --input crates/typos-dict/assets/words.csv --output crates/typos-dict/assets/words.csv --check
displayName: Verify typos-dict
- script: |
cargo run --package codespell-codegen -- --output crates/codespell-dict/src/dict_codegen.rs --check

View file

@ -4,12 +4,12 @@ extern crate test;
#[bench]
fn load_corrections(b: &mut test::Bencher) {
b.iter(|| typos_cli::dict::BuiltIn::new());
b.iter(|| typos_cli::dict::BuiltIn::new(Default::default()));
}
#[bench]
fn correct_word_hit(b: &mut test::Bencher) {
let corrections = typos_cli::dict::BuiltIn::new();
let corrections = typos_cli::dict::BuiltIn::new(Default::default());
let input = typos::tokens::Word::new("successs", 0).unwrap();
assert_eq!(
corrections.correct_word(input),
@ -20,7 +20,7 @@ fn correct_word_hit(b: &mut test::Bencher) {
#[bench]
fn correct_word_miss(b: &mut test::Bencher) {
let corrections = typos_cli::dict::BuiltIn::new();
let corrections = typos_cli::dict::BuiltIn::new(Default::default());
let input = typos::tokens::Word::new("success", 0).unwrap();
assert!(corrections.correct_word(input).is_empty());
b.iter(|| corrections.correct_word(input));

View file

@ -184,7 +184,7 @@ fn bench_check_file(data: &str, b: &mut test::Bencher) {
let sample_path = temp.child("sample");
sample_path.write_str(data).unwrap();
let corrections = typos_cli::dict::BuiltIn::new();
let corrections = typos_cli::dict::BuiltIn::new(Default::default());
let parser = typos::tokens::Parser::new();
let checks = typos::checks::TyposSettings::new().build_checks();
b.iter(|| {

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,5 @@
[package]
name = "typos-codegen"
name = "typos-dict-codegen"
version = "1.0.2"
authors = ["Ed Page <eopage@gmail.com>"]
description = "Source Code Spelling Correction"

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,24 @@
[package]
name = "typos-dict-verify"
version = "1.0.2"
authors = ["Ed Page <eopage@gmail.com>"]
description = "Source Code Spelling Correction"
repository = "https://github.com/crate-ci/typos"
readme = "../../../README.md"
categories = ["text-processing"]
license = "MIT"
edition = "2018"
publish = false
[badges]
azure-devops = { project = "crate-ci", pipeline = "typos" }
codecov = { repository = "crate-ci/typos" }
[dependencies]
csv = "1.0"
unicase = "2.5"
codegenrs = "0.1"
structopt = "0.3"
varcon = { version = "0.2", path = "../../varcon" }
itertools = "0.9"
edit-distance = "2.1"

View file

@ -0,0 +1,99 @@
use std::collections::HashMap;
use std::collections::HashSet;
use structopt::StructOpt;
fn generate<W: std::io::Write>(file: &mut W, dict: &[u8]) {
let mut wtr = csv::Writer::from_writer(file);
let disallowed_typos = disallowed_typos();
let related_words = related_words();
let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
.from_reader(dict);
for record in reader.records() {
let record = record.unwrap();
let typo = &record[0];
let correction = &record[1];
if disallowed_typos.contains(&unicase::UniCase::new(typo)) {
continue;
}
let correction = related_words
.get(correction)
.and_then(|words| find_best_match(typo, correction, words))
.unwrap_or(correction);
wtr.write_record(&[typo, correction]).unwrap();
}
wtr.flush().unwrap();
}
fn disallowed_typos() -> HashSet<unicase::UniCase<&'static str>> {
varcon::VARCON
.iter()
.flat_map(|c| c.entries.iter())
.flat_map(|e| e.variants.iter())
.map(|v| unicase::UniCase::new(v.word))
.collect()
}
fn related_words() -> HashMap<&'static str, HashSet<&'static str>> {
let mut words: HashMap<&'static str, HashSet<&'static str>> = HashMap::new();
for entry in varcon::VARCON.iter().flat_map(|c| c.entries.iter()) {
let variants: HashSet<_> = entry
.variants
.iter()
.filter(|v| v.types.iter().any(|t| t.tag != Some(varcon::Tag::Improper)))
.map(|v| v.word)
.collect();
for variant in variants.iter() {
let set = words.entry(variant).or_insert_with(HashSet::new);
set.extend(variants.iter().filter(|v| *v != variant));
}
}
words
}
fn find_best_match<'c>(
typo: &'c str,
correction: &'c str,
related_words: &HashSet<&'static str>,
) -> Option<&'c str> {
assert!(!related_words.contains(correction));
let current = edit_distance::edit_distance(typo, correction);
let mut matches: Vec<_> = related_words
.iter()
.map(|r| (edit_distance::edit_distance(typo, r), *r))
.filter(|(d, _)| *d < current)
.collect();
matches.sort_unstable();
matches.into_iter().next().map(|(_, r)| r)
}
#[derive(Debug, StructOpt)]
#[structopt(rename_all = "kebab-case")]
struct Options {
#[structopt(short("-i"), long, parse(from_os_str))]
input: std::path::PathBuf,
#[structopt(flatten)]
codegen: codegenrs::CodeGenArgs,
}
fn run() -> Result<i32, Box<dyn std::error::Error>> {
let options = Options::from_args();
let data = std::fs::read(&options.input).unwrap();
let mut content = vec![];
generate(&mut content, &data);
let content = String::from_utf8(content)?;
options.codegen.write_str(&content)?;
Ok(0)
}
fn main() {
let code = run().unwrap();
std::process::exit(code);
}

View file

@ -0,0 +1,21 @@
[package]
name = "typos-vars"
version = "0.2.1"
authors = ["Ed Page <eopage@gmail.com>"]
description = "Source Code Spelling Correction"
repository = "https://github.com/crate-ci/typos"
readme = "../../README.md"
categories = ["development-tools", "text-processing"]
keywords = ["development", "spelling"]
license = "MIT"
edition = "2018"
[badges]
azure-devops = { project = "crate-ci", pipeline = "typos" }
codecov = { repository = "crate-ci/typos" }
[dependencies]
phf = { version = "0.8", features = ["unicase"] }
unicase = "2.5"
log = "0.4"
varcon-core = { version = "1.0", path = "../varcon-core", features = ["flags"] }

View file

@ -0,0 +1,30 @@
[package]
name = "typos-vars-codegen"
version = "1.0.2"
authors = ["Ed Page <eopage@gmail.com>"]
description = "Source Code Spelling Correction"
repository = "https://github.com/crate-ci/typos"
readme = "../../../README.md"
categories = ["text-processing"]
license = "MIT"
edition = "2018"
publish = false
[badges]
azure-devops = { project = "crate-ci", pipeline = "typos" }
codecov = { repository = "crate-ci/typos" }
[dependencies]
phf = { version = "0.8", features = ["unicase"] }
phf_codegen = "0.8"
varcon = { version = "0.2", path = "../../varcon", features = ["flags"] }
varcon-core = { version = "1.0", path = "../../varcon-core", features = ["flags"] }
typos = { version = "^0.3", path = "../../typos" }
unicase = "2.5"
codegenrs = "0.1"
structopt = "0.3"
clap = "2"
log = "0.4"
env_logger = "0.7"
clap-verbosity-flag = "0.3"
itertools = "0.9"

View file

@ -0,0 +1,314 @@
use std::collections::BTreeMap;
use std::collections::HashSet;
use std::io::Write;
use structopt::StructOpt;
static CATEGORIES: [varcon::Category; 4] = [
varcon::Category::American,
varcon::Category::BritishIse,
// For now, only want to support one form of British, so going with -ise as it seems more
// popular.
varcon::Category::Canadian,
varcon::Category::Australian,
// Other basically means all
];
fn generate_variations<W: std::io::Write>(file: &mut W) {
let entries = entries();
writeln!(
file,
"// This file is code-genned by {}",
env!("CARGO_PKG_NAME")
)
.unwrap();
writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
writeln!(file).unwrap();
writeln!(file, "use unicase::UniCase;").unwrap();
writeln!(file).unwrap();
writeln!(file, "pub type Variants = &'static [&'static str];",).unwrap();
writeln!(
file,
"pub type VariantsMap = [Variants; {}];",
CATEGORIES.len()
)
.unwrap();
writeln!(file).unwrap();
writeln!(file, "pub fn all_categories() -> crate::CategorySet {{",).unwrap();
writeln!(
file,
" {}",
itertools::join(
CATEGORIES
.iter()
.map(|c| format!("crate::Category::{:?}", c)),
" | "
)
)
.unwrap();
writeln!(file, "}}",).unwrap();
writeln!(file).unwrap();
writeln!(
file,
"pub fn corrections(category: crate::Category, options: VariantsMap) -> &'static [&'static str] {{",
)
.unwrap();
writeln!(file, " match category {{").unwrap();
for (index, category) in CATEGORIES.iter().enumerate() {
writeln!(
file,
" crate::Category::{:?} => options[{}],",
category, index
)
.unwrap();
}
writeln!(
file,
" crate::Category::BritishIze | crate::Category::Other => unreachable!(\"{{:?}} is unused\", category),",
)
.unwrap();
writeln!(file, " }}").unwrap();
writeln!(file, "}}").unwrap();
writeln!(file).unwrap();
writeln!(
file,
"pub static VARS_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &'static [(u8, &VariantsMap)]> = "
)
.unwrap();
let entry_sets = entry_sets(entries.iter());
let mut referenced_symbols: HashSet<&str> = HashSet::new();
let mut builder = phf_codegen::Map::new();
for (word, data) in entry_sets.iter() {
if is_always_valid(data) {
// No need to convert from current form to target form
continue;
}
referenced_symbols.extend(data.iter().map(|(s, _)| s));
let value = generate_link(&data);
builder.entry(unicase::UniCase::new(word), &value);
}
let codegenned = builder.build();
writeln!(file, "{}", codegenned).unwrap();
writeln!(file, ";").unwrap();
for (symbol, entry) in entries.iter() {
if !referenced_symbols.contains(symbol.as_str()) {
continue;
}
generate_entry(file, symbol, entry);
}
}
fn generate_entry(file: &mut impl std::io::Write, symbol: &str, entry: &varcon_core::Entry) {
writeln!(file, "pub(crate) static {}: VariantsMap = [", symbol).unwrap();
for category in &CATEGORIES {
let corrections = collect_correct(entry, *category);
let mut corrections: Vec<_> = corrections.iter().collect();
corrections.sort_unstable();
writeln!(file, " &[").unwrap();
for correction in &corrections {
writeln!(file, " {:?},", correction).unwrap();
}
writeln!(file, " ],").unwrap();
}
writeln!(file, "];").unwrap();
writeln!(file).unwrap();
}
fn generate_link(data: &[(&str, varcon::CategorySet)]) -> String {
let mut output = Vec::new();
write!(output, "&[").unwrap();
for (symbol, set) in data.iter() {
write!(output, "(0b{:05b}, &{}), ", set.bits(), symbol).unwrap();
}
write!(output, "]").unwrap();
String::from_utf8(output).unwrap()
}
fn is_always_valid(data: &[(&str, varcon::CategorySet)]) -> bool {
let valid_categories = valid_categories();
for (_symbol, set) in data.iter() {
if *set == valid_categories {
return true;
}
}
false
}
fn entries() -> BTreeMap<String, varcon_core::Entry> {
varcon::VARCON
.iter()
.flat_map(|c| c.entries.iter())
.filter(|e| {
e.variants
.iter()
.all(|v| typos::tokens::Word::new(&v.word, 0).is_ok())
})
.map(|e| {
let mut e = e.into_owned();
for variant in e.variants.iter_mut() {
variant.word.make_ascii_lowercase();
}
(entry_symbol(&e), e)
})
.collect()
}
fn entry_symbol(entry: &varcon_core::Entry) -> String {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
std::hash::Hash::hash(entry, &mut hasher);
let hash = std::hash::Hasher::finish(&hasher);
format!(
"ENTRY_{}_{}",
entry.variants[0].word.to_ascii_uppercase(),
hash
)
}
fn entry_sets<'e>(
entries: impl Iterator<Item = (&'e String, &'e varcon_core::Entry)>,
) -> BTreeMap<&'e str, Vec<(&'e str, varcon::CategorySet)>> {
let mut sets = BTreeMap::new();
for (symbol, entry) in entries {
for (word, set) in entry_set(entry).iter() {
let v = sets.entry(*word).or_insert_with(Vec::new);
v.push((symbol.as_str(), *set));
}
}
sets
}
fn entry_set(entry: &varcon_core::Entry) -> BTreeMap<&str, varcon::CategorySet> {
let mut sets = BTreeMap::new();
let valid_categories = valid_categories();
for variant in entry.variants.iter() {
let set = sets
.entry(variant.word.as_str())
.or_insert_with(varcon::CategorySet::empty);
for t in variant.types.iter() {
match t.category {
varcon::Category::Other => *set |= valid_categories,
varcon::Category::BritishIze => (),
_ => set.insert(t.category),
}
}
}
sets
}
fn valid_categories() -> varcon::CategorySet {
let mut c = varcon::CategorySet::empty();
for cat in CATEGORIES.iter() {
c.insert(*cat);
}
c
}
fn collect_correct(entry: &varcon_core::Entry, category: varcon::Category) -> HashSet<&str> {
// If there is ambiguity, collect all potential options.
let mut primary = HashSet::new();
let mut backup = HashSet::new();
for variant in entry.variants.iter().filter(|v| !ignore_variant(v)) {
for t in variant
.types
.iter()
.filter(|t| t.category == category || t.category == varcon::Category::Other)
{
let tag = t.tag.unwrap_or(varcon::Tag::Eq);
if tag == varcon::Tag::Eq {
primary.insert(variant.word.as_str());
}
if tag != varcon::Tag::Improper {
backup.insert(variant.word.as_str());
}
}
}
if primary.len() == 1 {
primary
} else {
backup
}
}
fn ignore_variant(variant: &varcon_core::Variant) -> bool {
if variant.word == "anesthetisation"
&& variant.types.len() == 1
&& variant.types[0].category == varcon::Category::Australian
&& (variant.types[0].tag == Some(varcon::Tag::Variant)
|| variant.types[0].tag == Some(varcon::Tag::Seldom))
{
return true;
}
false
}
// dict needs
// all words, with bitfags, pointing to list of entry names
//
// varcon needs
// all entries by name
#[derive(Debug, StructOpt)]
#[structopt(rename_all = "kebab-case")]
struct Options {
#[structopt(flatten)]
codegen: codegenrs::CodeGenArgs,
#[structopt(flatten)]
rustmft: codegenrs::RustfmtArgs,
#[structopt(flatten)]
pub(crate) verbose: clap_verbosity_flag::Verbosity,
}
fn init_logging(level: Option<log::Level>) {
if let Some(level) = level {
let mut builder = env_logger::Builder::new();
builder.filter(None, level.to_level_filter());
if level == log::LevelFilter::Trace {
builder.format_timestamp_secs();
} else {
builder.format(|f, record| {
writeln!(
f,
"[{}] {}",
record.level().to_string().to_lowercase(),
record.args()
)
});
}
builder.init();
}
}
fn run() -> Result<i32, Box<dyn std::error::Error>> {
let mut options = Options::from_args();
options.verbose.set_default(Some(log::Level::Info));
init_logging(options.verbose.log_level());
let mut content = vec![];
generate_variations(&mut content);
let content = String::from_utf8(content)?;
let content = options.rustmft.reformat(&content)?;
options.codegen.write_str(&content)?;
Ok(0)
}
fn main() {
let code = run().unwrap();
std::process::exit(code);
}

View file

@ -0,0 +1,6 @@
mod vars_codegen;
pub use crate::vars_codegen::*;
pub use varcon_core::Category;
pub use varcon_core::CategorySet;

File diff suppressed because it is too large Load diff

View file

@ -14,15 +14,18 @@ Configuration is read from the following (in precedence order)
| Field | Argument | Format | Description |
|------------------------|-------------------|--------|-------------|
| files.binary | --binary | bool | |
| files.ignore-hidden | --hidden | bool | |
| files.ignore-files | --ignore | bool | |
| files.ignore-dot | --ignore-dot | bool | |
| files.ignore-vcs | --ignore-vcs | bool | |
| files.ignore-global | --ignore-global | bool | |
| files.ignore-parent | --ignore-parent | bool | |
| default.check-filename | \- | bool | |
| default.check-file | \- | bool | |
| default.ignore-hex | \- | bool | |
| default.identifier-include-digits | \- | bool | |
| default.identifier-include-chars | \- | string | |
| files.binary | --binary | bool | Check binary files as text |
| files.ignore-hidden | --hidden | bool | Skip hidden files and directories. |
| files.ignore-files | --ignore | bool | Respect ignore files. |
| files.ignore-dot | --ignore-dot | bool | Respect .ignore files. |
| files.ignore-vcs | --ignore-vcs | bool | Respect ignore files in vcs directories. |
| files.ignore-global | --ignore-global | bool | Respect global ignore files. |
| files.ignore-parent | --ignore-parent | bool | Respect ignore files in parent directories. |
| default.check-filename | \- | bool | Verifying spelling in file names. |
| default.check-file | \- | bool | Verifying spelling in files. |
| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. |
| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. |
| default.identifier-include-digits | \- | bool | Allow identifiers to include digits, in addition to letters. |
| default.identifier-leading-chars | \- | string | Allow identifiers to start with one of these characters. |
| default.identifier-include-chars | \- | string | Allow identifiers to include these characters. |
| default.locale | \- | en, en-us, en-gb, en-ca, en-au | |

View file

@ -112,6 +112,12 @@ pub(crate) struct FileArgs {
no_hex: bool,
#[structopt(long, overrides_with("no-hex"), hidden(true))]
hex: bool,
#[structopt(
long,
possible_values(&config::Locale::variants()),
)]
pub(crate) locale: Option<config::Locale>,
}
impl config::FileSource for FileArgs {
@ -141,6 +147,10 @@ impl config::FileSource for FileArgs {
(_, _) => unreachable!("StructOpt should make this impossible"),
}
}
fn locale(&self) -> Option<config::Locale> {
self.locale
}
}
#[derive(Debug, StructOpt)]

View file

@ -82,6 +82,10 @@ pub trait FileSource {
fn identifier_include_chars(&self) -> Option<&str> {
None
}
fn locale(&self) -> Option<Locale> {
None
}
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
@ -247,6 +251,7 @@ pub struct FileConfig {
pub identifier_leading_chars: Option<String>,
pub identifier_include_digits: Option<bool>,
pub identifier_include_chars: Option<String>,
pub locale: Option<Locale>,
}
impl FileConfig {
@ -272,6 +277,9 @@ impl FileConfig {
if let Some(source) = source.identifier_include_chars() {
self.identifier_include_chars = Some(source.to_owned());
}
if let Some(source) = source.locale() {
self.locale = Some(source);
}
}
pub fn check_filename(&self) -> bool {
@ -301,6 +309,10 @@ impl FileConfig {
pub fn identifier_include_chars(&self) -> &str {
self.identifier_include_chars.as_deref().unwrap_or("_'")
}
pub fn locale(&self) -> Locale {
self.locale.unwrap_or_default()
}
}
impl FileSource for FileConfig {
@ -331,6 +343,10 @@ impl FileSource for FileConfig {
fn identifier_include_chars(&self) -> Option<&str> {
self.identifier_include_chars.as_deref()
}
fn locale(&self) -> Option<Locale> {
self.locale
}
}
fn find_project_file(dir: std::path::PathBuf, name: &str) -> Option<std::path::PathBuf> {
@ -346,3 +362,62 @@ fn find_project_file(dir: std::path::PathBuf, name: &str) -> Option<std::path::P
}
Some(file_path)
}
#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum Locale {
En,
EnUs,
EnGb,
EnCa,
EnAu,
}
impl Locale {
pub fn category(self) -> Option<typos_vars::Category> {
match self {
Locale::En => None,
Locale::EnUs => Some(typos_vars::Category::American),
Locale::EnGb => Some(typos_vars::Category::BritishIse),
Locale::EnCa => Some(typos_vars::Category::Canadian),
Locale::EnAu => Some(typos_vars::Category::Australian),
}
}
pub fn variants() -> [&'static str; 5] {
["en", "en-us", "en-gb", "en-ca", "en-au"]
}
}
impl Default for Locale {
fn default() -> Self {
Locale::En
}
}
impl std::str::FromStr for Locale {
type Err = String;
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
match s {
"en" => Ok(Locale::En),
"en-us" => Ok(Locale::EnUs),
"en-gb" => Ok(Locale::EnGb),
"en-ca" => Ok(Locale::EnCa),
"en-au" => Ok(Locale::EnAu),
_ => Err("valid values: en, en-us, en-gb, en-ca, en-au".to_owned()),
}
}
}
impl std::fmt::Display for Locale {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match *self {
Locale::En => write!(f, "en"),
Locale::EnUs => write!(f, "en-us"),
Locale::EnGb => write!(f, "en-gb"),
Locale::EnCa => write!(f, "en-ca"),
Locale::EnAu => write!(f, "en-au"),
}
}
}

View file

@ -5,11 +5,15 @@ use unicase::UniCase;
use typos::tokens::Case;
#[derive(Default)]
pub struct BuiltIn {}
pub struct BuiltIn {
locale: Option<typos_vars::Category>,
}
impl BuiltIn {
pub fn new() -> Self {
Self {}
pub fn new(locale: crate::config::Locale) -> Self {
Self {
locale: locale.category(),
}
}
pub fn correct_ident<'s, 'w>(
@ -19,12 +23,66 @@ impl BuiltIn {
Vec::new()
}
pub fn correct_word<'s, 'w>(&'s self, word: typos::tokens::Word<'w>) -> Vec<Cow<'s, str>> {
map_lookup(&typos_dict::WORD_DICTIONARY, word.token())
.map(|s| case_correct(s, word.case()))
pub fn correct_word<'s, 'w>(
&'s self,
word_token: typos::tokens::Word<'w>,
) -> Vec<Cow<'s, str>> {
let word = word_token.token();
let corrections = if let Some(correction) = self.correct_with_dict(word) {
self.correct_with_vars(word)
.unwrap_or_else(|| vec![correction])
} else {
self.correct_with_vars(word).unwrap_or_else(Vec::new)
};
corrections
.into_iter()
.map(|s| case_correct(s, word_token.case()))
.collect()
}
fn correct_with_dict(&self, word: &str) -> Option<&'static str> {
map_lookup(&typos_dict::WORD_DICTIONARY, word)
}
fn correct_with_vars(&self, word: &str) -> Option<Vec<&'static str>> {
let variants = map_lookup(&typos_vars::VARS_DICTIONARY, word)?;
self.select_variant(variants)
}
fn select_variant(
&self,
vars: &'static [(u8, &'static typos_vars::VariantsMap)],
) -> Option<Vec<&'static str>> {
let var = vars[0];
let var_categories = unsafe {
// Code-genned from a checked category-set, so known to be safe
typos_vars::CategorySet::new(var.0)
};
if let Some(locale) = self.locale {
if var_categories.contains(locale) {
// Already valid for the current locale.
None
} else {
Some(
typos_vars::corrections(locale, *var.1)
.iter()
.copied()
.collect(),
)
}
} else {
// All locales are valid
if var_categories.is_empty() {
// But the word is never valid.
let mut unique: Vec<_> = var.1.iter().flat_map(|v| v.iter()).copied().collect();
unique.sort_unstable();
unique.dedup();
Some(unique)
} else {
None
}
}
}
}
impl typos::Dictionary for BuiltIn {
@ -37,10 +95,7 @@ impl typos::Dictionary for BuiltIn {
}
}
fn map_lookup(
map: &'static phf::Map<UniCase<&'static str>, &'static str>,
key: &str,
) -> Option<&'static str> {
fn map_lookup<V: Clone>(map: &'static phf::Map<UniCase<&'static str>, V>, key: &str) -> Option<V> {
// This transmute should be safe as `get` will not store the reference with
// the expanded lifetime. This is due to `Borrow` being overly strict and
// can't have an impl for `&'static str` to `Borrow<&'a str>`.

View file

@ -1 +1,2 @@
pub mod config;
pub mod dict;

View file

@ -56,7 +56,7 @@ fn run() -> Result<i32, anyhow::Error> {
.include_chars(config.default.identifier_include_chars().to_owned())
.build();
let dictionary = crate::dict::BuiltIn::new();
let dictionary = crate::dict::BuiltIn::new(config.default.locale());
let mut settings = typos::checks::TyposSettings::new();
settings