Merge pull request #235 from epage/parser

perf(parser): Overhaul how parsing is done
This commit is contained in:
Ed Page 2021-04-30 13:04:11 -05:00 committed by GitHub
commit e4f477799a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 520 additions and 327 deletions

221
Cargo.lock generated
View file

@ -46,9 +46,9 @@ dependencies = [
[[package]] [[package]]
name = "anyhow" name = "anyhow"
version = "1.0.38" version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "afddf7f520a80dbf76e6f50a35bca42a2331ef227a28b3b6dc5c2e2338d114b1" checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b"
[[package]] [[package]]
name = "arrayvec" name = "arrayvec"
@ -58,9 +58,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
[[package]] [[package]]
name = "assert_fs" name = "assert_fs"
version = "1.0.1" version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3203d5bb9979ac7210f01a150578ebafef6f08b55e79f6db32673c0977b94340" checksum = "73c485ca248200dfb850a64468a926321865cae0c450eaa7cdbe9ccf4ec49028"
dependencies = [ dependencies = [
"doc-comment", "doc-comment",
"globwalk", "globwalk",
@ -89,11 +89,12 @@ checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
[[package]] [[package]]
name = "backtrace" name = "backtrace"
version = "0.3.56" version = "0.3.58"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d117600f438b1707d4e4ae15d3595657288f8235a0eb593e80ecc98ab34e1bc" checksum = "88fb5a785d6b44fd9d6700935608639af1b8356de1e55d5f7c2740f4faa15d82"
dependencies = [ dependencies = [
"addr2line", "addr2line",
"cc",
"cfg-if", "cfg-if",
"libc", "libc",
"miniz_oxide", "miniz_oxide",
@ -109,9 +110,9 @@ checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
[[package]] [[package]]
name = "bitvec" name = "bitvec"
version = "0.19.4" version = "0.19.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7ba35e9565969edb811639dbebfe34edc0368e472c5018474c8eb2543397f81" checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321"
dependencies = [ dependencies = [
"funty", "funty",
"radium", "radium",
@ -139,19 +140,25 @@ checksum = "63396b8a4b9de3f4fdfb320ab6080762242f66a8ef174c49d8e19b674db4cdbe"
[[package]] [[package]]
name = "byteorder" name = "byteorder"
version = "1.4.2" version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]] [[package]]
name = "cast" name = "cast"
version = "0.2.3" version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0" checksum = "cc38c385bfd7e444464011bb24820f40dd1c76bcdfa1b78611cb7c2e5cafab75"
dependencies = [ dependencies = [
"rustc_version", "rustc_version",
] ]
[[package]]
name = "cc"
version = "1.0.67"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd"
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.0" version = "1.0.0"
@ -225,6 +232,12 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "convert_case"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
[[package]] [[package]]
name = "criterion" name = "criterion"
version = "0.3.4" version = "0.3.4"
@ -263,9 +276,9 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam-channel" name = "crossbeam-channel"
version = "0.5.0" version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"crossbeam-utils", "crossbeam-utils",
@ -284,9 +297,9 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam-epoch" name = "crossbeam-epoch"
version = "0.9.3" version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2584f639eb95fea8c798496315b297cf81b9b58b6d30ab066a75455333cf4b12" checksum = "52fb27eab85b17fbb9f6fd667089e07d6a2eb8743d02639ee7f6a7a7729c9c94"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"crossbeam-utils", "crossbeam-utils",
@ -297,9 +310,9 @@ dependencies = [
[[package]] [[package]]
name = "crossbeam-utils" name = "crossbeam-utils"
version = "0.8.3" version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7e9d99fa91428effe99c5c6d4634cdeba32b8cf784fc428a2a687f61a952c49" checksum = "4feb231f0d4d6af81aed15928e58ecf5816aa62a2393e2c82f46973e92a9a278"
dependencies = [ dependencies = [
"autocfg", "autocfg",
"cfg-if", "cfg-if",
@ -308,9 +321,9 @@ dependencies = [
[[package]] [[package]]
name = "csv" name = "csv"
version = "1.1.5" version = "1.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f9d58633299b24b515ac72a3f869f8b91306a3cec616a602843a383acd6f9e97" checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
dependencies = [ dependencies = [
"bstr", "bstr",
"csv-core", "csv-core",
@ -346,10 +359,10 @@ checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b"
dependencies = [ dependencies = [
"fnv", "fnv",
"ident_case", "ident_case",
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"strsim 0.9.3", "strsim 0.9.3",
"syn 1.0.60", "syn 1.0.71",
] ]
[[package]] [[package]]
@ -360,7 +373,7 @@ checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72"
dependencies = [ dependencies = [
"darling_core", "darling_core",
"quote 1.0.9", "quote 1.0.9",
"syn 1.0.60", "syn 1.0.71",
] ]
[[package]] [[package]]
@ -379,13 +392,14 @@ dependencies = [
[[package]] [[package]]
name = "derive_more" name = "derive_more"
version = "0.99.11" version = "0.99.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41cb0e6161ad61ed084a36ba71fbba9e3ac5aee3606fb607fe08da6acbcf3d8c" checksum = "f82b1b72f1263f214c0f823371768776c4f5841b942c9883aa8e5ec584fd0ba6"
dependencies = [ dependencies = [
"proc-macro2 1.0.24", "convert_case",
"proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"syn 1.0.60", "syn 1.0.71",
] ]
[[package]] [[package]]
@ -395,9 +409,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1cf41b4580a37cca5ef2ada2cc43cf5d6be3983f4522e83010d67ab6925e84b" checksum = "f1cf41b4580a37cca5ef2ada2cc43cf5d6be3983f4522e83010d67ab6925e84b"
dependencies = [ dependencies = [
"darling", "darling",
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"syn 1.0.60", "syn 1.0.71",
] ]
[[package]] [[package]]
@ -509,9 +523,9 @@ version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "946ee94e3dbf58fdd324f9ce245c7b238d46a66f00e86a020b71996349e46cce" checksum = "946ee94e3dbf58fdd324f9ce245c7b238d46a66f00e86a020b71996349e46cce"
dependencies = [ dependencies = [
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"syn 1.0.60", "syn 1.0.71",
] ]
[[package]] [[package]]
@ -717,9 +731,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
[[package]] [[package]]
name = "js-sys" name = "js-sys"
version = "0.3.48" version = "0.3.50"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc9f84f9b115ce7843d60706df1422a916680bfdfcbdb0447c5614ff9d7e4d78" checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c"
dependencies = [ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
@ -741,9 +755,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]] [[package]]
name = "lexical-core" name = "lexical-core"
version = "0.7.5" version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21f866863575d0e1d654fbeeabdc927292fdf862873dc3c96c6f753357e13374" checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe"
dependencies = [ dependencies = [
"arrayvec", "arrayvec",
"bitflags", "bitflags",
@ -754,9 +768,9 @@ dependencies = [
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.86" version = "0.2.94"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7282d924be3275cec7f6756ff4121987bc6481325397dde6ba3e7802b1a8b1c" checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"
[[package]] [[package]]
name = "log" name = "log"
@ -781,9 +795,9 @@ checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
[[package]] [[package]]
name = "memoffset" name = "memoffset"
version = "0.6.1" version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87" checksum = "f83fb6581e8ed1f85fd45c116db8405483899489e38406156c25eb743554361d"
dependencies = [ dependencies = [
"autocfg", "autocfg",
] ]
@ -866,9 +880,9 @@ checksum = "a9a7ab5d64814df0fe4a4b5ead45ed6c5f181ee3ff04ba344313a6c80446c5d4"
[[package]] [[package]]
name = "once_cell" name = "once_cell"
version = "1.7.0" version = "1.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10acf907b94fc1b1a152d08ef97e7759650268cf986bf127f387e602b02c7e5a" checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3"
[[package]] [[package]]
name = "oorandom" name = "oorandom"
@ -960,9 +974,9 @@ checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857"
[[package]] [[package]]
name = "predicates" name = "predicates"
version = "1.0.7" version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eeb433456c1a57cc93554dea3ce40b4c19c4057e41c55d4a0f3d84ea71c325aa" checksum = "f49cfaf7fdaa3bfacc6fa3e7054e65148878354a5cfddcf661df4c851f8021df"
dependencies = [ dependencies = [
"difference", "difference",
"float-cmp", "float-cmp",
@ -1000,9 +1014,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
dependencies = [ dependencies = [
"proc-macro-error-attr", "proc-macro-error-attr",
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"syn 1.0.60", "syn 1.0.71",
"version_check", "version_check",
] ]
@ -1012,7 +1026,7 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
dependencies = [ dependencies = [
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"version_check", "version_check",
] ]
@ -1028,11 +1042,11 @@ dependencies = [
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.24" version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" checksum = "a152013215dca273577e18d2bf00fa862b89b24169fb78c4c95aeb07992c9cec"
dependencies = [ dependencies = [
"unicode-xid 0.2.1", "unicode-xid 0.2.2",
] ]
[[package]] [[package]]
@ -1056,7 +1070,7 @@ version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
dependencies = [ dependencies = [
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
] ]
[[package]] [[package]]
@ -1183,23 +1197,22 @@ dependencies = [
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.2.5" version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9" checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2"
dependencies = [ dependencies = [
"bitflags", "bitflags",
] ]
[[package]] [[package]]
name = "regex" name = "regex"
version = "1.4.3" version = "1.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a" checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"memchr", "memchr",
"regex-syntax", "regex-syntax",
"thread_local",
] ]
[[package]] [[package]]
@ -1213,9 +1226,9 @@ dependencies = [
[[package]] [[package]]
name = "regex-syntax" name = "regex-syntax"
version = "0.6.22" version = "0.6.23"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581" checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548"
[[package]] [[package]]
name = "remove_dir_all" name = "remove_dir_all"
@ -1279,9 +1292,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]] [[package]]
name = "serde" name = "serde"
version = "1.0.123" version = "1.0.125"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae" checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171"
dependencies = [ dependencies = [
"serde_derive", "serde_derive",
] ]
@ -1298,13 +1311,13 @@ dependencies = [
[[package]] [[package]]
name = "serde_derive" name = "serde_derive"
version = "1.0.123" version = "1.0.125"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31" checksum = "b093b7a2bb58203b5da3056c05b4ec1fed827dcfdb37347a8841695263b3d06d"
dependencies = [ dependencies = [
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"syn 1.0.60", "syn 1.0.71",
] ]
[[package]] [[package]]
@ -1319,10 +1332,16 @@ dependencies = [
] ]
[[package]] [[package]]
name = "siphasher" name = "simdutf8"
version = "0.3.3" version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" checksum = "7f4f3d445e9015cf5e72cec4a3b3a84f8d54f34207afee609fd152de1c0212b1"
[[package]]
name = "siphasher"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbce6d4507c7e4a3962091436e56e95290cb71fa302d0d270e32130b75fbff27"
[[package]] [[package]]
name = "static_assertions" name = "static_assertions"
@ -1361,9 +1380,9 @@ checksum = "5ba9cdfda491b814720b6b06e0cac513d922fc407582032e8706e9f137976f90"
dependencies = [ dependencies = [
"heck", "heck",
"proc-macro-error", "proc-macro-error",
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"syn 1.0.60", "syn 1.0.71",
] ]
[[package]] [[package]]
@ -1379,13 +1398,13 @@ dependencies = [
[[package]] [[package]]
name = "syn" name = "syn"
version = "1.0.60" version = "1.0.71"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081" checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373"
dependencies = [ dependencies = [
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"unicode-xid 0.2.1", "unicode-xid 0.2.2",
] ]
[[package]] [[package]]
@ -1441,9 +1460,9 @@ version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0" checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
dependencies = [ dependencies = [
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"syn 1.0.60", "syn 1.0.71",
] ]
[[package]] [[package]]
@ -1457,9 +1476,9 @@ dependencies = [
[[package]] [[package]]
name = "tinytemplate" name = "tinytemplate"
version = "1.2.0" version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2ada8616fad06a2d0c455adc530de4ef57605a8120cc65da9653e0e9623ca74" checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
dependencies = [ dependencies = [
"serde", "serde",
"serde_json", "serde_json",
@ -1493,11 +1512,13 @@ dependencies = [
"anyhow", "anyhow",
"itertools 0.10.0", "itertools 0.10.0",
"log", "log",
"nom",
"once_cell", "once_cell",
"regex",
"serde", "serde",
"simdutf8",
"thiserror", "thiserror",
"unicode-segmentation", "unicode-segmentation",
"unicode-xid 0.2.2",
] ]
[[package]] [[package]]
@ -1512,7 +1533,7 @@ dependencies = [
"clap-verbosity-flag", "clap-verbosity-flag",
"content_inspector", "content_inspector",
"criterion", "criterion",
"derive_more 0.99.11", "derive_more 0.99.13",
"derive_setters", "derive_setters",
"difflib", "difflib",
"encoding", "encoding",
@ -1631,9 +1652,9 @@ checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
[[package]] [[package]]
name = "unicode-xid" name = "unicode-xid"
version = "0.2.1" version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
[[package]] [[package]]
name = "uuid" name = "uuid"
@ -1676,15 +1697,15 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]] [[package]]
name = "version_check" name = "version_check"
version = "0.9.2" version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
[[package]] [[package]]
name = "walkdir" name = "walkdir"
version = "2.3.1" version = "2.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d" checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
dependencies = [ dependencies = [
"same-file", "same-file",
"winapi", "winapi",
@ -1705,9 +1726,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
[[package]] [[package]]
name = "wasm-bindgen" name = "wasm-bindgen"
version = "0.2.71" version = "0.2.73"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ee1280240b7c461d6a0071313e08f34a60b0365f14260362e5a2b17d1d31aa7" checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"wasm-bindgen-macro", "wasm-bindgen-macro",
@ -1715,24 +1736,24 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-backend" name = "wasm-bindgen-backend"
version = "0.2.71" version = "0.2.73"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b7d8b6942b8bb3a9b0e73fc79b98095a27de6fa247615e59d096754a3bc2aa8" checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae"
dependencies = [ dependencies = [
"bumpalo", "bumpalo",
"lazy_static", "lazy_static",
"log", "log",
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"syn 1.0.60", "syn 1.0.71",
"wasm-bindgen-shared", "wasm-bindgen-shared",
] ]
[[package]] [[package]]
name = "wasm-bindgen-macro" name = "wasm-bindgen-macro"
version = "0.2.71" version = "0.2.73"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5ac38da8ef716661f0f36c0d8320b89028efe10c7c0afde65baffb496ce0d3b" checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f"
dependencies = [ dependencies = [
"quote 1.0.9", "quote 1.0.9",
"wasm-bindgen-macro-support", "wasm-bindgen-macro-support",
@ -1740,28 +1761,28 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-macro-support" name = "wasm-bindgen-macro-support"
version = "0.2.71" version = "0.2.73"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc053ec74d454df287b9374ee8abb36ffd5acb95ba87da3ba5b7d3fe20eb401e" checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c"
dependencies = [ dependencies = [
"proc-macro2 1.0.24", "proc-macro2 1.0.26",
"quote 1.0.9", "quote 1.0.9",
"syn 1.0.60", "syn 1.0.71",
"wasm-bindgen-backend", "wasm-bindgen-backend",
"wasm-bindgen-shared", "wasm-bindgen-shared",
] ]
[[package]] [[package]]
name = "wasm-bindgen-shared" name = "wasm-bindgen-shared"
version = "0.2.71" version = "0.2.73"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d6f8ec44822dd71f5f221a5847fb34acd9060535c1211b70a05844c0f6383b1" checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489"
[[package]] [[package]]
name = "web-sys" name = "web-sys"
version = "0.3.48" version = "0.3.50"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec600b26223b2948cedfde2a0aa6756dcf1fef616f43d7b3097aaf53a6c4d92b" checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be"
dependencies = [ dependencies = [
"js-sys", "js-sys",
"wasm-bindgen", "wasm-bindgen",

View file

@ -1,39 +1,93 @@
mod data; mod data;
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
fn bench_tokenize(c: &mut Criterion) { fn bench_parse_str(c: &mut Criterion) {
let mut group = c.benchmark_group("tokenize"); let mut group = c.benchmark_group("parse_str");
for (name, sample) in data::DATA { for (name, sample) in data::DATA {
let len = sample.len(); let len = sample.len();
group.bench_with_input(BenchmarkId::new("ident(bytes)", name), &len, |b, _| { group.throughput(Throughput::Bytes(len as u64));
let parser = typos::tokens::Tokenizer::new(); group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
b.iter(|| parser.parse_bytes(sample.as_bytes()).last()); let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
});
group.bench_with_input(BenchmarkId::new("ident(str)", name), &len, |b, _| {
let parser = typos::tokens::Tokenizer::new();
b.iter(|| parser.parse_str(sample).last()); b.iter(|| parser.parse_str(sample).last());
}); });
group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
let parser = typos::tokens::TokenizerBuilder::new()
.unicode(false)
.build();
b.iter(|| parser.parse_str(sample).last());
});
}
group.finish();
}
fn bench_parse_bytes(c: &mut Criterion) {
let mut group = c.benchmark_group("parse_bytes");
for (name, sample) in data::DATA {
let len = sample.len();
group.throughput(Throughput::Bytes(len as u64));
group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
b.iter(|| parser.parse_bytes(sample.as_bytes()).last());
});
group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
let parser = typos::tokens::TokenizerBuilder::new()
.unicode(false)
.build();
b.iter(|| parser.parse_bytes(sample.as_bytes()).last());
});
}
group.finish();
}
fn bench_split(c: &mut Criterion) {
let mut group = c.benchmark_group("split");
for (name, sample) in data::DATA {
let len = sample.len();
group.throughput(Throughput::Bytes(len as u64));
group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| { group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| {
let symbol = typos::tokens::Identifier::new_unchecked(sample, 0); let symbol =
typos::tokens::Identifier::new_unchecked(sample, typos::tokens::Case::None, 0);
b.iter(|| symbol.split().last()); b.iter(|| symbol.split().last());
}); });
group.bench_with_input( }
BenchmarkId::new("ident(bytes)+words", name), group.finish();
&len, }
|b, _| {
let parser = typos::tokens::Tokenizer::new(); fn bench_parse_split(c: &mut Criterion) {
let mut group = c.benchmark_group("parse_bytes+split");
for (name, sample) in data::DATA {
let len = sample.len();
group.throughput(Throughput::Bytes(len as u64));
group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
b.iter(|| { b.iter(|| {
parser parser
.parse_bytes(sample.as_bytes()) .parse_bytes(sample.as_bytes())
.flat_map(|i| i.split()) .flat_map(|i| i.split())
.last() .last()
}); });
}, });
); group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
let parser = typos::tokens::TokenizerBuilder::new()
.unicode(false)
.build();
b.iter(|| {
parser
.parse_bytes(sample.as_bytes())
.flat_map(|i| i.split())
.last()
});
});
} }
group.finish(); group.finish();
} }
criterion_group!(benches, bench_tokenize); criterion_group!(
benches,
bench_parse_str,
bench_parse_bytes,
bench_split,
bench_parse_split
);
criterion_main!(benches); criterion_main!(benches);

View file

@ -17,9 +17,11 @@ codecov = { repository = "crate-ci/typos" }
[dependencies] [dependencies]
anyhow = "1.0" anyhow = "1.0"
thiserror = "1.0" thiserror = "1.0"
regex = "1.3" nom = "6.0"
unicode-xid = "0.2.2"
once_cell = "1.2.0" once_cell = "1.2.0"
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
simdutf8 = "0.1.1"
itertools = "0.10" itertools = "0.10"
log = "0.4" log = "0.4"
unicode-segmentation = "1.7.1" unicode-segmentation = "1.7.1"

View file

@ -1,11 +1,9 @@
/// Define rules for tokenizaing a buffer. /// Define rules for tokenizaing a buffer.
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct TokenizerBuilder { pub struct TokenizerBuilder {
unicode: bool,
ignore_hex: bool, ignore_hex: bool,
leading_digits: bool, leading_digits: bool,
leading_chars: String,
include_digits: bool,
include_chars: String,
} }
impl TokenizerBuilder { impl TokenizerBuilder {
@ -13,6 +11,12 @@ impl TokenizerBuilder {
Default::default() Default::default()
} }
/// Specify that unicode Identifiers are allowed.
pub fn unicode(&mut self, yes: bool) -> &mut Self {
self.unicode = yes;
self
}
/// Specify that hexadecimal numbers should be ignored. /// Specify that hexadecimal numbers should be ignored.
pub fn ignore_hex(&mut self, yes: bool) -> &mut Self { pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
self.ignore_hex = yes; self.ignore_hex = yes;
@ -25,64 +29,26 @@ impl TokenizerBuilder {
self self
} }
/// Extend accepted leading characters for Identifiers.
pub fn leading_chars(&mut self, chars: String) -> &mut Self {
self.leading_chars = chars;
self
}
/// Specify that digits can be included in Identifiers.
pub fn include_digits(&mut self, yes: bool) -> &mut Self {
self.include_digits = yes;
self
}
/// Extend accepted characters for Identifiers.
pub fn include_chars(&mut self, chars: String) -> &mut Self {
self.include_chars = chars;
self
}
pub fn build(&self) -> Tokenizer { pub fn build(&self) -> Tokenizer {
let mut pattern = r#"\b("#.to_owned(); let TokenizerBuilder {
Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars); unicode,
Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars); leading_digits,
pattern.push_str(r#"*)\b"#); ignore_hex,
} = self.clone();
let words_str = regex::Regex::new(&pattern).unwrap();
let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();
Tokenizer { Tokenizer {
words_str, unicode,
words_bytes, leading_digits,
// `leading_digits` let's us bypass the regexes since you can't have a decimal or ignore_hex,
// hexadecimal number without a leading digit.
ignore_numbers: self.leading_digits,
ignore_hex: self.ignore_hex && self.leading_digits,
} }
} }
fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
pattern.push_str(r#"(\p{Alphabetic}"#);
if digits {
pattern.push_str(r#"|\d"#);
}
for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
let escaped = regex::escape(&grapheme);
pattern.push_str(&format!("|{}", escaped));
}
pattern.push(')');
}
} }
impl Default for TokenizerBuilder { impl Default for TokenizerBuilder {
fn default() -> Self { fn default() -> Self {
Self { Self {
ignore_hex: true, unicode: true,
leading_digits: false, leading_digits: false,
leading_chars: "_".to_owned(), ignore_hex: true,
include_digits: true,
include_chars: "_'".to_owned(),
} }
} }
} }
@ -90,9 +56,8 @@ impl Default for TokenizerBuilder {
/// Extract Identifiers from a buffer. /// Extract Identifiers from a buffer.
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct Tokenizer { pub struct Tokenizer {
words_str: regex::Regex, unicode: bool,
words_bytes: regex::bytes::Regex, leading_digits: bool,
ignore_numbers: bool,
ignore_hex: bool, ignore_hex: bool,
} }
@ -102,32 +67,46 @@ impl Tokenizer {
} }
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> { pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
self.words_str let iter = if self.unicode {
.find_iter(content) itertools::Either::Left(unicode_parser::iter_literals(content))
.filter(move |m| self.accept(m.as_str().as_bytes())) } else {
.map(|m| Identifier::new_unchecked(m.as_str(), m.start())) itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
} };
iter.filter_map(move |identifier| {
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> { let offset = offset(content.as_bytes(), identifier.as_bytes());
self.words_bytes self.transform(identifier, offset)
.find_iter(content)
.filter(move |m| self.accept(m.as_bytes()))
.filter_map(|m| {
let s = std::str::from_utf8(m.as_bytes()).ok();
s.map(|s| Identifier::new_unchecked(s, m.start()))
}) })
} }
fn accept(&self, contents: &[u8]) -> bool { pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
if self.ignore_numbers && is_number(contents) { let iter = if self.unicode {
return false; let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
itertools::Either::Left(iter)
} else {
itertools::Either::Right(ascii_parser::iter_literals(content))
};
iter.filter_map(move |identifier| {
let offset = offset(content, identifier.as_bytes());
self.transform(identifier, offset)
})
} }
if self.ignore_hex && is_hex(contents) { fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option<Identifier<'i>> {
return false; debug_assert!(!identifier.is_empty());
if self.leading_digits {
if is_number(identifier.as_bytes()) {
return None;
} }
true if self.ignore_hex && is_hex(identifier.as_bytes()) {
return None;
}
} else if is_digit(identifier.as_bytes()[0]) {
return None;
}
let case = Case::None;
Some(Identifier::new_unchecked(identifier, case, offset))
} }
} }
@ -137,34 +116,176 @@ impl Default for Tokenizer {
} }
} }
// `_`: number literal separator in Rust and other languages fn offset(base: &[u8], needle: &[u8]) -> usize {
// `'`: number literal separator in C++ let base = base.as_ptr() as usize;
static DIGITS: once_cell::sync::Lazy<regex::bytes::Regex> = let needle = needle.as_ptr() as usize;
once_cell::sync::Lazy::new(|| regex::bytes::Regex::new(r#"^[0-9_']+$"#).unwrap()); debug_assert!(base <= needle);
needle - base
fn is_number(ident: &[u8]) -> bool {
DIGITS.is_match(ident)
} }
// `_`: number literal separator in Rust and other languages struct Utf8Chunks<'s> {
// `'`: number literal separator in C++ source: &'s [u8],
static HEX: once_cell::sync::Lazy<regex::bytes::Regex> = }
once_cell::sync::Lazy::new(|| regex::bytes::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap());
impl<'s> Utf8Chunks<'s> {
fn new(source: &'s [u8]) -> Self {
Self { source }
}
}
impl<'s> Iterator for Utf8Chunks<'s> {
type Item = &'s str;
fn next(&mut self) -> Option<&'s str> {
if self.source.is_empty() {
return None;
}
match simdutf8::compat::from_utf8(self.source) {
Ok(valid) => {
self.source = b"";
Some(valid)
}
Err(error) => {
let (valid, after_valid) = self.source.split_at(error.valid_up_to());
if let Some(invalid_sequence_length) = error.error_len() {
self.source = &after_valid[invalid_sequence_length..];
} else {
self.source = b"";
}
let valid = unsafe { std::str::from_utf8_unchecked(valid) };
Some(valid)
}
}
}
}
fn is_number(ident: &[u8]) -> bool {
ident.iter().all(|b| is_digit(*b) || is_digit_sep(*b))
}
fn is_hex(ident: &[u8]) -> bool { fn is_hex(ident: &[u8]) -> bool {
HEX.is_match(ident) if ident.len() < 3 {
false
} else {
ident[0] == b'0'
&& ident[1] == b'x'
&& ident[2..]
.iter()
.all(|b| is_hex_digit(*b) || is_digit_sep(*b))
}
}
#[inline]
fn is_digit(chr: u8) -> bool {
chr.is_ascii_digit()
}
#[inline]
fn is_digit_sep(chr: u8) -> bool {
// `_`: number literal separator in Rust and other languages
// `'`: number literal separator in C++
chr == b'_' || chr == b'\''
}
#[inline]
fn is_hex_digit(chr: u8) -> bool {
chr.is_ascii_hexdigit()
}
mod unicode_parser {
use nom::bytes::complete::*;
use nom::sequence::*;
use nom::IResult;
pub(crate) fn iter_literals(mut input: &str) -> impl Iterator<Item = &str> {
std::iter::from_fn(move || match next_literal(input) {
Ok((i, o)) => {
input = i;
debug_assert_ne!(o, "");
Some(o)
}
_ => None,
})
}
fn next_literal(input: &str) -> IResult<&str, &str> {
preceded(literal_sep, identifier)(input)
}
fn literal_sep(input: &str) -> IResult<&str, &str> {
take_till(unicode_xid::UnicodeXID::is_xid_continue)(input)
}
fn identifier(input: &str) -> IResult<&str, &str> {
// Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
// or unexpected cases than strip off start characters to a word since we aren't doing a
// proper word boundary parse
take_while1(unicode_xid::UnicodeXID::is_xid_continue)(input)
}
}
mod ascii_parser {
use nom::bytes::complete::*;
use nom::sequence::*;
use nom::IResult;
pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator<Item = &str> {
std::iter::from_fn(move || match next_literal(input) {
Ok((i, o)) => {
input = i;
debug_assert_ne!(o, b"");
// This is safe because we've checked that the strings are a subset of ASCII
// characters.
let o = unsafe { std::str::from_utf8_unchecked(o) };
Some(o)
}
_ => None,
})
}
fn next_literal(input: &[u8]) -> IResult<&[u8], &[u8]> {
preceded(literal_sep, identifier)(input)
}
fn literal_sep(input: &[u8]) -> IResult<&[u8], &[u8]> {
take_till(is_continue)(input)
}
fn identifier(input: &[u8]) -> IResult<&[u8], &[u8]> {
// Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
// or unexpected cases than strip off start characters to a word since we aren't doing a
// proper word boundary parse
take_while1(is_continue)(input)
}
fn is_continue(c: u8) -> bool {
(b'a'..=b'z').contains(&c)
|| (b'A'..=b'Z').contains(&c)
|| (b'0'..=b'9').contains(&c)
|| c == b'_'
}
} }
/// A term composed of Words. /// A term composed of Words.
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Identifier<'t> { pub struct Identifier<'t> {
token: &'t str, token: &'t str,
case: Case,
offset: usize, offset: usize,
} }
impl<'t> Identifier<'t> { impl<'t> Identifier<'t> {
pub fn new_unchecked(token: &'t str, offset: usize) -> Self { pub fn new_unchecked(token: &'t str, case: Case, offset: usize) -> Self {
Self { token, offset } Self {
token,
case,
offset,
}
} }
pub fn token(&self) -> &'t str { pub fn token(&self) -> &'t str {
@ -172,7 +293,7 @@ impl<'t> Identifier<'t> {
} }
pub fn case(&self) -> Case { pub fn case(&self) -> Case {
Case::None self.case
} }
pub fn offset(&self) -> usize { pub fn offset(&self) -> usize {
@ -181,7 +302,12 @@ impl<'t> Identifier<'t> {
/// Split into individual Words. /// Split into individual Words.
pub fn split(&self) -> impl Iterator<Item = Word<'t>> { pub fn split(&self) -> impl Iterator<Item = Word<'t>> {
SplitIdent::new(self.token, self.offset) match self.case {
Case::None => itertools::Either::Left(SplitIdent::new(self.token, self.offset)),
_ => itertools::Either::Right(
Some(Word::new_unchecked(self.token, self.case, self.offset)).into_iter(),
),
}
} }
} }
@ -269,7 +395,7 @@ impl<'s> Iterator for SplitIdent<'s> {
while let Some((i, c)) = self.char_indices.next() { while let Some((i, c)) = self.char_indices.next() {
let cur_mode = WordMode::classify(c); let cur_mode = WordMode::classify(c);
if cur_mode == WordMode::Boundary { if cur_mode == WordMode::Boundary {
assert!(self.start_mode == WordMode::Boundary); debug_assert!(self.start_mode == WordMode::Boundary);
continue; continue;
} }
if self.start_mode == WordMode::Boundary { if self.start_mode == WordMode::Boundary {
@ -409,7 +535,7 @@ mod test {
let parser = Tokenizer::new(); let parser = Tokenizer::new();
let input = "word"; let input = "word";
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", 0)]; let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", Case::None, 0)];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect(); let actual: Vec<_> = parser.parse_str(input).collect();
@ -422,8 +548,8 @@ mod test {
let input = "A B"; let input = "A B";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("A", 0), Identifier::new_unchecked("A", Case::None, 0),
Identifier::new_unchecked("B", 2), Identifier::new_unchecked("B", Case::None, 2),
]; ];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
@ -437,8 +563,8 @@ mod test {
let input = "A.B"; let input = "A.B";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("A", 0), Identifier::new_unchecked("A", Case::None, 0),
Identifier::new_unchecked("B", 2), Identifier::new_unchecked("B", Case::None, 2),
]; ];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
@ -452,8 +578,8 @@ mod test {
let input = "A::B"; let input = "A::B";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("A", 0), Identifier::new_unchecked("A", Case::None, 0),
Identifier::new_unchecked("B", 3), Identifier::new_unchecked("B", Case::None, 3),
]; ];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
@ -466,7 +592,7 @@ mod test {
let parser = Tokenizer::new(); let parser = Tokenizer::new();
let input = "A_B"; let input = "A_B";
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", 0)]; let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", Case::None, 0)];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect(); let actual: Vec<_> = parser.parse_str(input).collect();
@ -475,12 +601,15 @@ mod test {
#[test] #[test]
fn tokenize_ignore_hex_enabled() { fn tokenize_ignore_hex_enabled() {
let parser = TokenizerBuilder::new().ignore_hex(true).build(); let parser = TokenizerBuilder::new()
.ignore_hex(true)
.leading_digits(true)
.build();
let input = "Hello 0xDEADBEEF World"; let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", 0), Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("World", 17), Identifier::new_unchecked("World", Case::None, 17),
]; ];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
@ -497,9 +626,47 @@ mod test {
let input = "Hello 0xDEADBEEF World"; let input = "Hello 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![ let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", 0), Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("0xDEADBEEF", 6), Identifier::new_unchecked("0xDEADBEEF", Case::None, 6),
Identifier::new_unchecked("World", 17), Identifier::new_unchecked("World", Case::None, 17),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_leading_digits_enabled() {
let parser = TokenizerBuilder::new()
.ignore_hex(false)
.leading_digits(true)
.build();
let input = "Hello 0Hello 124 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("0Hello", Case::None, 6),
Identifier::new_unchecked("0xDEADBEEF", Case::None, 17),
Identifier::new_unchecked("World", Case::None, 28),
];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual);
let actual: Vec<_> = parser.parse_str(input).collect();
assert_eq!(expected, actual);
}
#[test]
fn tokenize_leading_digits_disabled() {
let parser = TokenizerBuilder::new()
.ignore_hex(false)
.leading_digits(false)
.build();
let input = "Hello 0Hello 124 0xDEADBEEF World";
let expected: Vec<Identifier> = vec![
Identifier::new_unchecked("Hello", Case::None, 0),
Identifier::new_unchecked("World", Case::None, 28),
]; ];
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
assert_eq!(expected, actual); assert_eq!(expected, actual);
@ -564,7 +731,7 @@ mod test {
), ),
]; ];
for (input, expected) in cases.iter() { for (input, expected) in cases.iter() {
let ident = Identifier::new_unchecked(input, 0); let ident = Identifier::new_unchecked(input, Case::None, 0);
let result: Vec<_> = ident.split().map(|w| (w.token, w.case, w.offset)).collect(); let result: Vec<_> = ident.split().map(|w| (w.token, w.case, w.offset)).collect();
assert_eq!(&result, expected); assert_eq!(&result, expected);
} }

View file

@ -23,11 +23,9 @@ Configuration is read from the following (in precedence order)
| default.binary | --binary | bool | Check binary files as text | | default.binary | --binary | bool | Check binary files as text |
| default.check-filename | \- | bool | Verifying spelling in file names. | | default.check-filename | \- | bool | Verifying spelling in file names. |
| default.check-file | \- | bool | Verifying spelling in files. | | default.check-file | \- | bool | Verifying spelling in files. |
| default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) |
| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. | | default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. |
| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. | | default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. |
| default.identifier-include-digits | \- | bool | Allow identifiers to include digits, in addition to letters. |
| default.identifier-leading-chars | \- | string | Allow identifiers to start with one of these characters. |
| default.identifier-include-chars | \- | string | Allow identifiers to include these characters. |
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. | | default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
| default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | | default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
| default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | | default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |

View file

@ -123,6 +123,12 @@ pub(crate) struct FileArgs {
#[structopt(long, overrides_with("no-check-files"), hidden(true))] #[structopt(long, overrides_with("no-check-files"), hidden(true))]
check_files: bool, check_files: bool,
#[structopt(long, overrides_with("no-unicode"), hidden(true))]
unicode: bool,
#[structopt(long, overrides_with("unicode"))]
/// Only allow ASCII characters in identifiers
no_unicode: bool,
#[structopt( #[structopt(
long, long,
possible_values(&config::Locale::variants()), possible_values(&config::Locale::variants()),
@ -136,7 +142,10 @@ impl FileArgs {
binary: self.binary(), binary: self.binary(),
check_filename: self.check_filename(), check_filename: self.check_filename(),
check_file: self.check_file(), check_file: self.check_file(),
tokenizer: None, tokenizer: Some(config::TokenizerConfig {
unicode: self.unicode(),
..Default::default()
}),
dict: Some(config::DictConfig { dict: Some(config::DictConfig {
locale: self.locale, locale: self.locale,
..Default::default() ..Default::default()
@ -145,30 +154,19 @@ impl FileArgs {
} }
fn binary(&self) -> Option<bool> { fn binary(&self) -> Option<bool> {
match (self.binary, self.no_binary) { resolve_bool_arg(self.binary, self.no_binary)
(true, false) => Some(true),
(false, true) => Some(false),
(false, false) => None,
(_, _) => unreachable!("StructOpt should make this impossible"),
}
} }
fn check_filename(&self) -> Option<bool> { fn check_filename(&self) -> Option<bool> {
match (self.check_filenames, self.no_check_filenames) { resolve_bool_arg(self.check_filenames, self.no_check_filenames)
(true, false) => Some(true),
(false, true) => Some(false),
(false, false) => None,
(_, _) => unreachable!("StructOpt should make this impossible"),
} }
fn unicode(&self) -> Option<bool> {
resolve_bool_arg(self.unicode, self.no_unicode)
} }
fn check_file(&self) -> Option<bool> { fn check_file(&self) -> Option<bool> {
match (self.check_files, self.no_check_files) { resolve_bool_arg(self.check_files, self.no_check_files)
(true, false) => Some(true),
(false, true) => Some(false),
(false, false) => None,
(_, _) => unreachable!("StructOpt should make this impossible"),
}
} }
} }
@ -244,56 +242,35 @@ impl WalkArgs {
} }
fn ignore_hidden(&self) -> Option<bool> { fn ignore_hidden(&self) -> Option<bool> {
match (self.hidden, self.no_hidden) { resolve_bool_arg(self.no_hidden, self.hidden)
(true, false) => Some(false),
(false, true) => Some(true),
(false, false) => None,
(_, _) => unreachable!("StructOpt should make this impossible"),
}
} }
fn ignore_files(&self) -> Option<bool> { fn ignore_files(&self) -> Option<bool> {
match (self.no_ignore, self.ignore) { resolve_bool_arg(self.ignore, self.no_ignore)
(true, false) => Some(false),
(false, true) => Some(true),
(false, false) => None,
(_, _) => unreachable!("StructOpt should make this impossible"),
}
} }
fn ignore_dot(&self) -> Option<bool> { fn ignore_dot(&self) -> Option<bool> {
match (self.no_ignore_dot, self.ignore_dot) { resolve_bool_arg(self.ignore_dot, self.no_ignore_dot)
(true, false) => Some(false),
(false, true) => Some(true),
(false, false) => None,
(_, _) => unreachable!("StructOpt should make this impossible"),
}
} }
fn ignore_vcs(&self) -> Option<bool> { fn ignore_vcs(&self) -> Option<bool> {
match (self.no_ignore_vcs, self.ignore_vcs) { resolve_bool_arg(self.ignore_vcs, self.no_ignore_vcs)
(true, false) => Some(false),
(false, true) => Some(true),
(false, false) => None,
(_, _) => unreachable!("StructOpt should make this impossible"),
}
} }
fn ignore_global(&self) -> Option<bool> { fn ignore_global(&self) -> Option<bool> {
match (self.no_ignore_global, self.ignore_global) { resolve_bool_arg(self.ignore_global, self.no_ignore_global)
(true, false) => Some(false),
(false, true) => Some(true),
(false, false) => None,
(_, _) => unreachable!("StructOpt should make this impossible"),
}
} }
fn ignore_parent(&self) -> Option<bool> { fn ignore_parent(&self) -> Option<bool> {
match (self.no_ignore_parent, self.ignore_parent) { resolve_bool_arg(self.ignore_parent, self.no_ignore_parent)
(true, false) => Some(false), }
(false, true) => Some(true), }
fn resolve_bool_arg(yes: bool, no: bool) -> Option<bool> {
match (yes, no) {
(true, false) => Some(true),
(false, true) => Some(false),
(false, false) => None, (false, false) => None,
(_, _) => unreachable!("StructOpt should make this impossible"), (_, _) => unreachable!("StructOpt should make this impossible"),
} }
}
} }

View file

@ -230,50 +230,38 @@ impl EngineConfig {
#[serde(deny_unknown_fields, default)] #[serde(deny_unknown_fields, default)]
#[serde(rename_all = "kebab-case")] #[serde(rename_all = "kebab-case")]
pub struct TokenizerConfig { pub struct TokenizerConfig {
/// Allow unicode characters in identifiers (and not just ASCII)
pub unicode: Option<bool>,
/// Do not check identifiers that appear to be hexadecimal values. /// Do not check identifiers that appear to be hexadecimal values.
pub ignore_hex: Option<bool>, pub ignore_hex: Option<bool>,
/// Allow identifiers to start with digits, in addition to letters. /// Allow identifiers to start with digits, in addition to letters.
pub identifier_leading_digits: Option<bool>, pub identifier_leading_digits: Option<bool>,
/// Allow identifiers to start with one of these characters.
pub identifier_leading_chars: Option<kstring::KString>,
/// Allow identifiers to include digits, in addition to letters.
pub identifier_include_digits: Option<bool>,
/// Allow identifiers to include these characters.
pub identifier_include_chars: Option<kstring::KString>,
} }
impl TokenizerConfig { impl TokenizerConfig {
pub fn from_defaults() -> Self { pub fn from_defaults() -> Self {
let empty = Self::default(); let empty = Self::default();
Self { Self {
unicode: Some(empty.unicode()),
ignore_hex: Some(empty.ignore_hex()), ignore_hex: Some(empty.ignore_hex()),
identifier_leading_digits: Some(empty.identifier_leading_digits()), identifier_leading_digits: Some(empty.identifier_leading_digits()),
identifier_leading_chars: Some(kstring::KString::from_ref(
empty.identifier_leading_chars(),
)),
identifier_include_digits: Some(empty.identifier_include_digits()),
identifier_include_chars: Some(kstring::KString::from_ref(
empty.identifier_include_chars(),
)),
} }
} }
pub fn update(&mut self, source: &TokenizerConfig) { pub fn update(&mut self, source: &TokenizerConfig) {
if let Some(source) = source.unicode {
self.unicode = Some(source);
}
if let Some(source) = source.ignore_hex { if let Some(source) = source.ignore_hex {
self.ignore_hex = Some(source); self.ignore_hex = Some(source);
} }
if let Some(source) = source.identifier_leading_digits { if let Some(source) = source.identifier_leading_digits {
self.identifier_leading_digits = Some(source); self.identifier_leading_digits = Some(source);
} }
if let Some(source) = source.identifier_leading_chars.as_ref() {
self.identifier_leading_chars = Some(source.clone());
}
if let Some(source) = source.identifier_include_digits {
self.identifier_include_digits = Some(source);
}
if let Some(source) = source.identifier_include_chars.as_ref() {
self.identifier_include_chars = Some(source.clone());
} }
pub fn unicode(&self) -> bool {
self.unicode.unwrap_or(true)
} }
pub fn ignore_hex(&self) -> bool { pub fn ignore_hex(&self) -> bool {
@ -283,18 +271,6 @@ impl TokenizerConfig {
pub fn identifier_leading_digits(&self) -> bool { pub fn identifier_leading_digits(&self) -> bool {
self.identifier_leading_digits.unwrap_or(false) self.identifier_leading_digits.unwrap_or(false)
} }
pub fn identifier_leading_chars(&self) -> &str {
self.identifier_leading_chars.as_deref().unwrap_or("_")
}
pub fn identifier_include_digits(&self) -> bool {
self.identifier_include_digits.unwrap_or(true)
}
pub fn identifier_include_chars(&self) -> &str {
self.identifier_include_chars.as_deref().unwrap_or("_'")
}
} }
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]

View file

@ -217,11 +217,9 @@ impl<'s> ConfigEngine<'s> {
let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults); let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);
let tokenizer = typos::tokens::TokenizerBuilder::new() let tokenizer = typos::tokens::TokenizerBuilder::new()
.unicode(tokenizer_config.unicode())
.ignore_hex(tokenizer_config.ignore_hex()) .ignore_hex(tokenizer_config.ignore_hex())
.leading_digits(tokenizer_config.identifier_leading_digits()) .leading_digits(tokenizer_config.identifier_leading_digits())
.leading_chars(tokenizer_config.identifier_leading_chars().to_owned())
.include_digits(tokenizer_config.identifier_include_digits())
.include_chars(tokenizer_config.identifier_include_chars().to_owned())
.build(); .build();
let dict = crate::dict::BuiltIn::new(dict_config.locale()); let dict = crate::dict::BuiltIn::new(dict_config.locale());