mirror of
https://github.com/crate-ci/typos.git
synced 2024-11-26 02:51:08 -05:00
Merge pull request #235 from epage/parser
perf(parser): Overhaul how parsing is done
This commit is contained in:
commit
e4f477799a
8 changed files with 520 additions and 327 deletions
221
Cargo.lock
generated
221
Cargo.lock
generated
|
@ -46,9 +46,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anyhow"
|
name = "anyhow"
|
||||||
version = "1.0.38"
|
version = "1.0.40"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "afddf7f520a80dbf76e6f50a35bca42a2331ef227a28b3b6dc5c2e2338d114b1"
|
checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "arrayvec"
|
name = "arrayvec"
|
||||||
|
@ -58,9 +58,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "assert_fs"
|
name = "assert_fs"
|
||||||
version = "1.0.1"
|
version = "1.0.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "3203d5bb9979ac7210f01a150578ebafef6f08b55e79f6db32673c0977b94340"
|
checksum = "73c485ca248200dfb850a64468a926321865cae0c450eaa7cdbe9ccf4ec49028"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"doc-comment",
|
"doc-comment",
|
||||||
"globwalk",
|
"globwalk",
|
||||||
|
@ -89,11 +89,12 @@ checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "backtrace"
|
name = "backtrace"
|
||||||
version = "0.3.56"
|
version = "0.3.58"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9d117600f438b1707d4e4ae15d3595657288f8235a0eb593e80ecc98ab34e1bc"
|
checksum = "88fb5a785d6b44fd9d6700935608639af1b8356de1e55d5f7c2740f4faa15d82"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"addr2line",
|
"addr2line",
|
||||||
|
"cc",
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"libc",
|
"libc",
|
||||||
"miniz_oxide",
|
"miniz_oxide",
|
||||||
|
@ -109,9 +110,9 @@ checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bitvec"
|
name = "bitvec"
|
||||||
version = "0.19.4"
|
version = "0.19.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a7ba35e9565969edb811639dbebfe34edc0368e472c5018474c8eb2543397f81"
|
checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"funty",
|
"funty",
|
||||||
"radium",
|
"radium",
|
||||||
|
@ -139,19 +140,25 @@ checksum = "63396b8a4b9de3f4fdfb320ab6080762242f66a8ef174c49d8e19b674db4cdbe"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "byteorder"
|
name = "byteorder"
|
||||||
version = "1.4.2"
|
version = "1.4.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b"
|
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cast"
|
name = "cast"
|
||||||
version = "0.2.3"
|
version = "0.2.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0"
|
checksum = "cc38c385bfd7e444464011bb24820f40dd1c76bcdfa1b78611cb7c2e5cafab75"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"rustc_version",
|
"rustc_version",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cc"
|
||||||
|
version = "1.0.67"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cfg-if"
|
name = "cfg-if"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
|
@ -225,6 +232,12 @@ dependencies = [
|
||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "convert_case"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "criterion"
|
name = "criterion"
|
||||||
version = "0.3.4"
|
version = "0.3.4"
|
||||||
|
@ -263,9 +276,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crossbeam-channel"
|
name = "crossbeam-channel"
|
||||||
version = "0.5.0"
|
version = "0.5.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775"
|
checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"crossbeam-utils",
|
"crossbeam-utils",
|
||||||
|
@ -284,9 +297,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crossbeam-epoch"
|
name = "crossbeam-epoch"
|
||||||
version = "0.9.3"
|
version = "0.9.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2584f639eb95fea8c798496315b297cf81b9b58b6d30ab066a75455333cf4b12"
|
checksum = "52fb27eab85b17fbb9f6fd667089e07d6a2eb8743d02639ee7f6a7a7729c9c94"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"crossbeam-utils",
|
"crossbeam-utils",
|
||||||
|
@ -297,9 +310,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crossbeam-utils"
|
name = "crossbeam-utils"
|
||||||
version = "0.8.3"
|
version = "0.8.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e7e9d99fa91428effe99c5c6d4634cdeba32b8cf784fc428a2a687f61a952c49"
|
checksum = "4feb231f0d4d6af81aed15928e58ecf5816aa62a2393e2c82f46973e92a9a278"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"autocfg",
|
"autocfg",
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
|
@ -308,9 +321,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "csv"
|
name = "csv"
|
||||||
version = "1.1.5"
|
version = "1.1.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f9d58633299b24b515ac72a3f869f8b91306a3cec616a602843a383acd6f9e97"
|
checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bstr",
|
"bstr",
|
||||||
"csv-core",
|
"csv-core",
|
||||||
|
@ -346,10 +359,10 @@ checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"fnv",
|
"fnv",
|
||||||
"ident_case",
|
"ident_case",
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"strsim 0.9.3",
|
"strsim 0.9.3",
|
||||||
"syn 1.0.60",
|
"syn 1.0.71",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -360,7 +373,7 @@ checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"darling_core",
|
"darling_core",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"syn 1.0.60",
|
"syn 1.0.71",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -379,13 +392,14 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "derive_more"
|
name = "derive_more"
|
||||||
version = "0.99.11"
|
version = "0.99.13"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "41cb0e6161ad61ed084a36ba71fbba9e3ac5aee3606fb607fe08da6acbcf3d8c"
|
checksum = "f82b1b72f1263f214c0f823371768776c4f5841b942c9883aa8e5ec584fd0ba6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.24",
|
"convert_case",
|
||||||
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"syn 1.0.60",
|
"syn 1.0.71",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -395,9 +409,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f1cf41b4580a37cca5ef2ada2cc43cf5d6be3983f4522e83010d67ab6925e84b"
|
checksum = "f1cf41b4580a37cca5ef2ada2cc43cf5d6be3983f4522e83010d67ab6925e84b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"darling",
|
"darling",
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"syn 1.0.60",
|
"syn 1.0.71",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -509,9 +523,9 @@ version = "0.6.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "946ee94e3dbf58fdd324f9ce245c7b238d46a66f00e86a020b71996349e46cce"
|
checksum = "946ee94e3dbf58fdd324f9ce245c7b238d46a66f00e86a020b71996349e46cce"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"syn 1.0.60",
|
"syn 1.0.71",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -717,9 +731,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "js-sys"
|
name = "js-sys"
|
||||||
version = "0.3.48"
|
version = "0.3.50"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "dc9f84f9b115ce7843d60706df1422a916680bfdfcbdb0447c5614ff9d7e4d78"
|
checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
@ -741,9 +755,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lexical-core"
|
name = "lexical-core"
|
||||||
version = "0.7.5"
|
version = "0.7.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "21f866863575d0e1d654fbeeabdc927292fdf862873dc3c96c6f753357e13374"
|
checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrayvec",
|
"arrayvec",
|
||||||
"bitflags",
|
"bitflags",
|
||||||
|
@ -754,9 +768,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.86"
|
version = "0.2.94"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b7282d924be3275cec7f6756ff4121987bc6481325397dde6ba3e7802b1a8b1c"
|
checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "log"
|
name = "log"
|
||||||
|
@ -781,9 +795,9 @@ checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memoffset"
|
name = "memoffset"
|
||||||
version = "0.6.1"
|
version = "0.6.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87"
|
checksum = "f83fb6581e8ed1f85fd45c116db8405483899489e38406156c25eb743554361d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"autocfg",
|
"autocfg",
|
||||||
]
|
]
|
||||||
|
@ -866,9 +880,9 @@ checksum = "a9a7ab5d64814df0fe4a4b5ead45ed6c5f181ee3ff04ba344313a6c80446c5d4"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "once_cell"
|
name = "once_cell"
|
||||||
version = "1.7.0"
|
version = "1.7.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "10acf907b94fc1b1a152d08ef97e7759650268cf986bf127f387e602b02c7e5a"
|
checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "oorandom"
|
name = "oorandom"
|
||||||
|
@ -960,9 +974,9 @@ checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "predicates"
|
name = "predicates"
|
||||||
version = "1.0.7"
|
version = "1.0.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "eeb433456c1a57cc93554dea3ce40b4c19c4057e41c55d4a0f3d84ea71c325aa"
|
checksum = "f49cfaf7fdaa3bfacc6fa3e7054e65148878354a5cfddcf661df4c851f8021df"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"difference",
|
"difference",
|
||||||
"float-cmp",
|
"float-cmp",
|
||||||
|
@ -1000,9 +1014,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro-error-attr",
|
"proc-macro-error-attr",
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"syn 1.0.60",
|
"syn 1.0.71",
|
||||||
"version_check",
|
"version_check",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -1012,7 +1026,7 @@ version = "1.0.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"version_check",
|
"version_check",
|
||||||
]
|
]
|
||||||
|
@ -1028,11 +1042,11 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "1.0.24"
|
version = "1.0.26"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71"
|
checksum = "a152013215dca273577e18d2bf00fa862b89b24169fb78c4c95aeb07992c9cec"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"unicode-xid 0.2.1",
|
"unicode-xid 0.2.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1056,7 +1070,7 @@ version = "1.0.9"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
|
checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1183,23 +1197,22 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "redox_syscall"
|
name = "redox_syscall"
|
||||||
version = "0.2.5"
|
version = "0.2.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9"
|
checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags",
|
"bitflags",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex"
|
name = "regex"
|
||||||
version = "1.4.3"
|
version = "1.4.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a"
|
checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick",
|
"aho-corasick",
|
||||||
"memchr",
|
"memchr",
|
||||||
"regex-syntax",
|
"regex-syntax",
|
||||||
"thread_local",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1213,9 +1226,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex-syntax"
|
name = "regex-syntax"
|
||||||
version = "0.6.22"
|
version = "0.6.23"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581"
|
checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "remove_dir_all"
|
name = "remove_dir_all"
|
||||||
|
@ -1279,9 +1292,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde"
|
name = "serde"
|
||||||
version = "1.0.123"
|
version = "1.0.125"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae"
|
checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde_derive",
|
"serde_derive",
|
||||||
]
|
]
|
||||||
|
@ -1298,13 +1311,13 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_derive"
|
name = "serde_derive"
|
||||||
version = "1.0.123"
|
version = "1.0.125"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31"
|
checksum = "b093b7a2bb58203b5da3056c05b4ec1fed827dcfdb37347a8841695263b3d06d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"syn 1.0.60",
|
"syn 1.0.71",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1319,10 +1332,16 @@ dependencies = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "siphasher"
|
name = "simdutf8"
|
||||||
version = "0.3.3"
|
version = "0.1.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7"
|
checksum = "7f4f3d445e9015cf5e72cec4a3b3a84f8d54f34207afee609fd152de1c0212b1"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "siphasher"
|
||||||
|
version = "0.3.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cbce6d4507c7e4a3962091436e56e95290cb71fa302d0d270e32130b75fbff27"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "static_assertions"
|
name = "static_assertions"
|
||||||
|
@ -1361,9 +1380,9 @@ checksum = "5ba9cdfda491b814720b6b06e0cac513d922fc407582032e8706e9f137976f90"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"heck",
|
"heck",
|
||||||
"proc-macro-error",
|
"proc-macro-error",
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"syn 1.0.60",
|
"syn 1.0.71",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1379,13 +1398,13 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
version = "1.0.60"
|
version = "1.0.71"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081"
|
checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"unicode-xid 0.2.1",
|
"unicode-xid 0.2.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1441,9 +1460,9 @@ version = "1.0.24"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
|
checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"syn 1.0.60",
|
"syn 1.0.71",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1457,9 +1476,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tinytemplate"
|
name = "tinytemplate"
|
||||||
version = "1.2.0"
|
version = "1.2.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a2ada8616fad06a2d0c455adc530de4ef57605a8120cc65da9653e0e9623ca74"
|
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
@ -1493,11 +1512,13 @@ dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"itertools 0.10.0",
|
"itertools 0.10.0",
|
||||||
"log",
|
"log",
|
||||||
|
"nom",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"regex",
|
|
||||||
"serde",
|
"serde",
|
||||||
|
"simdutf8",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"unicode-segmentation",
|
"unicode-segmentation",
|
||||||
|
"unicode-xid 0.2.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -1512,7 +1533,7 @@ dependencies = [
|
||||||
"clap-verbosity-flag",
|
"clap-verbosity-flag",
|
||||||
"content_inspector",
|
"content_inspector",
|
||||||
"criterion",
|
"criterion",
|
||||||
"derive_more 0.99.11",
|
"derive_more 0.99.13",
|
||||||
"derive_setters",
|
"derive_setters",
|
||||||
"difflib",
|
"difflib",
|
||||||
"encoding",
|
"encoding",
|
||||||
|
@ -1631,9 +1652,9 @@ checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-xid"
|
name = "unicode-xid"
|
||||||
version = "0.2.1"
|
version = "0.2.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
|
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "uuid"
|
name = "uuid"
|
||||||
|
@ -1676,15 +1697,15 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "version_check"
|
name = "version_check"
|
||||||
version = "0.9.2"
|
version = "0.9.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed"
|
checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "walkdir"
|
name = "walkdir"
|
||||||
version = "2.3.1"
|
version = "2.3.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d"
|
checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"same-file",
|
"same-file",
|
||||||
"winapi",
|
"winapi",
|
||||||
|
@ -1705,9 +1726,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wasm-bindgen"
|
name = "wasm-bindgen"
|
||||||
version = "0.2.71"
|
version = "0.2.73"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7ee1280240b7c461d6a0071313e08f34a60b0365f14260362e5a2b17d1d31aa7"
|
checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"wasm-bindgen-macro",
|
"wasm-bindgen-macro",
|
||||||
|
@ -1715,24 +1736,24 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wasm-bindgen-backend"
|
name = "wasm-bindgen-backend"
|
||||||
version = "0.2.71"
|
version = "0.2.73"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5b7d8b6942b8bb3a9b0e73fc79b98095a27de6fa247615e59d096754a3bc2aa8"
|
checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bumpalo",
|
"bumpalo",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"log",
|
"log",
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"syn 1.0.60",
|
"syn 1.0.71",
|
||||||
"wasm-bindgen-shared",
|
"wasm-bindgen-shared",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wasm-bindgen-macro"
|
name = "wasm-bindgen-macro"
|
||||||
version = "0.2.71"
|
version = "0.2.73"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e5ac38da8ef716661f0f36c0d8320b89028efe10c7c0afde65baffb496ce0d3b"
|
checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"wasm-bindgen-macro-support",
|
"wasm-bindgen-macro-support",
|
||||||
|
@ -1740,28 +1761,28 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wasm-bindgen-macro-support"
|
name = "wasm-bindgen-macro-support"
|
||||||
version = "0.2.71"
|
version = "0.2.73"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cc053ec74d454df287b9374ee8abb36ffd5acb95ba87da3ba5b7d3fe20eb401e"
|
checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.24",
|
"proc-macro2 1.0.26",
|
||||||
"quote 1.0.9",
|
"quote 1.0.9",
|
||||||
"syn 1.0.60",
|
"syn 1.0.71",
|
||||||
"wasm-bindgen-backend",
|
"wasm-bindgen-backend",
|
||||||
"wasm-bindgen-shared",
|
"wasm-bindgen-shared",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wasm-bindgen-shared"
|
name = "wasm-bindgen-shared"
|
||||||
version = "0.2.71"
|
version = "0.2.73"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7d6f8ec44822dd71f5f221a5847fb34acd9060535c1211b70a05844c0f6383b1"
|
checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "web-sys"
|
name = "web-sys"
|
||||||
version = "0.3.48"
|
version = "0.3.50"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ec600b26223b2948cedfde2a0aa6756dcf1fef616f43d7b3097aaf53a6c4d92b"
|
checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"js-sys",
|
"js-sys",
|
||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
|
|
|
@ -1,39 +1,93 @@
|
||||||
mod data;
|
mod data;
|
||||||
|
|
||||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
|
||||||
|
|
||||||
fn bench_tokenize(c: &mut Criterion) {
|
fn bench_parse_str(c: &mut Criterion) {
|
||||||
let mut group = c.benchmark_group("tokenize");
|
let mut group = c.benchmark_group("parse_str");
|
||||||
for (name, sample) in data::DATA {
|
for (name, sample) in data::DATA {
|
||||||
let len = sample.len();
|
let len = sample.len();
|
||||||
group.bench_with_input(BenchmarkId::new("ident(bytes)", name), &len, |b, _| {
|
group.throughput(Throughput::Bytes(len as u64));
|
||||||
let parser = typos::tokens::Tokenizer::new();
|
group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
|
||||||
b.iter(|| parser.parse_bytes(sample.as_bytes()).last());
|
let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
|
||||||
});
|
|
||||||
group.bench_with_input(BenchmarkId::new("ident(str)", name), &len, |b, _| {
|
|
||||||
let parser = typos::tokens::Tokenizer::new();
|
|
||||||
b.iter(|| parser.parse_str(sample).last());
|
b.iter(|| parser.parse_str(sample).last());
|
||||||
});
|
});
|
||||||
|
group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
|
||||||
|
let parser = typos::tokens::TokenizerBuilder::new()
|
||||||
|
.unicode(false)
|
||||||
|
.build();
|
||||||
|
b.iter(|| parser.parse_str(sample).last());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bench_parse_bytes(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("parse_bytes");
|
||||||
|
for (name, sample) in data::DATA {
|
||||||
|
let len = sample.len();
|
||||||
|
group.throughput(Throughput::Bytes(len as u64));
|
||||||
|
group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
|
||||||
|
let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
|
||||||
|
b.iter(|| parser.parse_bytes(sample.as_bytes()).last());
|
||||||
|
});
|
||||||
|
group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
|
||||||
|
let parser = typos::tokens::TokenizerBuilder::new()
|
||||||
|
.unicode(false)
|
||||||
|
.build();
|
||||||
|
b.iter(|| parser.parse_bytes(sample.as_bytes()).last());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bench_split(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("split");
|
||||||
|
for (name, sample) in data::DATA {
|
||||||
|
let len = sample.len();
|
||||||
|
group.throughput(Throughput::Bytes(len as u64));
|
||||||
group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| {
|
group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| {
|
||||||
let symbol = typos::tokens::Identifier::new_unchecked(sample, 0);
|
let symbol =
|
||||||
|
typos::tokens::Identifier::new_unchecked(sample, typos::tokens::Case::None, 0);
|
||||||
b.iter(|| symbol.split().last());
|
b.iter(|| symbol.split().last());
|
||||||
});
|
});
|
||||||
group.bench_with_input(
|
}
|
||||||
BenchmarkId::new("ident(bytes)+words", name),
|
group.finish();
|
||||||
&len,
|
}
|
||||||
|b, _| {
|
|
||||||
let parser = typos::tokens::Tokenizer::new();
|
fn bench_parse_split(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("parse_bytes+split");
|
||||||
|
for (name, sample) in data::DATA {
|
||||||
|
let len = sample.len();
|
||||||
|
group.throughput(Throughput::Bytes(len as u64));
|
||||||
|
group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
|
||||||
|
let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
parser
|
parser
|
||||||
.parse_bytes(sample.as_bytes())
|
.parse_bytes(sample.as_bytes())
|
||||||
.flat_map(|i| i.split())
|
.flat_map(|i| i.split())
|
||||||
.last()
|
.last()
|
||||||
});
|
});
|
||||||
},
|
});
|
||||||
);
|
group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
|
||||||
|
let parser = typos::tokens::TokenizerBuilder::new()
|
||||||
|
.unicode(false)
|
||||||
|
.build();
|
||||||
|
b.iter(|| {
|
||||||
|
parser
|
||||||
|
.parse_bytes(sample.as_bytes())
|
||||||
|
.flat_map(|i| i.split())
|
||||||
|
.last()
|
||||||
|
});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
group.finish();
|
group.finish();
|
||||||
}
|
}
|
||||||
|
|
||||||
criterion_group!(benches, bench_tokenize);
|
criterion_group!(
|
||||||
|
benches,
|
||||||
|
bench_parse_str,
|
||||||
|
bench_parse_bytes,
|
||||||
|
bench_split,
|
||||||
|
bench_parse_split
|
||||||
|
);
|
||||||
criterion_main!(benches);
|
criterion_main!(benches);
|
||||||
|
|
|
@ -17,9 +17,11 @@ codecov = { repository = "crate-ci/typos" }
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0"
|
anyhow = "1.0"
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
regex = "1.3"
|
nom = "6.0"
|
||||||
|
unicode-xid = "0.2.2"
|
||||||
once_cell = "1.2.0"
|
once_cell = "1.2.0"
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
simdutf8 = "0.1.1"
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
unicode-segmentation = "1.7.1"
|
unicode-segmentation = "1.7.1"
|
||||||
|
|
|
@ -1,11 +1,9 @@
|
||||||
/// Define rules for tokenizaing a buffer.
|
/// Define rules for tokenizaing a buffer.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct TokenizerBuilder {
|
pub struct TokenizerBuilder {
|
||||||
|
unicode: bool,
|
||||||
ignore_hex: bool,
|
ignore_hex: bool,
|
||||||
leading_digits: bool,
|
leading_digits: bool,
|
||||||
leading_chars: String,
|
|
||||||
include_digits: bool,
|
|
||||||
include_chars: String,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TokenizerBuilder {
|
impl TokenizerBuilder {
|
||||||
|
@ -13,6 +11,12 @@ impl TokenizerBuilder {
|
||||||
Default::default()
|
Default::default()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Specify that unicode Identifiers are allowed.
|
||||||
|
pub fn unicode(&mut self, yes: bool) -> &mut Self {
|
||||||
|
self.unicode = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
/// Specify that hexadecimal numbers should be ignored.
|
/// Specify that hexadecimal numbers should be ignored.
|
||||||
pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
|
pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
|
||||||
self.ignore_hex = yes;
|
self.ignore_hex = yes;
|
||||||
|
@ -25,64 +29,26 @@ impl TokenizerBuilder {
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extend accepted leading characters for Identifiers.
|
|
||||||
pub fn leading_chars(&mut self, chars: String) -> &mut Self {
|
|
||||||
self.leading_chars = chars;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Specify that digits can be included in Identifiers.
|
|
||||||
pub fn include_digits(&mut self, yes: bool) -> &mut Self {
|
|
||||||
self.include_digits = yes;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extend accepted characters for Identifiers.
|
|
||||||
pub fn include_chars(&mut self, chars: String) -> &mut Self {
|
|
||||||
self.include_chars = chars;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build(&self) -> Tokenizer {
|
pub fn build(&self) -> Tokenizer {
|
||||||
let mut pattern = r#"\b("#.to_owned();
|
let TokenizerBuilder {
|
||||||
Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
|
unicode,
|
||||||
Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
|
leading_digits,
|
||||||
pattern.push_str(r#"*)\b"#);
|
ignore_hex,
|
||||||
|
} = self.clone();
|
||||||
let words_str = regex::Regex::new(&pattern).unwrap();
|
|
||||||
let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();
|
|
||||||
|
|
||||||
Tokenizer {
|
Tokenizer {
|
||||||
words_str,
|
unicode,
|
||||||
words_bytes,
|
leading_digits,
|
||||||
// `leading_digits` let's us bypass the regexes since you can't have a decimal or
|
ignore_hex,
|
||||||
// hexadecimal number without a leading digit.
|
|
||||||
ignore_numbers: self.leading_digits,
|
|
||||||
ignore_hex: self.ignore_hex && self.leading_digits,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
|
|
||||||
pattern.push_str(r#"(\p{Alphabetic}"#);
|
|
||||||
if digits {
|
|
||||||
pattern.push_str(r#"|\d"#);
|
|
||||||
}
|
|
||||||
for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
|
|
||||||
let escaped = regex::escape(&grapheme);
|
|
||||||
pattern.push_str(&format!("|{}", escaped));
|
|
||||||
}
|
|
||||||
pattern.push(')');
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for TokenizerBuilder {
|
impl Default for TokenizerBuilder {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
ignore_hex: true,
|
unicode: true,
|
||||||
leading_digits: false,
|
leading_digits: false,
|
||||||
leading_chars: "_".to_owned(),
|
ignore_hex: true,
|
||||||
include_digits: true,
|
|
||||||
include_chars: "_'".to_owned(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -90,9 +56,8 @@ impl Default for TokenizerBuilder {
|
||||||
/// Extract Identifiers from a buffer.
|
/// Extract Identifiers from a buffer.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Tokenizer {
|
pub struct Tokenizer {
|
||||||
words_str: regex::Regex,
|
unicode: bool,
|
||||||
words_bytes: regex::bytes::Regex,
|
leading_digits: bool,
|
||||||
ignore_numbers: bool,
|
|
||||||
ignore_hex: bool,
|
ignore_hex: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,32 +67,46 @@ impl Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
self.words_str
|
let iter = if self.unicode {
|
||||||
.find_iter(content)
|
itertools::Either::Left(unicode_parser::iter_literals(content))
|
||||||
.filter(move |m| self.accept(m.as_str().as_bytes()))
|
} else {
|
||||||
.map(|m| Identifier::new_unchecked(m.as_str(), m.start()))
|
itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
|
||||||
}
|
};
|
||||||
|
iter.filter_map(move |identifier| {
|
||||||
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
let offset = offset(content.as_bytes(), identifier.as_bytes());
|
||||||
self.words_bytes
|
self.transform(identifier, offset)
|
||||||
.find_iter(content)
|
|
||||||
.filter(move |m| self.accept(m.as_bytes()))
|
|
||||||
.filter_map(|m| {
|
|
||||||
let s = std::str::from_utf8(m.as_bytes()).ok();
|
|
||||||
s.map(|s| Identifier::new_unchecked(s, m.start()))
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn accept(&self, contents: &[u8]) -> bool {
|
pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
|
||||||
if self.ignore_numbers && is_number(contents) {
|
let iter = if self.unicode {
|
||||||
return false;
|
let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
|
||||||
|
itertools::Either::Left(iter)
|
||||||
|
} else {
|
||||||
|
itertools::Either::Right(ascii_parser::iter_literals(content))
|
||||||
|
};
|
||||||
|
iter.filter_map(move |identifier| {
|
||||||
|
let offset = offset(content, identifier.as_bytes());
|
||||||
|
self.transform(identifier, offset)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.ignore_hex && is_hex(contents) {
|
fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option<Identifier<'i>> {
|
||||||
return false;
|
debug_assert!(!identifier.is_empty());
|
||||||
|
if self.leading_digits {
|
||||||
|
if is_number(identifier.as_bytes()) {
|
||||||
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
true
|
if self.ignore_hex && is_hex(identifier.as_bytes()) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
} else if is_digit(identifier.as_bytes()[0]) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let case = Case::None;
|
||||||
|
Some(Identifier::new_unchecked(identifier, case, offset))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -137,34 +116,176 @@ impl Default for Tokenizer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// `_`: number literal separator in Rust and other languages
|
fn offset(base: &[u8], needle: &[u8]) -> usize {
|
||||||
// `'`: number literal separator in C++
|
let base = base.as_ptr() as usize;
|
||||||
static DIGITS: once_cell::sync::Lazy<regex::bytes::Regex> =
|
let needle = needle.as_ptr() as usize;
|
||||||
once_cell::sync::Lazy::new(|| regex::bytes::Regex::new(r#"^[0-9_']+$"#).unwrap());
|
debug_assert!(base <= needle);
|
||||||
|
needle - base
|
||||||
fn is_number(ident: &[u8]) -> bool {
|
|
||||||
DIGITS.is_match(ident)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// `_`: number literal separator in Rust and other languages
|
struct Utf8Chunks<'s> {
|
||||||
// `'`: number literal separator in C++
|
source: &'s [u8],
|
||||||
static HEX: once_cell::sync::Lazy<regex::bytes::Regex> =
|
}
|
||||||
once_cell::sync::Lazy::new(|| regex::bytes::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap());
|
|
||||||
|
impl<'s> Utf8Chunks<'s> {
|
||||||
|
fn new(source: &'s [u8]) -> Self {
|
||||||
|
Self { source }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s> Iterator for Utf8Chunks<'s> {
|
||||||
|
type Item = &'s str;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<&'s str> {
|
||||||
|
if self.source.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
match simdutf8::compat::from_utf8(self.source) {
|
||||||
|
Ok(valid) => {
|
||||||
|
self.source = b"";
|
||||||
|
Some(valid)
|
||||||
|
}
|
||||||
|
Err(error) => {
|
||||||
|
let (valid, after_valid) = self.source.split_at(error.valid_up_to());
|
||||||
|
|
||||||
|
if let Some(invalid_sequence_length) = error.error_len() {
|
||||||
|
self.source = &after_valid[invalid_sequence_length..];
|
||||||
|
} else {
|
||||||
|
self.source = b"";
|
||||||
|
}
|
||||||
|
|
||||||
|
let valid = unsafe { std::str::from_utf8_unchecked(valid) };
|
||||||
|
Some(valid)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_number(ident: &[u8]) -> bool {
|
||||||
|
ident.iter().all(|b| is_digit(*b) || is_digit_sep(*b))
|
||||||
|
}
|
||||||
|
|
||||||
fn is_hex(ident: &[u8]) -> bool {
|
fn is_hex(ident: &[u8]) -> bool {
|
||||||
HEX.is_match(ident)
|
if ident.len() < 3 {
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
ident[0] == b'0'
|
||||||
|
&& ident[1] == b'x'
|
||||||
|
&& ident[2..]
|
||||||
|
.iter()
|
||||||
|
.all(|b| is_hex_digit(*b) || is_digit_sep(*b))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_digit(chr: u8) -> bool {
|
||||||
|
chr.is_ascii_digit()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_digit_sep(chr: u8) -> bool {
|
||||||
|
// `_`: number literal separator in Rust and other languages
|
||||||
|
// `'`: number literal separator in C++
|
||||||
|
chr == b'_' || chr == b'\''
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_hex_digit(chr: u8) -> bool {
|
||||||
|
chr.is_ascii_hexdigit()
|
||||||
|
}
|
||||||
|
|
||||||
|
mod unicode_parser {
|
||||||
|
use nom::bytes::complete::*;
|
||||||
|
use nom::sequence::*;
|
||||||
|
use nom::IResult;
|
||||||
|
|
||||||
|
pub(crate) fn iter_literals(mut input: &str) -> impl Iterator<Item = &str> {
|
||||||
|
std::iter::from_fn(move || match next_literal(input) {
|
||||||
|
Ok((i, o)) => {
|
||||||
|
input = i;
|
||||||
|
debug_assert_ne!(o, "");
|
||||||
|
Some(o)
|
||||||
|
}
|
||||||
|
_ => None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next_literal(input: &str) -> IResult<&str, &str> {
|
||||||
|
preceded(literal_sep, identifier)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn literal_sep(input: &str) -> IResult<&str, &str> {
|
||||||
|
take_till(unicode_xid::UnicodeXID::is_xid_continue)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn identifier(input: &str) -> IResult<&str, &str> {
|
||||||
|
// Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
|
||||||
|
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
|
||||||
|
// or unexpected cases than strip off start characters to a word since we aren't doing a
|
||||||
|
// proper word boundary parse
|
||||||
|
take_while1(unicode_xid::UnicodeXID::is_xid_continue)(input)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
mod ascii_parser {
|
||||||
|
use nom::bytes::complete::*;
|
||||||
|
use nom::sequence::*;
|
||||||
|
use nom::IResult;
|
||||||
|
|
||||||
|
pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator<Item = &str> {
|
||||||
|
std::iter::from_fn(move || match next_literal(input) {
|
||||||
|
Ok((i, o)) => {
|
||||||
|
input = i;
|
||||||
|
debug_assert_ne!(o, b"");
|
||||||
|
// This is safe because we've checked that the strings are a subset of ASCII
|
||||||
|
// characters.
|
||||||
|
let o = unsafe { std::str::from_utf8_unchecked(o) };
|
||||||
|
Some(o)
|
||||||
|
}
|
||||||
|
_ => None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next_literal(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||||
|
preceded(literal_sep, identifier)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn literal_sep(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||||
|
take_till(is_continue)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn identifier(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||||
|
// Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
|
||||||
|
// `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
|
||||||
|
// or unexpected cases than strip off start characters to a word since we aren't doing a
|
||||||
|
// proper word boundary parse
|
||||||
|
take_while1(is_continue)(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_continue(c: u8) -> bool {
|
||||||
|
(b'a'..=b'z').contains(&c)
|
||||||
|
|| (b'A'..=b'Z').contains(&c)
|
||||||
|
|| (b'0'..=b'9').contains(&c)
|
||||||
|
|| c == b'_'
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A term composed of Words.
|
/// A term composed of Words.
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
pub struct Identifier<'t> {
|
pub struct Identifier<'t> {
|
||||||
token: &'t str,
|
token: &'t str,
|
||||||
|
case: Case,
|
||||||
offset: usize,
|
offset: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Identifier<'t> {
|
impl<'t> Identifier<'t> {
|
||||||
pub fn new_unchecked(token: &'t str, offset: usize) -> Self {
|
pub fn new_unchecked(token: &'t str, case: Case, offset: usize) -> Self {
|
||||||
Self { token, offset }
|
Self {
|
||||||
|
token,
|
||||||
|
case,
|
||||||
|
offset,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn token(&self) -> &'t str {
|
pub fn token(&self) -> &'t str {
|
||||||
|
@ -172,7 +293,7 @@ impl<'t> Identifier<'t> {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn case(&self) -> Case {
|
pub fn case(&self) -> Case {
|
||||||
Case::None
|
self.case
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn offset(&self) -> usize {
|
pub fn offset(&self) -> usize {
|
||||||
|
@ -181,7 +302,12 @@ impl<'t> Identifier<'t> {
|
||||||
|
|
||||||
/// Split into individual Words.
|
/// Split into individual Words.
|
||||||
pub fn split(&self) -> impl Iterator<Item = Word<'t>> {
|
pub fn split(&self) -> impl Iterator<Item = Word<'t>> {
|
||||||
SplitIdent::new(self.token, self.offset)
|
match self.case {
|
||||||
|
Case::None => itertools::Either::Left(SplitIdent::new(self.token, self.offset)),
|
||||||
|
_ => itertools::Either::Right(
|
||||||
|
Some(Word::new_unchecked(self.token, self.case, self.offset)).into_iter(),
|
||||||
|
),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -269,7 +395,7 @@ impl<'s> Iterator for SplitIdent<'s> {
|
||||||
while let Some((i, c)) = self.char_indices.next() {
|
while let Some((i, c)) = self.char_indices.next() {
|
||||||
let cur_mode = WordMode::classify(c);
|
let cur_mode = WordMode::classify(c);
|
||||||
if cur_mode == WordMode::Boundary {
|
if cur_mode == WordMode::Boundary {
|
||||||
assert!(self.start_mode == WordMode::Boundary);
|
debug_assert!(self.start_mode == WordMode::Boundary);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if self.start_mode == WordMode::Boundary {
|
if self.start_mode == WordMode::Boundary {
|
||||||
|
@ -409,7 +535,7 @@ mod test {
|
||||||
let parser = Tokenizer::new();
|
let parser = Tokenizer::new();
|
||||||
|
|
||||||
let input = "word";
|
let input = "word";
|
||||||
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", 0)];
|
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", Case::None, 0)];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
let actual: Vec<_> = parser.parse_str(input).collect();
|
let actual: Vec<_> = parser.parse_str(input).collect();
|
||||||
|
@ -422,8 +548,8 @@ mod test {
|
||||||
|
|
||||||
let input = "A B";
|
let input = "A B";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("A", 0),
|
Identifier::new_unchecked("A", Case::None, 0),
|
||||||
Identifier::new_unchecked("B", 2),
|
Identifier::new_unchecked("B", Case::None, 2),
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
|
@ -437,8 +563,8 @@ mod test {
|
||||||
|
|
||||||
let input = "A.B";
|
let input = "A.B";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("A", 0),
|
Identifier::new_unchecked("A", Case::None, 0),
|
||||||
Identifier::new_unchecked("B", 2),
|
Identifier::new_unchecked("B", Case::None, 2),
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
|
@ -452,8 +578,8 @@ mod test {
|
||||||
|
|
||||||
let input = "A::B";
|
let input = "A::B";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("A", 0),
|
Identifier::new_unchecked("A", Case::None, 0),
|
||||||
Identifier::new_unchecked("B", 3),
|
Identifier::new_unchecked("B", Case::None, 3),
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
|
@ -466,7 +592,7 @@ mod test {
|
||||||
let parser = Tokenizer::new();
|
let parser = Tokenizer::new();
|
||||||
|
|
||||||
let input = "A_B";
|
let input = "A_B";
|
||||||
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", 0)];
|
let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", Case::None, 0)];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
let actual: Vec<_> = parser.parse_str(input).collect();
|
let actual: Vec<_> = parser.parse_str(input).collect();
|
||||||
|
@ -475,12 +601,15 @@ mod test {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_ignore_hex_enabled() {
|
fn tokenize_ignore_hex_enabled() {
|
||||||
let parser = TokenizerBuilder::new().ignore_hex(true).build();
|
let parser = TokenizerBuilder::new()
|
||||||
|
.ignore_hex(true)
|
||||||
|
.leading_digits(true)
|
||||||
|
.build();
|
||||||
|
|
||||||
let input = "Hello 0xDEADBEEF World";
|
let input = "Hello 0xDEADBEEF World";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("Hello", 0),
|
Identifier::new_unchecked("Hello", Case::None, 0),
|
||||||
Identifier::new_unchecked("World", 17),
|
Identifier::new_unchecked("World", Case::None, 17),
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
|
@ -497,9 +626,47 @@ mod test {
|
||||||
|
|
||||||
let input = "Hello 0xDEADBEEF World";
|
let input = "Hello 0xDEADBEEF World";
|
||||||
let expected: Vec<Identifier> = vec![
|
let expected: Vec<Identifier> = vec![
|
||||||
Identifier::new_unchecked("Hello", 0),
|
Identifier::new_unchecked("Hello", Case::None, 0),
|
||||||
Identifier::new_unchecked("0xDEADBEEF", 6),
|
Identifier::new_unchecked("0xDEADBEEF", Case::None, 6),
|
||||||
Identifier::new_unchecked("World", 17),
|
Identifier::new_unchecked("World", Case::None, 17),
|
||||||
|
];
|
||||||
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
let actual: Vec<_> = parser.parse_str(input).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_leading_digits_enabled() {
|
||||||
|
let parser = TokenizerBuilder::new()
|
||||||
|
.ignore_hex(false)
|
||||||
|
.leading_digits(true)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
let input = "Hello 0Hello 124 0xDEADBEEF World";
|
||||||
|
let expected: Vec<Identifier> = vec![
|
||||||
|
Identifier::new_unchecked("Hello", Case::None, 0),
|
||||||
|
Identifier::new_unchecked("0Hello", Case::None, 6),
|
||||||
|
Identifier::new_unchecked("0xDEADBEEF", Case::None, 17),
|
||||||
|
Identifier::new_unchecked("World", Case::None, 28),
|
||||||
|
];
|
||||||
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
let actual: Vec<_> = parser.parse_str(input).collect();
|
||||||
|
assert_eq!(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_leading_digits_disabled() {
|
||||||
|
let parser = TokenizerBuilder::new()
|
||||||
|
.ignore_hex(false)
|
||||||
|
.leading_digits(false)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
let input = "Hello 0Hello 124 0xDEADBEEF World";
|
||||||
|
let expected: Vec<Identifier> = vec![
|
||||||
|
Identifier::new_unchecked("Hello", Case::None, 0),
|
||||||
|
Identifier::new_unchecked("World", Case::None, 28),
|
||||||
];
|
];
|
||||||
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
|
||||||
assert_eq!(expected, actual);
|
assert_eq!(expected, actual);
|
||||||
|
@ -564,7 +731,7 @@ mod test {
|
||||||
),
|
),
|
||||||
];
|
];
|
||||||
for (input, expected) in cases.iter() {
|
for (input, expected) in cases.iter() {
|
||||||
let ident = Identifier::new_unchecked(input, 0);
|
let ident = Identifier::new_unchecked(input, Case::None, 0);
|
||||||
let result: Vec<_> = ident.split().map(|w| (w.token, w.case, w.offset)).collect();
|
let result: Vec<_> = ident.split().map(|w| (w.token, w.case, w.offset)).collect();
|
||||||
assert_eq!(&result, expected);
|
assert_eq!(&result, expected);
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,11 +23,9 @@ Configuration is read from the following (in precedence order)
|
||||||
| default.binary | --binary | bool | Check binary files as text |
|
| default.binary | --binary | bool | Check binary files as text |
|
||||||
| default.check-filename | \- | bool | Verifying spelling in file names. |
|
| default.check-filename | \- | bool | Verifying spelling in file names. |
|
||||||
| default.check-file | \- | bool | Verifying spelling in files. |
|
| default.check-file | \- | bool | Verifying spelling in files. |
|
||||||
|
| default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) |
|
||||||
| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. |
|
| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. |
|
||||||
| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. |
|
| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. |
|
||||||
| default.identifier-include-digits | \- | bool | Allow identifiers to include digits, in addition to letters. |
|
|
||||||
| default.identifier-leading-chars | \- | string | Allow identifiers to start with one of these characters. |
|
|
||||||
| default.identifier-include-chars | \- | string | Allow identifiers to include these characters. |
|
|
||||||
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
|
| default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. |
|
||||||
| default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
|
| default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
|
||||||
| default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
|
| default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
|
||||||
|
|
81
src/args.rs
81
src/args.rs
|
@ -123,6 +123,12 @@ pub(crate) struct FileArgs {
|
||||||
#[structopt(long, overrides_with("no-check-files"), hidden(true))]
|
#[structopt(long, overrides_with("no-check-files"), hidden(true))]
|
||||||
check_files: bool,
|
check_files: bool,
|
||||||
|
|
||||||
|
#[structopt(long, overrides_with("no-unicode"), hidden(true))]
|
||||||
|
unicode: bool,
|
||||||
|
#[structopt(long, overrides_with("unicode"))]
|
||||||
|
/// Only allow ASCII characters in identifiers
|
||||||
|
no_unicode: bool,
|
||||||
|
|
||||||
#[structopt(
|
#[structopt(
|
||||||
long,
|
long,
|
||||||
possible_values(&config::Locale::variants()),
|
possible_values(&config::Locale::variants()),
|
||||||
|
@ -136,7 +142,10 @@ impl FileArgs {
|
||||||
binary: self.binary(),
|
binary: self.binary(),
|
||||||
check_filename: self.check_filename(),
|
check_filename: self.check_filename(),
|
||||||
check_file: self.check_file(),
|
check_file: self.check_file(),
|
||||||
tokenizer: None,
|
tokenizer: Some(config::TokenizerConfig {
|
||||||
|
unicode: self.unicode(),
|
||||||
|
..Default::default()
|
||||||
|
}),
|
||||||
dict: Some(config::DictConfig {
|
dict: Some(config::DictConfig {
|
||||||
locale: self.locale,
|
locale: self.locale,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
|
@ -145,30 +154,19 @@ impl FileArgs {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn binary(&self) -> Option<bool> {
|
fn binary(&self) -> Option<bool> {
|
||||||
match (self.binary, self.no_binary) {
|
resolve_bool_arg(self.binary, self.no_binary)
|
||||||
(true, false) => Some(true),
|
|
||||||
(false, true) => Some(false),
|
|
||||||
(false, false) => None,
|
|
||||||
(_, _) => unreachable!("StructOpt should make this impossible"),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_filename(&self) -> Option<bool> {
|
fn check_filename(&self) -> Option<bool> {
|
||||||
match (self.check_filenames, self.no_check_filenames) {
|
resolve_bool_arg(self.check_filenames, self.no_check_filenames)
|
||||||
(true, false) => Some(true),
|
|
||||||
(false, true) => Some(false),
|
|
||||||
(false, false) => None,
|
|
||||||
(_, _) => unreachable!("StructOpt should make this impossible"),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn unicode(&self) -> Option<bool> {
|
||||||
|
resolve_bool_arg(self.unicode, self.no_unicode)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_file(&self) -> Option<bool> {
|
fn check_file(&self) -> Option<bool> {
|
||||||
match (self.check_files, self.no_check_files) {
|
resolve_bool_arg(self.check_files, self.no_check_files)
|
||||||
(true, false) => Some(true),
|
|
||||||
(false, true) => Some(false),
|
|
||||||
(false, false) => None,
|
|
||||||
(_, _) => unreachable!("StructOpt should make this impossible"),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -244,56 +242,35 @@ impl WalkArgs {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ignore_hidden(&self) -> Option<bool> {
|
fn ignore_hidden(&self) -> Option<bool> {
|
||||||
match (self.hidden, self.no_hidden) {
|
resolve_bool_arg(self.no_hidden, self.hidden)
|
||||||
(true, false) => Some(false),
|
|
||||||
(false, true) => Some(true),
|
|
||||||
(false, false) => None,
|
|
||||||
(_, _) => unreachable!("StructOpt should make this impossible"),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ignore_files(&self) -> Option<bool> {
|
fn ignore_files(&self) -> Option<bool> {
|
||||||
match (self.no_ignore, self.ignore) {
|
resolve_bool_arg(self.ignore, self.no_ignore)
|
||||||
(true, false) => Some(false),
|
|
||||||
(false, true) => Some(true),
|
|
||||||
(false, false) => None,
|
|
||||||
(_, _) => unreachable!("StructOpt should make this impossible"),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ignore_dot(&self) -> Option<bool> {
|
fn ignore_dot(&self) -> Option<bool> {
|
||||||
match (self.no_ignore_dot, self.ignore_dot) {
|
resolve_bool_arg(self.ignore_dot, self.no_ignore_dot)
|
||||||
(true, false) => Some(false),
|
|
||||||
(false, true) => Some(true),
|
|
||||||
(false, false) => None,
|
|
||||||
(_, _) => unreachable!("StructOpt should make this impossible"),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ignore_vcs(&self) -> Option<bool> {
|
fn ignore_vcs(&self) -> Option<bool> {
|
||||||
match (self.no_ignore_vcs, self.ignore_vcs) {
|
resolve_bool_arg(self.ignore_vcs, self.no_ignore_vcs)
|
||||||
(true, false) => Some(false),
|
|
||||||
(false, true) => Some(true),
|
|
||||||
(false, false) => None,
|
|
||||||
(_, _) => unreachable!("StructOpt should make this impossible"),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ignore_global(&self) -> Option<bool> {
|
fn ignore_global(&self) -> Option<bool> {
|
||||||
match (self.no_ignore_global, self.ignore_global) {
|
resolve_bool_arg(self.ignore_global, self.no_ignore_global)
|
||||||
(true, false) => Some(false),
|
|
||||||
(false, true) => Some(true),
|
|
||||||
(false, false) => None,
|
|
||||||
(_, _) => unreachable!("StructOpt should make this impossible"),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ignore_parent(&self) -> Option<bool> {
|
fn ignore_parent(&self) -> Option<bool> {
|
||||||
match (self.no_ignore_parent, self.ignore_parent) {
|
resolve_bool_arg(self.ignore_parent, self.no_ignore_parent)
|
||||||
(true, false) => Some(false),
|
}
|
||||||
(false, true) => Some(true),
|
}
|
||||||
|
|
||||||
|
fn resolve_bool_arg(yes: bool, no: bool) -> Option<bool> {
|
||||||
|
match (yes, no) {
|
||||||
|
(true, false) => Some(true),
|
||||||
|
(false, true) => Some(false),
|
||||||
(false, false) => None,
|
(false, false) => None,
|
||||||
(_, _) => unreachable!("StructOpt should make this impossible"),
|
(_, _) => unreachable!("StructOpt should make this impossible"),
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -230,50 +230,38 @@ impl EngineConfig {
|
||||||
#[serde(deny_unknown_fields, default)]
|
#[serde(deny_unknown_fields, default)]
|
||||||
#[serde(rename_all = "kebab-case")]
|
#[serde(rename_all = "kebab-case")]
|
||||||
pub struct TokenizerConfig {
|
pub struct TokenizerConfig {
|
||||||
|
/// Allow unicode characters in identifiers (and not just ASCII)
|
||||||
|
pub unicode: Option<bool>,
|
||||||
/// Do not check identifiers that appear to be hexadecimal values.
|
/// Do not check identifiers that appear to be hexadecimal values.
|
||||||
pub ignore_hex: Option<bool>,
|
pub ignore_hex: Option<bool>,
|
||||||
/// Allow identifiers to start with digits, in addition to letters.
|
/// Allow identifiers to start with digits, in addition to letters.
|
||||||
pub identifier_leading_digits: Option<bool>,
|
pub identifier_leading_digits: Option<bool>,
|
||||||
/// Allow identifiers to start with one of these characters.
|
|
||||||
pub identifier_leading_chars: Option<kstring::KString>,
|
|
||||||
/// Allow identifiers to include digits, in addition to letters.
|
|
||||||
pub identifier_include_digits: Option<bool>,
|
|
||||||
/// Allow identifiers to include these characters.
|
|
||||||
pub identifier_include_chars: Option<kstring::KString>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TokenizerConfig {
|
impl TokenizerConfig {
|
||||||
pub fn from_defaults() -> Self {
|
pub fn from_defaults() -> Self {
|
||||||
let empty = Self::default();
|
let empty = Self::default();
|
||||||
Self {
|
Self {
|
||||||
|
unicode: Some(empty.unicode()),
|
||||||
ignore_hex: Some(empty.ignore_hex()),
|
ignore_hex: Some(empty.ignore_hex()),
|
||||||
identifier_leading_digits: Some(empty.identifier_leading_digits()),
|
identifier_leading_digits: Some(empty.identifier_leading_digits()),
|
||||||
identifier_leading_chars: Some(kstring::KString::from_ref(
|
|
||||||
empty.identifier_leading_chars(),
|
|
||||||
)),
|
|
||||||
identifier_include_digits: Some(empty.identifier_include_digits()),
|
|
||||||
identifier_include_chars: Some(kstring::KString::from_ref(
|
|
||||||
empty.identifier_include_chars(),
|
|
||||||
)),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn update(&mut self, source: &TokenizerConfig) {
|
pub fn update(&mut self, source: &TokenizerConfig) {
|
||||||
|
if let Some(source) = source.unicode {
|
||||||
|
self.unicode = Some(source);
|
||||||
|
}
|
||||||
if let Some(source) = source.ignore_hex {
|
if let Some(source) = source.ignore_hex {
|
||||||
self.ignore_hex = Some(source);
|
self.ignore_hex = Some(source);
|
||||||
}
|
}
|
||||||
if let Some(source) = source.identifier_leading_digits {
|
if let Some(source) = source.identifier_leading_digits {
|
||||||
self.identifier_leading_digits = Some(source);
|
self.identifier_leading_digits = Some(source);
|
||||||
}
|
}
|
||||||
if let Some(source) = source.identifier_leading_chars.as_ref() {
|
|
||||||
self.identifier_leading_chars = Some(source.clone());
|
|
||||||
}
|
|
||||||
if let Some(source) = source.identifier_include_digits {
|
|
||||||
self.identifier_include_digits = Some(source);
|
|
||||||
}
|
|
||||||
if let Some(source) = source.identifier_include_chars.as_ref() {
|
|
||||||
self.identifier_include_chars = Some(source.clone());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn unicode(&self) -> bool {
|
||||||
|
self.unicode.unwrap_or(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn ignore_hex(&self) -> bool {
|
pub fn ignore_hex(&self) -> bool {
|
||||||
|
@ -283,18 +271,6 @@ impl TokenizerConfig {
|
||||||
pub fn identifier_leading_digits(&self) -> bool {
|
pub fn identifier_leading_digits(&self) -> bool {
|
||||||
self.identifier_leading_digits.unwrap_or(false)
|
self.identifier_leading_digits.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn identifier_leading_chars(&self) -> &str {
|
|
||||||
self.identifier_leading_chars.as_deref().unwrap_or("_")
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn identifier_include_digits(&self) -> bool {
|
|
||||||
self.identifier_include_digits.unwrap_or(true)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn identifier_include_chars(&self) -> &str {
|
|
||||||
self.identifier_include_chars.as_deref().unwrap_or("_'")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||||
|
|
|
@ -217,11 +217,9 @@ impl<'s> ConfigEngine<'s> {
|
||||||
let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);
|
let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);
|
||||||
|
|
||||||
let tokenizer = typos::tokens::TokenizerBuilder::new()
|
let tokenizer = typos::tokens::TokenizerBuilder::new()
|
||||||
|
.unicode(tokenizer_config.unicode())
|
||||||
.ignore_hex(tokenizer_config.ignore_hex())
|
.ignore_hex(tokenizer_config.ignore_hex())
|
||||||
.leading_digits(tokenizer_config.identifier_leading_digits())
|
.leading_digits(tokenizer_config.identifier_leading_digits())
|
||||||
.leading_chars(tokenizer_config.identifier_leading_chars().to_owned())
|
|
||||||
.include_digits(tokenizer_config.identifier_include_digits())
|
|
||||||
.include_chars(tokenizer_config.identifier_include_chars().to_owned())
|
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
let dict = crate::dict::BuiltIn::new(dict_config.locale());
|
let dict = crate::dict::BuiltIn::new(dict_config.locale());
|
||||||
|
|
Loading…
Reference in a new issue