Merge pull request #235 from epage/parser

perf(parser): Overhaul how parsing is done
2024-11-22 09:01:04 -05:00 · 2021-04-30 13:04:11 -05:00 · 2021-04-30 13:04:11 -05:00 · e4f477799a
commit e4f477799a
parent 4450b2d4e6 f40ed5a328
8 changed files with 520 additions and 327 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -46,9 +46,9 @@ dependencies = [

 [[package]]
 name = "anyhow"
-version = "1.0.38"
+version = "1.0.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "afddf7f520a80dbf76e6f50a35bca42a2331ef227a28b3b6dc5c2e2338d114b1"
+checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b"

 [[package]]
 name = "arrayvec"
@ -58,9 +58,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"

 [[package]]
 name = "assert_fs"
-version = "1.0.1"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3203d5bb9979ac7210f01a150578ebafef6f08b55e79f6db32673c0977b94340"
+checksum = "73c485ca248200dfb850a64468a926321865cae0c450eaa7cdbe9ccf4ec49028"
 dependencies = [
 "doc-comment",
 "globwalk",
@ -89,11 +89,12 @@ checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"

 [[package]]
 name = "backtrace"
-version = "0.3.56"
+version = "0.3.58"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d117600f438b1707d4e4ae15d3595657288f8235a0eb593e80ecc98ab34e1bc"
+checksum = "88fb5a785d6b44fd9d6700935608639af1b8356de1e55d5f7c2740f4faa15d82"
 dependencies = [
 "addr2line",
+ "cc",
 "cfg-if",
 "libc",
 "miniz_oxide",
@ -109,9 +110,9 @@ checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"

 [[package]]
 name = "bitvec"
-version = "0.19.4"
+version = "0.19.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7ba35e9565969edb811639dbebfe34edc0368e472c5018474c8eb2543397f81"
+checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321"
 dependencies = [
 "funty",
 "radium",
@ -139,19 +140,25 @@ checksum = "63396b8a4b9de3f4fdfb320ab6080762242f66a8ef174c49d8e19b674db4cdbe"

 [[package]]
 name = "byteorder"
-version = "1.4.2"
+version = "1.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b"
+checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"

 [[package]]
 name = "cast"
-version = "0.2.3"
+version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0"
+checksum = "cc38c385bfd7e444464011bb24820f40dd1c76bcdfa1b78611cb7c2e5cafab75"
 dependencies = [
 "rustc_version",
 ]

+[[package]]
+name = "cc"
+version = "1.0.67"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd"
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
@ -225,6 +232,12 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "convert_case"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
+
 [[package]]
 name = "criterion"
 version = "0.3.4"
@ -263,9 +276,9 @@ dependencies = [

 [[package]]
 name = "crossbeam-channel"
-version = "0.5.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775"
+checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
 dependencies = [
 "cfg-if",
 "crossbeam-utils",
@ -284,9 +297,9 @@ dependencies = [

 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.3"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2584f639eb95fea8c798496315b297cf81b9b58b6d30ab066a75455333cf4b12"
+checksum = "52fb27eab85b17fbb9f6fd667089e07d6a2eb8743d02639ee7f6a7a7729c9c94"
 dependencies = [
 "cfg-if",
 "crossbeam-utils",
@ -297,9 +310,9 @@ dependencies = [

 [[package]]
 name = "crossbeam-utils"
-version = "0.8.3"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7e9d99fa91428effe99c5c6d4634cdeba32b8cf784fc428a2a687f61a952c49"
+checksum = "4feb231f0d4d6af81aed15928e58ecf5816aa62a2393e2c82f46973e92a9a278"
 dependencies = [
 "autocfg",
 "cfg-if",
@ -308,9 +321,9 @@ dependencies = [

 [[package]]
 name = "csv"
-version = "1.1.5"
+version = "1.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9d58633299b24b515ac72a3f869f8b91306a3cec616a602843a383acd6f9e97"
+checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
 dependencies = [
 "bstr",
 "csv-core",
@ -346,10 +359,10 @@ checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b"
 dependencies = [
 "fnv",
 "ident_case",
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
 "strsim 0.9.3",
- "syn 1.0.60",
+ "syn 1.0.71",
 ]

 [[package]]
@ -360,7 +373,7 @@ checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72"
 dependencies = [
 "darling_core",
 "quote 1.0.9",
- "syn 1.0.60",
+ "syn 1.0.71",
 ]

 [[package]]
@ -379,13 +392,14 @@ dependencies = [

 [[package]]
 name = "derive_more"
-version = "0.99.11"
+version = "0.99.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41cb0e6161ad61ed084a36ba71fbba9e3ac5aee3606fb607fe08da6acbcf3d8c"
+checksum = "f82b1b72f1263f214c0f823371768776c4f5841b942c9883aa8e5ec584fd0ba6"
 dependencies = [
- "proc-macro2 1.0.24",
+ "convert_case",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
- "syn 1.0.60",
+ "syn 1.0.71",
 ]

 [[package]]
@ -395,9 +409,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f1cf41b4580a37cca5ef2ada2cc43cf5d6be3983f4522e83010d67ab6925e84b"
 dependencies = [
 "darling",
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
- "syn 1.0.60",
+ "syn 1.0.71",
 ]

 [[package]]
@ -509,9 +523,9 @@ version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "946ee94e3dbf58fdd324f9ce245c7b238d46a66f00e86a020b71996349e46cce"
 dependencies = [
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
- "syn 1.0.60",
+ "syn 1.0.71",
 ]

 [[package]]
@ -717,9 +731,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"

 [[package]]
 name = "js-sys"
-version = "0.3.48"
+version = "0.3.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc9f84f9b115ce7843d60706df1422a916680bfdfcbdb0447c5614ff9d7e4d78"
+checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c"
 dependencies = [
 "wasm-bindgen",
 ]
@ -741,9 +755,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"

 [[package]]
 name = "lexical-core"
-version = "0.7.5"
+version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21f866863575d0e1d654fbeeabdc927292fdf862873dc3c96c6f753357e13374"
+checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe"
 dependencies = [
 "arrayvec",
 "bitflags",
@ -754,9 +768,9 @@ dependencies = [

 [[package]]
 name = "libc"
-version = "0.2.86"
+version = "0.2.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7282d924be3275cec7f6756ff4121987bc6481325397dde6ba3e7802b1a8b1c"
+checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"

 [[package]]
 name = "log"
@ -781,9 +795,9 @@ checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"

 [[package]]
 name = "memoffset"
-version = "0.6.1"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87"
+checksum = "f83fb6581e8ed1f85fd45c116db8405483899489e38406156c25eb743554361d"
 dependencies = [
 "autocfg",
 ]
@ -866,9 +880,9 @@ checksum = "a9a7ab5d64814df0fe4a4b5ead45ed6c5f181ee3ff04ba344313a6c80446c5d4"

 [[package]]
 name = "once_cell"
-version = "1.7.0"
+version = "1.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10acf907b94fc1b1a152d08ef97e7759650268cf986bf127f387e602b02c7e5a"
+checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3"

 [[package]]
 name = "oorandom"
@ -960,9 +974,9 @@ checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857"

 [[package]]
 name = "predicates"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eeb433456c1a57cc93554dea3ce40b4c19c4057e41c55d4a0f3d84ea71c325aa"
+checksum = "f49cfaf7fdaa3bfacc6fa3e7054e65148878354a5cfddcf661df4c851f8021df"
 dependencies = [
 "difference",
 "float-cmp",
@ -1000,9 +1014,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
 dependencies = [
 "proc-macro-error-attr",
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
- "syn 1.0.60",
+ "syn 1.0.71",
 "version_check",
 ]

@ -1012,7 +1026,7 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
 dependencies = [
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
 "version_check",
 ]
@ -1028,11 +1042,11 @@ dependencies = [

 [[package]]
 name = "proc-macro2"
-version = "1.0.24"
+version = "1.0.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71"
+checksum = "a152013215dca273577e18d2bf00fa862b89b24169fb78c4c95aeb07992c9cec"
 dependencies = [
- "unicode-xid 0.2.1",
+ "unicode-xid 0.2.2",
 ]

 [[package]]
@ -1056,7 +1070,7 @@ version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
 dependencies = [
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 ]

 [[package]]
@ -1183,23 +1197,22 @@ dependencies = [

 [[package]]
 name = "redox_syscall"
-version = "0.2.5"
+version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9"
+checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2"
 dependencies = [
 "bitflags",
 ]

 [[package]]
 name = "regex"
-version = "1.4.3"
+version = "1.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a"
+checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759"
 dependencies = [
 "aho-corasick",
 "memchr",
 "regex-syntax",
- "thread_local",
 ]

 [[package]]
@ -1213,9 +1226,9 @@ dependencies = [

 [[package]]
 name = "regex-syntax"
-version = "0.6.22"
+version = "0.6.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581"
+checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548"

 [[package]]
 name = "remove_dir_all"
@ -1279,9 +1292,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"

 [[package]]
 name = "serde"
-version = "1.0.123"
+version = "1.0.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae"
+checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171"
 dependencies = [
 "serde_derive",
 ]
@ -1298,13 +1311,13 @@ dependencies = [

 [[package]]
 name = "serde_derive"
-version = "1.0.123"
+version = "1.0.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31"
+checksum = "b093b7a2bb58203b5da3056c05b4ec1fed827dcfdb37347a8841695263b3d06d"
 dependencies = [
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
- "syn 1.0.60",
+ "syn 1.0.71",
 ]

 [[package]]
@ -1319,10 +1332,16 @@ dependencies = [
 ]

 [[package]]
-name = "siphasher"
-version = "0.3.3"
+name = "simdutf8"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7"
+checksum = "7f4f3d445e9015cf5e72cec4a3b3a84f8d54f34207afee609fd152de1c0212b1"
+
+[[package]]
+name = "siphasher"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cbce6d4507c7e4a3962091436e56e95290cb71fa302d0d270e32130b75fbff27"

 [[package]]
 name = "static_assertions"
@ -1361,9 +1380,9 @@ checksum = "5ba9cdfda491b814720b6b06e0cac513d922fc407582032e8706e9f137976f90"
 dependencies = [
 "heck",
 "proc-macro-error",
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
- "syn 1.0.60",
+ "syn 1.0.71",
 ]

 [[package]]
@ -1379,13 +1398,13 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "1.0.60"
+version = "1.0.71"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081"
+checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373"
 dependencies = [
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
- "unicode-xid 0.2.1",
+ "unicode-xid 0.2.2",
 ]

 [[package]]
@ -1441,9 +1460,9 @@ version = "1.0.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0"
 dependencies = [
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
- "syn 1.0.60",
+ "syn 1.0.71",
 ]

 [[package]]
@ -1457,9 +1476,9 @@ dependencies = [

 [[package]]
 name = "tinytemplate"
-version = "1.2.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2ada8616fad06a2d0c455adc530de4ef57605a8120cc65da9653e0e9623ca74"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
 dependencies = [
 "serde",
 "serde_json",
@ -1493,11 +1512,13 @@ dependencies = [
 "anyhow",
 "itertools 0.10.0",
 "log",
+ "nom",
 "once_cell",
- "regex",
 "serde",
+ "simdutf8",
 "thiserror",
 "unicode-segmentation",
+ "unicode-xid 0.2.2",
 ]

 [[package]]
@ -1512,7 +1533,7 @@ dependencies = [
 "clap-verbosity-flag",
 "content_inspector",
 "criterion",
- "derive_more 0.99.11",
+ "derive_more 0.99.13",
 "derive_setters",
 "difflib",
 "encoding",
@ -1631,9 +1652,9 @@ checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"

 [[package]]
 name = "unicode-xid"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
+checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"

 [[package]]
 name = "uuid"
@ -1676,15 +1697,15 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"

 [[package]]
 name = "version_check"
-version = "0.9.2"
+version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed"
+checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"

 [[package]]
 name = "walkdir"
-version = "2.3.1"
+version = "2.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d"
+checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
 dependencies = [
 "same-file",
 "winapi",
@ -1705,9 +1726,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"

 [[package]]
 name = "wasm-bindgen"
-version = "0.2.71"
+version = "0.2.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ee1280240b7c461d6a0071313e08f34a60b0365f14260362e5a2b17d1d31aa7"
+checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9"
 dependencies = [
 "cfg-if",
 "wasm-bindgen-macro",
@ -1715,24 +1736,24 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.71"
+version = "0.2.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b7d8b6942b8bb3a9b0e73fc79b98095a27de6fa247615e59d096754a3bc2aa8"
+checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae"
 dependencies = [
 "bumpalo",
 "lazy_static",
 "log",
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
- "syn 1.0.60",
+ "syn 1.0.71",
 "wasm-bindgen-shared",
 ]

 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.71"
+version = "0.2.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5ac38da8ef716661f0f36c0d8320b89028efe10c7c0afde65baffb496ce0d3b"
+checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f"
 dependencies = [
 "quote 1.0.9",
 "wasm-bindgen-macro-support",
@ -1740,28 +1761,28 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.71"
+version = "0.2.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc053ec74d454df287b9374ee8abb36ffd5acb95ba87da3ba5b7d3fe20eb401e"
+checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c"
 dependencies = [
- "proc-macro2 1.0.24",
+ "proc-macro2 1.0.26",
 "quote 1.0.9",
- "syn 1.0.60",
+ "syn 1.0.71",
 "wasm-bindgen-backend",
 "wasm-bindgen-shared",
 ]

 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.71"
+version = "0.2.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d6f8ec44822dd71f5f221a5847fb34acd9060535c1211b70a05844c0f6383b1"
+checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489"

 [[package]]
 name = "web-sys"
-version = "0.3.48"
+version = "0.3.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec600b26223b2948cedfde2a0aa6756dcf1fef616f43d7b3097aaf53a6c4d92b"
+checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be"
 dependencies = [
 "js-sys",
 "wasm-bindgen",
--- a/benches/tokenize.rs
+++ b/benches/tokenize.rs
@ -1,39 +1,93 @@
 mod data;

-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};

-fn bench_tokenize(c: &mut Criterion) {
-    let mut group = c.benchmark_group("tokenize");
+fn bench_parse_str(c: &mut Criterion) {
+    let mut group = c.benchmark_group("parse_str");
    for (name, sample) in data::DATA {
        let len = sample.len();
-        group.bench_with_input(BenchmarkId::new("ident(bytes)", name), &len, |b, _| {
-            let parser = typos::tokens::Tokenizer::new();
-            b.iter(|| parser.parse_bytes(sample.as_bytes()).last());
-        });
-        group.bench_with_input(BenchmarkId::new("ident(str)", name), &len, |b, _| {
-            let parser = typos::tokens::Tokenizer::new();
+        group.throughput(Throughput::Bytes(len as u64));
+        group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
+            let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
            b.iter(|| parser.parse_str(sample).last());
        });
-        group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| {
-            let symbol = typos::tokens::Identifier::new_unchecked(sample, 0);
-            b.iter(|| symbol.split().last());
+        group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
+            let parser = typos::tokens::TokenizerBuilder::new()
+                .unicode(false)
+                .build();
+            b.iter(|| parser.parse_str(sample).last());
        });
-        group.bench_with_input(
-            BenchmarkId::new("ident(bytes)+words", name),
-            &len,
-            |b, _| {
-                let parser = typos::tokens::Tokenizer::new();
-                b.iter(|| {
-                    parser
-                        .parse_bytes(sample.as_bytes())
-                        .flat_map(|i| i.split())
-                        .last()
-                });
-            },
-        );
    }
    group.finish();
 }

-criterion_group!(benches, bench_tokenize);
+fn bench_parse_bytes(c: &mut Criterion) {
+    let mut group = c.benchmark_group("parse_bytes");
+    for (name, sample) in data::DATA {
+        let len = sample.len();
+        group.throughput(Throughput::Bytes(len as u64));
+        group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
+            let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
+            b.iter(|| parser.parse_bytes(sample.as_bytes()).last());
+        });
+        group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
+            let parser = typos::tokens::TokenizerBuilder::new()
+                .unicode(false)
+                .build();
+            b.iter(|| parser.parse_bytes(sample.as_bytes()).last());
+        });
+    }
+    group.finish();
+}
+
+fn bench_split(c: &mut Criterion) {
+    let mut group = c.benchmark_group("split");
+    for (name, sample) in data::DATA {
+        let len = sample.len();
+        group.throughput(Throughput::Bytes(len as u64));
+        group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| {
+            let symbol =
+                typos::tokens::Identifier::new_unchecked(sample, typos::tokens::Case::None, 0);
+            b.iter(|| symbol.split().last());
+        });
+    }
+    group.finish();
+}
+
+fn bench_parse_split(c: &mut Criterion) {
+    let mut group = c.benchmark_group("parse_bytes+split");
+    for (name, sample) in data::DATA {
+        let len = sample.len();
+        group.throughput(Throughput::Bytes(len as u64));
+        group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| {
+            let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build();
+            b.iter(|| {
+                parser
+                    .parse_bytes(sample.as_bytes())
+                    .flat_map(|i| i.split())
+                    .last()
+            });
+        });
+        group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| {
+            let parser = typos::tokens::TokenizerBuilder::new()
+                .unicode(false)
+                .build();
+            b.iter(|| {
+                parser
+                    .parse_bytes(sample.as_bytes())
+                    .flat_map(|i| i.split())
+                    .last()
+            });
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_parse_str,
+    bench_parse_bytes,
+    bench_split,
+    bench_parse_split
+);
 criterion_main!(benches);
--- a/crates/typos/Cargo.toml
+++ b/crates/typos/Cargo.toml
@ -17,9 +17,11 @@ codecov = { repository = "crate-ci/typos" }
 [dependencies]
 anyhow = "1.0"
 thiserror = "1.0"
-regex = "1.3"
+nom = "6.0"
+unicode-xid = "0.2.2"
 once_cell = "1.2.0"
 serde = { version = "1.0", features = ["derive"] }
+simdutf8 = "0.1.1"
 itertools = "0.10"
 log = "0.4"
 unicode-segmentation = "1.7.1"
--- a/crates/typos/src/tokens.rs
+++ b/crates/typos/src/tokens.rs
@ -1,11 +1,9 @@
 /// Define rules for tokenizaing a buffer.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct TokenizerBuilder {
+    unicode: bool,
    ignore_hex: bool,
    leading_digits: bool,
-    leading_chars: String,
-    include_digits: bool,
-    include_chars: String,
 }

 impl TokenizerBuilder {
@ -13,6 +11,12 @@ impl TokenizerBuilder {
        Default::default()
    }

+    /// Specify that unicode Identifiers are allowed.
+    pub fn unicode(&mut self, yes: bool) -> &mut Self {
+        self.unicode = yes;
+        self
+    }
+
    /// Specify that hexadecimal numbers should be ignored.
    pub fn ignore_hex(&mut self, yes: bool) -> &mut Self {
        self.ignore_hex = yes;
@ -25,64 +29,26 @@ impl TokenizerBuilder {
        self
    }

-    /// Extend accepted leading characters for Identifiers.
-    pub fn leading_chars(&mut self, chars: String) -> &mut Self {
-        self.leading_chars = chars;
-        self
-    }
-
-    /// Specify that digits can be included in Identifiers.
-    pub fn include_digits(&mut self, yes: bool) -> &mut Self {
-        self.include_digits = yes;
-        self
-    }
-
-    /// Extend accepted characters for Identifiers.
-    pub fn include_chars(&mut self, chars: String) -> &mut Self {
-        self.include_chars = chars;
-        self
-    }
-
    pub fn build(&self) -> Tokenizer {
-        let mut pattern = r#"\b("#.to_owned();
-        Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars);
-        Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars);
-        pattern.push_str(r#"*)\b"#);
-
-        let words_str = regex::Regex::new(&pattern).unwrap();
-        let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();
-
+        let TokenizerBuilder {
+            unicode,
+            leading_digits,
+            ignore_hex,
+        } = self.clone();
        Tokenizer {
-            words_str,
-            words_bytes,
-            // `leading_digits` let's us bypass the regexes since you can't have a decimal or
-            // hexadecimal number without a leading digit.
-            ignore_numbers: self.leading_digits,
-            ignore_hex: self.ignore_hex && self.leading_digits,
+            unicode,
+            leading_digits,
+            ignore_hex,
        }
    }
-
-    fn push_pattern(pattern: &mut String, digits: bool, chars: &str) {
-        pattern.push_str(r#"(\p{Alphabetic}"#);
-        if digits {
-            pattern.push_str(r#"|\d"#);
-        }
-        for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) {
-            let escaped = regex::escape(&grapheme);
-            pattern.push_str(&format!("|{}", escaped));
-        }
-        pattern.push(')');
-    }
 }

 impl Default for TokenizerBuilder {
    fn default() -> Self {
        Self {
-            ignore_hex: true,
+            unicode: true,
            leading_digits: false,
-            leading_chars: "_".to_owned(),
-            include_digits: true,
-            include_chars: "_'".to_owned(),
+            ignore_hex: true,
        }
    }
 }
@ -90,9 +56,8 @@ impl Default for TokenizerBuilder {
 /// Extract Identifiers from a buffer.
 #[derive(Debug, Clone)]
 pub struct Tokenizer {
-    words_str: regex::Regex,
-    words_bytes: regex::bytes::Regex,
-    ignore_numbers: bool,
+    unicode: bool,
+    leading_digits: bool,
    ignore_hex: bool,
 }

@ -102,32 +67,46 @@ impl Tokenizer {
    }

    pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator<Item = Identifier<'c>> {
-        self.words_str
-            .find_iter(content)
-            .filter(move |m| self.accept(m.as_str().as_bytes()))
-            .map(|m| Identifier::new_unchecked(m.as_str(), m.start()))
+        let iter = if self.unicode {
+            itertools::Either::Left(unicode_parser::iter_literals(content))
+        } else {
+            itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes()))
+        };
+        iter.filter_map(move |identifier| {
+            let offset = offset(content.as_bytes(), identifier.as_bytes());
+            self.transform(identifier, offset)
+        })
    }

    pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator<Item = Identifier<'c>> {
-        self.words_bytes
-            .find_iter(content)
-            .filter(move |m| self.accept(m.as_bytes()))
-            .filter_map(|m| {
-                let s = std::str::from_utf8(m.as_bytes()).ok();
-                s.map(|s| Identifier::new_unchecked(s, m.start()))
-            })
+        let iter = if self.unicode {
+            let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c));
+            itertools::Either::Left(iter)
+        } else {
+            itertools::Either::Right(ascii_parser::iter_literals(content))
+        };
+        iter.filter_map(move |identifier| {
+            let offset = offset(content, identifier.as_bytes());
+            self.transform(identifier, offset)
+        })
    }

-    fn accept(&self, contents: &[u8]) -> bool {
-        if self.ignore_numbers && is_number(contents) {
-            return false;
+    fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option<Identifier<'i>> {
+        debug_assert!(!identifier.is_empty());
+        if self.leading_digits {
+            if is_number(identifier.as_bytes()) {
+                return None;
+            }
+
+            if self.ignore_hex && is_hex(identifier.as_bytes()) {
+                return None;
+            }
+        } else if is_digit(identifier.as_bytes()[0]) {
+            return None;
        }

-        if self.ignore_hex && is_hex(contents) {
-            return false;
-        }
-
-        true
+        let case = Case::None;
+        Some(Identifier::new_unchecked(identifier, case, offset))
    }
 }

@ -137,34 +116,176 @@ impl Default for Tokenizer {
    }
 }

-// `_`: number literal separator in Rust and other languages
-// `'`: number literal separator in C++
-static DIGITS: once_cell::sync::Lazy<regex::bytes::Regex> =
-    once_cell::sync::Lazy::new(|| regex::bytes::Regex::new(r#"^[0-9_']+$"#).unwrap());
-
-fn is_number(ident: &[u8]) -> bool {
-    DIGITS.is_match(ident)
+fn offset(base: &[u8], needle: &[u8]) -> usize {
+    let base = base.as_ptr() as usize;
+    let needle = needle.as_ptr() as usize;
+    debug_assert!(base <= needle);
+    needle - base
 }

-// `_`: number literal separator in Rust and other languages
-// `'`: number literal separator in C++
-static HEX: once_cell::sync::Lazy<regex::bytes::Regex> =
-    once_cell::sync::Lazy::new(|| regex::bytes::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap());
+struct Utf8Chunks<'s> {
+    source: &'s [u8],
+}
+
+impl<'s> Utf8Chunks<'s> {
+    fn new(source: &'s [u8]) -> Self {
+        Self { source }
+    }
+}
+
+impl<'s> Iterator for Utf8Chunks<'s> {
+    type Item = &'s str;
+
+    fn next(&mut self) -> Option<&'s str> {
+        if self.source.is_empty() {
+            return None;
+        }
+
+        match simdutf8::compat::from_utf8(self.source) {
+            Ok(valid) => {
+                self.source = b"";
+                Some(valid)
+            }
+            Err(error) => {
+                let (valid, after_valid) = self.source.split_at(error.valid_up_to());
+
+                if let Some(invalid_sequence_length) = error.error_len() {
+                    self.source = &after_valid[invalid_sequence_length..];
+                } else {
+                    self.source = b"";
+                }
+
+                let valid = unsafe { std::str::from_utf8_unchecked(valid) };
+                Some(valid)
+            }
+        }
+    }
+}
+
+fn is_number(ident: &[u8]) -> bool {
+    ident.iter().all(|b| is_digit(*b) || is_digit_sep(*b))
+}

 fn is_hex(ident: &[u8]) -> bool {
-    HEX.is_match(ident)
+    if ident.len() < 3 {
+        false
+    } else {
+        ident[0] == b'0'
+            && ident[1] == b'x'
+            && ident[2..]
+                .iter()
+                .all(|b| is_hex_digit(*b) || is_digit_sep(*b))
+    }
+}
+
+#[inline]
+fn is_digit(chr: u8) -> bool {
+    chr.is_ascii_digit()
+}
+
+#[inline]
+fn is_digit_sep(chr: u8) -> bool {
+    // `_`: number literal separator in Rust and other languages
+    // `'`: number literal separator in C++
+    chr == b'_' || chr == b'\''
+}
+
+#[inline]
+fn is_hex_digit(chr: u8) -> bool {
+    chr.is_ascii_hexdigit()
+}
+
+mod unicode_parser {
+    use nom::bytes::complete::*;
+    use nom::sequence::*;
+    use nom::IResult;
+
+    pub(crate) fn iter_literals(mut input: &str) -> impl Iterator<Item = &str> {
+        std::iter::from_fn(move || match next_literal(input) {
+            Ok((i, o)) => {
+                input = i;
+                debug_assert_ne!(o, "");
+                Some(o)
+            }
+            _ => None,
+        })
+    }
+
+    fn next_literal(input: &str) -> IResult<&str, &str> {
+        preceded(literal_sep, identifier)(input)
+    }
+
+    fn literal_sep(input: &str) -> IResult<&str, &str> {
+        take_till(unicode_xid::UnicodeXID::is_xid_continue)(input)
+    }
+
+    fn identifier(input: &str) -> IResult<&str, &str> {
+        // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
+        // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
+        // or unexpected cases than strip off start characters to a word since we aren't doing a
+        // proper word boundary parse
+        take_while1(unicode_xid::UnicodeXID::is_xid_continue)(input)
+    }
+}
+
+mod ascii_parser {
+    use nom::bytes::complete::*;
+    use nom::sequence::*;
+    use nom::IResult;
+
+    pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator<Item = &str> {
+        std::iter::from_fn(move || match next_literal(input) {
+            Ok((i, o)) => {
+                input = i;
+                debug_assert_ne!(o, b"");
+                // This is safe because we've checked that the strings are a subset of ASCII
+                // characters.
+                let o = unsafe { std::str::from_utf8_unchecked(o) };
+                Some(o)
+            }
+            _ => None,
+        })
+    }
+
+    fn next_literal(input: &[u8]) -> IResult<&[u8], &[u8]> {
+        preceded(literal_sep, identifier)(input)
+    }
+
+    fn literal_sep(input: &[u8]) -> IResult<&[u8], &[u8]> {
+        take_till(is_continue)(input)
+    }
+
+    fn identifier(input: &[u8]) -> IResult<&[u8], &[u8]> {
+        // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only
+        // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd
+        // or unexpected cases than strip off start characters to a word since we aren't doing a
+        // proper word boundary parse
+        take_while1(is_continue)(input)
+    }
+
+    fn is_continue(c: u8) -> bool {
+        (b'a'..=b'z').contains(&c)
+            || (b'A'..=b'Z').contains(&c)
+            || (b'0'..=b'9').contains(&c)
+            || c == b'_'
+    }
 }

 /// A term composed of Words.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct Identifier<'t> {
    token: &'t str,
+    case: Case,
    offset: usize,
 }

 impl<'t> Identifier<'t> {
-    pub fn new_unchecked(token: &'t str, offset: usize) -> Self {
-        Self { token, offset }
+    pub fn new_unchecked(token: &'t str, case: Case, offset: usize) -> Self {
+        Self {
+            token,
+            case,
+            offset,
+        }
    }

    pub fn token(&self) -> &'t str {
@ -172,7 +293,7 @@ impl<'t> Identifier<'t> {
    }

    pub fn case(&self) -> Case {
-        Case::None
+        self.case
    }

    pub fn offset(&self) -> usize {
@ -181,7 +302,12 @@ impl<'t> Identifier<'t> {

    /// Split into individual Words.
    pub fn split(&self) -> impl Iterator<Item = Word<'t>> {
-        SplitIdent::new(self.token, self.offset)
+        match self.case {
+            Case::None => itertools::Either::Left(SplitIdent::new(self.token, self.offset)),
+            _ => itertools::Either::Right(
+                Some(Word::new_unchecked(self.token, self.case, self.offset)).into_iter(),
+            ),
+        }
    }
 }

@ -269,7 +395,7 @@ impl<'s> Iterator for SplitIdent<'s> {
        while let Some((i, c)) = self.char_indices.next() {
            let cur_mode = WordMode::classify(c);
            if cur_mode == WordMode::Boundary {
-                assert!(self.start_mode == WordMode::Boundary);
+                debug_assert!(self.start_mode == WordMode::Boundary);
                continue;
            }
            if self.start_mode == WordMode::Boundary {
@ -409,7 +535,7 @@ mod test {
        let parser = Tokenizer::new();

        let input = "word";
-        let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", 0)];
+        let expected: Vec<Identifier> = vec![Identifier::new_unchecked("word", Case::None, 0)];
        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
        let actual: Vec<_> = parser.parse_str(input).collect();
@ -422,8 +548,8 @@ mod test {

        let input = "A B";
        let expected: Vec<Identifier> = vec![
-            Identifier::new_unchecked("A", 0),
-            Identifier::new_unchecked("B", 2),
+            Identifier::new_unchecked("A", Case::None, 0),
+            Identifier::new_unchecked("B", Case::None, 2),
        ];
        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
@ -437,8 +563,8 @@ mod test {

        let input = "A.B";
        let expected: Vec<Identifier> = vec![
-            Identifier::new_unchecked("A", 0),
-            Identifier::new_unchecked("B", 2),
+            Identifier::new_unchecked("A", Case::None, 0),
+            Identifier::new_unchecked("B", Case::None, 2),
        ];
        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
@ -452,8 +578,8 @@ mod test {

        let input = "A::B";
        let expected: Vec<Identifier> = vec![
-            Identifier::new_unchecked("A", 0),
-            Identifier::new_unchecked("B", 3),
+            Identifier::new_unchecked("A", Case::None, 0),
+            Identifier::new_unchecked("B", Case::None, 3),
        ];
        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
@ -466,7 +592,7 @@ mod test {
        let parser = Tokenizer::new();

        let input = "A_B";
-        let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", 0)];
+        let expected: Vec<Identifier> = vec![Identifier::new_unchecked("A_B", Case::None, 0)];
        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
        let actual: Vec<_> = parser.parse_str(input).collect();
@ -475,12 +601,15 @@ mod test {

    #[test]
    fn tokenize_ignore_hex_enabled() {
-        let parser = TokenizerBuilder::new().ignore_hex(true).build();
+        let parser = TokenizerBuilder::new()
+            .ignore_hex(true)
+            .leading_digits(true)
+            .build();

        let input = "Hello 0xDEADBEEF World";
        let expected: Vec<Identifier> = vec![
-            Identifier::new_unchecked("Hello", 0),
-            Identifier::new_unchecked("World", 17),
+            Identifier::new_unchecked("Hello", Case::None, 0),
+            Identifier::new_unchecked("World", Case::None, 17),
        ];
        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
@ -497,9 +626,47 @@ mod test {

        let input = "Hello 0xDEADBEEF World";
        let expected: Vec<Identifier> = vec![
-            Identifier::new_unchecked("Hello", 0),
-            Identifier::new_unchecked("0xDEADBEEF", 6),
-            Identifier::new_unchecked("World", 17),
+            Identifier::new_unchecked("Hello", Case::None, 0),
+            Identifier::new_unchecked("0xDEADBEEF", Case::None, 6),
+            Identifier::new_unchecked("World", Case::None, 17),
+        ];
+        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = parser.parse_str(input).collect();
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
+    fn tokenize_leading_digits_enabled() {
+        let parser = TokenizerBuilder::new()
+            .ignore_hex(false)
+            .leading_digits(true)
+            .build();
+
+        let input = "Hello 0Hello 124 0xDEADBEEF World";
+        let expected: Vec<Identifier> = vec![
+            Identifier::new_unchecked("Hello", Case::None, 0),
+            Identifier::new_unchecked("0Hello", Case::None, 6),
+            Identifier::new_unchecked("0xDEADBEEF", Case::None, 17),
+            Identifier::new_unchecked("World", Case::None, 28),
+        ];
+        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
+        assert_eq!(expected, actual);
+        let actual: Vec<_> = parser.parse_str(input).collect();
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
+    fn tokenize_leading_digits_disabled() {
+        let parser = TokenizerBuilder::new()
+            .ignore_hex(false)
+            .leading_digits(false)
+            .build();
+
+        let input = "Hello 0Hello 124 0xDEADBEEF World";
+        let expected: Vec<Identifier> = vec![
+            Identifier::new_unchecked("Hello", Case::None, 0),
+            Identifier::new_unchecked("World", Case::None, 28),
        ];
        let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect();
        assert_eq!(expected, actual);
@ -564,7 +731,7 @@ mod test {
            ),
        ];
        for (input, expected) in cases.iter() {
-            let ident = Identifier::new_unchecked(input, 0);
+            let ident = Identifier::new_unchecked(input, Case::None, 0);
            let result: Vec<_> = ident.split().map(|w| (w.token, w.case, w.offset)).collect();
            assert_eq!(&result, expected);
        }
--- a/docs/reference.md
+++ b/docs/reference.md
@ -23,11 +23,9 @@ Configuration is read from the following (in precedence order)
 | default.binary         | --binary          | bool   | Check binary files as text |
 | default.check-filename | \-                | bool   | Verifying spelling in file names. |
 | default.check-file     | \-                | bool   | Verifying spelling in files. |
+| default.unicode        | --unicode         | bool   | Allow unicode characters in identifiers (and not just ASCII) |
 | default.ignore-hex     | \-                | bool   | Do not check identifiers that appear to be hexadecimal values. |
 | default.identifier-leading-digits   | \-   | bool   | Allow identifiers to start with digits, in addition to letters. |
-| default.identifier-include-digits   | \-   | bool   | Allow identifiers to include digits, in addition to letters. |
-| default.identifier-leading-chars    | \-   | string | Allow identifiers to start with one of these characters. |
-| default.identifier-include-chars    | \-   | string | Allow identifiers to include these characters. |
 | default.locale         | --locale          | en, en-us, en-gb, en-ca, en-au   | English dialect to correct to. |
 | default.extend-identifiers | \-            | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
 | default.extend-words       | \-            | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. |
--- a/src/args.rs
+++ b/src/args.rs
@ -123,6 +123,12 @@ pub(crate) struct FileArgs {
    #[structopt(long, overrides_with("no-check-files"), hidden(true))]
    check_files: bool,

+    #[structopt(long, overrides_with("no-unicode"), hidden(true))]
+    unicode: bool,
+    #[structopt(long, overrides_with("unicode"))]
+    /// Only allow ASCII characters in identifiers
+    no_unicode: bool,
+
    #[structopt(
        long,
        possible_values(&config::Locale::variants()),
@ -136,7 +142,10 @@ impl FileArgs {
            binary: self.binary(),
            check_filename: self.check_filename(),
            check_file: self.check_file(),
-            tokenizer: None,
+            tokenizer: Some(config::TokenizerConfig {
+                unicode: self.unicode(),
+                ..Default::default()
+            }),
            dict: Some(config::DictConfig {
                locale: self.locale,
                ..Default::default()
@ -145,30 +154,19 @@ impl FileArgs {
    }

    fn binary(&self) -> Option<bool> {
-        match (self.binary, self.no_binary) {
-            (true, false) => Some(true),
-            (false, true) => Some(false),
-            (false, false) => None,
-            (_, _) => unreachable!("StructOpt should make this impossible"),
-        }
+        resolve_bool_arg(self.binary, self.no_binary)
    }

    fn check_filename(&self) -> Option<bool> {
-        match (self.check_filenames, self.no_check_filenames) {
-            (true, false) => Some(true),
-            (false, true) => Some(false),
-            (false, false) => None,
-            (_, _) => unreachable!("StructOpt should make this impossible"),
-        }
+        resolve_bool_arg(self.check_filenames, self.no_check_filenames)
+    }
+
+    fn unicode(&self) -> Option<bool> {
+        resolve_bool_arg(self.unicode, self.no_unicode)
    }

    fn check_file(&self) -> Option<bool> {
-        match (self.check_files, self.no_check_files) {
-            (true, false) => Some(true),
-            (false, true) => Some(false),
-            (false, false) => None,
-            (_, _) => unreachable!("StructOpt should make this impossible"),
-        }
+        resolve_bool_arg(self.check_files, self.no_check_files)
    }
 }

@ -244,56 +242,35 @@ impl WalkArgs {
    }

    fn ignore_hidden(&self) -> Option<bool> {
-        match (self.hidden, self.no_hidden) {
-            (true, false) => Some(false),
-            (false, true) => Some(true),
-            (false, false) => None,
-            (_, _) => unreachable!("StructOpt should make this impossible"),
-        }
+        resolve_bool_arg(self.no_hidden, self.hidden)
    }

    fn ignore_files(&self) -> Option<bool> {
-        match (self.no_ignore, self.ignore) {
-            (true, false) => Some(false),
-            (false, true) => Some(true),
-            (false, false) => None,
-            (_, _) => unreachable!("StructOpt should make this impossible"),
-        }
+        resolve_bool_arg(self.ignore, self.no_ignore)
    }

    fn ignore_dot(&self) -> Option<bool> {
-        match (self.no_ignore_dot, self.ignore_dot) {
-            (true, false) => Some(false),
-            (false, true) => Some(true),
-            (false, false) => None,
-            (_, _) => unreachable!("StructOpt should make this impossible"),
-        }
+        resolve_bool_arg(self.ignore_dot, self.no_ignore_dot)
    }

    fn ignore_vcs(&self) -> Option<bool> {
-        match (self.no_ignore_vcs, self.ignore_vcs) {
-            (true, false) => Some(false),
-            (false, true) => Some(true),
-            (false, false) => None,
-            (_, _) => unreachable!("StructOpt should make this impossible"),
-        }
+        resolve_bool_arg(self.ignore_vcs, self.no_ignore_vcs)
    }

    fn ignore_global(&self) -> Option<bool> {
-        match (self.no_ignore_global, self.ignore_global) {
-            (true, false) => Some(false),
-            (false, true) => Some(true),
-            (false, false) => None,
-            (_, _) => unreachable!("StructOpt should make this impossible"),
-        }
+        resolve_bool_arg(self.ignore_global, self.no_ignore_global)
    }

    fn ignore_parent(&self) -> Option<bool> {
-        match (self.no_ignore_parent, self.ignore_parent) {
-            (true, false) => Some(false),
-            (false, true) => Some(true),
-            (false, false) => None,
-            (_, _) => unreachable!("StructOpt should make this impossible"),
-        }
+        resolve_bool_arg(self.ignore_parent, self.no_ignore_parent)
+    }
+}
+
+fn resolve_bool_arg(yes: bool, no: bool) -> Option<bool> {
+    match (yes, no) {
+        (true, false) => Some(true),
+        (false, true) => Some(false),
+        (false, false) => None,
+        (_, _) => unreachable!("StructOpt should make this impossible"),
    }
 }
--- a/src/config.rs
+++ b/src/config.rs
@ -230,50 +230,38 @@ impl EngineConfig {
 #[serde(deny_unknown_fields, default)]
 #[serde(rename_all = "kebab-case")]
 pub struct TokenizerConfig {
+    /// Allow unicode characters in identifiers (and not just ASCII)
+    pub unicode: Option<bool>,
    /// Do not check identifiers that appear to be hexadecimal values.
    pub ignore_hex: Option<bool>,
    /// Allow identifiers to start with digits, in addition to letters.
    pub identifier_leading_digits: Option<bool>,
-    /// Allow identifiers to start with one of these characters.
-    pub identifier_leading_chars: Option<kstring::KString>,
-    /// Allow identifiers to include digits, in addition to letters.
-    pub identifier_include_digits: Option<bool>,
-    /// Allow identifiers to include these characters.
-    pub identifier_include_chars: Option<kstring::KString>,
 }

 impl TokenizerConfig {
    pub fn from_defaults() -> Self {
        let empty = Self::default();
        Self {
+            unicode: Some(empty.unicode()),
            ignore_hex: Some(empty.ignore_hex()),
            identifier_leading_digits: Some(empty.identifier_leading_digits()),
-            identifier_leading_chars: Some(kstring::KString::from_ref(
-                empty.identifier_leading_chars(),
-            )),
-            identifier_include_digits: Some(empty.identifier_include_digits()),
-            identifier_include_chars: Some(kstring::KString::from_ref(
-                empty.identifier_include_chars(),
-            )),
        }
    }

    pub fn update(&mut self, source: &TokenizerConfig) {
+        if let Some(source) = source.unicode {
+            self.unicode = Some(source);
+        }
        if let Some(source) = source.ignore_hex {
            self.ignore_hex = Some(source);
        }
        if let Some(source) = source.identifier_leading_digits {
            self.identifier_leading_digits = Some(source);
        }
-        if let Some(source) = source.identifier_leading_chars.as_ref() {
-            self.identifier_leading_chars = Some(source.clone());
-        }
-        if let Some(source) = source.identifier_include_digits {
-            self.identifier_include_digits = Some(source);
-        }
-        if let Some(source) = source.identifier_include_chars.as_ref() {
-            self.identifier_include_chars = Some(source.clone());
-        }
+    }
+
+    pub fn unicode(&self) -> bool {
+        self.unicode.unwrap_or(true)
    }

    pub fn ignore_hex(&self) -> bool {
@ -283,18 +271,6 @@ impl TokenizerConfig {
    pub fn identifier_leading_digits(&self) -> bool {
        self.identifier_leading_digits.unwrap_or(false)
    }
-
-    pub fn identifier_leading_chars(&self) -> &str {
-        self.identifier_leading_chars.as_deref().unwrap_or("_")
-    }
-
-    pub fn identifier_include_digits(&self) -> bool {
-        self.identifier_include_digits.unwrap_or(true)
-    }
-
-    pub fn identifier_include_chars(&self) -> &str {
-        self.identifier_include_chars.as_deref().unwrap_or("_'")
-    }
 }

 #[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
--- a/src/policy.rs
+++ b/src/policy.rs
@ -217,11 +217,9 @@ impl<'s> ConfigEngine<'s> {
        let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults);

        let tokenizer = typos::tokens::TokenizerBuilder::new()
+            .unicode(tokenizer_config.unicode())
            .ignore_hex(tokenizer_config.ignore_hex())
            .leading_digits(tokenizer_config.identifier_leading_digits())
-            .leading_chars(tokenizer_config.identifier_leading_chars().to_owned())
-            .include_digits(tokenizer_config.identifier_include_digits())
-            .include_chars(tokenizer_config.identifier_include_chars().to_owned())
            .build();

        let dict = crate::dict::BuiltIn::new(dict_config.locale());