diff --git a/Cargo.lock b/Cargo.lock index 8856495..d13e961 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -271,7 +271,7 @@ name = "heck" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -811,6 +811,7 @@ dependencies = [ "serde_json 1.0.36 (registry+https://github.com/rust-lang/crates.io-index)", "structopt 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)", "unicase 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -828,7 +829,7 @@ dependencies = [ [[package]] name = "unicode-segmentation" -version = "1.2.1" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -1013,7 +1014,7 @@ dependencies = [ "checksum treeline 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7f741b240f1a48843f9b8e0444fb55fb2a4ff67293b50a9179dfd5ea67f8d41" "checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" "checksum unicase 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7f4765f83163b74f957c797ad9253caf97f103fb064d3999aea9568d09fc8a33" -"checksum unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "aa6024fc12ddfd1c6dbc14a80fa2324d4568849869b779f6bd37e5e4c03344d1" +"checksum unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1967f4cdfc355b37fd76d2a954fb2ed3871034eb4f26d60537d88795cfc332a9" "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" diff --git a/Cargo.toml b/Cargo.toml index daef3c6..921ec74 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,7 @@ unicase = "1.1" bstr = "0.2" log = "0.4" env_logger = "0.6" +unicode-segmentation = "1.3.0" [dev-dependencies] assert_fs = "0.10" diff --git a/src/tokens.rs b/src/tokens.rs index d2fec14..421c59c 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -10,6 +10,7 @@ pub enum Case { pub struct ParserBuilder { ignore_hex: bool, include_digits: bool, + include_chars: String, } impl ParserBuilder { @@ -27,11 +28,22 @@ impl ParserBuilder { self } + pub fn include_chars(&mut self, chars: String) -> &mut Self { + self.include_chars = chars; + self + } + pub fn build(&self) -> Parser { - let mut pattern = r#"\b(\p{Alphabetic}|_|'"#.to_owned(); + let mut pattern = r#"\b(\p{Alphabetic}"#.to_owned(); if self.include_digits { pattern.push_str(r#"|\d"#); } + for grapheme in + unicode_segmentation::UnicodeSegmentation::graphemes(self.include_chars.as_str(), true) + { + let escaped = regex::escape(&grapheme); + pattern.push_str(&format!("|{}", escaped)); + } pattern.push_str(r#")+\b"#); let words_str = regex::Regex::new(&pattern).unwrap(); let words_bytes = regex::bytes::Regex::new(&pattern).unwrap(); @@ -48,6 +60,7 @@ impl Default for ParserBuilder { Self { ignore_hex: true, include_digits: true, + include_chars: "_'".to_owned(), } } }