feat(parse): Make identifier symbols configurable

This commit is contained in:
Ed Page 2019-08-07 07:36:27 -05:00
parent e093135ac1
commit 3419a8df85
3 changed files with 19 additions and 4 deletions

7
Cargo.lock generated
View file

@ -271,7 +271,7 @@ name = "heck"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -811,6 +811,7 @@ dependencies = [
"serde_json 1.0.36 (registry+https://github.com/rust-lang/crates.io-index)",
"structopt 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)",
"unicase 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -828,7 +829,7 @@ dependencies = [
[[package]]
name = "unicode-segmentation"
version = "1.2.1"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
@ -1013,7 +1014,7 @@ dependencies = [
"checksum treeline 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7f741b240f1a48843f9b8e0444fb55fb2a4ff67293b50a9179dfd5ea67f8d41"
"checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86"
"checksum unicase 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7f4765f83163b74f957c797ad9253caf97f103fb064d3999aea9568d09fc8a33"
"checksum unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "aa6024fc12ddfd1c6dbc14a80fa2324d4568849869b779f6bd37e5e4c03344d1"
"checksum unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1967f4cdfc355b37fd76d2a954fb2ed3871034eb4f26d60537d88795cfc332a9"
"checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526"
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56"

View file

@ -36,6 +36,7 @@ unicase = "1.1"
bstr = "0.2"
log = "0.4"
env_logger = "0.6"
unicode-segmentation = "1.3.0"
[dev-dependencies]
assert_fs = "0.10"

View file

@ -10,6 +10,7 @@ pub enum Case {
pub struct ParserBuilder {
ignore_hex: bool,
include_digits: bool,
include_chars: String,
}
impl ParserBuilder {
@ -27,11 +28,22 @@ impl ParserBuilder {
self
}
pub fn include_chars(&mut self, chars: String) -> &mut Self {
self.include_chars = chars;
self
}
pub fn build(&self) -> Parser {
let mut pattern = r#"\b(\p{Alphabetic}|_|'"#.to_owned();
let mut pattern = r#"\b(\p{Alphabetic}"#.to_owned();
if self.include_digits {
pattern.push_str(r#"|\d"#);
}
for grapheme in
unicode_segmentation::UnicodeSegmentation::graphemes(self.include_chars.as_str(), true)
{
let escaped = regex::escape(&grapheme);
pattern.push_str(&format!("|{}", escaped));
}
pattern.push_str(r#")+\b"#);
let words_str = regex::Regex::new(&pattern).unwrap();
let words_bytes = regex::bytes::Regex::new(&pattern).unwrap();
@ -48,6 +60,7 @@ impl Default for ParserBuilder {
Self {
ignore_hex: true,
include_digits: true,
include_chars: "_'".to_owned(),
}
}
}