feat: Expose wikipedia's dict to Rust

This commit is contained in:
Ed Page 2019-10-28 13:39:59 -06:00
parent 3daafd1ea7
commit 0f06e602cb
10 changed files with 9666 additions and 0 deletions

21
Cargo.lock generated
View file

@ -899,6 +899,27 @@ name = "wasi"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "wikipedia-codegen"
version = "0.1.1"
dependencies = [
"codegenrs 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"phf 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"phf_codegen 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"structopt 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"unicase 2.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "wikipedia-dict"
version = "0.1.1"
dependencies = [
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
"phf 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"unicase 2.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "winapi"
version = "0.3.8"

View file

@ -4,6 +4,7 @@ members = [
"dict/typos", "dict/typos/codegen",
"dict/codespell", "dict/codespell/codegen",
"dict/misspell", "dict/misspell/codegen",
"dict/wikipedia", "dict/wikipedia/codegen",
]
[package]

View file

@ -33,6 +33,9 @@ stages:
- script: |
cargo run --package misspell-codegen -- --output dict/misspell/src/dict_codegen.rs --check
displayName: Verify misspell-dict
- script: |
cargo run --package wikipedia-codegen -- --output dict/wikipedia/src/dict_codegen.rs --check
displayName: Verify wikipedia-dict
- stage: committed
displayName: Lint History
dependsOn: []

20
dict/wikipedia/Cargo.toml Normal file
View file

@ -0,0 +1,20 @@
[package]
name = "wikipedia-dict"
version = "0.1.1"
authors = ["Ed Page <eopage@gmail.com>"]
description = "Source Code Spelling Correction"
repository = "https://github.com/crate-ci/typos"
readme = "README.md"
categories = ["development-tools", "text-processing"]
keywords = ["development", "spelling"]
license = "CC-BY-SA-3.0"
edition = "2018"
[badges]
azure-devops = { project = "crate-ci", pipeline = "typos" }
codecov = { repository = "crate-ci/typos" }
[dependencies]
phf = { version = "0.8", features = ["unicase"] }
unicase = "2.5"
log = "0.4"

5
dict/wikipedia/README.md Normal file
View file

@ -0,0 +1,5 @@
Origin: [Wikipedia:Lists of common misspellings/For machines](https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines)
# License
Text is available under the Creative Commons Attribution-ShareAlike License; additional terms may apply.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,23 @@
[package]
name = "wikipedia-codegen"
version = "0.1.1"
authors = ["Ed Page <eopage@gmail.com>"]
description = "Source Code Spelling Correction"
repository = "https://github.com/crate-ci/typos"
readme = "../README.md"
categories = ["text-processing"]
license = "MIT"
edition = "2018"
publish = false
[badges]
azure-devops = { project = "crate-ci", pipeline = "typos" }
codecov = { repository = "crate-ci/typos" }
[dependencies]
phf = { version = "0.8", features = ["unicase"] }
phf_codegen = "0.8"
unicase = "2.5"
itertools = "0.8"
codegenrs = "0.1"
structopt = "0.3"

View file

@ -0,0 +1,80 @@
use structopt::StructOpt;
pub const DICT: &str = include_str!("../../assets/dictionary.txt");
fn parse_dict(raw: &str) -> impl Iterator<Item = (&str, Vec<&str>)> {
raw.lines().map(|s| {
let mut parts = s.splitn(2, "->");
let typo = parts.next().unwrap().trim();
let corrections = parts
.next()
.unwrap()
.split(",")
.filter_map(|c| {
let c = c.trim();
if c.is_empty() {
None
} else {
Some(c)
}
})
.collect();
(typo, corrections)
})
}
fn generate<W: std::io::Write>(file: &mut W) {
writeln!(
file,
"// This file is code-genned by {}",
env!("CARGO_PKG_NAME")
)
.unwrap();
writeln!(file).unwrap();
writeln!(file, "use unicase::UniCase;").unwrap();
let dict = parse_dict(DICT);
writeln!(
file,
"pub static WORD_DICTIONARY: phf::Map<unicase::UniCase<&'static str>, &[&'static str]> = ",
)
.unwrap();
let mut builder = phf_codegen::Map::new();
for (typo, corrections) in dict {
let value = itertools::join(corrections.iter().map(|s| format!("{:?}", s)), ", ");
let value = format!("&[{}]", value);
builder.entry(unicase::UniCase::new(typo), &value);
}
let codegenned = builder.build();
writeln!(file, "{}", codegenned).unwrap();
writeln!(file, ";").unwrap();
writeln!(file, "").unwrap();
}
#[derive(Debug, StructOpt)]
#[structopt(rename_all = "kebab-case")]
struct Options {
#[structopt(flatten)]
codegen: codegenrs::CodeGenArgs,
#[structopt(flatten)]
rustmft: codegenrs::RustfmtArgs,
}
fn run() -> Result<i32, Box<dyn std::error::Error>> {
let options = Options::from_args();
let mut content = vec![];
generate(&mut content);
let content = String::from_utf8(content)?;
let content = options.rustmft.reformat(&content)?;
options.codegen.write_str(&content)?;
Ok(0)
}
fn main() {
let code = run().unwrap();
std::process::exit(code);
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,3 @@
mod dict_codegen;
pub use crate::dict_codegen::*;