feat(dict): varcon dict

2024-12-22 15:42:23 -05:00 · 2020-04-07 19:50:06 -05:00 · 2020-04-07 19:50:06 -05:00 · 7f983992bd
commit 7f983992bd
parent 814ff82aff
15 changed files with 484863 additions and 3 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -24,6 +24,15 @@ version = "1.0.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9a60d744a80c30fcb657dfe2c1b22bcb3e814c1a1e3674f32bf5820b570fbff"

+[[package]]
+name = "arrayvec"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd9fd44efafa8690358b7408d253adf110036b88f55672a933f01d616ad9b1b9"
+dependencies = [
+ "nodrop",
+]
+
 [[package]]
 name = "assert_fs"
 version = "1.0.0"
@ -81,9 +90,9 @@ checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de"

 [[package]]
 name = "cfg-if"
-version = "0.1.10"
+version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
+checksum = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"

 [[package]]
 name = "clap"
@ -276,6 +285,26 @@ version = "1.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3"

+[[package]]
+name = "enumflags2"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83c8d82922337cd23a15f88b70d8e4ef5f11da38dd7cdb55e84dd5de99695da0"
+dependencies = [
+ "enumflags2_derive",
+]
+
+[[package]]
+name = "enumflags2_derive"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "946ee94e3dbf58fdd324f9ce245c7b238d46a66f00e86a020b71996349e46cce"
+dependencies = [
+ "proc-macro2 1.0.12",
+ "quote 1.0.4",
+ "syn 1.0.19",
+]
+
 [[package]]
 name = "env_logger"
 version = "0.7.1"
@ -411,6 +440,20 @@ version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"

+[[package]]
+name = "lexical-core"
+version = "0.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f86d66d380c9c5a685aaac7a11818bdfa1f733198dfd9ec09c70b762cd12ad6f"
+dependencies = [
+ "arrayvec",
+ "bitflags",
+ "cfg-if",
+ "rustc_version",
+ "ryu",
+ "static_assertions",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.69"
@ -460,6 +503,23 @@ dependencies = [
 "unicase",
 ]

+[[package]]
+name = "nodrop"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
+
+[[package]]
+name = "nom"
+version = "5.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b471253da97532da4b61552249c521e01e736071f71c1a4f7ebbfbf0a06aad6"
+dependencies = [
+ "lexical-core",
+ "memchr",
+ "version_check",
+]
+
 [[package]]
 name = "normalize-line-endings"
 version = "0.3.0"
@ -786,6 +846,12 @@ version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7"

+[[package]]
+name = "static_assertions"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f3eb36b47e512f8f1c9e3d10c2c1965bc992bd9cdb024fa581e2194501c83d3"
+
 [[package]]
 name = "strsim"
 version = "0.8.0"
@ -1025,6 +1091,29 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"

+[[package]]
+name = "varcon-codegen"
+version = "1.0.2"
+dependencies = [
+ "codegenrs",
+ "structopt",
+ "varcon-parser",
+]
+
+[[package]]
+name = "varcon-dict"
+version = "0.2.1"
+dependencies = [
+ "enumflags2",
+]
+
+[[package]]
+name = "varcon-parser"
+version = "1.0.0"
+dependencies = [
+ "nom",
+]
+
 [[package]]
 name = "vec_map"
 version = "0.8.2"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -5,6 +5,7 @@ members = [
    "crates/codespell-dict", "crates/codespell-dict/codegen",
    "crates/misspell-dict", "crates/misspell-dict/codegen",
    "crates/wikipedia-dict", "crates/wikipedia-dict/codegen",
+    "crates/varcon", "crates/varcon/codegen", "crates/varcon-parser",
 ]

 [package]
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -39,6 +39,9 @@ stages:
    - script: |
        cargo run --package wikipedia-codegen -- --output crates/wikipedia-dict/src/dict_codegen.rs --check
      displayName: Verify wikipedia-dict
+    - script: |
+        cargo run --package varcon-codegen -- --output crates/varcon/src/codegen.rs --check
+      displayName: Verify varcon-dict
 - stage: committed
  displayName: Lint History
  dependsOn: []
--- a/crates/typos-dict/codegen/src/main.rs
+++ b/crates/typos-dict/codegen/src/main.rs
@ -1,6 +1,6 @@
 use structopt::StructOpt;

-pub const DICT: &[u8] = include_bytes!("../../assets/words.csv");
+const DICT: &[u8] = include_bytes!("../../assets/words.csv");

 fn generate<W: std::io::Write>(file: &mut W) {
    writeln!(
--- a/crates/varcon-parser/Cargo.toml
+++ b/crates/varcon-parser/Cargo.toml
@ -0,0 +1,17 @@
+[package]
+name = "varcon-parser"
+version = "1.0.0"
+authors = ["Ed Page <eopage@gmail.com>"]
+description = "Parse varcon.txt file"
+repository = "https://github.com/crate-ci/typos"
+readme = "../../../README.md"
+categories = ["text-processing"]
+license = "MIT"
+edition = "2018"
+
+[badges]
+azure-devops = { project = "crate-ci", pipeline = "typos" }
+codecov = { repository = "crate-ci/typos" }
+
+[dependencies]
+nom = "5.1.1"
--- a/crates/varcon-parser/src/lib.rs
+++ b/crates/varcon-parser/src/lib.rs
@ -0,0 +1,107 @@
+mod parser;
+
+pub use parser::ClusterIter;
+
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct Cluster {
+    pub header: Option<String>,
+    pub entries: Vec<Entry>,
+    pub notes: Vec<String>,
+}
+
+impl Cluster {
+    pub fn infer(&mut self) {
+        for entry in self.entries.iter_mut() {
+            entry.infer();
+        }
+    }
+}
+
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct Entry {
+    pub variants: Vec<Variant>,
+    pub pos: Option<Pos>,
+    pub archaic: bool,
+    pub note: bool,
+    pub description: Option<String>,
+    pub comment: Option<String>,
+}
+
+impl Entry {
+    pub fn infer(&mut self) {
+        imply(
+            &mut self.variants,
+            Category::BritishIse,
+            Category::BritishIze,
+        );
+        imply(&mut self.variants, Category::BritishIze, Category::Canadian);
+        imply(
+            &mut self.variants,
+            Category::BritishIse,
+            Category::Australian,
+        );
+    }
+}
+
+fn imply(variants: &mut Vec<Variant>, required: Category, missing: Category) {
+    let missing_exists = variants
+        .iter()
+        .any(|v| v.types.iter().any(|t| t.category == missing));
+    if missing_exists {
+        return;
+    }
+
+    for variant in variants.iter_mut() {
+        let types: Vec<_> = variant
+            .types
+            .iter()
+            .filter(|t| t.category == required)
+            .cloned()
+            .map(|mut t| {
+                t.category = missing;
+                t
+            })
+            .collect();
+        variant.types.extend(types);
+    }
+}
+
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct Variant {
+    pub types: Vec<Type>,
+    pub word: String,
+}
+
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct Type {
+    pub category: Category,
+    pub tag: Option<Tag>,
+    pub num: Option<usize>,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
+pub enum Category {
+    American,
+    BritishIse,
+    BritishIze,
+    Canadian,
+    Australian,
+    Other,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+pub enum Tag {
+    Eq,
+    Variant,
+    Seldom,
+    Possible,
+    Improper,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
+pub enum Pos {
+    Noun,
+    Verb,
+    Adjective,
+    Adverb,
+}
--- a/crates/varcon-parser/src/parser.rs
+++ b/crates/varcon-parser/src/parser.rs
@ -0,0 +1,568 @@
+use nom::IResult;
+use nom::InputTakeAtPosition;
+
+use crate::*;
+
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct ClusterIter<'i> {
+    input: &'i str,
+}
+
+impl<'i> ClusterIter<'i> {
+    pub fn new(input: &'i str) -> Self {
+        Self { input }
+    }
+}
+
+impl<'i> Iterator for ClusterIter<'i> {
+    type Item = Cluster;
+
+    fn next(&mut self) -> Option<Cluster> {
+        let i = self.input.trim_start();
+        let (i, c) = Cluster::parse(i).ok()?;
+        self.input = i;
+        Some(c)
+    }
+}
+
+#[cfg(test)]
+mod test_cluster_iter {
+    use super::*;
+
+    #[test]
+    fn test_single() {
+        let iter = ClusterIter::new(
+            "# acknowledgment <verified> (level 35)
+A Cv: acknowledgment / Av B C: acknowledgement
+A Cv: acknowledgments / Av B C: acknowledgements
+A Cv: acknowledgment's / Av B C: acknowledgement's
+
+",
+        );
+        let all: Vec<_> = iter.collect();
+        assert_eq!(all.len(), 1);
+    }
+
+    #[test]
+    fn test_multiple() {
+        let iter = ClusterIter::new(
+            "# acknowledgment <verified> (level 35)
+A Cv: acknowledgment / Av B C: acknowledgement
+A Cv: acknowledgments / Av B C: acknowledgements
+A Cv: acknowledgment's / Av B C: acknowledgement's
+
+# acknowledgment <verified> (level 35)
+A Cv: acknowledgment / Av B C: acknowledgement
+A Cv: acknowledgments / Av B C: acknowledgements
+A Cv: acknowledgment's / Av B C: acknowledgement's
+
+",
+        );
+        let all: Vec<_> = iter.collect();
+        assert_eq!(all.len(), 2);
+    }
+}
+
+impl Cluster {
+    pub fn parse(input: &str) -> IResult<&str, Self> {
+        let header = nom::sequence::tuple((
+            nom::bytes::streaming::tag("#"),
+            nom::character::streaming::space0,
+            nom::character::streaming::not_line_ending,
+            nom::character::streaming::line_ending,
+        ));
+        let note = nom::sequence::preceded(
+            nom::sequence::pair(
+                nom::bytes::streaming::tag("##"),
+                nom::character::streaming::space0,
+            ),
+            nom::sequence::terminated(
+                nom::character::streaming::not_line_ending,
+                nom::character::streaming::line_ending,
+            ),
+        );
+        let cluster = nom::sequence::tuple((
+            nom::combinator::opt(header),
+            nom::multi::many1(nom::sequence::terminated(
+                Entry::parse,
+                nom::character::streaming::line_ending,
+            )),
+            nom::multi::many0(note),
+        ));
+        let (input, (header, entries, notes)) = (cluster)(input)?;
+
+        let header = header.map(|s| s.2.to_owned());
+        let notes = notes.into_iter().map(|s| s.to_owned()).collect();
+        let c = Self {
+            header,
+            entries,
+            notes,
+        };
+        Ok((input, c))
+    }
+}
+
+#[cfg(test)]
+mod test_cluster {
+    use super::*;
+
+    #[test]
+    fn test_basic() {
+        let (input, actual) = Cluster::parse(
+            "# acknowledgment <verified> (level 35)
+A Cv: acknowledgment / Av B C: acknowledgement
+A Cv: acknowledgments / Av B C: acknowledgements
+A Cv: acknowledgment's / Av B C: acknowledgement's
+
+",
+        )
+        .unwrap();
+        assert_eq!(input, "\n");
+        assert_eq!(
+            actual.header,
+            Some("acknowledgment <verified> (level 35)".to_owned())
+        );
+        assert_eq!(actual.entries.len(), 3);
+        assert_eq!(actual.notes.len(), 0);
+    }
+
+    #[test]
+    fn test_notes() {
+        let (input, actual) = Cluster::parse(
+            "# coloration <verified> (level 50)
+A B C: coloration / B. Cv: colouration
+A B C: colorations / B. Cv: colourations
+A B C: coloration's / B. Cv: colouration's
+## OED has coloration as the preferred spelling and discolouration as a
+## variant for British Engl or some reason
+
+",
+        )
+        .unwrap();
+        assert_eq!(input, "\n");
+        assert_eq!(
+            actual.header,
+            Some("coloration <verified> (level 50)".to_owned())
+        );
+        assert_eq!(actual.entries.len(), 3);
+        assert_eq!(actual.notes.len(), 2);
+    }
+}
+
+impl Entry {
+    pub fn parse(input: &str) -> IResult<&str, Self> {
+        let var_sep = nom::sequence::tuple((
+            nom::character::streaming::space0,
+            nom::bytes::streaming::tag("/"),
+            nom::character::streaming::space0,
+        ));
+        let (input, variants) =
+            nom::multi::separated_nonempty_list(var_sep, Variant::parse)(input)?;
+
+        let desc_sep = nom::sequence::tuple((
+            nom::character::streaming::space0,
+            nom::bytes::streaming::tag("|"),
+        ));
+        let (input, description) =
+            nom::combinator::opt(nom::sequence::tuple((desc_sep, Self::parse_description)))(input)?;
+
+        let comment_sep = nom::sequence::tuple((
+            nom::character::streaming::space0,
+            nom::bytes::streaming::tag("#"),
+        ));
+        let (input, comment) = nom::combinator::opt(nom::sequence::tuple((
+            comment_sep,
+            nom::character::streaming::space1,
+            nom::character::streaming::not_line_ending,
+        )))(input)?;
+
+        let mut e = match description {
+            Some((_, description)) => description,
+            None => Self {
+                variants: Vec::new(),
+                pos: None,
+                archaic: false,
+                note: false,
+                description: None,
+                comment: None,
+            },
+        };
+        e.variants = variants;
+        e.comment = comment.map(|c| c.2.to_owned());
+        Ok((input, e))
+    }
+
+    fn parse_description(input: &str) -> IResult<&str, Self> {
+        let (input, (pos, archaic, note, description)) = nom::sequence::tuple((
+            nom::combinator::opt(nom::sequence::tuple((
+                nom::character::streaming::space1,
+                Pos::parse,
+            ))),
+            nom::combinator::opt(nom::sequence::tuple((
+                nom::character::streaming::space1,
+                nom::bytes::streaming::tag("(-)"),
+            ))),
+            nom::combinator::opt(nom::sequence::tuple((
+                nom::character::streaming::space1,
+                nom::bytes::streaming::tag("--"),
+            ))),
+            nom::combinator::opt(nom::sequence::tuple((
+                nom::character::streaming::space1,
+                nom::bytes::streaming::take_till(|c| c == '\n' || c == '\r' || c == '#'),
+            ))),
+        ))(input)?;
+
+        let variants = Vec::new();
+        let pos = pos.map(|(_, p)| p);
+        let archaic = archaic.is_some();
+        let note = note.is_some();
+        let description = description.map(|(_, d)| d.to_owned());
+        let e = Self {
+            variants,
+            pos,
+            archaic,
+            note,
+            description,
+            comment: None,
+        };
+        Ok((input, e))
+    }
+}
+
+#[cfg(test)]
+mod test_entry {
+    use super::*;
+
+    #[test]
+    fn test_variant_only() {
+        // Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
+        // cases.
+        let (input, actual) =
+            Entry::parse("A Cv: acknowledgment's / Av B C: acknowledgement's\n").unwrap();
+        assert_eq!(input, "\n");
+        assert_eq!(actual.variants.len(), 2);
+        assert_eq!(actual.pos, None);
+        assert_eq!(actual.archaic, false);
+        assert_eq!(actual.note, false);
+        assert_eq!(actual.description, None);
+    }
+
+    #[test]
+    fn test_description() {
+        // Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
+        // cases.
+        let (input, actual) = Entry::parse("A C: prize / B: prise | otherwise\n").unwrap();
+        assert_eq!(input, "\n");
+        assert_eq!(actual.variants.len(), 2);
+        assert_eq!(actual.pos, None);
+        assert_eq!(actual.archaic, false);
+        assert_eq!(actual.note, false);
+        assert_eq!(actual.description, Some("otherwise".to_owned()));
+    }
+
+    #[test]
+    fn test_pos() {
+        // Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
+        // cases.
+        let (input, actual) = Entry::parse("A B C: practice / AV Cv: practise | <N>\n").unwrap();
+        assert_eq!(input, "\n");
+        assert_eq!(actual.variants.len(), 2);
+        assert_eq!(actual.pos, Some(Pos::Noun));
+        assert_eq!(actual.archaic, false);
+        assert_eq!(actual.note, false);
+        assert_eq!(actual.description, None);
+    }
+
+    #[test]
+    fn test_archaic() {
+        // Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
+        // cases.
+        let (input, actual) = Entry::parse("A: bark / Av B: barque | (-) ship\n").unwrap();
+        assert_eq!(input, "\n");
+        assert_eq!(actual.variants.len(), 2);
+        assert_eq!(actual.pos, None);
+        assert_eq!(actual.archaic, true);
+        assert_eq!(actual.note, false);
+        assert_eq!(actual.description, Some("ship".to_owned()));
+    }
+
+    #[test]
+    fn test_note() {
+        // Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
+        // cases.
+        let (input, actual) = Entry::parse("_: cabbies | -- plural\n").unwrap();
+        assert_eq!(input, "\n");
+        assert_eq!(actual.variants.len(), 1);
+        assert_eq!(actual.pos, None);
+        assert_eq!(actual.archaic, false);
+        assert_eq!(actual.note, true);
+        assert_eq!(actual.description, Some("plural".to_owned()));
+    }
+
+    #[test]
+    fn test_trailing_comment() {
+        let (input, actual) = Entry::parse(
+            "A B: accursed / AV B-: accurst # ODE: archaic, M-W: 'or' but can find little evidence of use\n",
+        )
+        .unwrap();
+        assert_eq!(input, "\n");
+        assert_eq!(actual.variants.len(), 2);
+        assert_eq!(actual.pos, None);
+        assert_eq!(actual.archaic, false);
+        assert_eq!(actual.note, false);
+        assert_eq!(actual.description, None);
+        assert_eq!(
+            actual.comment,
+            Some("ODE: archaic, M-W: 'or' but can find little evidence of use".to_owned())
+        );
+    }
+}
+
+impl Variant {
+    pub fn parse(input: &str) -> IResult<&str, Self> {
+        let types =
+            nom::multi::separated_nonempty_list(nom::character::streaming::space1, Type::parse);
+        let sep = nom::sequence::tuple((
+            nom::bytes::streaming::tag(":"),
+            nom::character::streaming::space0,
+        ));
+        let (input, (types, word)) = nom::sequence::separated_pair(types, sep, word)(input)?;
+        let v = Self { types, word };
+        Ok((input, v))
+    }
+}
+
+fn word(input: &str) -> IResult<&str, String> {
+    input
+        .split_at_position1(
+            |item| item.is_ascii_whitespace(),
+            nom::error::ErrorKind::Alpha,
+        )
+        .map(|(i, s)| (i, s.to_owned().replace('_', " ")))
+}
+
+#[cfg(test)]
+mod test_variant {
+    use super::*;
+
+    #[test]
+    fn test_valid() {
+        // Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
+        // cases.
+        let (input, actual) = Variant::parse("A Cv: acknowledgment ").unwrap();
+        assert_eq!(input, " ");
+        assert_eq!(
+            actual.types,
+            vec![
+                Type {
+                    category: Category::American,
+                    tag: None,
+                    num: None,
+                },
+                Type {
+                    category: Category::Canadian,
+                    tag: Some(Tag::Variant),
+                    num: None,
+                }
+            ]
+        );
+        assert_eq!(actual.word, "acknowledgment");
+    }
+
+    #[test]
+    fn test_extra() {
+        let (input, actual) =
+            Variant::parse("A Cv: acknowledgment's / Av B C: acknowledgement's").unwrap();
+        assert_eq!(input, " / Av B C: acknowledgement's");
+        assert_eq!(
+            actual.types,
+            vec![
+                Type {
+                    category: Category::American,
+                    tag: None,
+                    num: None,
+                },
+                Type {
+                    category: Category::Canadian,
+                    tag: Some(Tag::Variant),
+                    num: None,
+                }
+            ]
+        );
+        assert_eq!(actual.word, "acknowledgment's");
+    }
+
+    #[test]
+    fn test_underscore() {
+        let (input, actual) = Variant::parse("_: air_gun\n").unwrap();
+        assert_eq!(input, "\n");
+        assert_eq!(
+            actual.types,
+            vec![Type {
+                category: Category::Other,
+                tag: None,
+                num: None,
+            },]
+        );
+        assert_eq!(actual.word, "air gun");
+    }
+}
+
+impl Type {
+    pub fn parse(input: &str) -> IResult<&str, Type> {
+        let (input, category) = Category::parse(input)?;
+        let (input, tag) = nom::combinator::opt(Tag::parse)(input)?;
+        let (input, num) = nom::combinator::opt(nom::character::streaming::digit1)(input)?;
+        let num = num.map(|s| s.parse().expect("parser ensured its a number"));
+        let t = Type { category, tag, num };
+        Ok((input, t))
+    }
+}
+
+#[cfg(test)]
+mod test_type {
+    use super::*;
+
+    #[test]
+    fn test_valid() {
+        // Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
+        // cases.
+        let (input, actual) = Type::parse("A ").unwrap();
+        assert_eq!(input, " ");
+        assert_eq!(actual.category, Category::American);
+        assert_eq!(actual.tag, None);
+        assert_eq!(actual.num, None);
+
+        let (input, actual) = Type::parse("Bv ").unwrap();
+        assert_eq!(input, " ");
+        assert_eq!(actual.category, Category::BritishIse);
+        assert_eq!(actual.tag, Some(Tag::Variant));
+        assert_eq!(actual.num, None);
+    }
+
+    #[test]
+    fn test_extra() {
+        let (input, actual) = Type::parse("Z foobar").unwrap();
+        assert_eq!(input, " foobar");
+        assert_eq!(actual.category, Category::BritishIze);
+        assert_eq!(actual.tag, None);
+        assert_eq!(actual.num, None);
+
+        let (input, actual) = Type::parse("C- foobar").unwrap();
+        assert_eq!(input, " foobar");
+        assert_eq!(actual.category, Category::Canadian);
+        assert_eq!(actual.tag, Some(Tag::Possible));
+        assert_eq!(actual.num, None);
+    }
+
+    #[test]
+    fn test_num() {
+        let (input, actual) = Type::parse("Av1 ").unwrap();
+        assert_eq!(input, " ");
+        assert_eq!(actual.category, Category::American);
+        assert_eq!(actual.tag, Some(Tag::Variant));
+        assert_eq!(actual.num, Some(1));
+    }
+}
+
+impl Category {
+    pub fn parse(input: &str) -> IResult<&str, Category> {
+        let symbols = nom::character::streaming::one_of("ABZCD_");
+        nom::combinator::map(symbols, |c| match c {
+            'A' => Category::American,
+            'B' => Category::BritishIse,
+            'Z' => Category::BritishIze,
+            'C' => Category::Canadian,
+            'D' => Category::Australian,
+            '_' => Category::Other,
+            _ => unreachable!("parser won't select this option"),
+        })(input)
+    }
+}
+
+#[cfg(test)]
+mod test_category {
+    use super::*;
+
+    #[test]
+    fn test_valid() {
+        let (input, actual) = Category::parse("A").unwrap();
+        assert_eq!(input, "");
+        assert_eq!(actual, Category::American);
+    }
+
+    #[test]
+    fn test_extra() {
+        let (input, actual) = Category::parse("_ foobar").unwrap();
+        assert_eq!(input, " foobar");
+        assert_eq!(actual, Category::Other);
+    }
+}
+
+impl Tag {
+    pub fn parse(input: &str) -> IResult<&str, Tag> {
+        let symbols = nom::character::streaming::one_of(".vV-x");
+        nom::combinator::map(symbols, |c| match c {
+            '.' => Tag::Eq,
+            'v' => Tag::Variant,
+            'V' => Tag::Seldom,
+            '-' => Tag::Possible,
+            'x' => Tag::Improper,
+            _ => unreachable!("parser won't select this option"),
+        })(input)
+    }
+}
+
+#[cfg(test)]
+mod test_tag {
+    use super::*;
+
+    #[test]
+    fn test_valid() {
+        let (input, actual) = Tag::parse(".").unwrap();
+        assert_eq!(input, "");
+        assert_eq!(actual, Tag::Eq);
+    }
+
+    #[test]
+    fn test_extra() {
+        let (input, actual) = Tag::parse("x foobar").unwrap();
+        assert_eq!(input, " foobar");
+        assert_eq!(actual, Tag::Improper);
+    }
+}
+
+impl Pos {
+    pub fn parse(input: &str) -> IResult<&str, Pos> {
+        use nom::bytes::streaming::tag;
+        let noun = tag("<N>");
+        let verb = tag("<V>");
+        let adjective = tag("<Adj>");
+        let adverb = tag("<Adv>");
+        nom::alt!(input,
+            noun => {|_| Pos::Noun } |
+            verb => {|_| Pos::Verb } |
+            adjective => {|_| Pos::Adjective } |
+            adverb => {|_| Pos::Adverb }
+        )
+    }
+}
+
+#[cfg(test)]
+mod test_pos {
+    use super::*;
+
+    #[test]
+    fn test_valid() {
+        let (input, actual) = Pos::parse("<N>").unwrap();
+        assert_eq!(input, "");
+        assert_eq!(actual, Pos::Noun);
+    }
+
+    #[test]
+    fn test_extra() {
+        let (input, actual) = Pos::parse("<Adj> foobar").unwrap();
+        assert_eq!(input, " foobar");
+        assert_eq!(actual, Pos::Adjective);
+    }
+}
--- a/crates/varcon/Cargo.toml
+++ b/crates/varcon/Cargo.toml
@ -0,0 +1,23 @@
+[package]
+name = "varcon-dict"
+version = "0.2.1"
+authors = ["Ed Page <eopage@gmail.com>"]
+description = "Source Code Spelling Correction"
+repository = "https://github.com/crate-ci/typos"
+readme = "../../README.md"
+categories = ["development-tools", "text-processing"]
+keywords = ["development", "spelling"]
+license = "MIT"
+edition = "2018"
+
+[badges]
+azure-devops = { project = "crate-ci", pipeline = "typos" }
+codecov = { repository = "crate-ci/typos" }
+
+[features]
+default = ["all"]
+all = ["flags"]
+flags = ["enumflags2"]
+
+[dependencies]
+enumflags2 = { version = "0.6", optional = true }
--- a/crates/varcon/assets/.gitattributes
+++ b/crates/varcon/assets/.gitattributes
@ -0,0 +1 @@
+* linguist-vendored
--- a/crates/varcon/assets/README
+++ b/crates/varcon/assets/README
@ -0,0 +1,481 @@
+Variant Conversion Info (VarCon)
+
+Version 2019.10.06
+
+Copyright 2000-2016 by Kevin Atkinson (kevina@gnu.org) and Benjamin
+Titze (btitze@protonmail.ch).
+
+This package contains information to convert between American,
+British, Canadian, and Australian spellings and vocabulary as well as
+other variant information.
+
+The latest version can be found at http://wordlist.aspell.net/.
+
+The main data file is varcon.txt.  It contains information on the
+preferred American, British, and Canadian spelling of a word as well
+as other variant information.
+
+Each line contains a mapping between the various spellings of a word.
+Words are tagged to indicate where the spelling is used, and each
+word/tag pair is separated with a " / ".  For example in the line:
+  A Cv: acknowledgment / Av B C: acknowledgement
+"acknowledgment" and "acknowledgement" are two spellings of the same
+word and "A", "Cv", "B", etc are the tags.  Tags are separated by
+spaces and the group of tags is separated from the word with a ": ".
+Here, "acknowledgment" is the preferred American spelling (as
+indicated by the "A") of the word, and "acknowledgement" is the
+preferred Canadian and British spelling ("B" and "C").  However the
+American spelling is sometimes used in Canada (as indicated by "Cv",
+where the lowercase "v" indicated a variant form) and the British
+spelling is sometimes used in America (as indicated the "Av").
+
+More generally each tag consists of a spelling category (for example
+"A") followed possible by a variant indicator.  The spelling
+categories are as follows:
+  A: American
+  B: British "ise" spelling
+  Z: British "ize" spelling or OED preferred Spelling
+  C: Canadian
+  D: Australian
+  _: Other (Variant info based on American dictionaries, never used
+            with any of the above).
+and the variants tags are as follows:
+  .: equal
+  v: variant
+  V: seldom used variant
+  -: possible variant, should generally not used
+  x: improper variant (should not use)
+
+The "." or equal variant tags are reserved for special cases when
+there is little agreement between dictionaries or when I think the
+dictionary is wrong.  The "v" indicator is used for most words marked
+as variants in the dictionary.  However, some variants will be demoted
+to a "V".  For example, if the variant is marked as "also" by
+Merriam-Webster, or also if only some dictionaries acknowledge the
+existence the variant.  "-" is used when the variant is generally not
+listed is the dictionary but I could find some evidence of its use, or
+when it is marked as an archaic spelling for the word.  The "x"
+is used when the spelling is almost generally considered a
+misspelling, and is only included for completeness.
+
+For Australian English "v" was used for variants that are widely used,
+but not preferred, and "V" for all "-or" (vs. "-our") variants and 
+variants considered "chiefly US".
+
+If there are no tags with the 'Z' spelling category on the line then
+'B' implies 'Z'.  Similarly if there are no 'C' tags then 'Z' implies
+'C'.  If there are no 'D' tags then 'B' implies 'D'.
+
+For ease of reading and maintaining the data file, each line is
+grouped in a cluster of closely related words.  Each cluster is
+uniquely identified by a headword, which is generally the American
+spelling of word on the first line of the cluster.  Each cluster is
+started with a '#' and is followed by the headword with some
+additional information after it.  For example the cluster for
+acknowledgment is:
+  # acknowledgment <verified> (level 35)
+  A Cv: acknowledgment / Av B C: acknowledgement
+  A Cv: acknowledgments / Av B C: acknowledgements
+  A Cv: acknowledgment's / Av B C: acknowledgement's
+The "<verified>" tag will be explained latter, and "(level 35)"
+indicate what level in SCOWL (see http://wordlist.sourceforge.net) 
+the headword is found in.  The levels generally mean the following:
+  <= 35: Very common word
+  <= 70: Can be found in the dictionary
+     80: Likely a valid word, can likely be found in an
+         unabridged dictionary
+   > 80: May not even be a legal word
+
+Sometimes the spelling of a word depends on the usage.  If so the word
+is listed more than once within a cluster, with any usage information
+being indicated after a " | ".  For example here is part of the cluster
+for prize:
+  A B: prize | reward
+  A B: prizes | reward
+  A C: prize / B: prise | otherwise
+  A C: prizes / B: prises | otherwise
+which indicated than the preferred spelling of prize is always with a
+"z" when meaning a reward, but otherwise is spelled with a "s" is
+British English.  In the example above a brief definition of the word
+is given, but often no such attempt is made, and the definition simply
+consists of a number, for example:
+  A B: sake | :1
+  A C: sake / Av B Cv: saki | :2
+
+Sometimes part-of-speech (POS) info is given to help distinguish which
+form is used.  For example:
+  A B C: practice / AV Cv: practise | <N>
+  A Cv: practice / AV B C: practise | <V>
+POS info is always given in the form "<POS>" and if a definition
+is also given the POS info is always first.  The POS tags used are as
+follows:
+  <N>: Noun
+  <V>: Verb
+  <Adj>: Adjective
+  <Adv>: Adverb
+
+A "(-)" before the definition indicated a rarely used or archaic form
+of a word, for example:
+  A B: bark | :1
+  A: bark / Av B: barque | (-) ship
+
+A "--" indicates a note rather than definition.  This is generally
+used to indicate that the spelling of the plural form not depend on
+the spelling of the root word, for example:
+  _: cabby / _.: cabbie
+  _: cabbies | -- plural
+
+Misc. notes on a particular form of a word are given after a "#" on
+the same line.  Misc. notes for the cluster are given at the end of
+the cluster and are prefixed with "##", for example:
+  # coloration <verified> (level 50)
+  A B C: coloration / B. Cv: colouration
+  A B C: colorations / B. Cv: colourations
+  A B C: coloration's / B. Cv: colouration's
+  ## OED has coloration as the preferred spelling and discolouration as a
+  ## variant for British Engl or some reason
+In the notes ODE (not to be confused with OED) stands for Oxford
+Dictionary of English, "Ox" is used for any Oxford dictionary, and
+"M-W" for Merriam-Webster.
+
+Earlier versions of varcon contained numerous errors.  With version
+5.0 massive effort has been made to correct many of these errors.
+Clusters that have undergone some form of verification (and likely
+correction) are marked with "<verified>".  As of version 5.0, most
+clusters with headwords word in common usage (SCOWL level 35 and
+below) should now be checked, as well as many others.  No effort was
+made to check clusters with headwords in SCOWL level 80 and above;
+many of those entries are unlikely to be in the dictionary anyway.
+
+The file variant-also.tab contains additional mappings between various
+spellings of a word which are not yet in varcon.txt.  No attempt is
+made to distinguish the primary form of a word.  The file
+variant-infl.tab is like variant-also.tab except that it is created
+automatically from the AGID inflection database.  The file
+variant-wroot.tab is like variant-infl.tab except that it also
+included the root form of the word.
+
+The file voc.tab is similar to varcon.txt but converts between
+vocabulary instead of spelling.  Unlike varcon.tab it is a simple tab
+separated file with the fields corresponding to the American, British,
+and Canadian words.  If more than one word if often used to describe
+the same thing the words are separated with commas.  The last column
+contains additional notes on when the word is used.  Unlike varcon.txt
+it is generally not suitable for automatic conversion.
+
+The "make-variant" Perl script will combine varcon.txt,
+variant-also.tab, and variant-infl.tab into one huge mapping and will
+output the result to "variant.tab".  If the "no-infl" option is given
+than variant-infl.tab will not be included.
+
+The "split" script will split out the information in varcon.txt into
+several word lists named as follows:
+  <spelling>[-v<variant level>][-uncommon].lst
+where <spelling> is one of: american, british, british_z, canadian,
+common, or other.  "common" is used for words which appear in
+varcon.txt, yet are used in all versions of english, such as "prize",
+and "other" is used for the "_" spelling category.  The mapping from
+the variant indicators in varcon.txt to the numeric variant level is
+as follows:
+  v => 0
+  V => 1
+  - => 2
+"-uncommon" is used for forms marked with "(-)" as already described.
+
+The "translate" Perl script will translate a text file from one
+spelling to another. Its usage is:
+
+translate <options> [<translation array>] <from> <to>
+<options> is any of
+  -?,-h,--help this screen
+  -m,--mark     mark words where the translation is questionable
+  -i,--include  include words where the translation is questionable
+<translation array> is the file name of the translation array,
+                    defaults to "abbc.tab".
+<from> and <to> are one of: american, british, british_z, or canadian.
+british-ise and british-ize can also be used.
+
+Text is read in from standard input and is outputted to standard out.
+Words are marked with a '?' before and after the questionable word
+when the option is enabled.
+
+The file varcon.pm contains some library routines for parsing
+varcon.txt and is used by many of the scripts above.
+
+If you discover any errors in these mappings or have suggestions for
+additions please file a bug report at
+https://github.com/kevina/wordlist/issues, or alternatively email me
+directly at kevina@gnu.org, but I will likely tell you to file a bug
+report so that I don't forget about it.
+
+SOURCE:
+
+These mappings were compiled from numerous sources.
+
+The abc.tab was originally created from the American and British word
+lists found in the Ispell distribution and the Canadian word list
+created by Garst R. Reese <reese@isn.net>:
+
+  What I have discovered is that Canadian is a modification of British.
+  Canadians use ize ization, izing izable like Americans, and gram instead
+  of gramme. The one exception I found was practise. It does not go to
+  practize.  Otherwise they use British spelling. So, what I am currently
+  checking books with is a an edited version of British, where I have
+  changed all occurrences of ise to ize, isab to izab, isation to ization,
+  ising to izing, and gramme to gram except I allow programme, which is
+  sometimes proper unless you are talking about a computer program. I did
+  bunches of greps to be sure these substitutions would work as expected.
+
+Many other words have been added to abc.tab which were not in the
+original Ispell word lists.
+
+Many different web sources were consulted when crating the tables.  They
+include:
+
+  The American-British British-American Dictionary
+    http://www.peak.org/~jeremy/dictionary/dictionary.html
+    American and British Spelling Differences
+      http://www.peak.org/~jeremy/dictionary/spellcat.html
+  Dave (VE7CNV)'s Truly Canadian Dictionary of Canadian Spelling
+    http://www.luther.bc.ca/~dave7cnv/cdnspelling/cdnspelling.html
+  Canadian Spelling Convention
+    http://imej.wfu.edu/articles/1999/1/02/demo/tutorial/canas.html
+  Cornerstone's Canadian English Page
+    http://www.web.net/cornerstone/cdneng.htm
+  Inter-Play Translation: British/Canadian/American Spelling
+    http://www.inter-play.com/translation/spel-ukus.htm
+  Inter-Play Translation: British/Canadian/American Vocabulary
+    http://www.inter-play.com/translation/voc-ukus.htm
+
+As well as several online dictionaries:
+
+  Marriam-Webster: http://www.m-w.com/
+  American Heritage: http://www.bartleby.com/61/
+  Cambridge (ESL): http://dictionary.cambridge.org/
+
+In version 5.0 a massive effort to correct the numerous errors in
+VarCon was done.  The primary sources used for verification were:
+
+  Marriam-Webster: http://www.m-w.com/
+  Free version of Oxford Dictionaries Online: 
+    http://www.oxforddictionaries.com/
+  Oxford dictionaries available via Oxford Reference Online
+    (subscription service, http://www.oxfordreference.com/):
+    The New Oxford American Dictionary (2nd edition, 2006)
+      and sometimes: The Oxford American Dictionary of Current English (2002)
+    The Concise Oxford English Dictionary (11th edition revised, 2008)
+      and sometimes: The Oxford Dictionary of English (2nd edition revised, 2005)
+    The Canadian Oxford Dictionary (2004)
+
+I also used Tysto UK vs US spelling list available at:
+  http://www.tysto.com/articles05/q1/20050324uk-us.shtml
+to make sure I didn't leave out any information in VarCon, however any
+additions from his lists where verified using the dictionaries
+mentioned above as his lists contained numerous errors (such as
+including archaic spellings of words)
+
+I also made indirect use of Luke's Canadian, British and American
+Spelling page available at:
+  http://www.lukemastin.com/testing/spelling/cgi-bin/database.cgi?database=spelling
+but only to perform some initial verification, in the end I rechecked
+his data using the dictionaries above.  (However, his data is, by far,
+more accurate than Tysto's)
+    
+In Version 2016.11.20 Benjamin Titze added support for Australian English.
+The primary sources for this addition were:
+
+  The Macquarie Dictionary: https://www.macquariedictionary.com.au/
+  Style Manual: For Authors, Editors and Printers, 6th Edition. DCITA.
+  University of Technology Sydney Publications Style Guide:
+    http://www.gsu.uts.edu.au/publications/styleguide/spelling.html
+  Style Manual, Department of Treasury and Finance, Tasmania:
+    http://conference.tasa.org.au/wp-content/uploads/2015/03/Style-Manual.pdf
+  Editor Australia - Style Guide: 
+    http://www.editoraustralia.com/styleguide_spelling.html
+  Webster in Australia (history of "our"/"or" spelling variants): 
+    http://blogs.usyd.edu.au/elac/2008/01/webster_in_australia.html
+
+
+CHANGELOG:
+
+From 2017.08.24 to 2018.10.06
+
+   - Added entries for: eukaryote, prokaryote, virtualization, volcanism
+
+From 2016.11.20 to 2017.08.24
+
+   - Typo fixes thanks to Jakub Wilk
+
+From 2016.06.26 to 2016.11.20
+
+   - New Australian spelling category thanks to the work of Benjamin
+     Titze.
+
+   - Various other fixes.
+
+From 2016.01.19 to 2016.06.26
+
+   - Fix plural of "bus".
+
+From 2015.08.24 to 2016.01.19
+
+   - Undo the effects of PERL_UNICODE in the translate script.
+
+   - Other minor fixes and new entries.
+
+From 2014.02.15 to 2015.08.24 (Aug 24, 2015)
+
+   - Added entry for Koran/Koranic.
+
+   - Tweaked "adviser" cluster.
+
+   - Fix formatting problems.
+
+From 2015.01.28 to 2014.02.15 (February 15, 2015)
+
+   - Various new entries
+
+From 2014.11.17 to 2015.01.28 (January 28, 2015)
+
+   - Minor adjustments to a few entries (analytic, amid)
+
+   - Added entry for shareable
+
+   - Remove a junk entry (ted/taed).
+
+From 2014.08.11 to 2014.11.17 (November 17, 2014)
+
+   - Fix typos in README
+
+   - Enhancement to VarCon translate script.  It will now, by default,
+     filter clusters with a SCOWL level > 80.  This behavior can be
+     controlled with the new "--thresh" option.
+
+   - Remove a few junk entries.
+
+From Revision 5.1 to Version 2014.08.11 (August 8, 2014)
+
+   - Various corrections.  Most of them minor.  Two notable exceptions:
+
+       - Added an entry for furor as the correct British spelling is furore
+
+       - Fixed racket entries as Canadians still use racquet even
+         though it is a British English (at least according to the
+         Oxford dictionaries)
+
+   - Other minor changes.
+
+From Revision 5.0 to Revision 5.1 (January 6, 2010)
+
+   - Corrected numerous errors after running various forms
+     of verification on varcon.txt.
+
+   - Reordered the clusters in varcon.txt so that they are
+     mostly in alphabetic order based on the headword.
+  
+From Revision 4.1 to Revision 5.0 (December 27, 2010)
+
+  - Completely new format for the main table which, in addition to
+    providing the preferred spelling of a word for various forms of
+    English, also records variant and other information.  To reflect
+    this change, the name of the file was renamed from abbc.tab to
+    varcon.txt.
+
+  - Massive effort to verify the variant information against
+    authoritative sources (mainly Oxford dictionaries).  Most entries
+    for common words (SCOWL level 35 and below) have been checked
+    against at least a British and Canadian dictionary.
+
+  - Added variant information for numerous other words, even when
+    there is no difference between the various forms on English.
+
+  - Other changes corresponding to the new format.
+
+From Revision 4 to Revision 4.1 (August 10, 2004)
+
+  - Fixed various errors in abbc.tab
+
+  - Removed clause 4 from the Ispell copyright with permission of Geoff
+    Kuenning.
+
+From Revision 3 to Revision 4 (August 7, 2004)
+
+  - Added a column to "abc.tab" for the British "ize" spelling and
+    renamed the file to abbc.tab.
+  - Added verb forms of prize/prise to abbc.tab, removed from
+    variant-also.tab
+
+From Revision 2 to Revision 3 (January 2, 2003)
+
+  - Added an option for not including variant-infl.tab for the
+    make-variant perl script
+  - Added the file variant-wroot.tab
+  - Added a few entries given to me by Francis Bond and Edward Betts
+
+From Revision 1 to Revision 2 (January 27, 2001)
+
+  - Removed all "B" markers because I could not find any evidence for
+    them
+  - Corrected a few Canadian entries, especially those with the "B"
+    markers
+  - Added some more entries by trying fixed changes (such as ize to
+    ise) to words in SCOWL and hand-checking over the ones with semi-common
+    words in them. 
+  - Added variant-infl.tab
+
+COPYRIGHT:
+
+Copyright 2000-2018 by Kevin Atkinson
+
+Permission to use, copy, modify, distribute and sell this array, the
+associated software, and its documentation for any purpose is hereby
+granted without fee, provided that the above copyright notice appears
+in all copies and that both that copyright notice and this permission
+notice appear in supporting documentation. Kevin Atkinson makes no
+representations about the suitability of this array for any
+purpose. It is provided "as is" without express or implied warranty.
+
+Copyright 2016 by Benjamin Titze
+
+Permission to use, copy, modify, distribute and sell this array, the
+associated software, and its documentation for any purpose is hereby
+granted without fee, provided that the above copyright notice appears
+in all copies and that both that copyright notice and this permission
+notice appear in supporting documentation. Benjamin Titze makes no
+representations about the suitability of this array for any
+purpose. It is provided "as is" without express or implied warranty.
+
+Since the original words lists come from the Ispell distribution:
+
+Copyright 1993, Geoff Kuenning, Granada Hills, CA
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. All modifications to the source code must be clearly marked as
+   such.  Binary redistributions based on modified source code
+   must be clearly marked as modified versions in the documentation
+   and/or other materials provided with the distribution.
+(clause 4 removed with permission from Geoff Kuenning)
+5. The name of Geoff Kuenning may not be used to endorse or promote
+   products derived from this software without specific prior
+   written permission.
+
+THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
--- a/crates/varcon/assets/varcon.txt
+++ b/crates/varcon/assets/varcon.txt
--- a/crates/varcon/codegen/Cargo.toml
+++ b/crates/varcon/codegen/Cargo.toml
@ -0,0 +1,20 @@
+[package]
+name = "varcon-codegen"
+version = "1.0.2"
+authors = ["Ed Page <eopage@gmail.com>"]
+description = "Source Code Spelling Correction"
+repository = "https://github.com/crate-ci/typos"
+readme = "../../../README.md"
+categories = ["text-processing"]
+license = "MIT"
+edition = "2018"
+publish = false
+
+[badges]
+azure-devops = { project = "crate-ci", pipeline = "typos" }
+codecov = { repository = "crate-ci/typos" }
+
+[dependencies]
+codegenrs = "0.1"
+structopt = "0.3"
+varcon-parser = { version = "1.0", path = "../../varcon-parser" }
--- a/crates/varcon/codegen/src/main.rs
+++ b/crates/varcon/codegen/src/main.rs
@ -0,0 +1,99 @@
+use structopt::StructOpt;
+
+const DICT: &[u8] = include_bytes!("../../assets/varcon.txt");
+
+fn generate<W: std::io::Write>(file: &mut W) {
+    let dict = String::from_utf8_lossy(DICT);
+    let clusters = varcon_parser::ClusterIter::new(&dict);
+
+    writeln!(
+        file,
+        "// This file is code-genned by {}",
+        env!("CARGO_PKG_NAME")
+    )
+    .unwrap();
+    writeln!(file, "#![allow(clippy::unreadable_literal)]",).unwrap();
+    writeln!(file).unwrap();
+    writeln!(file, "use crate::*;").unwrap();
+    writeln!(file).unwrap();
+
+    writeln!(file, "pub static VARCON: &'static [Cluster] = &[").unwrap();
+    for mut cluster in clusters {
+        cluster.infer();
+        writeln!(file, "Cluster {{").unwrap();
+        writeln!(file, "  header: {:?},", cluster.header).unwrap();
+        writeln!(file, "  entries: &[").unwrap();
+        for entry in &cluster.entries {
+            writeln!(file, "  Entry {{").unwrap();
+            writeln!(file, "    variants: &[").unwrap();
+            for variant in &entry.variants {
+                writeln!(file, "      Variant {{").unwrap();
+                writeln!(file, "        word: {:?},", variant.word).unwrap();
+                writeln!(file, "        types: &[").unwrap();
+                for t in &variant.types {
+                    write!(file, "          Type {{").unwrap();
+                    write!(file, "category: Category::{:?}, ", t.category).unwrap();
+                    if let Some(tag) = t.tag {
+                        write!(file, "tag: Some(Tag::{:?}), ", tag).unwrap();
+                    } else {
+                        write!(file, "tag: {:?}, ", t.tag).unwrap();
+                    }
+                    write!(file, "num: {:?},", t.num).unwrap();
+                    writeln!(file, "}},").unwrap();
+                }
+                writeln!(file, "        ],").unwrap();
+                writeln!(file, "      }},").unwrap();
+            }
+            writeln!(file, "  ],").unwrap();
+            if let Some(pos) = entry.pos {
+                write!(file, "  pos: Some(Pos::{:?}),", pos).unwrap();
+            } else {
+                write!(file, "  pos: {:?},", entry.pos).unwrap();
+            }
+            writeln!(
+                file,
+                " archaic: {:?}, note: {:?},",
+                entry.archaic, entry.note
+            )
+            .unwrap();
+            writeln!(file, "  description: {:?},", entry.description).unwrap();
+            writeln!(file, "  comment: {:?},", entry.comment).unwrap();
+            writeln!(file, "  }},").unwrap();
+        }
+        writeln!(file, "  ],").unwrap();
+        writeln!(file, "  notes: &[").unwrap();
+        for note in &cluster.notes {
+            writeln!(file, "    {:?},", note).unwrap();
+        }
+        writeln!(file, "  ],").unwrap();
+        writeln!(file, "  }},").unwrap();
+    }
+    writeln!(file, "];").unwrap();
+}
+
+#[derive(Debug, StructOpt)]
+#[structopt(rename_all = "kebab-case")]
+struct Options {
+    #[structopt(flatten)]
+    codegen: codegenrs::CodeGenArgs,
+    #[structopt(flatten)]
+    rustmft: codegenrs::RustfmtArgs,
+}
+
+fn run() -> Result<i32, Box<dyn std::error::Error>> {
+    let options = Options::from_args();
+
+    let mut content = vec![];
+    generate(&mut content);
+
+    let content = String::from_utf8(content)?;
+    let content = options.rustmft.reformat(&content)?;
+    options.codegen.write_str(&content)?;
+
+    Ok(0)
+}
+
+fn main() {
+    let code = run().unwrap();
+    std::process::exit(code);
+}
--- a/crates/varcon/src/codegen.rs
+++ b/crates/varcon/src/codegen.rs
--- a/crates/varcon/src/lib.rs
+++ b/crates/varcon/src/lib.rs
@ -0,0 +1,75 @@
+mod codegen;
+
+pub use codegen::*;
+
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct Cluster {
+    pub header: Option<&'static str>,
+    pub entries: &'static [Entry],
+    pub notes: &'static [&'static str],
+}
+
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct Entry {
+    pub variants: &'static [Variant],
+    pub pos: Option<Pos>,
+    pub archaic: bool,
+    pub note: bool,
+    pub description: Option<&'static str>,
+    pub comment: Option<&'static str>,
+}
+
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct Variant {
+    pub types: &'static [Type],
+    pub word: &'static str,
+}
+
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct Type {
+    pub category: Category,
+    pub tag: Option<Tag>,
+    pub num: Option<usize>,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
+#[cfg_attr(feature = "flags", derive(enumflags2::BitFlags))]
+#[repr(u8)]
+pub enum Category {
+    American = 0x01,
+    BritishIse = 0x02,
+    BritishIze = 0x04,
+    Canadian = 0x08,
+    Australian = 0x10,
+    Other = 0x20,
+}
+
+#[cfg(feature = "flags")]
+pub type CategorySet = enumflags2::BitFlags<Category>;
+
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+#[cfg_attr(feature = "flags", derive(enumflags2::BitFlags))]
+#[repr(u8)]
+pub enum Tag {
+    Eq = 0x01,
+    Variant = 0x02,
+    Seldom = 0x04,
+    Possible = 0x08,
+    Improper = 0x10,
+}
+
+#[cfg(feature = "flags")]
+pub type TagSet = enumflags2::BitFlags<Tag>;
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
+#[cfg_attr(feature = "flags", derive(enumflags2::BitFlags))]
+#[repr(u8)]
+pub enum Pos {
+    Noun = 0x01,
+    Verb = 0x02,
+    Adjective = 0x04,
+    Adverb = 0x08,
+}
+
+#[cfg(feature = "flags")]
+pub type PosSet = enumflags2::BitFlags<Pos>;