typos/crates/typos-vars/tests/codegen.rs

281 lines
8 KiB
Rust
Raw Normal View History

use std::collections::BTreeMap;
use std::collections::HashSet;
use std::io::Write;
2022-08-01 15:45:58 -04:00
#[test]
fn codegen() {
let mut content = vec![];
generate_variations(&mut content);
let content = String::from_utf8(content).unwrap();
let content = codegenrs::rustfmt(&content, None).unwrap();
2024-05-27 23:09:20 -04:00
snapbox::assert_data_eq!(content, snapbox::file!["../src/vars_codegen.rs"].raw());
2022-08-01 15:45:58 -04:00
}
static CATEGORIES: [varcon::Category; 4] = [
varcon::Category::American,
varcon::Category::BritishIse,
// For now, only want to support one form of British, so going with -ise as it seems more
// popular.
varcon::Category::Canadian,
varcon::Category::Australian,
// Other basically means all
];
2024-04-26 22:14:01 -04:00
fn generate_variations<W: Write>(file: &mut W) {
let entries = entries();
2022-09-01 08:15:42 -04:00
writeln!(
file,
"// This file is @generated by {}",
file!().replace('\\', "/")
)
.unwrap();
writeln!(
file,
"#![allow(clippy::unreadable_literal, clippy::type_complexity)]",
)
.unwrap();
writeln!(file).unwrap();
writeln!(file, "pub type Variants = &'static [&'static str];",).unwrap();
writeln!(
file,
"pub type VariantsMap = [Variants; {}];",
CATEGORIES.len()
)
.unwrap();
writeln!(file).unwrap();
writeln!(file, "pub fn all_categories() -> crate::CategorySet {{",).unwrap();
writeln!(
file,
" {}",
itertools::join(
2024-07-26 17:08:02 -04:00
CATEGORIES.iter().map(|c| format!("crate::Category::{c:?}")),
" | "
)
)
.unwrap();
writeln!(file, "}}",).unwrap();
writeln!(file).unwrap();
writeln!(
file,
"pub fn corrections(category: crate::Category, options: VariantsMap) -> &'static [&'static str] {{",
)
.unwrap();
writeln!(file, " match category {{").unwrap();
for (index, category) in CATEGORIES.iter().enumerate() {
writeln!(
file,
2024-07-26 17:08:02 -04:00
" crate::Category::{category:?} => options[{index}],"
)
.unwrap();
}
writeln!(
file,
" crate::Category::BritishIze | crate::Category::Other => unreachable!(\"{{:?}} is unused\", category),",
)
.unwrap();
writeln!(file, " }}").unwrap();
writeln!(file, "}}").unwrap();
writeln!(file).unwrap();
let entry_sets = entry_sets(entries.iter());
let mut referenced_symbols: HashSet<&str> = HashSet::new();
dictgen::generate_trie(
file,
"VARS",
"&[(u8, &VariantsMap)]",
2024-04-26 22:14:01 -04:00
entry_sets.iter().filter_map(|kv| {
let (word, data) = kv;
if is_always_valid(data) {
// No need to convert from current form to target form
None
} else {
referenced_symbols.extend(data.iter().map(|(s, _)| s));
2021-11-08 12:36:05 -05:00
let value = generate_link(data);
Some((*word, value))
}
}),
64,
)
.unwrap();
let no_invalid = entry_sets.values().all(|data| !is_always_invalid(data));
writeln!(file).unwrap();
2024-07-26 17:08:02 -04:00
writeln!(file, "pub const NO_INVALID: bool = {no_invalid:?};",).unwrap();
writeln!(file).unwrap();
for (symbol, entry) in entries.iter() {
if !referenced_symbols.contains(symbol.as_str()) {
continue;
}
generate_entry(file, symbol, entry);
}
}
2024-04-26 22:14:01 -04:00
fn generate_entry(file: &mut impl Write, symbol: &str, entry: &varcon_core::Entry) {
2024-07-26 17:08:02 -04:00
writeln!(file, "pub(crate) static {symbol}: VariantsMap = [").unwrap();
for category in &CATEGORIES {
let corrections = collect_correct(entry, *category);
let mut corrections: Vec<_> = corrections.iter().collect();
corrections.sort_unstable();
writeln!(file, " &[").unwrap();
for correction in &corrections {
2024-07-26 17:08:02 -04:00
writeln!(file, " {correction:?},").unwrap();
}
writeln!(file, " ],").unwrap();
}
writeln!(file, "];").unwrap();
writeln!(file).unwrap();
}
fn generate_link(data: &[(&str, varcon::CategorySet)]) -> String {
let mut output = Vec::new();
write!(output, "&[").unwrap();
for (symbol, set) in data.iter() {
write!(output, "(0b{:05b}, &{}), ", set.bits(), symbol).unwrap();
}
write!(output, "]").unwrap();
String::from_utf8(output).unwrap()
}
fn is_always_valid(data: &[(&str, varcon::CategorySet)]) -> bool {
let valid_categories = valid_categories();
for (_symbol, set) in data.iter() {
if *set == valid_categories {
return true;
}
}
false
}
fn is_always_invalid(data: &[(&str, varcon::CategorySet)]) -> bool {
for (_symbol, set) in data.iter() {
if set.is_empty() {
return true;
}
}
false
}
fn entries() -> BTreeMap<String, varcon_core::Entry> {
varcon::VARCON
.iter()
2024-08-23 15:03:04 -04:00
.filter(|c| c.verified)
.flat_map(|c| c.entries.iter())
.filter(|e| {
e.variants
.iter()
2021-11-08 12:36:05 -05:00
.all(|v| typos::tokens::Word::new(v.word, 0).is_ok())
})
.map(|e| {
let mut e = e.into_owned();
for variant in e.variants.iter_mut() {
variant.word.make_ascii_lowercase();
}
(entry_symbol(&e), e)
})
.collect()
}
fn entry_symbol(entry: &varcon_core::Entry) -> String {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
std::hash::Hash::hash(entry, &mut hasher);
let hash = std::hash::Hasher::finish(&hasher);
format!(
"ENTRY_{}_{}",
entry.variants[0].word.to_ascii_uppercase(),
hash
)
}
fn entry_sets<'e>(
entries: impl Iterator<Item = (&'e String, &'e varcon_core::Entry)>,
) -> BTreeMap<&'e str, Vec<(&'e str, varcon::CategorySet)>> {
let mut sets = BTreeMap::new();
for (symbol, entry) in entries {
for (word, set) in entry_set(entry).iter() {
let v = sets.entry(*word).or_insert_with(Vec::new);
v.push((symbol.as_str(), *set));
}
}
sets
}
fn entry_set(entry: &varcon_core::Entry) -> BTreeMap<&str, varcon::CategorySet> {
let mut sets = BTreeMap::new();
let valid_categories = valid_categories();
for variant in entry.variants.iter() {
let set = sets
.entry(variant.word.as_str())
.or_insert_with(varcon::CategorySet::empty);
for t in variant.types.iter() {
match t.category {
varcon::Category::Other => *set |= valid_categories,
varcon::Category::BritishIze => (),
_ => set.insert(t.category),
}
}
}
sets
}
fn valid_categories() -> varcon::CategorySet {
let mut c = varcon::CategorySet::empty();
for cat in CATEGORIES.iter() {
c.insert(*cat);
}
c
}
fn collect_correct(entry: &varcon_core::Entry, category: varcon::Category) -> HashSet<&str> {
// If there is ambiguity, collect all potential options.
let mut primary = HashSet::new();
let mut backup = HashSet::new();
for variant in entry.variants.iter().filter(|v| !ignore_variant(v)) {
for t in variant
.types
.iter()
.filter(|t| t.category == category || t.category == varcon::Category::Other)
{
let tag = t.tag.unwrap_or(varcon::Tag::Eq);
if tag == varcon::Tag::Eq {
primary.insert(variant.word.as_str());
}
if tag != varcon::Tag::Improper {
backup.insert(variant.word.as_str());
}
}
}
if primary.len() == 1 {
primary
} else {
backup
}
}
fn ignore_variant(variant: &varcon_core::Variant) -> bool {
if variant.word == "anesthetisation"
&& variant.types.len() == 1
&& variant.types[0].category == varcon::Category::Australian
&& (variant.types[0].tag == Some(varcon::Tag::Variant)
|| variant.types[0].tag == Some(varcon::Tag::Seldom))
{
return true;
}
false
}
// dict needs
// all words, with bitfags, pointing to list of entry names
//
// varcon needs
// all entries by name