2020-05-27 20:46:41 -05:00
|
|
|
use std::collections::BTreeMap;
|
|
|
|
use std::collections::HashSet;
|
|
|
|
use std::io::Write;
|
|
|
|
|
2022-08-01 14:45:58 -05:00
|
|
|
#[test]
|
|
|
|
fn codegen() {
|
|
|
|
let mut content = vec![];
|
|
|
|
generate_variations(&mut content);
|
|
|
|
|
|
|
|
let content = String::from_utf8(content).unwrap();
|
|
|
|
let content = codegenrs::rustfmt(&content, None).unwrap();
|
2024-05-27 22:09:20 -05:00
|
|
|
snapbox::assert_data_eq!(content, snapbox::file!["../src/vars_codegen.rs"].raw());
|
2022-08-01 14:45:58 -05:00
|
|
|
}
|
2020-05-27 20:46:41 -05:00
|
|
|
|
|
|
|
static CATEGORIES: [varcon::Category; 4] = [
|
|
|
|
varcon::Category::American,
|
|
|
|
varcon::Category::BritishIse,
|
|
|
|
// For now, only want to support one form of British, so going with -ise as it seems more
|
|
|
|
// popular.
|
|
|
|
varcon::Category::Canadian,
|
|
|
|
varcon::Category::Australian,
|
|
|
|
// Other basically means all
|
|
|
|
];
|
|
|
|
|
2024-04-26 21:14:01 -05:00
|
|
|
fn generate_variations<W: Write>(file: &mut W) {
|
2020-05-27 20:46:41 -05:00
|
|
|
let entries = entries();
|
|
|
|
|
2022-09-01 07:15:42 -05:00
|
|
|
writeln!(
|
|
|
|
file,
|
|
|
|
"// This file is @generated by {}",
|
|
|
|
file!().replace('\\', "/")
|
|
|
|
)
|
|
|
|
.unwrap();
|
2021-06-30 16:03:09 -05:00
|
|
|
writeln!(
|
|
|
|
file,
|
|
|
|
"#![allow(clippy::unreadable_literal, clippy::type_complexity)]",
|
|
|
|
)
|
|
|
|
.unwrap();
|
2020-05-27 20:46:41 -05:00
|
|
|
writeln!(file).unwrap();
|
|
|
|
|
|
|
|
writeln!(file, "pub type Variants = &'static [&'static str];",).unwrap();
|
|
|
|
writeln!(
|
|
|
|
file,
|
|
|
|
"pub type VariantsMap = [Variants; {}];",
|
|
|
|
CATEGORIES.len()
|
|
|
|
)
|
|
|
|
.unwrap();
|
|
|
|
writeln!(file).unwrap();
|
|
|
|
|
|
|
|
writeln!(file, "pub fn all_categories() -> crate::CategorySet {{",).unwrap();
|
|
|
|
writeln!(
|
|
|
|
file,
|
|
|
|
" {}",
|
|
|
|
itertools::join(
|
2024-07-26 16:08:02 -05:00
|
|
|
CATEGORIES.iter().map(|c| format!("crate::Category::{c:?}")),
|
2020-05-27 20:46:41 -05:00
|
|
|
" | "
|
|
|
|
)
|
|
|
|
)
|
|
|
|
.unwrap();
|
|
|
|
writeln!(file, "}}",).unwrap();
|
|
|
|
writeln!(file).unwrap();
|
|
|
|
|
|
|
|
writeln!(
|
|
|
|
file,
|
|
|
|
"pub fn corrections(category: crate::Category, options: VariantsMap) -> &'static [&'static str] {{",
|
|
|
|
)
|
|
|
|
.unwrap();
|
|
|
|
writeln!(file, " match category {{").unwrap();
|
|
|
|
for (index, category) in CATEGORIES.iter().enumerate() {
|
|
|
|
writeln!(
|
|
|
|
file,
|
2024-07-26 16:08:02 -05:00
|
|
|
" crate::Category::{category:?} => options[{index}],"
|
2020-05-27 20:46:41 -05:00
|
|
|
)
|
|
|
|
.unwrap();
|
|
|
|
}
|
|
|
|
writeln!(
|
|
|
|
file,
|
|
|
|
" crate::Category::BritishIze | crate::Category::Other => unreachable!(\"{{:?}} is unused\", category),",
|
|
|
|
)
|
|
|
|
.unwrap();
|
|
|
|
writeln!(file, " }}").unwrap();
|
|
|
|
writeln!(file, "}}").unwrap();
|
|
|
|
writeln!(file).unwrap();
|
|
|
|
|
|
|
|
let entry_sets = entry_sets(entries.iter());
|
|
|
|
let mut referenced_symbols: HashSet<&str> = HashSet::new();
|
2021-06-30 16:03:09 -05:00
|
|
|
dictgen::generate_trie(
|
2021-04-30 21:16:04 -05:00
|
|
|
file,
|
2021-06-30 16:03:09 -05:00
|
|
|
"VARS",
|
2021-06-30 10:12:17 -05:00
|
|
|
"&[(u8, &VariantsMap)]",
|
2024-04-26 21:14:01 -05:00
|
|
|
entry_sets.iter().filter_map(|kv| {
|
2021-06-30 10:12:17 -05:00
|
|
|
let (word, data) = kv;
|
|
|
|
if is_always_valid(data) {
|
|
|
|
// No need to convert from current form to target form
|
|
|
|
None
|
|
|
|
} else {
|
|
|
|
referenced_symbols.extend(data.iter().map(|(s, _)| s));
|
2021-11-08 11:36:05 -06:00
|
|
|
let value = generate_link(data);
|
2021-06-30 10:12:17 -05:00
|
|
|
Some((*word, value))
|
|
|
|
}
|
|
|
|
}),
|
2021-06-30 16:03:09 -05:00
|
|
|
64,
|
2021-04-30 21:16:04 -05:00
|
|
|
)
|
|
|
|
.unwrap();
|
2020-11-10 20:50:10 -06:00
|
|
|
|
2021-06-30 10:12:17 -05:00
|
|
|
let no_invalid = entry_sets.values().all(|data| !is_always_invalid(data));
|
2021-05-18 21:05:18 -05:00
|
|
|
writeln!(file).unwrap();
|
2024-07-26 16:08:02 -05:00
|
|
|
writeln!(file, "pub const NO_INVALID: bool = {no_invalid:?};",).unwrap();
|
2021-05-18 21:05:18 -05:00
|
|
|
|
|
|
|
writeln!(file).unwrap();
|
2020-05-27 20:46:41 -05:00
|
|
|
for (symbol, entry) in entries.iter() {
|
|
|
|
if !referenced_symbols.contains(symbol.as_str()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
generate_entry(file, symbol, entry);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-04-26 21:14:01 -05:00
|
|
|
fn generate_entry(file: &mut impl Write, symbol: &str, entry: &varcon_core::Entry) {
|
2024-07-26 16:08:02 -05:00
|
|
|
writeln!(file, "pub(crate) static {symbol}: VariantsMap = [").unwrap();
|
2020-05-27 20:46:41 -05:00
|
|
|
for category in &CATEGORIES {
|
|
|
|
let corrections = collect_correct(entry, *category);
|
|
|
|
let mut corrections: Vec<_> = corrections.iter().collect();
|
|
|
|
corrections.sort_unstable();
|
|
|
|
writeln!(file, " &[").unwrap();
|
|
|
|
for correction in &corrections {
|
2024-07-26 16:08:02 -05:00
|
|
|
writeln!(file, " {correction:?},").unwrap();
|
2020-05-27 20:46:41 -05:00
|
|
|
}
|
|
|
|
writeln!(file, " ],").unwrap();
|
|
|
|
}
|
|
|
|
writeln!(file, "];").unwrap();
|
|
|
|
writeln!(file).unwrap();
|
|
|
|
}
|
|
|
|
|
|
|
|
fn generate_link(data: &[(&str, varcon::CategorySet)]) -> String {
|
|
|
|
let mut output = Vec::new();
|
|
|
|
|
|
|
|
write!(output, "&[").unwrap();
|
|
|
|
for (symbol, set) in data.iter() {
|
|
|
|
write!(output, "(0b{:05b}, &{}), ", set.bits(), symbol).unwrap();
|
|
|
|
}
|
|
|
|
write!(output, "]").unwrap();
|
|
|
|
|
|
|
|
String::from_utf8(output).unwrap()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn is_always_valid(data: &[(&str, varcon::CategorySet)]) -> bool {
|
|
|
|
let valid_categories = valid_categories();
|
|
|
|
for (_symbol, set) in data.iter() {
|
|
|
|
if *set == valid_categories {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
false
|
|
|
|
}
|
|
|
|
|
2021-05-18 21:05:18 -05:00
|
|
|
fn is_always_invalid(data: &[(&str, varcon::CategorySet)]) -> bool {
|
|
|
|
for (_symbol, set) in data.iter() {
|
|
|
|
if set.is_empty() {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
false
|
|
|
|
}
|
|
|
|
|
2020-05-27 20:46:41 -05:00
|
|
|
fn entries() -> BTreeMap<String, varcon_core::Entry> {
|
|
|
|
varcon::VARCON
|
|
|
|
.iter()
|
2024-08-23 14:03:04 -05:00
|
|
|
.filter(|c| c.verified)
|
2020-05-27 20:46:41 -05:00
|
|
|
.flat_map(|c| c.entries.iter())
|
|
|
|
.filter(|e| {
|
|
|
|
e.variants
|
|
|
|
.iter()
|
2021-11-08 11:36:05 -06:00
|
|
|
.all(|v| typos::tokens::Word::new(v.word, 0).is_ok())
|
2020-05-27 20:46:41 -05:00
|
|
|
})
|
|
|
|
.map(|e| {
|
|
|
|
let mut e = e.into_owned();
|
|
|
|
for variant in e.variants.iter_mut() {
|
|
|
|
variant.word.make_ascii_lowercase();
|
|
|
|
}
|
|
|
|
(entry_symbol(&e), e)
|
|
|
|
})
|
|
|
|
.collect()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn entry_symbol(entry: &varcon_core::Entry) -> String {
|
|
|
|
let mut hasher = std::collections::hash_map::DefaultHasher::new();
|
|
|
|
std::hash::Hash::hash(entry, &mut hasher);
|
|
|
|
let hash = std::hash::Hasher::finish(&hasher);
|
|
|
|
format!(
|
|
|
|
"ENTRY_{}_{}",
|
|
|
|
entry.variants[0].word.to_ascii_uppercase(),
|
|
|
|
hash
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn entry_sets<'e>(
|
|
|
|
entries: impl Iterator<Item = (&'e String, &'e varcon_core::Entry)>,
|
|
|
|
) -> BTreeMap<&'e str, Vec<(&'e str, varcon::CategorySet)>> {
|
|
|
|
let mut sets = BTreeMap::new();
|
|
|
|
for (symbol, entry) in entries {
|
|
|
|
for (word, set) in entry_set(entry).iter() {
|
|
|
|
let v = sets.entry(*word).or_insert_with(Vec::new);
|
|
|
|
v.push((symbol.as_str(), *set));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
sets
|
|
|
|
}
|
|
|
|
|
|
|
|
fn entry_set(entry: &varcon_core::Entry) -> BTreeMap<&str, varcon::CategorySet> {
|
|
|
|
let mut sets = BTreeMap::new();
|
|
|
|
let valid_categories = valid_categories();
|
|
|
|
for variant in entry.variants.iter() {
|
|
|
|
let set = sets
|
|
|
|
.entry(variant.word.as_str())
|
|
|
|
.or_insert_with(varcon::CategorySet::empty);
|
|
|
|
for t in variant.types.iter() {
|
|
|
|
match t.category {
|
|
|
|
varcon::Category::Other => *set |= valid_categories,
|
|
|
|
varcon::Category::BritishIze => (),
|
|
|
|
_ => set.insert(t.category),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
sets
|
|
|
|
}
|
|
|
|
|
|
|
|
fn valid_categories() -> varcon::CategorySet {
|
|
|
|
let mut c = varcon::CategorySet::empty();
|
|
|
|
for cat in CATEGORIES.iter() {
|
|
|
|
c.insert(*cat);
|
|
|
|
}
|
|
|
|
c
|
|
|
|
}
|
|
|
|
|
|
|
|
fn collect_correct(entry: &varcon_core::Entry, category: varcon::Category) -> HashSet<&str> {
|
|
|
|
// If there is ambiguity, collect all potential options.
|
|
|
|
let mut primary = HashSet::new();
|
|
|
|
let mut backup = HashSet::new();
|
|
|
|
for variant in entry.variants.iter().filter(|v| !ignore_variant(v)) {
|
|
|
|
for t in variant
|
|
|
|
.types
|
|
|
|
.iter()
|
|
|
|
.filter(|t| t.category == category || t.category == varcon::Category::Other)
|
|
|
|
{
|
|
|
|
let tag = t.tag.unwrap_or(varcon::Tag::Eq);
|
|
|
|
if tag == varcon::Tag::Eq {
|
|
|
|
primary.insert(variant.word.as_str());
|
|
|
|
}
|
|
|
|
if tag != varcon::Tag::Improper {
|
|
|
|
backup.insert(variant.word.as_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if primary.len() == 1 {
|
|
|
|
primary
|
|
|
|
} else {
|
|
|
|
backup
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn ignore_variant(variant: &varcon_core::Variant) -> bool {
|
|
|
|
if variant.word == "anesthetisation"
|
|
|
|
&& variant.types.len() == 1
|
|
|
|
&& variant.types[0].category == varcon::Category::Australian
|
|
|
|
&& (variant.types[0].tag == Some(varcon::Tag::Variant)
|
|
|
|
|| variant.types[0].tag == Some(varcon::Tag::Seldom))
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
false
|
|
|
|
}
|
|
|
|
|
|
|
|
// dict needs
|
|
|
|
// all words, with bitfags, pointing to list of entry names
|
|
|
|
//
|
|
|
|
// varcon needs
|
|
|
|
// all entries by name
|