use std::collections::BTreeMap; use std::collections::HashSet; use std::io::Write; #[test] fn codegen() { let mut content = vec![]; generate_variations(&mut content); let content = String::from_utf8(content).unwrap(); let content = codegenrs::rustfmt(&content, None).unwrap(); snapbox::assert_data_eq!(content, snapbox::file!["../src/vars_codegen.rs"].raw()); } static CATEGORIES: [varcon::Category; 4] = [ varcon::Category::American, varcon::Category::BritishIse, // For now, only want to support one form of British, so going with -ise as it seems more // popular. varcon::Category::Canadian, varcon::Category::Australian, // Other basically means all ]; fn generate_variations(file: &mut W) { let entries = entries(); writeln!( file, "// This file is @generated by {}", file!().replace('\\', "/") ) .unwrap(); writeln!( file, "#![allow(clippy::unreadable_literal, clippy::type_complexity)]", ) .unwrap(); writeln!(file).unwrap(); writeln!(file, "pub type Variants = &'static [&'static str];",).unwrap(); writeln!( file, "pub type VariantsMap = [Variants; {}];", CATEGORIES.len() ) .unwrap(); writeln!(file).unwrap(); writeln!(file, "pub fn all_categories() -> crate::CategorySet {{",).unwrap(); writeln!( file, " {}", itertools::join( CATEGORIES.iter().map(|c| format!("crate::Category::{c:?}")), " | " ) ) .unwrap(); writeln!(file, "}}",).unwrap(); writeln!(file).unwrap(); writeln!( file, "pub fn corrections(category: crate::Category, options: VariantsMap) -> &'static [&'static str] {{", ) .unwrap(); writeln!(file, " match category {{").unwrap(); for (index, category) in CATEGORIES.iter().enumerate() { writeln!( file, " crate::Category::{category:?} => options[{index}]," ) .unwrap(); } writeln!( file, " crate::Category::BritishIze | crate::Category::Other => unreachable!(\"{{:?}} is unused\", category),", ) .unwrap(); writeln!(file, " }}").unwrap(); writeln!(file, "}}").unwrap(); writeln!(file).unwrap(); let entry_sets = entry_sets(entries.iter()); let mut referenced_symbols: HashSet<&str> = HashSet::new(); dictgen::generate_trie( file, "VARS", "&[(u8, &VariantsMap)]", entry_sets.iter().filter_map(|kv| { let (word, data) = kv; if is_always_valid(data) { // No need to convert from current form to target form None } else { referenced_symbols.extend(data.iter().map(|(s, _)| s)); let value = generate_link(data); Some((*word, value)) } }), 64, ) .unwrap(); let no_invalid = entry_sets.values().all(|data| !is_always_invalid(data)); writeln!(file).unwrap(); writeln!(file, "pub const NO_INVALID: bool = {no_invalid:?};",).unwrap(); writeln!(file).unwrap(); for (symbol, entry) in entries.iter() { if !referenced_symbols.contains(symbol.as_str()) { continue; } generate_entry(file, symbol, entry); } } fn generate_entry(file: &mut impl Write, symbol: &str, entry: &varcon_core::Entry) { writeln!(file, "pub(crate) static {symbol}: VariantsMap = [").unwrap(); for category in &CATEGORIES { let corrections = collect_correct(entry, *category); let mut corrections: Vec<_> = corrections.iter().collect(); corrections.sort_unstable(); writeln!(file, " &[").unwrap(); for correction in &corrections { writeln!(file, " {correction:?},").unwrap(); } writeln!(file, " ],").unwrap(); } writeln!(file, "];").unwrap(); writeln!(file).unwrap(); } fn generate_link(data: &[(&str, varcon::CategorySet)]) -> String { let mut output = Vec::new(); write!(output, "&[").unwrap(); for (symbol, set) in data.iter() { write!(output, "(0b{:05b}, &{}), ", set.bits(), symbol).unwrap(); } write!(output, "]").unwrap(); String::from_utf8(output).unwrap() } fn is_always_valid(data: &[(&str, varcon::CategorySet)]) -> bool { let valid_categories = valid_categories(); for (_symbol, set) in data.iter() { if *set == valid_categories { return true; } } false } fn is_always_invalid(data: &[(&str, varcon::CategorySet)]) -> bool { for (_symbol, set) in data.iter() { if set.is_empty() { return true; } } false } fn entries() -> BTreeMap { varcon::VARCON .iter() .flat_map(|c| c.entries.iter()) .filter(|e| { e.variants .iter() .all(|v| typos::tokens::Word::new(v.word, 0).is_ok()) }) .map(|e| { let mut e = e.into_owned(); for variant in e.variants.iter_mut() { variant.word.make_ascii_lowercase(); } (entry_symbol(&e), e) }) .collect() } fn entry_symbol(entry: &varcon_core::Entry) -> String { let mut hasher = std::collections::hash_map::DefaultHasher::new(); std::hash::Hash::hash(entry, &mut hasher); let hash = std::hash::Hasher::finish(&hasher); format!( "ENTRY_{}_{}", entry.variants[0].word.to_ascii_uppercase(), hash ) } fn entry_sets<'e>( entries: impl Iterator, ) -> BTreeMap<&'e str, Vec<(&'e str, varcon::CategorySet)>> { let mut sets = BTreeMap::new(); for (symbol, entry) in entries { for (word, set) in entry_set(entry).iter() { let v = sets.entry(*word).or_insert_with(Vec::new); v.push((symbol.as_str(), *set)); } } sets } fn entry_set(entry: &varcon_core::Entry) -> BTreeMap<&str, varcon::CategorySet> { let mut sets = BTreeMap::new(); let valid_categories = valid_categories(); for variant in entry.variants.iter() { let set = sets .entry(variant.word.as_str()) .or_insert_with(varcon::CategorySet::empty); for t in variant.types.iter() { match t.category { varcon::Category::Other => *set |= valid_categories, varcon::Category::BritishIze => (), _ => set.insert(t.category), } } } sets } fn valid_categories() -> varcon::CategorySet { let mut c = varcon::CategorySet::empty(); for cat in CATEGORIES.iter() { c.insert(*cat); } c } fn collect_correct(entry: &varcon_core::Entry, category: varcon::Category) -> HashSet<&str> { // If there is ambiguity, collect all potential options. let mut primary = HashSet::new(); let mut backup = HashSet::new(); for variant in entry.variants.iter().filter(|v| !ignore_variant(v)) { for t in variant .types .iter() .filter(|t| t.category == category || t.category == varcon::Category::Other) { let tag = t.tag.unwrap_or(varcon::Tag::Eq); if tag == varcon::Tag::Eq { primary.insert(variant.word.as_str()); } if tag != varcon::Tag::Improper { backup.insert(variant.word.as_str()); } } } if primary.len() == 1 { primary } else { backup } } fn ignore_variant(variant: &varcon_core::Variant) -> bool { if variant.word == "anesthetisation" && variant.types.len() == 1 && variant.types[0].category == varcon::Category::Australian && (variant.types[0].tag == Some(varcon::Tag::Variant) || variant.types[0].tag == Some(varcon::Tag::Seldom)) { return true; } false } // dict needs // all words, with bitfags, pointing to list of entry names // // varcon needs // all entries by name