feat(varcon): Update to Version 2020.12.07

2024-11-22 00:51:11 -05:00 · 2024-08-23 09:51:46 -05:00 · 2024-08-23 09:51:46 -05:00 · dd3e1018f8
commit dd3e1018f8
parent d35e0fe68c
6 changed files with 44593 additions and 42252 deletions
--- a/crates/typos-vars/src/vars_codegen.rs
+++ b/crates/typos-vars/src/vars_codegen.rs
--- a/crates/varcon-core/src/lib.rs
+++ b/crates/varcon-core/src/lib.rs
@ -124,8 +124,9 @@ pub enum Pos {
    Verb = 0x02,
    Adjective = 0x04,
    Adverb = 0x08,
-    Interjection = 0x10,
+    AdjectiveOrAdverb = 0x10,
-    Preposition = 0x20,
+    Interjection = 0x20,
    Preposition = 0x40,
 }
 #[cfg(feature = "flags")]
--- a/crates/varcon-core/src/parser.rs
+++ b/crates/varcon-core/src/parser.rs
@ -970,6 +970,8 @@ impl Entry {
            let comment =
                opt((comment_sep, space1, winnow::ascii::till_line_ending)).parse_next(input)?;
            let _ = winnow::ascii::space0.parse_next(input)?;
            e.variants = variants;
            e.comment = comment.map(|c| c.2.to_owned());
            Ok(e)
@ -1001,6 +1003,13 @@ impl Entry {
                entry.description = opt(preceded(space1, description))
                    .parse_next(input)?
                    .map(|d| d.to_owned());
                if opt((winnow::ascii::space0, '|'))
                    .parse_next(input)?
                    .is_some()
                {
                    entry.note = opt(preceded(space1, note)).parse_next(input)?;
                }
            }
            Ok(entry)
        })
@ -1020,7 +1029,7 @@ fn archaic(input: &mut &str) -> PResult<(), ()> {
 }
 fn description(input: &mut &str) -> PResult<String, ()> {
-    let description = winnow::token::take_till(0.., ('\n', '\r', '#')).parse_next(input)?;
+    let description = winnow::token::take_till(0.., ('\n', '\r', '#', '|')).parse_next(input)?;
    Ok(description.to_owned())
 }
@ -1432,6 +1441,56 @@ Entry {
    comment: None,
 }
 "#]]
        );
    }
    #[test]
    fn test_description_and_note() {
        // Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
        // cases.
        let (input, actual) = Entry::parse_
            .parse_peek("A B: wizz | as in \"gee whiz\" | -- Ox: informal, chiefly N. Amer.\n")
            .unwrap();
        assert_data_eq!(
            input,
            str![[r#"
 "#]]
        );
        assert_data_eq!(
            actual.to_debug(),
            str![[r#"
 Entry {
    variants: [
        Variant {
            types: [
                Type {
                    category: American,
                    tag: None,
                    num: None,
                },
                Type {
                    category: BritishIse,
                    tag: None,
                    num: None,
                },
            ],
            word: "wizz",
        },
    ],
    pos: None,
    archaic: false,
    description: Some(
        "as in /"gee whiz/" ",
    ),
    note: Some(
        "Ox: informal, chiefly N. Amer.",
    ),
    comment: None,
 }
 "#]]
        );
    }
@ -1511,9 +1570,15 @@ impl Variant {
    fn parse_(input: &mut &str) -> PResult<Self, ()> {
        trace("variant", move |input: &mut &str| {
            let types = winnow::combinator::separated(1.., Type::parse_, space1);
            let columns =
                winnow::combinator::separated(0.., winnow::ascii::digit1, space1).map(|()| ());
            let sep = (":", winnow::ascii::space0);
-            let (types, word) =
+            let ((types, _, _columns), word) = winnow::combinator::separated_pair(
-                winnow::combinator::separated_pair(types, sep, word).parse_next(input)?;
+                (types, winnow::ascii::space0, columns),
                sep,
                word,
            )
            .parse_next(input)?;
            let v = Self { types, word };
            Ok(v)
        })
@ -1624,6 +1689,35 @@ Variant {
    word: "air gun",
 }
 "#]]
        );
    }
    #[test]
    fn test_columns() {
        // Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
        // cases.
        let (input, actual) = Variant::parse_.parse_peek("A B 1 2: aeries").unwrap();
        assert_data_eq!(input, str![""]);
        assert_data_eq!(
            actual.to_debug(),
            str![[r#"
 Variant {
    types: [
        Type {
            category: American,
            tag: None,
            num: None,
        },
        Type {
            category: BritishIse,
            tag: None,
            num: None,
        },
    ],
    word: "aeries",
 }
 "#]]
        );
    }
@ -1874,6 +1968,7 @@ impl Pos {
                "V".value(Pos::Verb),
                "Adj".value(Pos::Adjective),
                "Adv".value(Pos::Adverb),
                "A".value(Pos::AdjectiveOrAdverb),
                "Inj".value(Pos::Interjection),
                "Prep".value(Pos::Preposition),
            ))
--- a/crates/varcon/assets/README
+++ b/crates/varcon/assets/README
@ -1,8 +1,9 @@
 Variant Conversion Info (VarCon)
 ********************************
-Version 2019.10.06
+Version 2020.12.07
-Copyright 2000-2016 by Kevin Atkinson (kevina@gnu.org) and Benjamin
+Copyright 2000-2020 by Kevin Atkinson (kevina@gnu.org) and Benjamin
 Titze (btitze@protonmail.ch).
 This package contains information to convert between American,
@ -11,9 +12,17 @@ other variant information.
 The latest version can be found at http://wordlist.aspell.net/.
 File Format
 ===========
 The main data file is varcon.txt.  It contains information on the
-preferred American, British, and Canadian spelling of a word as well
+preferred American, British, Canadian and Australian spelling of a
-as other variant information.
+word as well as other variant information.
 Varcon Lines
 ------------
 Each line contains a mapping between the various spellings of a word.
 Words are tagged to indicate where the spelling is used, and each
@ -32,6 +41,7 @@ spelling is sometimes used in America (as indicated the "Av").
 More generally each tag consists of a spelling category (for example
 "A") followed possible by a variant indicator.  The spelling
 categories are as follows:
  A: American
  B: British "ise" spelling
  Z: British "ize" spelling or OED preferred Spelling
@ -39,7 +49,9 @@ categories are as follows:
  D: Australian
  _: Other (Variant info based on American dictionaries, never used
            with any of the above).
 and the variants tags are as follows:
  .: equal
  v: variant
  V: seldom used variant
@ -66,6 +78,13 @@ If there are no tags with the 'Z' spelling category on the line then
 'B' implies 'Z'.  Similarly if there are no 'C' tags then 'Z' implies
 'C'.  If there are no 'D' tags then 'B' implies 'D'.
 Some entries may have a number after the tags, this is a column
 number and will be explained later.
 Varcon Clusters
 ---------------
 For ease of reading and maintaining the data file, each line is
 grouped in a cluster of closely related words.  Each cluster is
 uniquely identified by a headword, which is generally the American
@ -86,10 +105,26 @@ the headword is found in.  The levels generally mean the following:
         unabridged dictionary
   > 80: May not even be a legal word
-Sometimes the spelling of a word depends on the usage.  If so the word
+Earlier versions of varcon contained numerous errors.  With version
-is listed more than once within a cluster, with any usage information
+5.0 massive effort has been made to correct many of these errors.
-being indicated after a " | ".  For example here is part of the cluster
+Clusters that have undergone some form of verification (and likely
-for prize:
+correction) are marked with "<verified>".  As of version 5.0, most
 clusters with headwords word in common usage (SCOWL level 35 and
 below) should now be checked, as well as many others.  No effort was
 made to check clusters with headwords in SCOWL level 80 and above;
 many of those entries are unlikely to be in the dictionary anyway.
 Varcon Groups
 -------------
 Sometimes the spelling of a word depends on the usage in which case a
 cluster is split into multiple groups with each group represting one
 usage of a word.  Usage annotations and/or pos tags are used to
 distinguish one group from another.
 Usage information is given after a " | ".  For example here is part of
 the cluster for prize:
  A B: prize | reward
  A B: prizes | reward
  A C: prize / B: prise | otherwise
@ -102,50 +137,90 @@ consists of a number, for example:
  A B: sake | :1
  A C: sake / Av B Cv: saki | :2
-Sometimes part-of-speech (POS) info is given to help distinguish which
+A part-of-speech (POS) tag may also given after a " | ", for example:
 form is used.  For example:
  A B C: practice / AV Cv: practise | <N>
  A Cv: practice / AV B C: practise | <V>
-POS info is always given in the form "<POS>" and if a definition
+POS tags are always given in the form "<POS>" and if a definition
 is also given the POS info is always first.  The POS tags used are as
 follows:
  <N>: Noun
  <V>: Verb
  <Adj>: Adjective
  <Adv>: Adverb
  <A>: Adjective or Adverb
  <Inj>
  <Prep>
  <abbr>
 Additional Annotations
 ----------------------
 A "(-)" before the definition indicated a rarely used or archaic form
 of a word, for example:
  A B: bark | :1
  A: bark / Av B: barque | (-) ship
-A "--" indicates a note rather than definition.  This is generally
+A "| -- pl: someword" indicates that the word is a plural and the root
-used to indicate that the spelling of the plural form not depend on
+is someword.
 the spelling of the root word, for example:
  _: cabby / _.: cabbie
  _: cabbies | -- plural
-Misc. notes on a particular form of a word are given after a "#" on
+A plain "| -- pl" indicates that the word is a plural and the root is
-the same line.  Misc. notes for the cluster are given at the end of
+elsewhere within the group.  It is used when one form of the plural is
-the cluster and are prefixed with "##", for example:
+the same as the root word, for example:
  _1: yak | :1
  _ 1: yaks / _V 1: yak | :1 | -- pl
  _ 1: yak's | :1
 A "| --" otherwise indicates a note which gives additional context but
 does not create it's own group like a definition does.
 A "#" after a line indicates a comment that is often used to indicate
 why.  A "##" after a cluster indicates the the comment applies to the
 entire cluster, for example:
  # coloration <verified> (level 50)
  A B C: coloration / B. Cv: colouration
  A B C: colorations / B. Cv: colourations
  A B C: coloration's / B. Cv: colouration's
  ## OED has coloration as the preferred spelling and discolouration as a
  ## variant for British Engl or some reason
-In the notes ODE (not to be confused with OED) stands for Oxford
+In the comments ODE (not to be confused with OED) stands for Oxford
 Dictionary of English, "Ox" is used for any Oxford dictionary, and
 "M-W" for Merriam-Webster.
-Earlier versions of varcon contained numerous errors.  With version
+
-5.0 massive effort has been made to correct many of these errors.
+Varcon Columns
-Clusters that have undergone some form of verification (and likely
+--------------
-correction) are marked with "<verified>".  As of version 5.0, most
+
-clusters with headwords word in common usage (SCOWL level 35 and
+Varcon does not directly expresses the relation of words within a
-below) should now be checked, as well as many others.  No effort was
+group as it is normally easy to derive.  For example given a simple
-made to check clusters with headwords in SCOWL level 80 and above;
+group of:
-many of those entries are unlikely to be in the dictionary anyway.
+  A: acknowledgment / B: acknowledgement
  A: acknowledgments / B: acknowledgements
  A: acknowledgment's / B: acknowledgement's
 it is clear that acknowledgments is the plural form of acknowledgment
 since they are both the American spelling of a word.  While
 acknowledgEments is the plural form of acknowledgEment since they are
 both the British forms of a word.  Within a group each varcon line
 is considered a row in a table and each entry within a line is considered
 a column.  Within this group the first column is the American spelling
 and the second is the British.
 Sometime the column assignment unclear, when they are explicit column
 numbers may be given.  For example:
  A B: caulk / Av: calk / AV Bv 1: caulking / AV 2: calking | <N> :3
  A B: caulks / Av: calks / AV Bv 1: caulkings / AV 2: calkings | <N> :3
  A B: caulk's / Av: calk's / AV Bv 1: caulking's / AV 2: calking's | <N> :3
 Each column must contain exactly one spelling of the base form of a
 word, however a column may contain multiple derived forms for a single
 spelling of the base form, for example:
  A B D 1: amoeba / Av Dv 2: ameba
  A B D 1: amoebas / Av Bv Dv 1: amoebae / Av Dv 2: amebas / Av Dv 2: amebae
  A B D 1: amoeba's / Av Dv 2: ameba's
 Additional Files
 ================
 The file variant-also.tab contains additional mappings between various
 spellings of a word which are not yet in varcon.txt.  No attempt is
@ -155,6 +230,7 @@ automatically from the AGID inflection database.  The file
 variant-wroot.tab is like variant-infl.tab except that it also
 included the root form of the word.
 The file voc.tab is similar to varcon.txt but converts between
 vocabulary instead of spelling.  Unlike varcon.tab it is a simple tab
 separated file with the fields corresponding to the American, British,
@ -163,11 +239,13 @@ the same thing the words are separated with commas.  The last column
 contains additional notes on when the word is used.  Unlike varcon.txt
 it is generally not suitable for automatic conversion.
 The "make-variant" Perl script will combine varcon.txt,
 variant-also.tab, and variant-infl.tab into one huge mapping and will
 output the result to "variant.tab".  If the "no-infl" option is given
 than variant-infl.tab will not be included.
 The "split" script will split out the information in varcon.txt into
 several word lists named as follows:
  <spelling>[-v<variant level>][-uncommon].lst
@ -182,6 +260,7 @@ as follows:
  - => 2
 "-uncommon" is used for forms marked with "(-)" as already described.
 The "translate" Perl script will translate a text file from one
 spelling to another. Its usage is:
@ -199,16 +278,23 @@ Text is read in from standard input and is outputted to standard out.
 Words are marked with a '?' before and after the questionable word
 when the option is enabled.
 The file varcon.pm contains some library routines for parsing
 varcon.txt and is used by many of the scripts above.
 Feedback
 ========
 If you discover any errors in these mappings or have suggestions for
 additions please file a bug report at
 https://github.com/kevina/wordlist/issues, or alternatively email me
 directly at kevina@gnu.org, but I will likely tell you to file a bug
 report so that I don't forget about it.
-SOURCE:
+
 Sources
 =======
 These mappings were compiled from numerous sources.
@ -296,9 +382,22 @@ The primary sources for this addition were:
    http://blogs.usyd.edu.au/elac/2008/01/webster_in_australia.html
-CHANGELOG:
+Changelog
 =========
-From 2017.08.24 to 2018.10.06
+From 2018.10.06 to 2020.12.07
   - Additional documentation on file format
   - Minor change in file format
   - Fix scripts to work with modern versions of Perl.
   - Various new entries
   - Additional cleanups
 From 2017.08.24 to 2019.10.06
   - Added entries for: eukaryote, prokaryote, virtualization, volcanism
@ -423,9 +522,11 @@ From Revision 1 to Revision 2 (January 27, 2001)
    words in them. 
  - Added variant-infl.tab
 COPYRIGHT:
-Copyright 2000-2018 by Kevin Atkinson
+Copyright
 =========
 Copyright 2000-2019 by Kevin Atkinson
 Permission to use, copy, modify, distribute and sell this array, the
 associated software, and its documentation for any purpose is hereby
--- a/crates/varcon/assets/varcon.txt
+++ b/crates/varcon/assets/varcon.txt
--- a/crates/varcon/src/codegen.rs
+++ b/crates/varcon/src/codegen.rs