feat(varcon): Update to Version 2020.12.07

2024-11-21 16:41:01 -05:00 · 2024-08-23 09:51:46 -05:00 · 2024-08-23 09:51:46 -05:00 · dd3e1018f8
commit dd3e1018f8
parent d35e0fe68c
6 changed files with 44593 additions and 42252 deletions
--- a/crates/typos-vars/src/vars_codegen.rs
+++ b/crates/typos-vars/src/vars_codegen.rs
--- a/crates/varcon-core/src/lib.rs
+++ b/crates/varcon-core/src/lib.rs
@ -124,8 +124,9 @@ pub enum Pos {
    Verb = 0x02,
    Adjective = 0x04,
    Adverb = 0x08,
-    Interjection = 0x10,
-    Preposition = 0x20,
+    AdjectiveOrAdverb = 0x10,
+    Interjection = 0x20,
+    Preposition = 0x40,
 }

 #[cfg(feature = "flags")]
--- a/crates/varcon-core/src/parser.rs
+++ b/crates/varcon-core/src/parser.rs
@ -970,6 +970,8 @@ impl Entry {
            let comment =
                opt((comment_sep, space1, winnow::ascii::till_line_ending)).parse_next(input)?;

+            let _ = winnow::ascii::space0.parse_next(input)?;
+
            e.variants = variants;
            e.comment = comment.map(|c| c.2.to_owned());
            Ok(e)
@ -1001,6 +1003,13 @@ impl Entry {
                entry.description = opt(preceded(space1, description))
                    .parse_next(input)?
                    .map(|d| d.to_owned());
+
+                if opt((winnow::ascii::space0, '|'))
+                    .parse_next(input)?
+                    .is_some()
+                {
+                    entry.note = opt(preceded(space1, note)).parse_next(input)?;
+                }
            }
            Ok(entry)
        })
@ -1020,7 +1029,7 @@ fn archaic(input: &mut &str) -> PResult<(), ()> {
 }

 fn description(input: &mut &str) -> PResult<String, ()> {
-    let description = winnow::token::take_till(0.., ('\n', '\r', '#')).parse_next(input)?;
+    let description = winnow::token::take_till(0.., ('\n', '\r', '#', '|')).parse_next(input)?;
    Ok(description.to_owned())
 }

@ -1432,6 +1441,56 @@ Entry {
    comment: None,
 }

+"#]]
+        );
+    }
+
+    #[test]
+    fn test_description_and_note() {
+        // Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
+        // cases.
+        let (input, actual) = Entry::parse_
+            .parse_peek("A B: wizz | as in \"gee whiz\" | -- Ox: informal, chiefly N. Amer.\n")
+            .unwrap();
+        assert_data_eq!(
+            input,
+            str![[r#"
+
+
+"#]]
+        );
+        assert_data_eq!(
+            actual.to_debug(),
+            str![[r#"
+Entry {
+    variants: [
+        Variant {
+            types: [
+                Type {
+                    category: American,
+                    tag: None,
+                    num: None,
+                },
+                Type {
+                    category: BritishIse,
+                    tag: None,
+                    num: None,
+                },
+            ],
+            word: "wizz",
+        },
+    ],
+    pos: None,
+    archaic: false,
+    description: Some(
+        "as in /"gee whiz/" ",
+    ),
+    note: Some(
+        "Ox: informal, chiefly N. Amer.",
+    ),
+    comment: None,
+}
+
 "#]]
        );
    }
@ -1511,9 +1570,15 @@ impl Variant {
    fn parse_(input: &mut &str) -> PResult<Self, ()> {
        trace("variant", move |input: &mut &str| {
            let types = winnow::combinator::separated(1.., Type::parse_, space1);
+            let columns =
+                winnow::combinator::separated(0.., winnow::ascii::digit1, space1).map(|()| ());
            let sep = (":", winnow::ascii::space0);
-            let (types, word) =
-                winnow::combinator::separated_pair(types, sep, word).parse_next(input)?;
+            let ((types, _, _columns), word) = winnow::combinator::separated_pair(
+                (types, winnow::ascii::space0, columns),
+                sep,
+                word,
+            )
+            .parse_next(input)?;
            let v = Self { types, word };
            Ok(v)
        })
@ -1624,6 +1689,35 @@ Variant {
    word: "air gun",
 }

+"#]]
+        );
+    }
+
+    #[test]
+    fn test_columns() {
+        // Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
+        // cases.
+        let (input, actual) = Variant::parse_.parse_peek("A B 1 2: aeries").unwrap();
+        assert_data_eq!(input, str![""]);
+        assert_data_eq!(
+            actual.to_debug(),
+            str![[r#"
+Variant {
+    types: [
+        Type {
+            category: American,
+            tag: None,
+            num: None,
+        },
+        Type {
+            category: BritishIse,
+            tag: None,
+            num: None,
+        },
+    ],
+    word: "aeries",
+}
+
 "#]]
        );
    }
@ -1874,6 +1968,7 @@ impl Pos {
                "V".value(Pos::Verb),
                "Adj".value(Pos::Adjective),
                "Adv".value(Pos::Adverb),
+                "A".value(Pos::AdjectiveOrAdverb),
                "Inj".value(Pos::Interjection),
                "Prep".value(Pos::Preposition),
            ))
--- a/crates/varcon/assets/README
+++ b/crates/varcon/assets/README
@ -1,8 +1,9 @@
 Variant Conversion Info (VarCon)
+********************************

-Version 2019.10.06
+Version 2020.12.07

-Copyright 2000-2016 by Kevin Atkinson (kevina@gnu.org) and Benjamin
+Copyright 2000-2020 by Kevin Atkinson (kevina@gnu.org) and Benjamin
 Titze (btitze@protonmail.ch).

 This package contains information to convert between American,
@ -11,9 +12,17 @@ other variant information.

 The latest version can be found at http://wordlist.aspell.net/.

+
+File Format
+===========
+
 The main data file is varcon.txt.  It contains information on the
-preferred American, British, and Canadian spelling of a word as well
-as other variant information.
+preferred American, British, Canadian and Australian spelling of a
+word as well as other variant information.
+
+
+Varcon Lines
+------------

 Each line contains a mapping between the various spellings of a word.
 Words are tagged to indicate where the spelling is used, and each
@ -32,6 +41,7 @@ spelling is sometimes used in America (as indicated the "Av").
 More generally each tag consists of a spelling category (for example
 "A") followed possible by a variant indicator.  The spelling
 categories are as follows:
+
  A: American
  B: British "ise" spelling
  Z: British "ize" spelling or OED preferred Spelling
@ -39,7 +49,9 @@ categories are as follows:
  D: Australian
  _: Other (Variant info based on American dictionaries, never used
            with any of the above).
+
 and the variants tags are as follows:
+
  .: equal
  v: variant
  V: seldom used variant
@ -66,6 +78,13 @@ If there are no tags with the 'Z' spelling category on the line then
 'B' implies 'Z'.  Similarly if there are no 'C' tags then 'Z' implies
 'C'.  If there are no 'D' tags then 'B' implies 'D'.

+Some entries may have a number after the tags, this is a column
+number and will be explained later.
+
+
+Varcon Clusters
+---------------
+
 For ease of reading and maintaining the data file, each line is
 grouped in a cluster of closely related words.  Each cluster is
 uniquely identified by a headword, which is generally the American
@ -86,10 +105,26 @@ the headword is found in.  The levels generally mean the following:
         unabridged dictionary
   > 80: May not even be a legal word

-Sometimes the spelling of a word depends on the usage.  If so the word
-is listed more than once within a cluster, with any usage information
-being indicated after a " | ".  For example here is part of the cluster
-for prize:
+Earlier versions of varcon contained numerous errors.  With version
+5.0 massive effort has been made to correct many of these errors.
+Clusters that have undergone some form of verification (and likely
+correction) are marked with "<verified>".  As of version 5.0, most
+clusters with headwords word in common usage (SCOWL level 35 and
+below) should now be checked, as well as many others.  No effort was
+made to check clusters with headwords in SCOWL level 80 and above;
+many of those entries are unlikely to be in the dictionary anyway.
+
+
+Varcon Groups
+-------------
+
+Sometimes the spelling of a word depends on the usage in which case a
+cluster is split into multiple groups with each group represting one
+usage of a word.  Usage annotations and/or pos tags are used to
+distinguish one group from another.
+
+Usage information is given after a " | ".  For example here is part of
+the cluster for prize:
  A B: prize | reward
  A B: prizes | reward
  A C: prize / B: prise | otherwise
@ -102,50 +137,90 @@ consists of a number, for example:
  A B: sake | :1
  A C: sake / Av B Cv: saki | :2

-Sometimes part-of-speech (POS) info is given to help distinguish which
-form is used.  For example:
+A part-of-speech (POS) tag may also given after a " | ", for example:
  A B C: practice / AV Cv: practise | <N>
  A Cv: practice / AV B C: practise | <V>
-POS info is always given in the form "<POS>" and if a definition
+POS tags are always given in the form "<POS>" and if a definition
 is also given the POS info is always first.  The POS tags used are as
 follows:
  <N>: Noun
  <V>: Verb
  <Adj>: Adjective
  <Adv>: Adverb
+  <A>: Adjective or Adverb
+  <Inj>
+  <Prep>
+  <abbr>
+
+
+Additional Annotations
+----------------------

 A "(-)" before the definition indicated a rarely used or archaic form
 of a word, for example:
  A B: bark | :1
  A: bark / Av B: barque | (-) ship

-A "--" indicates a note rather than definition.  This is generally
-used to indicate that the spelling of the plural form not depend on
-the spelling of the root word, for example:
-  _: cabby / _.: cabbie
-  _: cabbies | -- plural
+A "| -- pl: someword" indicates that the word is a plural and the root
+is someword.

-Misc. notes on a particular form of a word are given after a "#" on
-the same line.  Misc. notes for the cluster are given at the end of
-the cluster and are prefixed with "##", for example:
+A plain "| -- pl" indicates that the word is a plural and the root is
+elsewhere within the group.  It is used when one form of the plural is
+the same as the root word, for example:
+  _1: yak | :1
+  _ 1: yaks / _V 1: yak | :1 | -- pl
+  _ 1: yak's | :1
+
+A "| --" otherwise indicates a note which gives additional context but
+does not create it's own group like a definition does.
+
+A "#" after a line indicates a comment that is often used to indicate
+why.  A "##" after a cluster indicates the the comment applies to the
+entire cluster, for example:
  # coloration <verified> (level 50)
  A B C: coloration / B. Cv: colouration
  A B C: colorations / B. Cv: colourations
  A B C: coloration's / B. Cv: colouration's
  ## OED has coloration as the preferred spelling and discolouration as a
  ## variant for British Engl or some reason
-In the notes ODE (not to be confused with OED) stands for Oxford
+In the comments ODE (not to be confused with OED) stands for Oxford
 Dictionary of English, "Ox" is used for any Oxford dictionary, and
 "M-W" for Merriam-Webster.

-Earlier versions of varcon contained numerous errors.  With version
-5.0 massive effort has been made to correct many of these errors.
-Clusters that have undergone some form of verification (and likely
-correction) are marked with "<verified>".  As of version 5.0, most
-clusters with headwords word in common usage (SCOWL level 35 and
-below) should now be checked, as well as many others.  No effort was
-made to check clusters with headwords in SCOWL level 80 and above;
-many of those entries are unlikely to be in the dictionary anyway.
+
+Varcon Columns
+--------------
+
+Varcon does not directly expresses the relation of words within a
+group as it is normally easy to derive.  For example given a simple
+group of:
+  A: acknowledgment / B: acknowledgement
+  A: acknowledgments / B: acknowledgements
+  A: acknowledgment's / B: acknowledgement's
+it is clear that acknowledgments is the plural form of acknowledgment
+since they are both the American spelling of a word.  While
+acknowledgEments is the plural form of acknowledgEment since they are
+both the British forms of a word.  Within a group each varcon line
+is considered a row in a table and each entry within a line is considered
+a column.  Within this group the first column is the American spelling
+and the second is the British.
+
+Sometime the column assignment unclear, when they are explicit column
+numbers may be given.  For example:
+  A B: caulk / Av: calk / AV Bv 1: caulking / AV 2: calking | <N> :3
+  A B: caulks / Av: calks / AV Bv 1: caulkings / AV 2: calkings | <N> :3
+  A B: caulk's / Av: calk's / AV Bv 1: caulking's / AV 2: calking's | <N> :3
+
+Each column must contain exactly one spelling of the base form of a
+word, however a column may contain multiple derived forms for a single
+spelling of the base form, for example:
+  A B D 1: amoeba / Av Dv 2: ameba
+  A B D 1: amoebas / Av Bv Dv 1: amoebae / Av Dv 2: amebas / Av Dv 2: amebae
+  A B D 1: amoeba's / Av Dv 2: ameba's
+
+
+Additional Files
+================

 The file variant-also.tab contains additional mappings between various
 spellings of a word which are not yet in varcon.txt.  No attempt is
@ -155,6 +230,7 @@ automatically from the AGID inflection database.  The file
 variant-wroot.tab is like variant-infl.tab except that it also
 included the root form of the word.

+
 The file voc.tab is similar to varcon.txt but converts between
 vocabulary instead of spelling.  Unlike varcon.tab it is a simple tab
 separated file with the fields corresponding to the American, British,
@ -163,11 +239,13 @@ the same thing the words are separated with commas.  The last column
 contains additional notes on when the word is used.  Unlike varcon.txt
 it is generally not suitable for automatic conversion.

+
 The "make-variant" Perl script will combine varcon.txt,
 variant-also.tab, and variant-infl.tab into one huge mapping and will
 output the result to "variant.tab".  If the "no-infl" option is given
 than variant-infl.tab will not be included.

+
 The "split" script will split out the information in varcon.txt into
 several word lists named as follows:
  <spelling>[-v<variant level>][-uncommon].lst
@ -182,6 +260,7 @@ as follows:
  - => 2
 "-uncommon" is used for forms marked with "(-)" as already described.

+
 The "translate" Perl script will translate a text file from one
 spelling to another. Its usage is:

@ -199,16 +278,23 @@ Text is read in from standard input and is outputted to standard out.
 Words are marked with a '?' before and after the questionable word
 when the option is enabled.

+
 The file varcon.pm contains some library routines for parsing
 varcon.txt and is used by many of the scripts above.

+
+Feedback
+========
+
 If you discover any errors in these mappings or have suggestions for
 additions please file a bug report at
 https://github.com/kevina/wordlist/issues, or alternatively email me
 directly at kevina@gnu.org, but I will likely tell you to file a bug
 report so that I don't forget about it.

-SOURCE:
+
+Sources
+=======

 These mappings were compiled from numerous sources.

@ -296,9 +382,22 @@ The primary sources for this addition were:
    http://blogs.usyd.edu.au/elac/2008/01/webster_in_australia.html


-CHANGELOG:
+Changelog
+=========

-From 2017.08.24 to 2018.10.06
+From 2018.10.06 to 2020.12.07
+
+   - Additional documentation on file format
+
+   - Minor change in file format
+
+   - Fix scripts to work with modern versions of Perl.
+
+   - Various new entries
+
+   - Additional cleanups
+
+From 2017.08.24 to 2019.10.06

   - Added entries for: eukaryote, prokaryote, virtualization, volcanism

@ -423,9 +522,11 @@ From Revision 1 to Revision 2 (January 27, 2001)
    words in them. 
  - Added variant-infl.tab

-COPYRIGHT:

-Copyright 2000-2018 by Kevin Atkinson
+Copyright
+=========
+
+Copyright 2000-2019 by Kevin Atkinson

 Permission to use, copy, modify, distribute and sell this array, the
 associated software, and its documentation for any purpose is hereby
--- a/crates/varcon/assets/varcon.txt
+++ b/crates/varcon/assets/varcon.txt
--- a/crates/varcon/src/codegen.rs
+++ b/crates/varcon/src/codegen.rs