feat(varcon): Update to Version 2020.12.07

This commit is contained in:
Ed Page 2024-08-23 09:51:46 -05:00
parent d35e0fe68c
commit dd3e1018f8
6 changed files with 44593 additions and 42252 deletions

File diff suppressed because it is too large Load diff

View file

@ -124,8 +124,9 @@ pub enum Pos {
Verb = 0x02, Verb = 0x02,
Adjective = 0x04, Adjective = 0x04,
Adverb = 0x08, Adverb = 0x08,
Interjection = 0x10, AdjectiveOrAdverb = 0x10,
Preposition = 0x20, Interjection = 0x20,
Preposition = 0x40,
} }
#[cfg(feature = "flags")] #[cfg(feature = "flags")]

View file

@ -970,6 +970,8 @@ impl Entry {
let comment = let comment =
opt((comment_sep, space1, winnow::ascii::till_line_ending)).parse_next(input)?; opt((comment_sep, space1, winnow::ascii::till_line_ending)).parse_next(input)?;
let _ = winnow::ascii::space0.parse_next(input)?;
e.variants = variants; e.variants = variants;
e.comment = comment.map(|c| c.2.to_owned()); e.comment = comment.map(|c| c.2.to_owned());
Ok(e) Ok(e)
@ -1001,6 +1003,13 @@ impl Entry {
entry.description = opt(preceded(space1, description)) entry.description = opt(preceded(space1, description))
.parse_next(input)? .parse_next(input)?
.map(|d| d.to_owned()); .map(|d| d.to_owned());
if opt((winnow::ascii::space0, '|'))
.parse_next(input)?
.is_some()
{
entry.note = opt(preceded(space1, note)).parse_next(input)?;
}
} }
Ok(entry) Ok(entry)
}) })
@ -1020,7 +1029,7 @@ fn archaic(input: &mut &str) -> PResult<(), ()> {
} }
fn description(input: &mut &str) -> PResult<String, ()> { fn description(input: &mut &str) -> PResult<String, ()> {
let description = winnow::token::take_till(0.., ('\n', '\r', '#')).parse_next(input)?; let description = winnow::token::take_till(0.., ('\n', '\r', '#', '|')).parse_next(input)?;
Ok(description.to_owned()) Ok(description.to_owned())
} }
@ -1432,6 +1441,56 @@ Entry {
comment: None, comment: None,
} }
"#]]
);
}
#[test]
fn test_description_and_note() {
// Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
// cases.
let (input, actual) = Entry::parse_
.parse_peek("A B: wizz | as in \"gee whiz\" | -- Ox: informal, chiefly N. Amer.\n")
.unwrap();
assert_data_eq!(
input,
str![[r#"
"#]]
);
assert_data_eq!(
actual.to_debug(),
str![[r#"
Entry {
variants: [
Variant {
types: [
Type {
category: American,
tag: None,
num: None,
},
Type {
category: BritishIse,
tag: None,
num: None,
},
],
word: "wizz",
},
],
pos: None,
archaic: false,
description: Some(
"as in /"gee whiz/" ",
),
note: Some(
"Ox: informal, chiefly N. Amer.",
),
comment: None,
}
"#]] "#]]
); );
} }
@ -1511,9 +1570,15 @@ impl Variant {
fn parse_(input: &mut &str) -> PResult<Self, ()> { fn parse_(input: &mut &str) -> PResult<Self, ()> {
trace("variant", move |input: &mut &str| { trace("variant", move |input: &mut &str| {
let types = winnow::combinator::separated(1.., Type::parse_, space1); let types = winnow::combinator::separated(1.., Type::parse_, space1);
let columns =
winnow::combinator::separated(0.., winnow::ascii::digit1, space1).map(|()| ());
let sep = (":", winnow::ascii::space0); let sep = (":", winnow::ascii::space0);
let (types, word) = let ((types, _, _columns), word) = winnow::combinator::separated_pair(
winnow::combinator::separated_pair(types, sep, word).parse_next(input)?; (types, winnow::ascii::space0, columns),
sep,
word,
)
.parse_next(input)?;
let v = Self { types, word }; let v = Self { types, word };
Ok(v) Ok(v)
}) })
@ -1624,6 +1689,35 @@ Variant {
word: "air gun", word: "air gun",
} }
"#]]
);
}
#[test]
fn test_columns() {
// Having nothing after `A` causes an incomplete parse. Shouldn't be a problem for my use
// cases.
let (input, actual) = Variant::parse_.parse_peek("A B 1 2: aeries").unwrap();
assert_data_eq!(input, str![""]);
assert_data_eq!(
actual.to_debug(),
str![[r#"
Variant {
types: [
Type {
category: American,
tag: None,
num: None,
},
Type {
category: BritishIse,
tag: None,
num: None,
},
],
word: "aeries",
}
"#]] "#]]
); );
} }
@ -1874,6 +1968,7 @@ impl Pos {
"V".value(Pos::Verb), "V".value(Pos::Verb),
"Adj".value(Pos::Adjective), "Adj".value(Pos::Adjective),
"Adv".value(Pos::Adverb), "Adv".value(Pos::Adverb),
"A".value(Pos::AdjectiveOrAdverb),
"Inj".value(Pos::Interjection), "Inj".value(Pos::Interjection),
"Prep".value(Pos::Preposition), "Prep".value(Pos::Preposition),
)) ))

View file

@ -1,8 +1,9 @@
Variant Conversion Info (VarCon) Variant Conversion Info (VarCon)
********************************
Version 2019.10.06 Version 2020.12.07
Copyright 2000-2016 by Kevin Atkinson (kevina@gnu.org) and Benjamin Copyright 2000-2020 by Kevin Atkinson (kevina@gnu.org) and Benjamin
Titze (btitze@protonmail.ch). Titze (btitze@protonmail.ch).
This package contains information to convert between American, This package contains information to convert between American,
@ -11,9 +12,17 @@ other variant information.
The latest version can be found at http://wordlist.aspell.net/. The latest version can be found at http://wordlist.aspell.net/.
File Format
===========
The main data file is varcon.txt. It contains information on the The main data file is varcon.txt. It contains information on the
preferred American, British, and Canadian spelling of a word as well preferred American, British, Canadian and Australian spelling of a
as other variant information. word as well as other variant information.
Varcon Lines
------------
Each line contains a mapping between the various spellings of a word. Each line contains a mapping between the various spellings of a word.
Words are tagged to indicate where the spelling is used, and each Words are tagged to indicate where the spelling is used, and each
@ -32,6 +41,7 @@ spelling is sometimes used in America (as indicated the "Av").
More generally each tag consists of a spelling category (for example More generally each tag consists of a spelling category (for example
"A") followed possible by a variant indicator. The spelling "A") followed possible by a variant indicator. The spelling
categories are as follows: categories are as follows:
A: American A: American
B: British "ise" spelling B: British "ise" spelling
Z: British "ize" spelling or OED preferred Spelling Z: British "ize" spelling or OED preferred Spelling
@ -39,7 +49,9 @@ categories are as follows:
D: Australian D: Australian
_: Other (Variant info based on American dictionaries, never used _: Other (Variant info based on American dictionaries, never used
with any of the above). with any of the above).
and the variants tags are as follows: and the variants tags are as follows:
.: equal .: equal
v: variant v: variant
V: seldom used variant V: seldom used variant
@ -66,6 +78,13 @@ If there are no tags with the 'Z' spelling category on the line then
'B' implies 'Z'. Similarly if there are no 'C' tags then 'Z' implies 'B' implies 'Z'. Similarly if there are no 'C' tags then 'Z' implies
'C'. If there are no 'D' tags then 'B' implies 'D'. 'C'. If there are no 'D' tags then 'B' implies 'D'.
Some entries may have a number after the tags, this is a column
number and will be explained later.
Varcon Clusters
---------------
For ease of reading and maintaining the data file, each line is For ease of reading and maintaining the data file, each line is
grouped in a cluster of closely related words. Each cluster is grouped in a cluster of closely related words. Each cluster is
uniquely identified by a headword, which is generally the American uniquely identified by a headword, which is generally the American
@ -86,10 +105,26 @@ the headword is found in. The levels generally mean the following:
unabridged dictionary unabridged dictionary
> 80: May not even be a legal word > 80: May not even be a legal word
Sometimes the spelling of a word depends on the usage. If so the word Earlier versions of varcon contained numerous errors. With version
is listed more than once within a cluster, with any usage information 5.0 massive effort has been made to correct many of these errors.
being indicated after a " | ". For example here is part of the cluster Clusters that have undergone some form of verification (and likely
for prize: correction) are marked with "<verified>". As of version 5.0, most
clusters with headwords word in common usage (SCOWL level 35 and
below) should now be checked, as well as many others. No effort was
made to check clusters with headwords in SCOWL level 80 and above;
many of those entries are unlikely to be in the dictionary anyway.
Varcon Groups
-------------
Sometimes the spelling of a word depends on the usage in which case a
cluster is split into multiple groups with each group represting one
usage of a word. Usage annotations and/or pos tags are used to
distinguish one group from another.
Usage information is given after a " | ". For example here is part of
the cluster for prize:
A B: prize | reward A B: prize | reward
A B: prizes | reward A B: prizes | reward
A C: prize / B: prise | otherwise A C: prize / B: prise | otherwise
@ -102,50 +137,90 @@ consists of a number, for example:
A B: sake | :1 A B: sake | :1
A C: sake / Av B Cv: saki | :2 A C: sake / Av B Cv: saki | :2
Sometimes part-of-speech (POS) info is given to help distinguish which A part-of-speech (POS) tag may also given after a " | ", for example:
form is used. For example:
A B C: practice / AV Cv: practise | <N> A B C: practice / AV Cv: practise | <N>
A Cv: practice / AV B C: practise | <V> A Cv: practice / AV B C: practise | <V>
POS info is always given in the form "<POS>" and if a definition POS tags are always given in the form "<POS>" and if a definition
is also given the POS info is always first. The POS tags used are as is also given the POS info is always first. The POS tags used are as
follows: follows:
<N>: Noun <N>: Noun
<V>: Verb <V>: Verb
<Adj>: Adjective <Adj>: Adjective
<Adv>: Adverb <Adv>: Adverb
<A>: Adjective or Adverb
<Inj>
<Prep>
<abbr>
Additional Annotations
----------------------
A "(-)" before the definition indicated a rarely used or archaic form A "(-)" before the definition indicated a rarely used or archaic form
of a word, for example: of a word, for example:
A B: bark | :1 A B: bark | :1
A: bark / Av B: barque | (-) ship A: bark / Av B: barque | (-) ship
A "--" indicates a note rather than definition. This is generally A "| -- pl: someword" indicates that the word is a plural and the root
used to indicate that the spelling of the plural form not depend on is someword.
the spelling of the root word, for example:
_: cabby / _.: cabbie
_: cabbies | -- plural
Misc. notes on a particular form of a word are given after a "#" on A plain "| -- pl" indicates that the word is a plural and the root is
the same line. Misc. notes for the cluster are given at the end of elsewhere within the group. It is used when one form of the plural is
the cluster and are prefixed with "##", for example: the same as the root word, for example:
_1: yak | :1
_ 1: yaks / _V 1: yak | :1 | -- pl
_ 1: yak's | :1
A "| --" otherwise indicates a note which gives additional context but
does not create it's own group like a definition does.
A "#" after a line indicates a comment that is often used to indicate
why. A "##" after a cluster indicates the the comment applies to the
entire cluster, for example:
# coloration <verified> (level 50) # coloration <verified> (level 50)
A B C: coloration / B. Cv: colouration A B C: coloration / B. Cv: colouration
A B C: colorations / B. Cv: colourations A B C: colorations / B. Cv: colourations
A B C: coloration's / B. Cv: colouration's A B C: coloration's / B. Cv: colouration's
## OED has coloration as the preferred spelling and discolouration as a ## OED has coloration as the preferred spelling and discolouration as a
## variant for British Engl or some reason ## variant for British Engl or some reason
In the notes ODE (not to be confused with OED) stands for Oxford In the comments ODE (not to be confused with OED) stands for Oxford
Dictionary of English, "Ox" is used for any Oxford dictionary, and Dictionary of English, "Ox" is used for any Oxford dictionary, and
"M-W" for Merriam-Webster. "M-W" for Merriam-Webster.
Earlier versions of varcon contained numerous errors. With version
5.0 massive effort has been made to correct many of these errors. Varcon Columns
Clusters that have undergone some form of verification (and likely --------------
correction) are marked with "<verified>". As of version 5.0, most
clusters with headwords word in common usage (SCOWL level 35 and Varcon does not directly expresses the relation of words within a
below) should now be checked, as well as many others. No effort was group as it is normally easy to derive. For example given a simple
made to check clusters with headwords in SCOWL level 80 and above; group of:
many of those entries are unlikely to be in the dictionary anyway. A: acknowledgment / B: acknowledgement
A: acknowledgments / B: acknowledgements
A: acknowledgment's / B: acknowledgement's
it is clear that acknowledgments is the plural form of acknowledgment
since they are both the American spelling of a word. While
acknowledgEments is the plural form of acknowledgEment since they are
both the British forms of a word. Within a group each varcon line
is considered a row in a table and each entry within a line is considered
a column. Within this group the first column is the American spelling
and the second is the British.
Sometime the column assignment unclear, when they are explicit column
numbers may be given. For example:
A B: caulk / Av: calk / AV Bv 1: caulking / AV 2: calking | <N> :3
A B: caulks / Av: calks / AV Bv 1: caulkings / AV 2: calkings | <N> :3
A B: caulk's / Av: calk's / AV Bv 1: caulking's / AV 2: calking's | <N> :3
Each column must contain exactly one spelling of the base form of a
word, however a column may contain multiple derived forms for a single
spelling of the base form, for example:
A B D 1: amoeba / Av Dv 2: ameba
A B D 1: amoebas / Av Bv Dv 1: amoebae / Av Dv 2: amebas / Av Dv 2: amebae
A B D 1: amoeba's / Av Dv 2: ameba's
Additional Files
================
The file variant-also.tab contains additional mappings between various The file variant-also.tab contains additional mappings between various
spellings of a word which are not yet in varcon.txt. No attempt is spellings of a word which are not yet in varcon.txt. No attempt is
@ -155,6 +230,7 @@ automatically from the AGID inflection database. The file
variant-wroot.tab is like variant-infl.tab except that it also variant-wroot.tab is like variant-infl.tab except that it also
included the root form of the word. included the root form of the word.
The file voc.tab is similar to varcon.txt but converts between The file voc.tab is similar to varcon.txt but converts between
vocabulary instead of spelling. Unlike varcon.tab it is a simple tab vocabulary instead of spelling. Unlike varcon.tab it is a simple tab
separated file with the fields corresponding to the American, British, separated file with the fields corresponding to the American, British,
@ -163,11 +239,13 @@ the same thing the words are separated with commas. The last column
contains additional notes on when the word is used. Unlike varcon.txt contains additional notes on when the word is used. Unlike varcon.txt
it is generally not suitable for automatic conversion. it is generally not suitable for automatic conversion.
The "make-variant" Perl script will combine varcon.txt, The "make-variant" Perl script will combine varcon.txt,
variant-also.tab, and variant-infl.tab into one huge mapping and will variant-also.tab, and variant-infl.tab into one huge mapping and will
output the result to "variant.tab". If the "no-infl" option is given output the result to "variant.tab". If the "no-infl" option is given
than variant-infl.tab will not be included. than variant-infl.tab will not be included.
The "split" script will split out the information in varcon.txt into The "split" script will split out the information in varcon.txt into
several word lists named as follows: several word lists named as follows:
<spelling>[-v<variant level>][-uncommon].lst <spelling>[-v<variant level>][-uncommon].lst
@ -182,6 +260,7 @@ as follows:
- => 2 - => 2
"-uncommon" is used for forms marked with "(-)" as already described. "-uncommon" is used for forms marked with "(-)" as already described.
The "translate" Perl script will translate a text file from one The "translate" Perl script will translate a text file from one
spelling to another. Its usage is: spelling to another. Its usage is:
@ -199,16 +278,23 @@ Text is read in from standard input and is outputted to standard out.
Words are marked with a '?' before and after the questionable word Words are marked with a '?' before and after the questionable word
when the option is enabled. when the option is enabled.
The file varcon.pm contains some library routines for parsing The file varcon.pm contains some library routines for parsing
varcon.txt and is used by many of the scripts above. varcon.txt and is used by many of the scripts above.
Feedback
========
If you discover any errors in these mappings or have suggestions for If you discover any errors in these mappings or have suggestions for
additions please file a bug report at additions please file a bug report at
https://github.com/kevina/wordlist/issues, or alternatively email me https://github.com/kevina/wordlist/issues, or alternatively email me
directly at kevina@gnu.org, but I will likely tell you to file a bug directly at kevina@gnu.org, but I will likely tell you to file a bug
report so that I don't forget about it. report so that I don't forget about it.
SOURCE:
Sources
=======
These mappings were compiled from numerous sources. These mappings were compiled from numerous sources.
@ -296,9 +382,22 @@ The primary sources for this addition were:
http://blogs.usyd.edu.au/elac/2008/01/webster_in_australia.html http://blogs.usyd.edu.au/elac/2008/01/webster_in_australia.html
CHANGELOG: Changelog
=========
From 2017.08.24 to 2018.10.06 From 2018.10.06 to 2020.12.07
- Additional documentation on file format
- Minor change in file format
- Fix scripts to work with modern versions of Perl.
- Various new entries
- Additional cleanups
From 2017.08.24 to 2019.10.06
- Added entries for: eukaryote, prokaryote, virtualization, volcanism - Added entries for: eukaryote, prokaryote, virtualization, volcanism
@ -423,9 +522,11 @@ From Revision 1 to Revision 2 (January 27, 2001)
words in them. words in them.
- Added variant-infl.tab - Added variant-infl.tab
COPYRIGHT:
Copyright 2000-2018 by Kevin Atkinson Copyright
=========
Copyright 2000-2019 by Kevin Atkinson
Permission to use, copy, modify, distribute and sell this array, the Permission to use, copy, modify, distribute and sell this array, the
associated software, and its documentation for any purpose is hereby associated software, and its documentation for any purpose is hereby

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff