fix(config): Resolve ambiguous file types

Before, when two file types matched the same glob, the file type that
one was non-deterministic.

Now, "the more specific" file type wins.  What this means is that we
break up the file by its extensions and prioritize the more literal glob
- If its just `*`, then its lowest priority
- If it contains `*` and other logic, then its next
- If it doesn't contain a `*`, then its the highest priority

This leaves out other glob syntax like `{one,two}` as those are
closed-ended and so considered specific still.

Fixes #487
This commit is contained in:
Ed Page 2022-06-15 15:40:49 -05:00
parent 4cf566d25f
commit 0bb32cc473
6 changed files with 202 additions and 30 deletions

6
Cargo.lock generated
View file

@ -651,9 +651,9 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
[[package]]
name = "globset"
version = "0.4.8"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10463d9ff00a2a068db14231982f5132edebad0d7660cd956a1c30292dbcbfbd"
checksum = "0a1e17342619edbc21a964c2afbeb6c820c6a2560032872f397bb97ea127bd0a"
dependencies = [
"aho-corasick",
"bstr",
@ -1541,6 +1541,7 @@ dependencies = [
"difflib",
"encoding",
"env_logger",
"globset",
"human-panic",
"ignore",
"itertools",
@ -1551,6 +1552,7 @@ dependencies = [
"proc-exit",
"serde",
"serde_json",
"thread_local",
"toml_edit",
"trycmd",
"typed-arena",

View file

@ -94,6 +94,8 @@ typed-arena = "2.0.1"
maplit = "1.0"
unicode-width = "0.1.9"
unic-emoji-char = "0.9.0"
thread_local = "1.1.4"
globset = "0.4.9"
[dev-dependencies]
assert_fs = "1.0"

View file

@ -131,13 +131,8 @@ fn run_type_list(args: &args::Args) -> proc_exit::ExitResult {
let stdout = std::io::stdout();
let mut handle = stdout.lock();
for def in definitions {
writeln!(
handle,
"{}: {}",
def.name(),
itertools::join(def.globs(), ", ")
)?;
for (name, globs) in definitions {
writeln!(handle, "{}: {}", name, itertools::join(globs, ", "))?;
}
Ok(())

183
src/file_type.rs Normal file
View file

@ -0,0 +1,183 @@
use std::collections::BTreeMap;
use kstring::KString;
#[derive(Default, Clone, Debug)]
pub struct TypesBuilder {
definitions: BTreeMap<KString, Vec<KString>>,
}
impl TypesBuilder {
pub fn new() -> Self {
Default::default()
}
pub fn add_defaults(&mut self) {
self.definitions.extend(
crate::default_types::DEFAULT_TYPES
.iter()
.map(|(name, glob)| {
let name = KString::from(*name);
let globs = glob.iter().map(|s| KString::from(*s)).collect();
(name, globs)
}),
);
}
pub fn contains_name(&self, name: &str) -> bool {
self.definitions.contains_key(name)
}
pub fn add(&mut self, name: impl Into<KString>, glob: impl Into<KString>) {
let name = name.into();
let glob = glob.into();
self.definitions.entry(name).or_default().push(glob);
}
pub fn build(self) -> Result<Types, anyhow::Error> {
let mut definitions = self
.definitions
.iter()
.flat_map(|(name, globs)| {
globs.iter().map(move |glob| {
let sort = sort_key(glob);
(sort, name, glob)
})
})
.collect::<Vec<_>>();
definitions.sort();
let mut glob_to_name = Vec::new();
let mut build_set = globset::GlobSetBuilder::new();
for (_, name, glob) in definitions {
glob_to_name.push(name.clone());
build_set.add(
globset::GlobBuilder::new(glob)
.literal_separator(true)
.build()?,
);
}
let set = build_set.build()?;
Ok(Types {
definitions: self.definitions,
glob_to_name,
set,
matches: std::sync::Arc::new(thread_local::ThreadLocal::default()),
})
}
}
fn sort_key(glob: &str) -> Vec<GlobPart<'_>> {
let mut key = glob
.split('.')
.map(|s| {
if s == "*" {
GlobPart::Wild(s)
} else if s.contains('*') {
GlobPart::PartialWild(s)
} else {
GlobPart::Literalish(s)
}
})
.collect::<Vec<_>>();
key.reverse();
key
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
enum GlobPart<'s> {
Wild(&'s str),
PartialWild(&'s str),
Literalish(&'s str),
}
#[derive(Default, Clone, Debug)]
pub struct Types {
definitions: BTreeMap<KString, Vec<KString>>,
glob_to_name: Vec<KString>,
set: globset::GlobSet,
/// Temporary storage for globs that match.
matches: std::sync::Arc<thread_local::ThreadLocal<std::cell::RefCell<Vec<usize>>>>,
}
impl Types {
pub fn definitions(&self) -> &BTreeMap<KString, Vec<KString>> {
&self.definitions
}
pub fn file_matched(&self, path: &std::path::Path) -> Option<&str> {
let file_name = path.file_name()?;
let mut matches = self.matches.get_or_default().borrow_mut();
self.set.matches_into(file_name, &mut *matches);
matches
.last()
.copied()
.map(|i| self.glob_to_name[i].as_str())
}
}
#[cfg(test)]
mod tests {
use super::*;
macro_rules! matched {
($name:ident, $types:expr, $path:expr, $matched:expr) => {
#[test]
fn $name() {
let mut btypes = TypesBuilder::new();
for (name, globs) in $types {
for glob in *globs {
btypes.add(*name, *glob);
}
}
let types = btypes.build().unwrap();
let actual = types.file_matched(std::path::Path::new($path));
let expected: Option<&str> = $matched.into();
assert_eq!(expected, actual, "{}", $path);
}
};
}
fn types() -> &'static [(&'static str, &'static [&'static str])] {
&[
("html", &["*.html", "*.htm"]),
("js", &["*.js"]),
("json", &["*.json"]),
("lock", &["package-lock.json", "*.lock"]),
]
}
matched!(basic_match, types(), "leftpad.js", "js");
matched!(multi_def_1, types(), "index.html", "html");
matched!(multi_def_2, types(), "index.htm", "html");
matched!(no_match, types(), "leftpad.ada", None);
matched!(more_specific, types(), "package-lock.json", "lock");
macro_rules! sort {
($name:ident, $actual:expr, $expected:expr) => {
#[test]
fn $name() {
let expected = $expected.into_iter().collect::<Vec<&str>>();
let mut actual = $actual.into_iter().collect::<Vec<&str>>();
actual.sort_by_key(|s| sort_key(s));
assert_eq!(expected, actual);
}
};
}
sort!(literal_sort, ["b", "c", "a"], ["a", "b", "c"]);
sort!(
basic_glob_sort,
["a_specific", "z_partial*"],
["z_partial*", "a_specific"]
);
sort!(
nested_glob_sort,
["a.specific", "z*.partial", "z.partial*"],
["z.partial*", "z*.partial", "a.specific"]
);
sort!(most_specific, ["*.txt.in", "*.in"], ["*.in", "*.txt.in"]);
}

View file

@ -13,3 +13,4 @@ pub mod policy;
pub mod report;
mod default_types;
mod file_type;

View file

@ -76,7 +76,10 @@ impl<'s> ConfigEngine<'s> {
self.get_walk(dir)
}
pub fn file_types(&self, cwd: &std::path::Path) -> &[ignore::types::FileTypeDef] {
pub fn file_types(
&self,
cwd: &std::path::Path,
) -> &std::collections::BTreeMap<kstring::KString, Vec<kstring::KString>> {
debug_assert!(cwd.is_absolute(), "{} is not absolute", cwd.display());
let dir = self
.configs
@ -176,25 +179,17 @@ impl<'s> ConfigEngine<'s> {
let walk = self.walk.intern(files);
let mut type_matcher = ignore::types::TypesBuilder::new();
for &(name, exts) in crate::default_types::DEFAULT_TYPES {
for ext in exts {
type_matcher.add(name, ext).expect("all defaults are valid");
}
}
let mut type_matcher = crate::file_type::TypesBuilder::new();
type_matcher.add_defaults();
let mut types: std::collections::HashMap<_, _> = Default::default();
for (type_name, type_engine) in type_.patterns() {
if type_engine.extend_glob.is_empty() {
if type_matcher
.definitions()
.iter()
.all(|def| def.name() != type_name.as_str())
{
if !type_matcher.contains_name(&type_name) {
anyhow::bail!("Unknown type definition `{}`, pass `--type-list` to see valid names or set `extend_glob` to add a new one.", type_name);
}
} else {
for glob in type_engine.extend_glob.iter() {
type_matcher.add(type_name.as_str(), glob.as_str())?;
type_matcher.add(type_name.as_ref(), glob.as_ref());
}
}
@ -208,8 +203,6 @@ impl<'s> ConfigEngine<'s> {
default.update(&overrides);
let default = self.init_file_config(default);
type_matcher.select("all");
let dir = DirConfig {
walk,
default,
@ -302,16 +295,12 @@ struct DirConfig {
walk: usize,
default: FileConfig,
types: std::collections::HashMap<kstring::KString, FileConfig>,
type_matcher: ignore::types::Types,
type_matcher: crate::file_type::Types,
}
impl DirConfig {
fn get_file_config(&self, path: &std::path::Path) -> FileConfig {
let match_ = self.type_matcher.matched(path, false);
let name = match_
.inner()
.and_then(|g| g.file_type_def())
.map(|f| f.name());
let name = self.type_matcher.file_matched(path);
name.and_then(|name| {
log::debug!("{}: `{}` policy", path.display(), name);