use std::{ convert::Infallible, env, fs, path::{Path, PathBuf}, str::FromStr, }; use anyhow::{anyhow, bail}; use indexmap::IndexMap; use itertools::Itertools; use once_cell::sync::Lazy; use regex::Regex; use serde_derive::Deserialize; use serde_with::DeserializeFromStr; use walkdir::WalkDir; /// Known mapping targets. /// /// Corresponds to `syntax_mapping::MappingTarget`. #[allow(clippy::enum_variant_names)] #[derive(Clone, Debug, Eq, PartialEq, Hash, DeserializeFromStr)] pub enum MappingTarget { MapTo(String), MapToUnknown, MapExtensionToUnknown, } impl FromStr for MappingTarget { type Err = Infallible; fn from_str(s: &str) -> Result { match s { "MappingTarget::MapToUnknown" => Ok(Self::MapToUnknown), "MappingTarget::MapExtensionToUnknown" => Ok(Self::MapExtensionToUnknown), syntax => Ok(Self::MapTo(syntax.into())), } } } impl MappingTarget { fn codegen(&self) -> String { match self { Self::MapTo(syntax) => format!(r###"MappingTarget::MapTo(r#"{syntax}"#)"###), Self::MapToUnknown => "MappingTarget::MapToUnknown".into(), Self::MapExtensionToUnknown => "MappingTarget::MapExtensionToUnknown".into(), } } } #[derive(Clone, Debug, PartialEq, Eq, Hash, DeserializeFromStr)] /// A single matcher. /// /// Codegen converts this into a `Lazy>`. struct Matcher(Vec); /// Parse a matcher. /// /// Note that this implementation is rather strict: it will greedily interpret /// every valid environment variable replacement as such, then immediately /// hard-error if it finds a '$' anywhere in the remaining text segments. /// /// The reason for this strictness is I currently cannot think of a valid reason /// why you would ever need '$' as plaintext in a glob pattern. Therefore any /// such occurrences are likely human errors. /// /// If we later discover some edge cases, it's okay to make it more permissive. /// /// Revision history: /// - 2024-02-20: allow `{` and `}` (glob brace expansion) impl FromStr for Matcher { type Err = anyhow::Error; fn from_str(s: &str) -> Result { use MatcherSegment as Seg; static VAR_REGEX: Lazy = Lazy::new(|| Regex::new(r"\$\{([\w\d_]+)\}").unwrap()); let mut segments = vec![]; let mut text_start = 0; for capture in VAR_REGEX.captures_iter(s) { let match_0 = capture.get(0).unwrap(); // text before this var let text_end = match_0.start(); segments.push(Seg::Text(s[text_start..text_end].into())); text_start = match_0.end(); // this var segments.push(Seg::Env(capture.get(1).unwrap().as_str().into())); } // possible trailing text segments.push(Seg::Text(s[text_start..].into())); // cleanup empty text segments let non_empty_segments = segments .into_iter() .filter(|seg| seg.text().map(|t| !t.is_empty()).unwrap_or(true)) .collect_vec(); // sanity check if non_empty_segments .windows(2) .any(|segs| segs[0].is_text() && segs[1].is_text()) { unreachable!("Parsed into consecutive text segments: {non_empty_segments:?}"); } // guard empty case if non_empty_segments.is_empty() { bail!(r#"Parsed an empty matcher: "{s}""#); } // guard variable syntax leftover fragments if non_empty_segments .iter() .filter_map(Seg::text) .any(|t| t.contains('$')) { bail!(r#"Invalid matcher: "{s}""#); } Ok(Self(non_empty_segments)) } } impl Matcher { fn codegen(&self) -> String { match self.0.len() { 0 => unreachable!("0-length matcher should never be created"), // if-let guard would be ideal here // see: https://github.com/rust-lang/rust/issues/51114 1 if self.0[0].is_text() => { let s = self.0[0].text().unwrap(); format!(r###"Lazy::new(|| Some(build_matcher_fixed(r#"{s}"#)))"###) } // parser logic ensures that this case can only happen when there are dynamic segments _ => { let segs = self.0.iter().map(MatcherSegment::codegen).join(", "); format!(r###"Lazy::new(|| build_matcher_dynamic(&[{segs}]))"###) } } } } /// A segment in a matcher. /// /// Corresponds to `syntax_mapping::MatcherSegment`. #[derive(Debug, Clone, PartialEq, Eq, Hash)] enum MatcherSegment { Text(String), Env(String), } #[allow(dead_code)] impl MatcherSegment { fn is_text(&self) -> bool { matches!(self, Self::Text(_)) } fn is_env(&self) -> bool { matches!(self, Self::Env(_)) } fn text(&self) -> Option<&str> { match self { Self::Text(t) => Some(t), Self::Env(_) => None, } } fn env(&self) -> Option<&str> { match self { Self::Text(_) => None, Self::Env(t) => Some(t), } } fn codegen(&self) -> String { match self { Self::Text(s) => format!(r###"MatcherSegment::Text(r#"{s}"#)"###), Self::Env(s) => format!(r###"MatcherSegment::Env(r#"{s}"#)"###), } } } /// A struct that models a single .toml file in /src/syntax_mapping/builtins/. #[derive(Clone, Debug, Deserialize)] struct MappingDefModel { mappings: IndexMap>, } impl MappingDefModel { fn into_mapping_list(self) -> MappingList { let list = self .mappings .into_iter() .flat_map(|(target, matchers)| { matchers .into_iter() .map(|matcher| (matcher, target.clone())) .collect::>() }) .collect(); MappingList(list) } } #[derive(Clone, Debug)] struct MappingList(Vec<(Matcher, MappingTarget)>); impl MappingList { fn codegen(&self) -> String { let array_items: Vec<_> = self .0 .iter() .map(|(matcher, target)| { format!("({m}, {t})", m = matcher.codegen(), t = target.codegen()) }) .collect(); let len = array_items.len(); format!( "/// Generated by build script from /src/syntax_mapping/builtins/.\n\ pub(crate) static BUILTIN_MAPPINGS: [(Lazy>, MappingTarget); {len}] = [\n{items}\n];", items = array_items.join(",\n") ) } } /// Get the list of paths to all mapping definition files that should be /// included for the current target platform. fn get_def_paths() -> anyhow::Result> { let source_subdirs = [ "common", #[cfg(target_family = "unix")] "unix-family", #[cfg(any( target_os = "freebsd", target_os = "netbsd", target_os = "openbsd", target_os = "macos" ))] "bsd-family", #[cfg(target_os = "linux")] "linux", #[cfg(target_os = "macos")] "macos", #[cfg(target_os = "windows")] "windows", ]; let mut toml_paths = vec![]; for subdir in source_subdirs { let wd = WalkDir::new(Path::new("src/syntax_mapping/builtins").join(subdir)); let paths = wd .into_iter() .filter_map_ok(|entry| { let path = entry.path(); (path.is_file() && path.extension().map(|ext| ext == "toml").unwrap_or(false)) .then(|| path.to_owned()) }) .collect::, _>>()?; toml_paths.extend(paths); } toml_paths.sort_by_key(|path| { path.file_name() .expect("file name should not terminate in ..") .to_owned() }); Ok(toml_paths) } fn read_all_mappings() -> anyhow::Result { let mut all_mappings = vec![]; for path in get_def_paths()? { let toml_string = fs::read_to_string(path)?; let mappings = toml::from_str::(&toml_string)?.into_mapping_list(); all_mappings.extend(mappings.0); } let duplicates = all_mappings .iter() .duplicates_by(|(matcher, _)| matcher) .collect_vec(); if !duplicates.is_empty() { bail!("Rules with duplicate matchers found: {duplicates:?}"); } Ok(MappingList(all_mappings)) } /// Build the static syntax mappings defined in /src/syntax_mapping/builtins/ /// into a .rs source file, which is to be inserted with `include!`. pub fn build_static_mappings() -> anyhow::Result<()> { println!("cargo:rerun-if-changed=src/syntax_mapping/builtins/"); let mappings = read_all_mappings()?; let codegen_path = Path::new(&env::var_os("OUT_DIR").ok_or(anyhow!("OUT_DIR is unset"))?) .join("codegen_static_syntax_mappings.rs"); fs::write(codegen_path, mappings.codegen())?; Ok(()) }