From 8070a94d84aae06f9968cbe93da73867b7f70dff Mon Sep 17 00:00:00 2001 From: phiresky Date: Tue, 9 Jun 2020 18:27:22 +0200 Subject: [PATCH] make poppler and pandoc internal custom adapters --- Cargo.lock | 34 +++++++++++---- Cargo.toml | 2 +- README.md | 2 +- doc/notes.md | 26 ++++++++++++ src/adapters.rs | 32 +++++++------- src/adapters/custom.rs | 80 ++++++++++++++++++++++++++++++++++- src/adapters/decompress.rs | 1 + src/adapters/ffmpeg.rs | 3 +- src/adapters/pandoc.rs | 86 -------------------------------------- src/adapters/pdfpages.rs | 3 +- src/adapters/poppler.rs | 39 ++--------------- src/adapters/sqlite.rs | 3 +- src/adapters/tar.rs | 3 +- src/adapters/tesseract.rs | 3 +- src/adapters/zip.rs | 3 +- src/args.rs | 32 ++++++++++---- 16 files changed, 187 insertions(+), 165 deletions(-) create mode 100644 doc/notes.md delete mode 100644 src/adapters/pandoc.rs diff --git a/Cargo.lock b/Cargo.lock index 166de5f..2aad29b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -294,6 +294,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "dyn-clone" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3ec9c7fb9a2ce708751c98e31ccbae74b6ab194f5c8e30cfb7ed62e38b70866" + [[package]] name = "either" version = "1.5.3" @@ -479,6 +485,16 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "indexmap" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c398b2b113b55809ceb9ee3e753fcbac793f1956663f3c36549c1346015c2afe" +dependencies = [ + "autocfg 1.0.0", + "serde", +] + [[package]] name = "itertools" version = "0.9.0" @@ -862,9 +878,9 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" [[package]] name = "quote" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54a21852a652ad6f610c9510194f398ff6f8692e334fd1145fed931f7fbe44ea" +checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37" dependencies = [ "proc-macro2", ] @@ -1173,10 +1189,12 @@ checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" [[package]] name = "schemars" -version = "0.7.6" +version = "0.8.0-alpha-2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be77ed66abed6954aabf6a3e31a84706bedbf93750d267e92ef4a6d90bbd6a61" +checksum = "a0d3111dca36beaa5be680b8d031d2416e5d0e66aac8118893d42792a6ea8996" dependencies = [ + "dyn-clone", + "indexmap", "schemars_derive", "serde", "serde_json", @@ -1184,9 +1202,9 @@ dependencies = [ [[package]] name = "schemars_derive" -version = "0.7.6" +version = "0.8.0-alpha-2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11af7a475c9ee266cfaa9e303a47c830ebe072bf3101ab907a7b7b9d816fa01d" +checksum = "0e066c77ba237124b99881dfb3022cd7f4b477e19abcdfffd264c6693929a0a5" dependencies = [ "proc-macro2", "quote", @@ -1336,9 +1354,9 @@ dependencies = [ [[package]] name = "synstructure" -version = "0.12.3" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67656ea1dc1b41b1451851562ea232ec2e5a80242139f7e679ceccfb5d61f545" +checksum = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 4c95747..88f8945 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,7 @@ paste = "0.1.16" tempfile = "3.1.0" glob = "0.3.0" anyhow = "1.0.31" -schemars = "0.7.6" +schemars = {version = "0.8.0-alpha-2", features = ["preserve_order"]} directories-next = "1.0.1" derive_more = "0.99.7" pretty-bytes = "0.2.2" diff --git a/README.md b/README.md index b1f1be4..a5e8ff4 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ On Arch Linux, you can simply install from AUR: `yay -S ripgrep-all`. On Debian-based distributions you can download the [rga binary][latestrelease] and get the dependencies like this: -`apt install ripgrep pandoc poppler-utils ffmpeg cargo` +`apt install ripgrep pandoc poppler-utils ffmpeg` If ripgrep is not included in your package sources, get it from [here](https://github.com/BurntSushi/ripgrep/releases). diff --git a/doc/notes.md b/doc/notes.md new file mode 100644 index 0000000..ce63621 --- /dev/null +++ b/doc/notes.md @@ -0,0 +1,26 @@ +## schema -> ui generation + +https://json-schema.org/implementations.html#web-ui-generation + +- https://github.com/guillotinaweb/ngx-schema-form +- https://github.com/hamzahamidi/ajsf angular igh +- https://github.com/dashjoin/json-schema-form +- https://github.com/json-editor/json-editor +- https://github.com/jsonform/jsonform +- https://github.com/vazco/uniforms + +## json schema is ridiculous + + "mimetypes": { + "description": "if not null and --rga-accurate is enabled, mime type matching is used instead of file name matching", + "type": [ + "array", + "null" + ], + "items": { + "type": "string" + } + }, + + what the fuck???? + this is the only thing required to see that json schema has horrible design diff --git a/src/adapters.rs b/src/adapters.rs index c857119..ebeb2e4 100644 --- a/src/adapters.rs +++ b/src/adapters.rs @@ -1,7 +1,6 @@ pub mod custom; pub mod decompress; pub mod ffmpeg; -pub mod pandoc; pub mod pdfpages; pub mod poppler; pub mod spawning; @@ -12,6 +11,7 @@ pub mod zip; use crate::matching::*; use crate::preproc::PreprocConfig; use anyhow::*; +use custom::builtin_spawning_adapters; use custom::CustomAdapterConfig; use log::*; use regex::Regex; @@ -35,6 +35,8 @@ pub struct AdapterMeta { /// list of matchers when we have mime type detection active (interpreted as ORed) /// warning: this *overrides* the fast matchers pub slow_matchers: Option>, + // if true, adapter is only used when user lists it in `--rga-adapters` + pub disabled_by_default: bool, } impl AdapterMeta { // todo: this is pretty ugly @@ -83,34 +85,32 @@ type AdaptersTuple = (Vec>, Vec>); pub fn get_all_adapters(custom_adapters: Option>) -> AdaptersTuple { // order in descending priority - let mut enabled_adapters: Vec> = vec![]; - let mut disabled_adapters: Vec> = vec![]; + let mut adapters: Vec> = vec![]; if let Some(custom_adapters) = custom_adapters { for adapter_config in custom_adapters { - if adapter_config.default_disabled.unwrap_or(false) { - disabled_adapters.push(Rc::new(adapter_config.to_adapter())); - } else { - enabled_adapters.push(Rc::new(adapter_config.to_adapter())); - } + adapters.push(Rc::new(adapter_config.to_adapter())); } } - let internal_enabled_adapters: Vec> = vec![ + let internal_adapters: Vec> = vec![ Rc::new(ffmpeg::FFmpegAdapter::new()), - Rc::new(pandoc::PandocAdapter::new()), - Rc::new(poppler::PopplerAdapter::new()), Rc::new(zip::ZipAdapter::new()), Rc::new(decompress::DecompressAdapter::new()), Rc::new(tar::TarAdapter::new()), Rc::new(sqlite::SqliteAdapter::new()), - ]; - enabled_adapters.extend(internal_enabled_adapters); - let internal_disabled_adapters: Vec> = vec![ Rc::new(pdfpages::PdfPagesAdapter::new()), Rc::new(tesseract::TesseractAdapter::new()), ]; - disabled_adapters.extend(internal_disabled_adapters); - (enabled_adapters, disabled_adapters) + adapters.extend( + builtin_spawning_adapters + .iter() + .map(|e| -> Rc { Rc::new(e.clone().to_adapter()) }), + ); + adapters.extend(internal_adapters); + + adapters + .into_iter() + .partition(|e| !e.metadata().disabled_by_default) } /** diff --git a/src/adapters/custom.rs b/src/adapters/custom.rs index 732a3cc..7c46d96 100644 --- a/src/adapters/custom.rs +++ b/src/adapters/custom.rs @@ -1,6 +1,6 @@ use super::{spawning::SpawningFileAdapter, AdapterMeta, GetMetadata}; use crate::matching::{FastMatcher, SlowMatcher}; - +use lazy_static::lazy_static; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -12,7 +12,7 @@ pub struct CustomAdapterConfig { /// a description of this adapter. shown in help pub description: String, /// if true, the adapter will be disabled by default - pub default_disabled: Option, + pub disabled_by_default: Option, /// version identifier. used to key cache entries, change if the configuration or program changes pub version: i32, /// the file extensions this adapter supports. For example ["epub", "mobi"] @@ -27,6 +27,81 @@ pub struct CustomAdapterConfig { pub args: Vec, } +fn strs(arr: &[&str]) -> Vec { + arr.iter().map(ToString::to_string).collect() +} + +lazy_static! { + pub static ref builtin_spawning_adapters: Vec = vec![ + // from https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/App/FormatHeuristics.hs + // excluding formats that could cause problems (.db ?= sqlite) or that are already text formats (e.g. xml-based) + //"db" -> Just "docbook" + //"adoc" -> Just "asciidoc" + //"asciidoc" -> Just "asciidoc" + //"context" -> Just "context" + //"ctx" -> Just "context" + //"dokuwiki" -> Just "dokuwiki" + //"htm" -> Just "html" + //"html" -> Just "html" + //"json" -> Just "json" + //"latex" -> Just "latex" + //"lhs" -> Just "markdown+lhs" + //"ltx" -> Just "latex" + //"markdown" -> Just "markdown" + //"md" -> Just "markdown" + //"ms" -> Just "ms" + //"muse" -> Just "muse" + //"native" -> Just "native" + //"opml" -> Just "opml" + //"org" -> Just "org" + //"roff" -> Just "ms" + //"rst" -> Just "rst" + //"s5" -> Just "s5" + //"t2t" -> Just "t2t" + //"tei" -> Just "tei" + //"tei.xml" -> Just "tei" + //"tex" -> Just "latex" + //"texi" -> Just "texinfo" + //"texinfo" -> Just "texinfo" + //"textile" -> Just "textile" + //"text" -> Just "markdown" + //"txt" -> Just "markdown" + //"xhtml" -> Just "html" + //"wiki" -> Just "mediawiki" + CustomAdapterConfig { + name: "pandoc".to_string(), + description: "Uses pandoc to convert binary/unreadable text documents to plain markdown-like text".to_string(), + version: 3, + extensions: strs(&["epub", "odt", "docx", "fb2", "ipynb"]), + binary: "pandoc".to_string(), + mimetypes: None, + // simpler markown (with more information loss but plainer text) + //.arg("--to=commonmark-header_attributes-link_attributes-fenced_divs-markdown_in_html_blocks-raw_html-native_divs-native_spans-bracketed_spans") + args: strs(&[ + "--from={file_extension}", + "--to=plain", + "--wrap=none", + "--atx-headers" + ]), + disabled_by_default: None + }, + CustomAdapterConfig { + name: "poppler".to_owned(), + version: 1, + description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files" + .to_owned(), + + extensions: strs(&["pdf"]), + mimetypes: Some(strs(&["application/pdf"])), + + binary: "pdftotext".to_string(), + args: strs(&["-", "-"]), + disabled_by_default: None, + // postprocessors: [{name: "add_page_numbers_by_pagebreaks"}] + } + ]; +} + pub struct CustomSpawningFileAdapter { binary: String, args: Vec, @@ -76,6 +151,7 @@ impl CustomAdapterConfig { .map(|s| SlowMatcher::MimeType(s.to_string())) .collect() }), + disabled_by_default: self.disabled_by_default.unwrap_or(false), }, } } diff --git a/src/adapters/decompress.rs b/src/adapters/decompress.rs index 60fc6ac..2b2bf84 100644 --- a/src/adapters/decompress.rs +++ b/src/adapters/decompress.rs @@ -30,6 +30,7 @@ lazy_static! { .map(|s| SlowMatcher::MimeType(s.to_string())) .collect() ), + disabled_by_default: false }; } #[derive(Default)] diff --git a/src/adapters/ffmpeg.rs b/src/adapters/ffmpeg.rs index f7791ed..e23751b 100644 --- a/src/adapters/ffmpeg.rs +++ b/src/adapters/ffmpeg.rs @@ -21,7 +21,8 @@ lazy_static! { .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) .collect(), - slow_matchers: None + slow_matchers: None, + disabled_by_default: false }; } diff --git a/src/adapters/pandoc.rs b/src/adapters/pandoc.rs deleted file mode 100644 index 0eaf7ed..0000000 --- a/src/adapters/pandoc.rs +++ /dev/null @@ -1,86 +0,0 @@ -use super::*; -use lazy_static::lazy_static; -use spawning::SpawningFileAdapter; -use std::process::Command; - -// from https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/App/FormatHeuristics.hs -// excluding formats that could cause problems (.db ?= sqlite) or that are already text formats (e.g. xml-based) -//"db" -> Just "docbook" -//"adoc" -> Just "asciidoc" -//"asciidoc" -> Just "asciidoc" -//"context" -> Just "context" -//"ctx" -> Just "context" -//"dokuwiki" -> Just "dokuwiki" -//"htm" -> Just "html" -//"html" -> Just "html" -//"json" -> Just "json" -//"latex" -> Just "latex" -//"lhs" -> Just "markdown+lhs" -//"ltx" -> Just "latex" -//"markdown" -> Just "markdown" -//"md" -> Just "markdown" -//"ms" -> Just "ms" -//"muse" -> Just "muse" -//"native" -> Just "native" -//"opml" -> Just "opml" -//"org" -> Just "org" -//"roff" -> Just "ms" -//"rst" -> Just "rst" -//"s5" -> Just "s5" -//"t2t" -> Just "t2t" -//"tei" -> Just "tei" -//"tei.xml" -> Just "tei" -//"tex" -> Just "latex" -//"texi" -> Just "texinfo" -//"texinfo" -> Just "texinfo" -//"textile" -> Just "textile" -//"text" -> Just "markdown" -//"txt" -> Just "markdown" -//"xhtml" -> Just "html" -//"wiki" -> Just "mediawiki" - -static EXTENSIONS: &[&str] = &["epub", "odt", "docx", "fb2", "ipynb"]; - -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "pandoc".to_owned(), - version: 3, - description: - "Uses pandoc to convert binary/unreadable text documents to plain markdown-like text" - .to_owned(), - recurses: false, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: None - }; -} -#[derive(Default)] -pub struct PandocAdapter; - -impl PandocAdapter { - pub fn new() -> PandocAdapter { - PandocAdapter - } -} -impl GetMetadata for PandocAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA - } -} -impl SpawningFileAdapter for PandocAdapter { - fn get_exe(&self) -> &str { - "pandoc" - } - fn command(&self, filepath_hint: &Path, mut cmd: Command) -> Command { - cmd.arg("--from") - .arg(filepath_hint.extension().unwrap()) - // simpler markown (with more information loss but plainer text) - //.arg("--to=commonmark-header_attributes-link_attributes-fenced_divs-markdown_in_html_blocks-raw_html-native_divs-native_spans-bracketed_spans") - .arg("--to=plain") - .arg("--wrap=none") - .arg("--atx-headers"); - cmd - } -} diff --git a/src/adapters/pdfpages.rs b/src/adapters/pdfpages.rs index 5984dcf..e4bafa4 100644 --- a/src/adapters/pdfpages.rs +++ b/src/adapters/pdfpages.rs @@ -22,7 +22,8 @@ lazy_static! { .collect(), slow_matchers: Some(vec![SlowMatcher::MimeType( "application/pdf".to_owned() - )]) + )]), + disabled_by_default: true }; } #[derive(Default)] diff --git a/src/adapters/poppler.rs b/src/adapters/poppler.rs index 5a474c2..addc9df 100644 --- a/src/adapters/poppler.rs +++ b/src/adapters/poppler.rs @@ -3,45 +3,11 @@ use lazy_static::lazy_static; use spawning::SpawningFileAdapter; use std::io::BufReader; use std::process::Command; - +/* static EXTENSIONS: &[&str] = &["pdf"]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "poppler".to_owned(), - version: 1, - description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files" - .to_owned(), - recurses: false, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some(vec![SlowMatcher::MimeType("application/pdf".to_owned())]) - }; -} -#[derive(Default)] -pub struct PopplerAdapter; - -impl PopplerAdapter { - pub fn new() -> PopplerAdapter { - PopplerAdapter - } -} -impl GetMetadata for PopplerAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA - } -} -impl SpawningFileAdapter for PopplerAdapter { - fn get_exe(&self) -> &str { - "pdftotext" - } - fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command { - cmd.arg("-").arg("-"); - cmd - } + postproc: "add_lines" fn postproc(line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Result<()> { // prepend Page X to each line let mut page = 1; @@ -60,3 +26,4 @@ impl SpawningFileAdapter for PopplerAdapter { Ok(()) } } +*/ diff --git a/src/adapters/sqlite.rs b/src/adapters/sqlite.rs index ed89a8c..762a5fa 100644 --- a/src/adapters/sqlite.rs +++ b/src/adapters/sqlite.rs @@ -22,7 +22,8 @@ lazy_static! { .collect(), slow_matchers: Some(vec![SlowMatcher::MimeType( "application/x-sqlite3".to_owned() - )]) + )]), + disabled_by_default: false }; } diff --git a/src/adapters/tar.rs b/src/adapters/tar.rs index 76724cf..9ec6efc 100644 --- a/src/adapters/tar.rs +++ b/src/adapters/tar.rs @@ -18,7 +18,8 @@ lazy_static! { .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) .collect(), - slow_matchers: None + slow_matchers: None, + disabled_by_default: false }; } #[derive(Default)] diff --git a/src/adapters/tesseract.rs b/src/adapters/tesseract.rs index c1244c7..810feba 100644 --- a/src/adapters/tesseract.rs +++ b/src/adapters/tesseract.rs @@ -15,7 +15,8 @@ lazy_static! { .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) .collect(), - slow_matchers: None + slow_matchers: None, + disabled_by_default: true }; } #[derive(Default)] diff --git a/src/adapters/zip.rs b/src/adapters/zip.rs index c77c782..f43b4c2 100644 --- a/src/adapters/zip.rs +++ b/src/adapters/zip.rs @@ -20,7 +20,8 @@ lazy_static! { .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) .collect(), - slow_matchers: Some(vec![SlowMatcher::MimeType("application/zip".to_owned())]) + slow_matchers: Some(vec![SlowMatcher::MimeType("application/zip".to_owned())]), + disabled_by_default: false }; } #[derive(Default)] diff --git a/src/args.rs b/src/args.rs index 77a633e..73b1258 100644 --- a/src/args.rs +++ b/src/args.rs @@ -5,7 +5,7 @@ use log::*; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::ffi::OsString; -use std::{fs::File, io::Write, iter::IntoIterator, str::FromStr}; +use std::{fs::File, io::Write, iter::IntoIterator, path::PathBuf, str::FromStr}; use structopt::StructOpt; #[derive(Debug, Deserialize, Serialize)] @@ -179,6 +179,10 @@ pub struct RgaConfig { ////////////////////////////////////////// //////////////////////////// CMD line only ////////////////////////////////////////// + #[serde(skip)] + #[structopt(long = "--rga-config-file", require_equals = true)] + pub config_file_path: Option, + /// same as passing path directly, except if argument is empty /// kinda hacky, but if no file is found, fzf calls rga with empty string as path, which causes No such file or directory from rg. So filter those cases and return specially #[serde(skip)] @@ -222,22 +226,31 @@ fn json_merge(a: &mut Value, b: &Value) { } } -fn read_config_file() -> Result<(String, Value)> { +fn read_config_file(path_override: Option) -> Result<(String, Value)> { let proj = project_dirs()?; let config_dir = proj.config_dir(); - let config_filename = config_dir.join("config.json"); + let config_filename = path_override + .as_ref() + .map(|e| PathBuf::from(e)) + .unwrap_or(config_dir.join("config.jsonc")); let config_filename_str = config_filename.to_string_lossy().into_owned(); if config_filename.exists() { let config_file_contents = std::fs::read_to_string(config_filename) .with_context(|| format!("Could not read config file json {}", config_filename_str))?; { // just for error messages - serde_json::from_str(&config_file_contents) - .with_context(|| format!("Error in config file: {}", config_file_contents))?; + serde_json::from_str::(&config_file_contents).with_context(|| { + format!( + "Error in config file {}: {}", + config_filename_str, config_file_contents + ) + })?; } let config_json: serde_json::Value = serde_json::from_str(&config_file_contents).context("Could not parse config json")?; Ok((config_filename_str, config_json)) + } else if let Some(p) = path_override.as_ref() { + Err(anyhow::anyhow!("Config file not found: {}", p))? } else { // write default config std::fs::create_dir_all(config_dir)?; @@ -256,7 +269,7 @@ fn read_config_file() -> Result<(String, Value)> { } _ => panic!("impos"), } - let mut configfile = File::create(config_dir.join("config.json"))?; + let mut configfile = File::create(config_filename)?; configfile.write(serde_json::to_string_pretty(&config_json)?.as_bytes())?; Ok((config_filename_str, config_json)) } @@ -276,7 +289,7 @@ where { // TODO: don't read config file in rga-preproc for performance (called for every file) - let arg_matches = RgaConfig::from_iter(args); + let arg_matches: RgaConfig = RgaConfig::from_iter(args); let args_config = serde_json::to_value(&arg_matches)?; let merged_config = { @@ -288,8 +301,9 @@ where merged_config } else { // read from config file, env and args + let (config_filename, config_file_config) = + read_config_file(arg_matches.config_file_path)?; let env_var_config = read_config_env()?; - let (config_filename, config_file_config) = read_config_file()?; let mut merged_config = config_file_config.clone(); json_merge(&mut merged_config, &env_var_config); json_merge(&mut merged_config, &args_config); @@ -357,7 +371,7 @@ pub fn split_args(is_rga_preproc: bool) -> Result<(RgaConfig, Vec)> { } }); debug!("rga (our) args: {:?}", our_args); - let matches = parse_args(our_args, is_rga_preproc).context("Could not parse args")?; + let matches = parse_args(our_args, is_rga_preproc).context("Could not parse config")?; if matches.rg_help { passthrough_args.insert(0, "--help".into()); }