more documentation

5 years ago · 5f2b5e3399
parent e0bc939b93
commit 5f2b5e3399
7 changed files with 36 additions and 22 deletions
--- a/README.md
+++ b/README.md
@ -10,6 +10,7 @@ rga is a line-oriented search tool that allows you to look for a regex in a mult
 - I wanted to add a photograph adapter (based on object classification / detection) for fun, based on something . It worked with [YOLO](https://pjreddie.com/darknet/yolo/), but something more useful and state-of-the art [like this](https://github.com/aimagelab/show-control-and-tell) proved very hard to integrate.
 - 7z adapter (couldn't find a nice to use Rust library)
 - allow per-adapter configuration options (probably via env (RGA_ADAPTER_CONF=json))
+- there's some more (mostly technical) todos in the code

 ## Examples

@ -40,19 +41,23 @@ On the first run rga is mostly faster because of multithreading, but on subseque
 rga should compile with stable Rust. To install it, simply run (your OSes equivalent of)

 ```bash
-apt install build-essential pandoc poppler-utils
+apt install build-essential pandoc poppler-utils ffmpeg
 cargo install ripgrep_all

 rga --help # works! :)
 ```

+You don't necessarily need to install any dependencies, but then you will see an error when trying to read from the corresponding file type (e.g. poppler-utils for pdf).
+
 ## Technical details

 `rga` simply runs ripgrep (`rg`) with some options set, especially `--pre=rga-preproc` and `--pre-glob`.

-`rga-preproc [fname]` will match an adapter to the given file based on either it's filename or it's mime type (if `--accurate` is given).
+`rga-preproc [fname]` will match an "adapter" to the given file based on either it's filename or it's mime type (if `--accurate` is given). You can see all adapters currently included in [src/adapters](src/adapters).
+
+Some rga adapters run external binaries to do the actual work (such as pandoc or ffmpeg), usually by writing to stdin and reading from stdout.

-Some rga adapters run external binaries
+Most adapters read the files from a [Read](https://doc.rust-lang.org/std/io/trait.Read.html), so they work completely on streamed data (that can come from anywhere including within nested archives). rga-preproc writes

 ## Development

--- a/src/adapters/ffmpeg.rs
+++ b/src/adapters/ffmpeg.rs
@ -7,14 +7,15 @@ use std::io::BufReader;
 use std::process::*;
 // todo:
 // maybe todo: read list of extensions from
-//ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
+// ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
+// but really, the probability of getting useful information from a .flv is low
 static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi"];

 lazy_static! {
    static ref METADATA: AdapterMeta = AdapterMeta {
        name: "ffmpeg".to_owned(),
        version: 1,
-        description: "Uses ffmpeg to extract video metadata and subtitles".to_owned(),
+        description: "Uses ffmpeg to extract video metadata/chapters and subtitles".to_owned(),
        fast_matchers: EXTENSIONS
            .iter()
            .map(|s| FastMatcher::FileExtension(s.to_string()))
@ -55,7 +56,10 @@ impl FileAdapter for FFmpegAdapter {
            ..
        } = ai;
        if !is_real_file {
-            // we *could* probably adapt this to also work based on streams, but really when would you want to search for videos within archives?
+            // we *could* probably adapt this to also work based on streams,
+            // it would require using a BufReader to read at least part of the file to memory
+            // but really when would you want to search for videos within archives?
+            // So instead, we only run this adapter if the file is a actual file on disk for now
            writeln!(oup, "{}[rga: skipping video in archive]", line_prefix,)?;
            return Ok(());
        }
@ -125,7 +129,7 @@ impl FileAdapter for FFmpegAdapter {
            let stdo = cmd.stdout.as_mut().expect("is piped");
            let time_re = Regex::new(r".*\d.*-->.*\d.*").unwrap();
            let mut time: String = "".to_owned();
-            // rewrite subtitle times so they are prefixed in every line
+            // rewrite subtitle times so they are shown as a prefix in every line
            for line in BufReader::new(stdo).lines() {
                let line = line?;
                // 09:55.195 --> 09:56.730
--- a/src/adapters/pandoc.rs
+++ b/src/adapters/pandoc.rs
@ -45,7 +45,9 @@ lazy_static! {
    static ref METADATA: AdapterMeta = AdapterMeta {
        name: "pandoc".to_owned(),
        version: 1,
-        description: "Uses pandoc to convert binary/unreadable text documents to plain text markdown-like text".to_owned(),
+        description:
+            "Uses pandoc to convert binary/unreadable text documents to plain markdown-like text"
+                .to_owned(),
        fast_matchers: EXTENSIONS
            .iter()
            .map(|s| FastMatcher::FileExtension(s.to_string()))
--- a/src/adapters/pdfpages.rs
+++ b/src/adapters/pdfpages.rs
@ -7,18 +7,16 @@ use lazy_static::lazy_static;
 use std::fs::File;
 use std::io::BufReader;

-
 use std::path::PathBuf;
 use std::process::Command;

-
 static EXTENSIONS: &[&str] = &["pdf"];

 lazy_static! {
 	static ref METADATA: AdapterMeta = AdapterMeta {
 		name: "pdfpages".to_owned(),
 		version: 1,
-		description: "Converts a pdf to it's individual pages as png files".to_owned(),
+		description: "Converts a pdf to it's individual pages as png files. Only useful in combination with tesseract".to_owned(),
 		fast_matchers: EXTENSIONS
 			.iter()
 			.map(|s| FastMatcher::FileExtension(s.to_string()))
@ -41,12 +39,13 @@ impl GetMetadata for PdfPagesAdapter {
 	}
 }

+/// A pdf is basically converted to a zip that has Page X.png files.
+/// This way, something like tesseract can process the pages individually
 impl FileAdapter for PdfPagesAdapter {
 	fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
 		let AdaptInfo {
 			filepath_hint,
 			is_real_file,
-			inp: _,
 			oup,
 			line_prefix,
 			archive_recursion_depth,
@ -75,7 +74,6 @@ impl FileAdapter for PdfPagesAdapter {
 			map_exe_error(e, exe_name, "Make sure you have graphicsmagick installed.")
 		})?;
 		let args = config.args;
-		// TODO: how to handle this copying better?

 		let status = cmd.wait()?;
 		if status.success() {
--- a/src/adapters/poppler.rs
+++ b/src/adapters/poppler.rs
@ -34,6 +34,13 @@ impl GetMetadata for PopplerAdapter {
    }
 }
 impl SpawningFileAdapter for PopplerAdapter {
+    fn get_exe(&self) -> &str {
+        "pdftotext"
+    }
+    fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
+        cmd.arg("-layout").arg("-").arg("-");
+        cmd
+    }
    fn postproc(line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Fallible<()> {
        // prepend Page X to each line
        let mut page = 1;
@ -48,11 +55,4 @@ impl SpawningFileAdapter for PopplerAdapter {
        }
        Ok(())
    }
-    fn get_exe(&self) -> &str {
-        "pdftotext"
-    }
-    fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
-        cmd.arg("-layout").arg("-").arg("-");
-        cmd
-    }
 }
--- a/src/adapters/spawning.rs
+++ b/src/adapters/spawning.rs
@ -9,14 +9,15 @@ use std::process::Stdio;
 * Copy a Read to a Write, while prefixing every line with a prefix.
 *
 * Try to detect binary files and ignore them. Does not ensure any encoding in the output.
+ *
+ * This is needed because the rg binary detection does not apply to preprocessed files
 */
 pub fn postproc_line_prefix(
    line_prefix: &str,
    inp: &mut dyn Read,
    oup: &mut dyn Write,
 ) -> Fallible<()> {
-    //std::io::copy(inp, oup)?;
-    //return Ok(());
+    // check for null byte in first 8kB
    let mut reader = BufReader::with_capacity(1 << 12, inp);
    let fourk = reader.fill_buf()?;
    if fourk.contains(&0u8) {
@ -45,6 +46,8 @@ pub trait SpawningFileAdapter: GetMetadata {
    }
 }

+/// replace a Command.spawn() error "File not found" with a more readable error
+/// to indicate some program is not installed
 pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error {
    use std::io::ErrorKind::*;
    match err.kind() {
@ -70,6 +73,7 @@ pub fn pipe_output(
    let mut stdo = cmd.stdout.take().expect("is piped");

    // TODO: how to handle this copying better?
+    // do we really need threads for this?
    crossbeam::scope(|s| -> Fallible<()> {
        s.spawn(|_| cp(line_prefix, &mut stdo, oup).unwrap()); // errors?
        std::io::copy(inp, &mut stdi)?;
--- a/src/adapters/sqlite.rs
+++ b/src/adapters/sqlite.rs
@ -65,6 +65,7 @@ impl FileAdapter for SqliteAdapter {
            ..
        } = ai;
        if !is_real_file {
+            // db is in an archive
            // todo: read to memory and then use that blob if size < max
            writeln!(oup, "{}[rga: skipping sqlite in archive]", line_prefix,)?;
            return Ok(());