more documentation

pull/11/head
phiresky 5 years ago
parent e0bc939b93
commit 5f2b5e3399

@ -10,6 +10,7 @@ rga is a line-oriented search tool that allows you to look for a regex in a mult
- I wanted to add a photograph adapter (based on object classification / detection) for fun, based on something . It worked with [YOLO](https://pjreddie.com/darknet/yolo/), but something more useful and state-of-the art [like this](https://github.com/aimagelab/show-control-and-tell) proved very hard to integrate.
- 7z adapter (couldn't find a nice to use Rust library)
- allow per-adapter configuration options (probably via env (RGA_ADAPTER_CONF=json))
- there's some more (mostly technical) todos in the code
## Examples
@ -40,19 +41,23 @@ On the first run rga is mostly faster because of multithreading, but on subseque
rga should compile with stable Rust. To install it, simply run (your OSes equivalent of)
```bash
apt install build-essential pandoc poppler-utils
apt install build-essential pandoc poppler-utils ffmpeg
cargo install ripgrep_all
rga --help # works! :)
```
You don't necessarily need to install any dependencies, but then you will see an error when trying to read from the corresponding file type (e.g. poppler-utils for pdf).
## Technical details
`rga` simply runs ripgrep (`rg`) with some options set, especially `--pre=rga-preproc` and `--pre-glob`.
`rga-preproc [fname]` will match an adapter to the given file based on either it's filename or it's mime type (if `--accurate` is given).
`rga-preproc [fname]` will match an "adapter" to the given file based on either it's filename or it's mime type (if `--accurate` is given). You can see all adapters currently included in [src/adapters](src/adapters).
Some rga adapters run external binaries to do the actual work (such as pandoc or ffmpeg), usually by writing to stdin and reading from stdout.
Some rga adapters run external binaries
Most adapters read the files from a [Read](https://doc.rust-lang.org/std/io/trait.Read.html), so they work completely on streamed data (that can come from anywhere including within nested archives). rga-preproc writes
## Development

@ -7,14 +7,15 @@ use std::io::BufReader;
use std::process::*;
// todo:
// maybe todo: read list of extensions from
//ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
// ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
// but really, the probability of getting useful information from a .flv is low
static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi"];
lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "ffmpeg".to_owned(),
version: 1,
description: "Uses ffmpeg to extract video metadata and subtitles".to_owned(),
description: "Uses ffmpeg to extract video metadata/chapters and subtitles".to_owned(),
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
@ -55,7 +56,10 @@ impl FileAdapter for FFmpegAdapter {
..
} = ai;
if !is_real_file {
// we *could* probably adapt this to also work based on streams, but really when would you want to search for videos within archives?
// we *could* probably adapt this to also work based on streams,
// it would require using a BufReader to read at least part of the file to memory
// but really when would you want to search for videos within archives?
// So instead, we only run this adapter if the file is a actual file on disk for now
writeln!(oup, "{}[rga: skipping video in archive]", line_prefix,)?;
return Ok(());
}
@ -125,7 +129,7 @@ impl FileAdapter for FFmpegAdapter {
let stdo = cmd.stdout.as_mut().expect("is piped");
let time_re = Regex::new(r".*\d.*-->.*\d.*").unwrap();
let mut time: String = "".to_owned();
// rewrite subtitle times so they are prefixed in every line
// rewrite subtitle times so they are shown as a prefix in every line
for line in BufReader::new(stdo).lines() {
let line = line?;
// 09:55.195 --> 09:56.730

@ -45,7 +45,9 @@ lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "pandoc".to_owned(),
version: 1,
description: "Uses pandoc to convert binary/unreadable text documents to plain text markdown-like text".to_owned(),
description:
"Uses pandoc to convert binary/unreadable text documents to plain markdown-like text"
.to_owned(),
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))

@ -7,18 +7,16 @@ use lazy_static::lazy_static;
use std::fs::File;
use std::io::BufReader;
use std::path::PathBuf;
use std::process::Command;
static EXTENSIONS: &[&str] = &["pdf"];
lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "pdfpages".to_owned(),
version: 1,
description: "Converts a pdf to it's individual pages as png files".to_owned(),
description: "Converts a pdf to it's individual pages as png files. Only useful in combination with tesseract".to_owned(),
fast_matchers: EXTENSIONS
.iter()
.map(|s| FastMatcher::FileExtension(s.to_string()))
@ -41,12 +39,13 @@ impl GetMetadata for PdfPagesAdapter {
}
}
/// A pdf is basically converted to a zip that has Page X.png files.
/// This way, something like tesseract can process the pages individually
impl FileAdapter for PdfPagesAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
let AdaptInfo {
filepath_hint,
is_real_file,
inp: _,
oup,
line_prefix,
archive_recursion_depth,
@ -75,7 +74,6 @@ impl FileAdapter for PdfPagesAdapter {
map_exe_error(e, exe_name, "Make sure you have graphicsmagick installed.")
})?;
let args = config.args;
// TODO: how to handle this copying better?
let status = cmd.wait()?;
if status.success() {

@ -34,6 +34,13 @@ impl GetMetadata for PopplerAdapter {
}
}
impl SpawningFileAdapter for PopplerAdapter {
fn get_exe(&self) -> &str {
"pdftotext"
}
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
cmd.arg("-layout").arg("-").arg("-");
cmd
}
fn postproc(line_prefix: &str, inp: &mut dyn Read, oup: &mut dyn Write) -> Fallible<()> {
// prepend Page X to each line
let mut page = 1;
@ -48,11 +55,4 @@ impl SpawningFileAdapter for PopplerAdapter {
}
Ok(())
}
fn get_exe(&self) -> &str {
"pdftotext"
}
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
cmd.arg("-layout").arg("-").arg("-");
cmd
}
}

@ -9,14 +9,15 @@ use std::process::Stdio;
* Copy a Read to a Write, while prefixing every line with a prefix.
*
* Try to detect binary files and ignore them. Does not ensure any encoding in the output.
*
* This is needed because the rg binary detection does not apply to preprocessed files
*/
pub fn postproc_line_prefix(
line_prefix: &str,
inp: &mut dyn Read,
oup: &mut dyn Write,
) -> Fallible<()> {
//std::io::copy(inp, oup)?;
//return Ok(());
// check for null byte in first 8kB
let mut reader = BufReader::with_capacity(1 << 12, inp);
let fourk = reader.fill_buf()?;
if fourk.contains(&0u8) {
@ -45,6 +46,8 @@ pub trait SpawningFileAdapter: GetMetadata {
}
}
/// replace a Command.spawn() error "File not found" with a more readable error
/// to indicate some program is not installed
pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error {
use std::io::ErrorKind::*;
match err.kind() {
@ -70,6 +73,7 @@ pub fn pipe_output(
let mut stdo = cmd.stdout.take().expect("is piped");
// TODO: how to handle this copying better?
// do we really need threads for this?
crossbeam::scope(|s| -> Fallible<()> {
s.spawn(|_| cp(line_prefix, &mut stdo, oup).unwrap()); // errors?
std::io::copy(inp, &mut stdi)?;

@ -65,6 +65,7 @@ impl FileAdapter for SqliteAdapter {
..
} = ai;
if !is_real_file {
// db is in an archive
// todo: read to memory and then use that blob if size < max
writeln!(oup, "{}[rga: skipping sqlite in archive]", line_prefix,)?;
return Ok(());

Loading…
Cancel
Save