From 1a0bbc798e816c0886128499c28e9356fb090efe Mon Sep 17 00:00:00 2001 From: phiresky Date: Thu, 6 Jun 2019 11:00:13 +0200 Subject: [PATCH] add zip support! --- Cargo.lock | 145 ++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 + exampledir/test.zip | Bin 162990 -> 326108 bytes exampledir/wasteland.epub | Bin 102081 -> 102081 bytes src/adapters.rs | 24 ++++-- src/adapters/ffmpeg.rs | 16 ++-- src/adapters/pandoc.rs | 7 +- src/adapters/poppler.rs | 9 ++- src/adapters/spawning.rs | 106 +++++++++++++++++++++++--- src/adapters/zip.rs | 72 ++++++++++++++++++ src/bin/rga-preproc.rs | 122 ++++++------------------------ src/lib.rs | 1 + src/preproc.rs | 151 ++++++++++++++++++++++++++++++++++++++ 13 files changed, 522 insertions(+), 133 deletions(-) create mode 100644 src/adapters/zip.rs create mode 100644 src/preproc.rs diff --git a/Cargo.lock b/Cargo.lock index 84a35f2..6242071 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,10 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +[[package]] +name = "adler32" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "aho-corasick" version = "0.7.3" @@ -67,6 +72,24 @@ name = "byteorder" version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "bzip2" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.57 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.57 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "cachedir" version = "0.1.1" @@ -93,6 +116,36 @@ dependencies = [ "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "crc32fast" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-channel 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-deque 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-epoch 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-channel" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", + "smallvec 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "crossbeam-deque" version = "0.2.0" @@ -102,6 +155,15 @@ dependencies = [ "crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "crossbeam-deque" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-epoch 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "crossbeam-epoch" version = "0.3.1" @@ -116,6 +178,27 @@ dependencies = [ "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "crossbeam-epoch" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-queue" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "crossbeam-utils" version = "0.2.2" @@ -124,6 +207,15 @@ dependencies = [ "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "crossbeam-utils" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "either" version = "1.5.2" @@ -194,6 +286,16 @@ name = "libc" version = "0.2.57" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "libflate" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "adler32 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "lmdb-rkv" version = "0.11.4" @@ -326,6 +428,11 @@ name = "pkg-config" version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "podio" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "proc-macro2" version = "0.4.30" @@ -498,6 +605,7 @@ version = "0.1.0" dependencies = [ "bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)", "cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", "failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "path-clean 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -506,6 +614,7 @@ dependencies = [ "serde 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)", "tree_magic_fork 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", "zstd 0.4.24+zstd.1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -629,6 +738,16 @@ dependencies = [ "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "time" +version = "0.1.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.57 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "tree_magic_fork" version = "0.2.2" @@ -706,6 +825,18 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "zip" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libflate 0.1.23 (registry+https://github.com/rust-lang/crates.io-index)", + "podio 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "zstd" version = "0.4.24+zstd.1.4.0" @@ -734,6 +865,7 @@ dependencies = [ ] [metadata] +"checksum adler32 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7e522997b529f05601e05166c07ed17789691f562762c7f3b987263d2dedee5c" "checksum aho-corasick 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e6f484ae0c99fec2e858eb6134949117399f222608d84cadb3f58c1f97c2364c" "checksum arrayref 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0d382e583f07208808f6b1249e60848879ba3543f57c32277bf52d69c2f0f0ee" "checksum arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)" = "92c7fb76bc8826a8b33b4ee5bb07a247a81e76764ab4d55e8f73e3a4d8808c71" @@ -743,13 +875,22 @@ dependencies = [ "checksum bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "9f04a5e50dc80b3d5d35320889053637d15011aed5e66b66b37ae798c65da6f7" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" "checksum byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a019b10a2a7cdeb292db131fc8113e57ea2a908f6e7894b0c3c671893b65dbeb" +"checksum bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "42b7c3cbf0fa9c1b82308d57191728ca0256cb821220f4e2fd410a72ade26e3b" +"checksum bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "6584aa36f5ad4c9247f5323b0a42f37802b37a836f0ad87084d7a33961abe25f" "checksum cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c06509d1f4ffa658939bd23f076cd929ef218241363796551528e7eec69128c8" "checksum cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "39f75544d7bbaf57560d2168f28fd649ff9c76153874db88bdbdfd839b1a7e7d" "checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +"checksum crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1" +"checksum crossbeam 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b14492071ca110999a20bf90e3833406d5d66bfd93b4e52ec9539025ff43fe0d" +"checksum crossbeam-channel 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "0f0ed1a4de2235cabda8558ff5840bffb97fcb64c97827f354a451307df5f72b" "checksum crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f739f8c5363aca78cfb059edf753d8f0d36908c348f3d8d1503f03d8b75d9cf3" +"checksum crossbeam-deque 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b18cd2e169ad86297e6bc0ad9aa679aee9daa4f19e8163860faf7c164e4f5a71" "checksum crossbeam-epoch 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "927121f5407de9956180ff5e936fe3cf4324279280001cd56b669d28ee7e9150" +"checksum crossbeam-epoch 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "04c9e3102cc2d69cd681412141b390abd55a362afc1540965dad0ad4d34280b4" +"checksum crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7c979cd6cfe72335896575c6b5688da489e420d36a27a0b9eb0c73db574b4a4b" "checksum crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "2760899e32a1d58d5abb31129f8fae5de75220bc2176e77ff7c627ae45c918d9" +"checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c" "checksum either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5527cfe0d098f36e3f8839852688e63c8fff1c90b2b405aef730615f9a7bcf7b" "checksum failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "795bd83d3abeb9220f257e597aa0080a508b27533824adf336529648f6abf7e2" "checksum failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "ea1063915fd7ef4309e222a5a07cf9c319fb9c7836b1f89b85458672dbb127e1" @@ -761,6 +902,7 @@ dependencies = [ "checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f" "checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14" "checksum libc 0.2.57 (registry+https://github.com/rust-lang/crates.io-index)" = "a844cabbd5a77e60403a58af576f0a1baa83c3dd2670be63e615bd24fc58b82d" +"checksum libflate 0.1.23 (registry+https://github.com/rust-lang/crates.io-index)" = "76912aa0196b6f0e06d9c43ee877be45369157c06172ade12fe20ac3ee5ffa15" "checksum lmdb-rkv 0.11.4 (registry+https://github.com/rust-lang/crates.io-index)" = "e25b4069789bf7ac069d6fd58229f18aec20c6f7cc9173cb731d11c10dbb6b6e" "checksum lmdb-rkv-sys 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1470e0168f1832e35afd6d0931ae60db625685332837b97aa156773ec9c5e393" "checksum lock_api 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ed946d4529956a20f2d63ebe1b69996d5a2137c91913fe3ebbeff957f5bca7ff" @@ -779,6 +921,7 @@ dependencies = [ "checksum percent-encoding 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "31010dd2e1ac33d5b46a5b413495239882813e0369f8ed8a5e266f173602f831" "checksum petgraph 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3659d1ee90221741f65dd128d9998311b0e40c5d3c23a62445938214abce4f" "checksum pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "676e8eb2b1b4c9043511a9b7bea0915320d7e502b0a079fb03f9635a5252b18c" +"checksum podio 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "780fb4b6698bbf9cf2444ea5d22411cef2953f0824b98f33cf454ec5615645bd" "checksum proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)" = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759" "checksum quote 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)" = "faf4799c5d274f3868a4aae320a0a182cbd2baee377b378f080e16a23e9d80db" "checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" @@ -812,6 +955,7 @@ dependencies = [ "checksum syn 0.15.34 (registry+https://github.com/rust-lang/crates.io-index)" = "a1393e4a97a19c01e900df2aec855a29f71cf02c402e2f443b8d2747c25c5dbe" "checksum synstructure 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)" = "02353edf96d6e4dc81aea2d8490a7e9db177bf8acb0e951c24940bf866cb313f" "checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" +"checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f" "checksum tree_magic_fork 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "aab921ca9b828f83389f3f3c5e77404612547081e5222eb3a23d06184f6813af" "checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" "checksum unicode-bidi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" @@ -823,6 +967,7 @@ dependencies = [ "checksum winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "f10e386af2b13e47c89e7236a7a14a086791a2b88ebad6df9bf42040195cf770" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +"checksum zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c18fc320faf909036e46ac785ea827f72e485304877faf1a3a39538d3714dbc3" "checksum zstd 0.4.24+zstd.1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2c5a6414958b49ee80f2dd0042023ac8f37cfe1d31fbeec0b9749cf6f2c03683" "checksum zstd-safe 1.4.9+zstd.1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d98332212af687878b146a6549c188e9b72971972d23089c831472f938e6272" "checksum zstd-sys 1.4.10+zstd.1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "46f433134fbd0c37c9eb5929733df5f34bcdff464722eb93155fcee93eb57652" diff --git a/Cargo.toml b/Cargo.toml index be9a417..910699e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,3 +27,5 @@ zstd = "0.4.24" lazy_static = "1.3.0" serde_json = "1.0.39" failure = "0.1.5" +zip = "0.5.2" +crossbeam = "0.7.1" diff --git a/exampledir/test.zip b/exampledir/test.zip index 15eba87b6901ce847075c99a3ddaebaaaf44f41e..f80f1fbe74e5acc4c1481818d6b88b2d80d5aa05 100644 GIT binary patch delta 171 zcmZ4Ylk?7J;SGKll(?7~fME6Nqkdk?=iFXb!^8l>96+&>)Z!Aos?36h7RD`1>uQ?4 zE^PO@!1!f#ATBK`Ad`SVpGBP~MR=f(L^UMES hnQmIcq{}Sl%RD``hRK?bnSq%>l#zj9;|~^~J^9E diff --git a/exampledir/wasteland.epub b/exampledir/wasteland.epub index ff9e1046ebbbdef9dc28f8b4d8f0b14f993f95de..3eae7e44bdf4ff6f7c056f6a3c9ccd85f01e054b 100644 GIT binary patch delta 78 zcmX>&m+jzOwuUW?6S_HF`2E|x85kJ+r_bzW43ty5==oD~>dsD8)s?Orf4N;eDcQF1 bm)I{iwqI>wY%$w8dKlB0kn~jaF?s?3y_OwP delta 78 zcmX>&m+jzOwuUW?6S_H1PZYZ3&A`AAJbh+2W1yVd#Z1?qm8v^CRaIBIZgj}=O!M^9 a@{pdeI6oux0PA*+9>#PgBs~>(&'a self) -> &'a AdapterMeta; } pub trait FileAdapter: GetMetadata { - fn adapt(&self, inp_fname: &Path, oup: &mut dyn Write) -> Fallible<()>; + fn adapt(&self, a: AdaptInfo) -> Fallible<()>; +} +pub struct AdaptInfo<'a> { + pub filepath_hint: &'a Path, + pub inp: &'a mut dyn Read, + pub oup: &'a mut (dyn Write + Send), + pub line_prefix: &'a str, + // pub adapt_subobject: &'a dyn Fn(AdaptInfo) -> Fallible<()>, } pub fn extension_to_regex(extension: &str) -> Regex { @@ -42,9 +49,10 @@ pub fn extension_to_regex(extension: &str) -> Regex { pub fn get_adapters() -> Vec> { let adapters: Vec> = vec![ - Rc::new(crate::adapters::ffmpeg::FFmpegAdapter::new()), - Rc::new(crate::adapters::pandoc::PandocAdapter::new()), - Rc::new(crate::adapters::poppler::PopplerAdapter::new()), + Rc::new(ffmpeg::FFmpegAdapter::new()), + Rc::new(pandoc::PandocAdapter::new()), + Rc::new(poppler::PopplerAdapter::new()), + Rc::new(zip::ZipAdapter::new()), ]; adapters } diff --git a/src/adapters/ffmpeg.rs b/src/adapters/ffmpeg.rs index 6f8c306..bdb450a 100644 --- a/src/adapters/ffmpeg.rs +++ b/src/adapters/ffmpeg.rs @@ -1,10 +1,10 @@ -use super::*; use super::spawning::map_exe_error; +use super::*; +use failure::*; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; use std::io::BufReader; use std::process::*; -use failure::*; // todo: // maybe todo: read list of extensions from //ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null @@ -43,8 +43,14 @@ struct FFprobeStream { codec_type: String, // video,audio,subtitle } impl FileAdapter for FFmpegAdapter { - fn adapt(&self, inp_fname: &Path, oup: &mut dyn Write) -> Fallible<()> { - let spawn_fail = |e| map_exe_error(e, "ffprobe", "Make sure you have ffmpeg installed."); + fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + let AdaptInfo { + filepath_hint, + inp, + oup, + .. + } = ai; + /*let spawn_fail = |e| map_exe_error(e, "ffprobe", "Make sure you have ffmpeg installed."); let has_subtitles = { let probe = Command::new("ffprobe") .args(vec![ @@ -122,7 +128,7 @@ impl FileAdapter for FFmpegAdapter { } } } - } + }*/ Ok(()) } } diff --git a/src/adapters/pandoc.rs b/src/adapters/pandoc.rs index 1d373ff..3b4ec0b 100644 --- a/src/adapters/pandoc.rs +++ b/src/adapters/pandoc.rs @@ -67,14 +67,13 @@ impl SpawningFileAdapter for PandocAdapter { fn get_exe(&self) -> &str { "pandoc" } - fn command(&self, inp_fname: &Path, mut cmd: Command) -> Command { + fn command(&self, filepath_hint: &Path, mut cmd: Command) -> Command { cmd + .arg("--from").arg(filepath_hint.extension().unwrap()) // simpler markown (with more information loss but plainer text) .arg("--to=commonmark-header_attributes-link_attributes-fenced_divs-markdown_in_html_blocks-raw_html-native_divs-native_spans-bracketed_spans") .arg("--wrap=none") - .arg("--atx-headers") - .arg("--") - .arg(inp_fname); + .arg("--atx-headers"); cmd } } diff --git a/src/adapters/poppler.rs b/src/adapters/poppler.rs index 301ffd1..4fd3066 100644 --- a/src/adapters/poppler.rs +++ b/src/adapters/poppler.rs @@ -9,7 +9,10 @@ lazy_static! { static ref METADATA: AdapterMeta = AdapterMeta { name: "poppler".to_owned(), version: 1, - matchers: EXTENSIONS.iter().map(|s| Matcher::FileExtension(s.to_string())).collect(), + matchers: EXTENSIONS + .iter() + .map(|s| Matcher::FileExtension(s.to_string())) + .collect(), }; } pub struct PopplerAdapter; @@ -29,8 +32,8 @@ impl SpawningFileAdapter for PopplerAdapter { fn get_exe(&self) -> &str { "pdftotext" } - fn command(&self, inp_fname: &Path, mut cmd: Command) -> Command { - cmd.arg("-layout").arg("--").arg(inp_fname).arg("-"); + fn command(&self, filepath_hint: &Path, mut cmd: Command) -> Command { + cmd.arg("-layout").arg("-").arg("-"); cmd } } diff --git a/src/adapters/spawning.rs b/src/adapters/spawning.rs index bf69747..60c72e1 100644 --- a/src/adapters/spawning.rs +++ b/src/adapters/spawning.rs @@ -1,26 +1,93 @@ use super::*; -use std::io::Write; +use failure::*; +use std::io::prelude::*; +use std::io::BufReader; use std::process::Command; use std::process::Stdio; -use failure::*; +use std::thread; pub trait SpawningFileAdapter: GetMetadata { fn get_exe(&self) -> &str; - fn command(&self, inp_fname: &Path, command: Command) -> Command; + fn command(&self, filepath_hint: &Path, command: Command) -> Command; + + fn postproc(line_prefix: &str, inp: &mut Read, oup: &mut Write) -> Fallible<()> { + //std::io::copy(inp, oup)?; + + for line in BufReader::new(inp).lines() { + oup.write_all(format!("{}{}\n", line_prefix, line?).as_bytes())?; + } + Ok(()) + } } pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> Error { use std::io::ErrorKind::*; match err.kind() { NotFound => format_err!("Could not find executable \"{}\". {}", exe_name, help), - _ => Error::from(err) + _ => Error::from(err), } } -pub fn pipe_output(mut cmd: Command, oup: &mut dyn Write, exe_name: &str, help: &str) -> Fallible<()> { - let mut cmd = cmd.stdout(Stdio::piped()).spawn().map_err(|e| map_exe_error(e, exe_name, help))?; - let stdo = cmd.stdout.as_mut().expect("is piped"); - std::io::copy(stdo, oup)?; +/*fn pipe(a: &mut dyn Read, b: &mut dyn Write, c: &mut dyn Read, d: &mut dyn Write) { + let mut buf = vec![0u8; 2 << 13]; + loop { + match a.read(&buf) { + + } + } +}*/ + +/*pub fn copy( + name: &str, + reader: &mut R, + writer: &mut W, +) -> std::io::Result +where + R: Read, + W: Write, +{ + eprintln!("START COPY {}", name); + let mut zz = vec![0; 1 << 13]; + let mut buf: &mut [u8] = zz.as_mut(); + let mut written = 0; + loop { + let r = reader.read(buf); + eprintln!("{}read: {:?}", name, r); + let len = match r { + Ok(0) => return Ok(written), + Ok(len) => len, + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + }; + writer.write_all(&buf[..len])?; + written += len as u64; + } +}*/ + +pub fn pipe_output( + line_prefix: &str, + mut cmd: Command, + inp: &mut (dyn Read), + oup: &mut (dyn Write + Send), + exe_name: &str, + help: &str, + cp: fn(line_prefix: &str, &mut dyn Read, &mut dyn Write) -> Fallible<()>, +) -> Fallible<()> { + let mut cmd = cmd + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .map_err(|e| map_exe_error(e, exe_name, help))?; + let mut stdi = cmd.stdin.take().expect("is piped"); + let mut stdo = cmd.stdout.take().expect("is piped"); + + crossbeam::scope(|s| -> Fallible<()> { + s.spawn(|_| cp(line_prefix, &mut stdo, oup).unwrap()); // errors? + std::io::copy(inp, &mut stdi)?; + drop(stdi); // NEEDED! otherwise deadlock + Ok(()) + }) + .unwrap()?; let status = cmd.wait()?; if status.success() { Ok(()) @@ -30,11 +97,26 @@ pub fn pipe_output(mut cmd: Command, oup: &mut dyn Write, exe_name: &str, help: } impl FileAdapter for T - where - T: SpawningFileAdapter, +where + T: SpawningFileAdapter, { - fn adapt(&self, inp_fname: &Path, oup: &mut dyn Write) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + let AdaptInfo { + filepath_hint, + inp, + oup, + line_prefix, + .. + } = ai; let cmd = Command::new(self.get_exe()); - pipe_output(self.command(inp_fname, cmd), oup, self.get_exe(), "") + pipe_output( + line_prefix, + self.command(filepath_hint, cmd), + inp, + oup, + self.get_exe(), + "", + Self::postproc, + ) } } diff --git a/src/adapters/zip.rs b/src/adapters/zip.rs new file mode 100644 index 0000000..b761687 --- /dev/null +++ b/src/adapters/zip.rs @@ -0,0 +1,72 @@ +use super::*; +use crate::preproc::rga_preproc; +use failure::*; +use lazy_static::lazy_static; +use std::fs::File; +// todo: +// maybe todo: read list of extensions from +//ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null +static EXTENSIONS: &[&str] = &["zip"]; + +lazy_static! { + static ref METADATA: AdapterMeta = AdapterMeta { + name: "zip".to_owned(), + version: 1, + matchers: EXTENSIONS + .iter() + .map(|s| Matcher::FileExtension(s.to_string())) + .collect(), + }; +} + +pub struct ZipAdapter; + +impl ZipAdapter { + pub fn new() -> ZipAdapter { + ZipAdapter + } +} +impl GetMetadata for ZipAdapter { + fn metadata<'a>(&'a self) -> &'a AdapterMeta { + &METADATA + } +} + +impl FileAdapter for ZipAdapter { + fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { + use std::io::prelude::*; + let AdaptInfo { + filepath_hint, + mut inp, + oup, + line_prefix, + .. + } = ai; + loop { + match ::zip::read::read_zipfile_from_stream(&mut inp) { + Ok(None) => break, + Ok(Some(mut file)) => { + eprintln!( + "{}|{}: {} bytes ({} bytes packed)", + filepath_hint.to_string_lossy(), + file.name(), + file.size(), + file.compressed_size() + ); + let line_prefix = &format!("{}{}:/", line_prefix, file.name().clone()); + rga_preproc( + AdaptInfo { + filepath_hint: &file.sanitized_name(), + inp: &mut file, + oup: oup, + line_prefix, + }, + None, + )?; + } + Err(e) => return Err(e.into()), + } + } + Ok(()) + } +} diff --git a/src/bin/rga-preproc.rs b/src/bin/rga-preproc.rs index d8d2604..87c012a 100644 --- a/src/bin/rga-preproc.rs +++ b/src/bin/rga-preproc.rs @@ -1,110 +1,30 @@ +use failure::{format_err, Error}; use path_clean::PathClean; use rga::adapters::*; +use rga::preproc::*; use rga::CachingWriter; -use failure::{Error, format_err}; - -// longest compressed conversion output to save in cache -const MAX_DB_BLOB_LEN: usize = 2000000; -const ZSTD_LEVEL: i32 = 12; - -fn open_db() -> Result>, Error> { - let app_cache = cachedir::CacheDirConfig::new("rga").get_cache_dir()?; - - let db_arc = rkv::Manager::singleton() - .write() - .expect("could not write db manager") - .get_or_create(app_cache.as_path(), |p| { - let mut builder = rkv::Rkv::environment_builder(); - builder - .set_flags(rkv::EnvironmentFlags::NO_SYNC | rkv::EnvironmentFlags::WRITE_MAP) // not durable - .set_map_size(2 * 1024 * 1024 * 1024) - .set_max_dbs(100); - rkv::Rkv::from_env(p, builder) - }) - .expect("could not get/create db"); - Ok(db_arc) -} +use std::fs::File; +use std::path::PathBuf; +use std::rc::Rc; fn main() -> Result<(), Error> { - //db. - let adapters = adapter_matcher()?; - let filepath = std::env::args_os() - .skip(1) - .next() - .ok_or(format_err!("No filename specified"))?; - eprintln!("inp fname: {:?}", filepath); - let path = std::env::current_dir()?.join(&filepath); + let path = { + let filepath = std::env::args_os() + .skip(1) + .next() + .ok_or(format_err!("No filename specified"))?; + eprintln!("inp fname: {:?}", filepath); + std::env::current_dir()?.join(&filepath) + }; + eprintln!("abs path: {:?}", path); - eprintln!("clean path: {:?}", path.clean()); - let serialized_path: Vec = - bincode::serialize(&path.clean()).expect("could not serialize path"); // key in the cache database - let filename = path.file_name().ok_or(format_err!("Empty filename"))?; - /*let mimetype = tree_magic::from_filepath(path).ok_or(lerr(format!( - "File {} does not exist", - filename.to_string_lossy() - )))?; - println!("mimetype: {:?}", mimetype);*/ - let adapter = adapters(FileMeta { - // mimetype, - lossy_filename: filename.to_string_lossy().to_string(), - }); - match adapter { - Some(ad) => { - let meta = ad.metadata(); - eprintln!("adapter: {}", &meta.name); - let db_name = format!("{}.v{}", meta.name, meta.version); - let db_arc = open_db()?; - let db_env = db_arc.read().unwrap(); - let db = db_env - .open_single(db_name.as_str(), rkv::store::Options::create()) - .map_err(|p| format_err!("could not open db store: {:?}", p))?; - let reader = db_env.read().expect("could not get reader"); - match db - .get(&reader, &serialized_path) - .map_err(|p| format_err!("could not read from db: {:?}", p))? - { - Some(rkv::Value::Blob(cached)) => { - let stdouti = std::io::stdout(); - zstd::stream::copy_decode(cached, stdouti.lock())?; - Ok(()) - } - Some(_) => Err(format_err!("Integrity: value not blob")), - None => { - let stdouti = std::io::stdout(); - let mut compbuf = - CachingWriter::new(stdouti.lock(), MAX_DB_BLOB_LEN, ZSTD_LEVEL)?; - ad.adapt(&path, &mut compbuf)?; - let compressed = compbuf.finish()?; - if let Some(cached) = compressed { - eprintln!("compressed len: {}", cached.len()); + let ai = AdaptInfo { + inp: &mut File::open(&path)?, + filepath_hint: &path, + oup: &mut std::io::stdout(), + line_prefix: "", + }; - { - let mut writer = db_env.write().map_err(|p| { - format_err!("could not open write handle to cache: {:?}", p) - })?; - db.put(&mut writer, &serialized_path, &rkv::Value::Blob(&cached)) - .map_err(|p| format_err!("could not write to cache: {:?}", p))?; - writer.commit().unwrap(); - } - } - Ok(()) - } - } - } - None => { - let allow_cat = false; - if allow_cat { - eprintln!("no adapter for that file, running cat!"); - let stdini = std::io::stdin(); - let mut stdin = stdini.lock(); - let stdouti = std::io::stdout(); - let mut stdout = stdouti.lock(); - std::io::copy(&mut stdin, &mut stdout)?; - Ok(()) - } else { - Err(format_err!("No adapter found for file {:?}", filename)) - } - } - } + rga_preproc(ai, Some(open_cache_db()?)) } diff --git a/src/lib.rs b/src/lib.rs index c1855a6..60a51c0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ pub mod adapters; mod caching_writer; pub mod errors; +pub mod preproc; pub use caching_writer::CachingWriter; diff --git a/src/preproc.rs b/src/preproc.rs new file mode 100644 index 0000000..35f5c5d --- /dev/null +++ b/src/preproc.rs @@ -0,0 +1,151 @@ +use crate::adapters::*; +use crate::CachingWriter; +use failure::{format_err, Error}; +use path_clean::PathClean; +use std::io::Read; +use std::path::Path; +use std::path::PathBuf; +use std::rc::Rc; + +// longest compressed conversion output to save in cache +const MAX_DB_BLOB_LEN: usize = 2000000; +const ZSTD_LEVEL: i32 = 12; + +pub fn open_cache_db() -> Result>, Error> { + let app_cache = cachedir::CacheDirConfig::new("rga").get_cache_dir()?; + + let db_arc = rkv::Manager::singleton() + .write() + .expect("could not write db manager") + .get_or_create(app_cache.as_path(), |p| { + let mut builder = rkv::Rkv::environment_builder(); + builder + .set_flags(rkv::EnvironmentFlags::NO_SYNC | rkv::EnvironmentFlags::WRITE_MAP) // not durable + .set_map_size(2 * 1024 * 1024 * 1024) + .set_max_dbs(100); + rkv::Rkv::from_env(p, builder) + }) + .expect("could not get/create db"); + Ok(db_arc) +} + +pub fn rga_preproc( + ai: AdaptInfo, + mb_db_arc: Option>>, +) -> Result<(), Error> { + let adapters = adapter_matcher()?; + let AdaptInfo { + filepath_hint, + inp, + oup, + line_prefix, + .. + } = ai; + let filename = filepath_hint + .file_name() + .ok_or(format_err!("Empty filename"))?; + + eprintln!("abs path: {:?}", filepath_hint); + + /*let mimetype = tree_magic::from_filepath(path).ok_or(lerr(format!( + "File {} does not exist", + filename.to_string_lossy() + )))?; + println!("mimetype: {:?}", mimetype);*/ + let adapter = adapters(FileMeta { + // mimetype, + lossy_filename: filename.to_string_lossy().to_string(), + }); + match adapter { + Some(ad) => { + let meta = ad.metadata(); + eprintln!("adapter: {}", &meta.name); + let db_name = format!("{}.v{}", meta.name, meta.version); + if let Some(db_arc) = mb_db_arc { + let cache_key: Vec = { + let clean_path = filepath_hint.to_owned().clean(); + eprintln!("clean path: {:?}", clean_path); + let meta = std::fs::metadata(&filepath_hint)?; + + let key = ( + clean_path, + meta.modified().expect("weird OS that can't into mtime"), + ); + eprintln!("cache key: {:?}", key); + + bincode::serialize(&key).expect("could not serialize path") // key in the cache database + }; + let db_env = db_arc.read().unwrap(); + let db = db_env + .open_single(db_name.as_str(), rkv::store::Options::create()) + .map_err(|p| format_err!("could not open db store: {:?}", p))?; + let reader = db_env.read().expect("could not get reader"); + let cached = db + .get(&reader, &cache_key) + .map_err(|p| format_err!("could not read from db: {:?}", p))?; + match cached { + Some(rkv::Value::Blob(cached)) => { + let stdouti = std::io::stdout(); + zstd::stream::copy_decode(cached, stdouti.lock())?; + Ok(()) + } + Some(_) => Err(format_err!("Integrity: value not blob")), + None => { + let mut compbuf = CachingWriter::new(oup, MAX_DB_BLOB_LEN, ZSTD_LEVEL)?; + // start dupe + eprintln!("adapting..."); + ad.adapt(AdaptInfo { + line_prefix, + filepath_hint, + inp, + oup: &mut compbuf, + })?; + // end dupe + let compressed = compbuf.finish()?; + if let Some(cached) = compressed { + eprintln!("compressed len: {}", cached.len()); + + { + let mut writer = db_env.write().map_err(|p| { + format_err!("could not open write handle to cache: {:?}", p) + })?; + db.put(&mut writer, &cache_key, &rkv::Value::Blob(&cached)) + .map_err(|p| { + format_err!("could not write to cache: {:?}", p) + })?; + writer.commit().unwrap(); + } + } + Ok(()) + } + } + } else { + // todo: duplicate code + // start dupe + eprintln!("adapting..."); + ad.adapt(AdaptInfo { + line_prefix, + filepath_hint, + inp, + oup, + })?; + // end dupe + Ok(()) + } + } + None => { + let allow_cat = false; + if allow_cat { + eprintln!("no adapter for that file, running cat!"); + let stdini = std::io::stdin(); + let mut stdin = stdini.lock(); + let stdouti = std::io::stdout(); + let mut stdout = stdouti.lock(); + std::io::copy(&mut stdin, &mut stdout)?; + Ok(()) + } else { + Err(format_err!("No adapter found for file {:?}", filename)) + } + } + } +}