From f624a6a29aa9cd70e87f42bd669992a0004e9d4a Mon Sep 17 00:00:00 2001 From: phiresky Date: Mon, 26 Dec 2022 21:51:22 +0100 Subject: [PATCH] partial zip adapter, cleanup --- Cargo.lock | 407 +++++++++++++++++++++------------------ Cargo.toml | 2 +- src/adapters.rs | 4 - src/adapters/pdfpages.rs | 140 -------------- src/adapters/zip.rs | 55 +++++- 5 files changed, 266 insertions(+), 342 deletions(-) delete mode 100644 src/adapters/pdfpages.rs diff --git a/Cargo.lock b/Cargo.lock index cb6def3..0f56b0b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,18 +8,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "aes" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8" -dependencies = [ - "cfg-if", - "cipher", - "cpufeatures", - "opaque-debug", -] - [[package]] name = "ahash" version = "0.7.6" @@ -55,6 +43,15 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "ansi_term" version = "0.12.1" @@ -119,6 +116,29 @@ dependencies = [ "syn", ] +[[package]] +name = "async_io_utilities" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b20cffc5590f4bf33f05f97a3ea587feba9c50d20325b401daa096b92ff7da0" +dependencies = [ + "tokio 1.23.0", +] + +[[package]] +name = "async_zip" +version = "0.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a36d43bdefc7215b2b3a97edd03b1553b7969ad76551025eedd3b913c645f6e" +dependencies = [ + "async-compression", + "async_io_utilities", + "chrono", + "crc32fast", + "thiserror", + "tokio 1.23.0", +] + [[package]] name = "atty" version = "0.2.14" @@ -136,12 +156,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" -[[package]] -name = "base64ct" -version = "1.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b645a089122eccb6111b4f81cbc1a49f5900ac4666bb93ac027feaecf15607bf" - [[package]] name = "bincode" version = "1.3.3" @@ -157,15 +171,6 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" -[[package]] -name = "block-buffer" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" -dependencies = [ - "generic-array 0.14.6", -] - [[package]] name = "brotli" version = "3.3.4" @@ -187,6 +192,12 @@ dependencies = [ "alloc-stdlib", ] +[[package]] +name = "bumpalo" +version = "3.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" + [[package]] name = "bytecount" version = "0.6.3" @@ -248,12 +259,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] -name = "cipher" -version = "0.3.0" +name = "chrono" +version = "0.4.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ee52072ec15386f770805afd189a01c8841be8696bed250fa2f13c4c0d6dfb7" +checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" dependencies = [ - "generic-array 0.14.6", + "iana-time-zone", + "js-sys", + "num-integer", + "num-traits", + "time", + "wasm-bindgen", + "winapi", ] [[package]] @@ -295,10 +312,14 @@ dependencies = [ ] [[package]] -name = "constant_time_eq" -version = "0.1.5" +name = "codespan-reporting" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" +checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" +dependencies = [ + "termcolor", + "unicode-width", +] [[package]] name = "convert_case" @@ -307,13 +328,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" [[package]] -name = "cpufeatures" -version = "0.2.5" +name = "core-foundation-sys" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" -dependencies = [ - "libc", -] +checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" [[package]] name = "crc32fast" @@ -392,21 +410,55 @@ dependencies = [ ] [[package]] -name = "crypto-common" -version = "0.1.6" +name = "ctor" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096" dependencies = [ - "generic-array 0.14.6", - "typenum", + "quote", + "syn", ] [[package]] -name = "ctor" -version = "0.1.26" +name = "cxx" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096" +checksum = "5add3fc1717409d029b20c5b6903fc0c0b02fa6741d820054f4a2efa5e5816fd" +dependencies = [ + "cc", + "cxxbridge-flags", + "cxxbridge-macro", + "link-cplusplus", +] + +[[package]] +name = "cxx-build" +version = "1.0.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c87959ba14bc6fbc61df77c3fcfe180fc32b93538c4f1031dd802ccb5f2ff0" +dependencies = [ + "cc", + "codespan-reporting", + "once_cell", + "proc-macro2", + "quote", + "scratch", + "syn", +] + +[[package]] +name = "cxxbridge-flags" +version = "1.0.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69a3e162fde4e594ed2b07d0f83c6c67b745e7f28ce58c6df5e6b6bef99dfb59" + +[[package]] +name = "cxxbridge-macro" +version = "1.0.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e7e2adeb6a0d4a282e581096b06e1791532b7d576dcde5ccd9382acf55db8e6" dependencies = [ + "proc-macro2", "quote", "syn", ] @@ -430,17 +482,6 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" -[[package]] -name = "digest" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" -dependencies = [ - "block-buffer", - "crypto-common", - "subtle", -] - [[package]] name = "directories-next" version = "2.0.0" @@ -663,16 +704,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "generic-array" -version = "0.14.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" -dependencies = [ - "typenum", - "version_check", -] - [[package]] name = "getopts" version = "0.2.21" @@ -690,7 +721,7 @@ checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" dependencies = [ "cfg-if", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", ] [[package]] @@ -745,19 +776,34 @@ dependencies = [ ] [[package]] -name = "hmac" -version = "0.12.1" +name = "humantime" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "iana-time-zone" +version = "0.1.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" dependencies = [ - "digest", + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "winapi", ] [[package]] -name = "humantime" -version = "2.1.0" +name = "iana-time-zone-haiku" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +dependencies = [ + "cxx", + "cxx-build", +] [[package]] name = "id-arena" @@ -832,6 +878,15 @@ dependencies = [ "libc", ] +[[package]] +name = "js-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" +dependencies = [ + "wasm-bindgen", +] + [[package]] name = "json_comments" version = "0.2.1" @@ -861,6 +916,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "link-cplusplus" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" +dependencies = [ + "cc", +] + [[package]] name = "linux-raw-sys" version = "0.1.4" @@ -958,7 +1022,7 @@ checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ "libc", "log", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys", ] @@ -1052,12 +1116,6 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" -[[package]] -name = "opaque-debug" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" - [[package]] name = "ordered-float" version = "3.4.0" @@ -1105,17 +1163,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "password-hash" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" -dependencies = [ - "base64ct", - "rand_core", - "subtle", -] - [[package]] name = "paste" version = "1.0.11" @@ -1128,18 +1175,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ecba01bf2678719532c5e3059e0b5f0811273d94b397088b82e3bd0a78c78fdd" -[[package]] -name = "pbkdf2" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" -dependencies = [ - "digest", - "hmac", - "password-hash", - "sha2", -] - [[package]] name = "percent-encoding" version = "2.2.0" @@ -1244,12 +1279,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" - [[package]] name = "redox_syscall" version = "0.2.16" @@ -1303,6 +1332,7 @@ dependencies = [ "anyhow", "async-compression", "async-stream", + "async_zip", "bincode", "bytes 1.3.0", "clap 4.0.32", @@ -1340,7 +1370,6 @@ dependencies = [ "tokio-test", "tokio-util", "tree_magic_mini", - "zip", ] [[package]] @@ -1440,6 +1469,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "scratch" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" + [[package]] name = "semver" version = "1.0.16" @@ -1448,18 +1483,18 @@ checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a" [[package]] name = "serde" -version = "1.0.151" +version = "1.0.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fed41fc1a24994d044e6db6935e69511a1153b52c15eb42493b26fa87feba0" +checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.151" +version = "1.0.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "255abe9a125a985c05190d687b320c12f9b1f0b99445e608c21ba0782c719ad8" +checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" dependencies = [ "proc-macro2", "quote", @@ -1488,28 +1523,6 @@ dependencies = [ "serde", ] -[[package]] -name = "sha1" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "sha2" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - [[package]] name = "signal-hook-registry" version = "1.4.0" @@ -1525,7 +1538,7 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ed5f6ab2122c6dec69dca18c72fa4590a27e581ad20d44960fe74c032a0b23b" dependencies = [ - "generic-array 0.12.4", + "generic-array", "num", ] @@ -1590,12 +1603,6 @@ dependencies = [ "syn", ] -[[package]] -name = "subtle" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" - [[package]] name = "syn" version = "1.0.107" @@ -1671,29 +1678,13 @@ dependencies = [ [[package]] name = "time" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376" -dependencies = [ - "itoa", - "serde", - "time-core", - "time-macros", -] - -[[package]] -name = "time-core" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" - -[[package]] -name = "time-macros" -version = "0.2.6" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d967f99f534ca7e495c575c62638eebc2898a8c84c119b89e250477bc4ba16b2" +checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" dependencies = [ - "time-core", + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", + "winapi", ] [[package]] @@ -1927,12 +1918,72 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasm-bindgen" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" + [[package]] name = "winapi" version = "0.3.9" @@ -2045,26 +2096,6 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" -[[package]] -name = "zip" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "537ce7411d25e54e8ae21a7ce0b15840e7bfcff15b51d697ec3266cc76bdf080" -dependencies = [ - "aes", - "byteorder", - "bzip2", - "constant_time_eq", - "crc32fast", - "crossbeam-utils", - "flate2", - "hmac", - "pbkdf2", - "sha1", - "time", - "zstd", -] - [[package]] name = "zstd" version = "0.11.2+zstd.1.5.2" diff --git a/Cargo.toml b/Cargo.toml index c1f4ad0..937256c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ version = "0.9.7-alpha.0" anyhow = "1.0.32" async-compression = {version = "0.3.15", features = ["all", "all-algorithms", "tokio"]} async-stream = "0.3.3" +async_zip = "0.0.9" bincode = "1.3.1" bytes = "1.2.1" clap = {version = "4.0.18", features = ["wrap_help"]} @@ -53,7 +54,6 @@ tokio-stream = {version = "0.1.11", features = ["io-util", "tokio-util"]} tokio-tar = { git = "https://github.com/vorot93/tokio-tar", version = "0.3.0" } tokio-util = {version = "0.7.4", features = ["io", "full"]} tree_magic = {package = "tree_magic_mini", version = "3.0.0"} -zip = "0.6.3" [dev-dependencies] ctor = "0.1.20" diff --git a/src/adapters.rs b/src/adapters.rs index 09627d9..581c674 100644 --- a/src/adapters.rs +++ b/src/adapters.rs @@ -2,11 +2,9 @@ pub mod custom; pub mod decompress; // pub mod ffmpeg; pub mod postproc; -// pub mod pdfpages; use std::sync::Arc; // pub mod sqlite; pub mod tar; -// pub mod tesseract; // pub mod writing; // pub mod zip; use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*}; @@ -119,8 +117,6 @@ pub fn get_all_adapters(custom_adapters: Option>) -> Ad Arc::new(decompress::DecompressAdapter::new()), Arc::new(tar::TarAdapter::new()), //Rc::new(sqlite::SqliteAdapter::new()), - // Rc::new(pdfpages::PdfPagesAdapter::new()), - // Rc::new(tesseract::TesseractAdapter::new()), ]; adapters.extend( BUILTIN_SPAWNING_ADAPTERS diff --git a/src/adapters/pdfpages.rs b/src/adapters/pdfpages.rs deleted file mode 100644 index 83c40d5..0000000 --- a/src/adapters/pdfpages.rs +++ /dev/null @@ -1,140 +0,0 @@ -use super::*; -use crate::adapters::spawning::map_exe_error; -use crate::preproc::rga_preproc; -use lazy_static::lazy_static; -use log::*; -use std::fs::File; -use std::io::BufReader; -use std::path::PathBuf; -use std::process::Command; - -static EXTENSIONS: &[&str] = &["pdf"]; - -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "pdfpages".to_owned(), - version: 1, - description: "Converts a pdf to its individual pages as png files. Only useful in combination with tesseract".to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some(vec![FileMatcher::MimeType( - "application/pdf".to_owned() - )]), - keep_fast_matchers_if_accurate: true, - disabled_by_default: true - }; -} -#[derive(Default)] -pub struct PdfPagesAdapter {} - -impl PdfPagesAdapter { - pub fn new() -> PdfPagesAdapter { - PdfPagesAdapter {} - } -} - -impl GetMetadata for PdfPagesAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA - } -} - -/// A pdf is basically converted to a zip that has Page X.png files. -/// This way, something like tesseract can process the pages individually -impl FileAdapter for PdfPagesAdapter { - fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<()> { - let AdaptInfo { - filepath_hint, - is_real_file, - oup, - line_prefix, - archive_recursion_depth, - config, - .. - } = ai; - if !is_real_file { - // todo: read to memory and then use that blob if size < max - writeln!(oup, "{}[rga: skipping pdfpages in archive]", line_prefix,)?; - return Ok(()); - } - let inp_fname = filepath_hint; - let exe_name = "gm"; - let out_dir = tempfile::Builder::new().prefix("pdfpages-").tempdir()?; - let out_fname = out_dir.path().join("out%04d.png"); - debug!("writing to temp dir: {}", out_fname.display()); - let mut cmd = Command::new(exe_name); - cmd.arg("convert") - .arg("-density") - .arg("200") - .arg(inp_fname) - .arg("+adjoin") - .arg(out_fname); - - let mut cmd = cmd - .spawn() - .map_err(|e| map_exe_error(e, exe_name, "Make sure you have graphicsmagick installed."))?; - let args = config.args; - - let status = cmd.wait()?; - if status.success() { - } else { - return Err(format_err!("subprocess failed: {:?}", status)); - } - for (i, filename) in glob::glob( - out_dir - .path() - .join("out*.png") - .to_str() - .expect("temp path has invalid encoding"), - )? - .enumerate() - { - let mut ele = BufReader::new(File::open(filename?)?); - rga_preproc(AdaptInfo { - filepath_hint: &PathBuf::from(format!("Page {}.png", i + 1)), - is_real_file: false, - inp: &mut ele, - oup, - line_prefix: &format!("{}Page {}:", line_prefix, i + 1), - archive_recursion_depth: archive_recursion_depth + 1, - config: PreprocConfig { cache: None, args }, - })?; - } - Ok(()) - } -} - -/*// todo: do this in an actually streaming fashion and less slow -// IEND chunk + PDF magic -// 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a -let split_seq = hex_literal::hex!("4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a"); -let split_seq_inx = 8; -fn split_by_seq<'a>( - split_seq: &'a [u8], - split_inx: usize, - read: &mut Read, -) -> Result + 'a> { - let regex = split_seq - .iter() - .map(|c| format!("\\x{:0>2x}", c)) - .collect::>() - .join(""); - let restr = format!("(?-u){}", regex); - eprintln!("re: {}", restr); - let re = regex::bytes::Regex::new(&restr)?; - - let mut all = Vec::new(); - read.read_to_end(&mut all)?; - let mut out: Vec>> = Vec::new(); - let mut last = 0; - for (i, split) in re.find_iter(&all).enumerate() { - let pos = split.start() + split_inx; - out.push(Cursor::new(Vec::from(&all[last..pos]))); - last = pos; - } - out.push(Cursor::new(Vec::from(&all[last..]))); - Ok(out) -}*/ diff --git a/src/adapters/zip.rs b/src/adapters/zip.rs index a5bf4d7..c5d8d50 100644 --- a/src/adapters/zip.rs +++ b/src/adapters/zip.rs @@ -1,6 +1,8 @@ use super::*; use crate::{adapted_iter::AdaptedFilesIter, print_bytes}; use anyhow::*; +use async_stream::stream; +use async_zip::read::stream::ZipFileReader; use lazy_static::lazy_static; use log::*; @@ -36,17 +38,52 @@ impl GetMetadata for ZipAdapter { } impl FileAdapter for ZipAdapter { - fn adapt<'a>( - &self, - inp: AdaptInfo<'a>, - _detection_reason: &FileMatcher, - ) -> Result> { - Ok(Box::new(ZipAdaptIter { inp })) + fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result { + let AdaptInfo { + inp, + filepath_hint, + archive_recursion_depth, + postprocess, + line_prefix, + config, + .. + } = ai; + let mut zip = ZipFileReader::new(inp); + + let s = stream! { + while !zip.finished() { + if let Some(mut reader) = zip.entry_reader().await? { + let file = reader.entry(); + /* if file.is_dir() { + continue; + }*/ + debug!( + "{}{}|{}: {} ({} packed)", + line_prefix, + filepath_hint.display(), + file.filename(), + print_bytes(file.uncompressed_size() as f64), + print_bytes(file.compressed_size() as f64) + ); + let new_line_prefix = format!("{}{}: ", line_prefix, file.filename()); + yield Ok(AdaptInfo { + filepath_hint: PathBuf::from(file.filename()), + is_real_file: false, + inp: Box::pin(reader), + line_prefix: new_line_prefix, + archive_recursion_depth: archive_recursion_depth + 1, + postprocess, + config: config.clone(), + }); + } + } + }; + Ok(Box::pin(s)) } } -struct ZipAdaptIter<'a> { - inp: AdaptInfo<'a>, +/*struct ZipAdaptIter { + inp: AdaptInfo, } impl<'a> AdaptedFilesIter for ZipAdaptIter<'a> { fn next<'b>(&'b mut self) -> Option> { @@ -80,7 +117,7 @@ impl<'a> AdaptedFilesIter for ZipAdaptIter<'a> { }) }) } -} +}*/ #[cfg(test)] mod test {