From dedbc6d23fc448553214796b6471d9985deec0a9 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 14 Oct 2012 03:01:32 -0400 Subject: [PATCH 01/22] Use dynamic block sizes --- read.c | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/read.c b/read.c index 4d89f48..bb2647c 100644 --- a/read.c +++ b/read.c @@ -25,6 +25,7 @@ static void wanted_free(wanted_t *w); typedef struct { uint8_t *input, *output; + size_t incap, outcap; size_t insize, outsize; off_t uoffset; // uncompressed offset } io_block_t; @@ -52,9 +53,8 @@ static void tar_write_last(void); #pragma mark DECLARE UTILS static lzma_vli gFileIndexOffset = 0; -static size_t gBlockInSize = 0, gBlockOutSize = 0; -static void set_block_sizes(void); +static void check_capacity(io_block_t *ib, size_t incap, size_t outcap); #pragma mark MAIN @@ -64,7 +64,6 @@ void pixz_read(bool verify, size_t nspecs, char **specs) { if (verify) gFileIndexOffset = read_file_index(0); wanted_files(nspecs, specs); - set_block_sizes(); #if DEBUG for (wanted_t *w = gWantedFiles; w; w = w->next) @@ -135,8 +134,8 @@ void pixz_read(bool verify, size_t nspecs, char **specs) { static void *block_create(void) { io_block_t *ib = malloc(sizeof(io_block_t)); - ib->input = malloc(gBlockInSize); - ib->output = malloc(gBlockOutSize); + ib->incap = ib->outcap = 0; + ib->input = ib->output = NULL; return ib; } @@ -150,25 +149,6 @@ static void block_free(void* data) { #pragma mark SETUP -static void set_block_sizes() { - lzma_index_iter iter; - lzma_index_iter_init(&iter, gIndex); - while (!lzma_index_iter_next(&iter, LZMA_INDEX_ITER_BLOCK)) { - // exclude the file index block - lzma_vli off = iter.block.compressed_file_offset; - if (gFileIndexOffset && off == gFileIndexOffset) - continue; - - size_t in = iter.block.total_size, - out = iter.block.uncompressed_size; - if (out > gBlockOutSize) - gBlockOutSize = out; - if (in > gBlockInSize) - gBlockInSize = in; - } -} - - static void wanted_free(wanted_t *w) { for (wanted_t *w = gWantedFiles; w; ) { wanted_t *tmp = w->next; @@ -244,6 +224,17 @@ static void wanted_files(size_t count, char **specs) { #pragma mark THREADS +static void check_capacity(io_block_t *ib, size_t incap, size_t outcap) { + if (incap > ib->incap) { + ib->incap = incap; + ib->input = malloc(incap); + } + if (outcap > ib->outcap) { + ib->outcap = outcap; + ib->output = malloc(outcap); + } +} + static void read_thread(void) { off_t offset = ftello(gInFile); wanted_t *w = gWantedFiles; @@ -273,6 +264,8 @@ static void read_thread(void) { pipeline_item_t *pi; queue_pop(gPipelineStartQ, (void**)&pi); io_block_t *ib = (io_block_t*)(pi->data); + check_capacity(ib, iter.block.unpadded_size, + iter.block.uncompressed_size); // Seek if needed, and get the data if (offset != boffset) { @@ -310,7 +303,7 @@ static void decode_thread(size_t thnum) { stream.avail_in = ib->insize - block.header_size; stream.next_in = ib->input + block.header_size; - stream.avail_out = gBlockOutSize; + stream.avail_out = ib->outcap; stream.next_out = ib->output; lzma_ret err = LZMA_OK; From cad2ee95ebaf31a25bbe86ef9b01645c60202f1f Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 14 Oct 2012 06:13:23 -0400 Subject: [PATCH 02/22] We never use the argument to read_file_index --- common.c | 5 ++--- list.c | 2 +- pixz.h | 2 +- read.c | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/common.c b/common.c index b637d2b..ad291d2 100644 --- a/common.c +++ b/common.c @@ -146,10 +146,9 @@ lzma_vli find_file_index(void **bdatap) { return ret; } -lzma_vli read_file_index(lzma_vli offset) { +lzma_vli read_file_index() { void *bdata = NULL; - if (!offset) - offset = find_file_index(&bdata); + lzma_vli offset = find_file_index(&bdata); if (!offset) return 0; diff --git a/list.c b/list.c index e601ce2..9798d29 100644 --- a/list.c +++ b/list.c @@ -7,7 +7,7 @@ void pixz_list(bool tar) { lzma_index_iter iter; lzma_index_iter_init(&iter, gIndex); - if (tar && read_file_index(0)) { + if (tar && read_file_index()) { dump_file_index(stdout, false); free_file_index(); } else { diff --git a/pixz.h b/pixz.h index daacd97..d1e9239 100644 --- a/pixz.h +++ b/pixz.h @@ -71,7 +71,7 @@ bool is_multi_header(const char *name); void decode_index(void); lzma_vli find_file_index(void **bdatap); -lzma_vli read_file_index(lzma_vli offset); +lzma_vli read_file_index(void); void dump_file_index(FILE *out, bool verbose); void free_file_index(void); diff --git a/read.c b/read.c index bb2647c..134a50d 100644 --- a/read.c +++ b/read.c @@ -62,7 +62,7 @@ static void check_capacity(io_block_t *ib, size_t incap, size_t outcap); void pixz_read(bool verify, size_t nspecs, char **specs) { decode_index(); if (verify) - gFileIndexOffset = read_file_index(0); + gFileIndexOffset = read_file_index(); wanted_files(nspecs, specs); #if DEBUG From b13ae91698316ba1e3646b85ebe13cb8004b2df9 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 14 Oct 2012 07:33:33 -0400 Subject: [PATCH 03/22] It's ok to decompress a text file to a TTY --- pixz.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pixz.c b/pixz.c index 22bf3ff..7b318dc 100644 --- a/pixz.c +++ b/pixz.c @@ -110,11 +110,13 @@ int main(int argc, char **argv) { die("Can't open input file"); if (opath && !(gOutFile = fopen(opath, "w"))) die("Can't open output file"); - if (op != OP_LIST && isatty(fileno(gOutFile)) == 1) - usage("Refusing to output to a TTY"); switch (op) { - case OP_WRITE: pixz_write(tar, level); break; + case OP_WRITE: + if (isatty(fileno(gOutFile)) == -1) + usage("Refusing to output to a TTY"); + pixz_write(tar, level); + break; case OP_READ: pixz_read(tar, 0, NULL); break; case OP_EXTRACT: pixz_read(tar, argc, argv); break; case OP_LIST: pixz_list(tar); From ea64c94c2182bb3347ca37d5425ea9d14c1f753f Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 14 Oct 2012 07:38:42 -0400 Subject: [PATCH 04/22] We handle EOF fine now --- write.c | 1 - 1 file changed, 1 deletion(-) diff --git a/write.c b/write.c index e8186de..958dc1a 100644 --- a/write.c +++ b/write.c @@ -126,7 +126,6 @@ static void read_thread() { while (true) { int aerr = archive_read_next_header(ar, &entry); if (aerr == ARCHIVE_EOF) { - // TODO break; } else if (aerr != ARCHIVE_OK && aerr != ARCHIVE_WARN) { // Some charset translations warn spuriously From 91f044e569a9bf5f2ebf574bdaefcb02f185858f Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 14 Oct 2012 07:39:10 -0400 Subject: [PATCH 05/22] Start adding non-indexed reading --- common.c | 6 +++-- pixz.h | 2 +- read.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 73 insertions(+), 9 deletions(-) diff --git a/common.c b/common.c index ad291d2..a0e0b30 100644 --- a/common.c +++ b/common.c @@ -229,9 +229,10 @@ static void read_file_index_data(void) { } } -void decode_index(void) { +bool decode_index(void) { if (fseek(gInFile, -LZMA_STREAM_HEADER_SIZE, SEEK_END) == -1) - die("Error seeking to stream footer"); + return false; // not seekable + uint8_t hdrbuf[LZMA_STREAM_HEADER_SIZE]; if (fread(hdrbuf, LZMA_STREAM_HEADER_SIZE, 1, gInFile) != 1) die("Error reading stream footer"); @@ -261,6 +262,7 @@ void decode_index(void) { if (err != LZMA_OK && err != LZMA_STREAM_END) die("Error decoding index"); } + return true; } diff --git a/pixz.h b/pixz.h index d1e9239..ee45a2e 100644 --- a/pixz.h +++ b/pixz.h @@ -68,7 +68,7 @@ extern file_index_t *gFileIndex, *gLastFile; extern lzma_check gCheck; bool is_multi_header(const char *name); -void decode_index(void); +bool decode_index(void); // true on success lzma_vli find_file_index(void **bdatap); lzma_vli read_file_index(void); diff --git a/read.c b/read.c index 134a50d..08d7acd 100644 --- a/read.c +++ b/read.c @@ -33,6 +33,7 @@ typedef struct { static void *block_create(void); static void block_free(void *data); static void read_thread(void); +static void read_thread_noindex(void); static void decode_thread(size_t thnum); @@ -60,17 +61,18 @@ static void check_capacity(io_block_t *ib, size_t incap, size_t outcap); #pragma mark MAIN void pixz_read(bool verify, size_t nspecs, char **specs) { - decode_index(); - if (verify) - gFileIndexOffset = read_file_index(); - wanted_files(nspecs, specs); + if (0 && decode_index()) { // FIXME + if (verify) + gFileIndexOffset = read_file_index(); + wanted_files(nspecs, specs); + } #if DEBUG for (wanted_t *w = gWantedFiles; w; w = w->next) debug("want: %s", w->name); #endif - pipeline_create(block_create, block_free, read_thread, decode_thread); + pipeline_create(block_create, block_free, read_thread_noindex, decode_thread); if (verify && gFileIndexOffset) { gArWanted = gWantedFiles; wanted_t *w = gWantedFiles, *wlast = NULL; @@ -227,7 +229,7 @@ static void wanted_files(size_t count, char **specs) { static void check_capacity(io_block_t *ib, size_t incap, size_t outcap) { if (incap > ib->incap) { ib->incap = incap; - ib->input = malloc(incap); + ib->input = realloc(ib->input, incap); } if (outcap > ib->outcap) { ib->outcap = outcap; @@ -235,6 +237,66 @@ static void check_capacity(io_block_t *ib, size_t incap, size_t outcap) { } } +static void read_thread_noindex(void) { + size_t bytes; + lzma_ret err; + + // Read the header + uint8_t stream_header[LZMA_STREAM_HEADER_SIZE]; + bytes = fread(stream_header, 1, LZMA_STREAM_HEADER_SIZE, gInFile); + if (bytes != LZMA_STREAM_HEADER_SIZE) + die("Error reading stream header"); + lzma_stream_flags stream_flags; + err = lzma_stream_header_decode(&stream_flags, stream_header); + if (err == LZMA_FORMAT_ERROR) + die("Not an XZ file"); + else if (err != LZMA_OK) + die("Error decoding XZ header"); + gCheck = stream_flags.check; + + lzma_filter filters[LZMA_FILTERS_MAX + 1]; + lzma_block block = { .filters = filters, .check = gCheck, .version = 0 }; + while (true) { + // Get pipeline item + pipeline_item_t *pi; + queue_pop(gPipelineStartQ, (void**)&pi); + io_block_t *ib = (io_block_t*)(pi->data); + check_capacity(ib, LZMA_BLOCK_HEADER_SIZE_MAX, 0); + + // Check for index + if (fread(ib->input, 1, 1, gInFile) != 1) + die("Error reading block header size"); + if (ib->input[0] == 0) + break; // Found the index + + // Decode header + block.header_size = lzma_block_header_size_decode(ib->input[0]); + if (block.header_size > LZMA_BLOCK_HEADER_SIZE_MAX) + die("Block header size too large"); + size_t rest = block.header_size - 1; + if (fread(ib->input + 1, 1, rest, gInFile) != rest) + die("Error reading block header"); + if (lzma_block_header_decode(&block, NULL, ib->input) != LZMA_OK) + die("Error decoding block header"); + + lzma_vli comp = block.compressed_size; + ib->insize = lzma_block_total_size(&block); + ib->outsize = block.uncompressed_size; + if (comp == LZMA_VLI_UNKNOWN || ib->outsize == LZMA_VLI_UNKNOWN) + die("No sizes in header!!!"); // FIXME: streaming; file index + check_capacity(ib, ib->insize, ib->outsize); + + rest = ib->insize - block.header_size; + bytes = fread(ib->input + block.header_size, 1, rest, gInFile); + if (bytes != rest) + die("Error reading block contents"); + pipeline_split(pi); + } + + pipeline_stop(); + // FIXME: don't output the pixz file index! heuristic? +} + static void read_thread(void) { off_t offset = ftello(gInFile); wanted_t *w = gWantedFiles; From 7820ec52b9be080a9740659a7ed45031554b7479 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 14 Oct 2012 09:15:42 -0400 Subject: [PATCH 06/22] cleanup --- write.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/write.c b/write.c index 958dc1a..2c291e6 100644 --- a/write.c +++ b/write.c @@ -93,10 +93,9 @@ void pixz_write(bool tar, uint32_t level) { } // file index - if (gTar) { + if (gTar) write_file_index(); - free_file_index(); - } + free_file_index(); // post-block cleanup: index, footer encode_index(); From a6d82ed79edffbc4f2d23e2546c99f71992db685 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 14 Oct 2012 09:25:25 -0400 Subject: [PATCH 07/22] Add FIXMEs --- read.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/read.c b/read.c index 08d7acd..697d726 100644 --- a/read.c +++ b/read.c @@ -74,6 +74,8 @@ void pixz_read(bool verify, size_t nspecs, char **specs) { pipeline_create(block_create, block_free, read_thread_noindex, decode_thread); if (verify && gFileIndexOffset) { + // FIXME: verify this works with noindex/streamed reading + // FIXME: don't stop on End Of Archive gArWanted = gWantedFiles; wanted_t *w = gWantedFiles, *wlast = NULL; bool lastmulti = false; From 8e1efb824eebf04a4a10b46d020df3ba44698faa Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 14 Oct 2012 09:48:48 -0400 Subject: [PATCH 08/22] Fix pragma marks --- read.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/read.c b/read.c index 697d726..9eb1384 100644 --- a/read.c +++ b/read.c @@ -226,7 +226,7 @@ static void wanted_files(size_t count, char **specs) { } -#pragma mark THREADS +#pragma mark READ static void check_capacity(io_block_t *ib, size_t incap, size_t outcap) { if (incap > ib->incap) { @@ -266,7 +266,7 @@ static void read_thread_noindex(void) { check_capacity(ib, LZMA_BLOCK_HEADER_SIZE_MAX, 0); // Check for index - if (fread(ib->input, 1, 1, gInFile) != 1) + if (ib->insize < 1 && fread(ib->input, 1, 1, gInFile) != 1) die("Error reading block header size"); if (ib->input[0] == 0) break; // Found the index @@ -348,6 +348,8 @@ static void read_thread(void) { pipeline_stop(); } +#pragma mark DECODE + static void decode_thread(size_t thnum) { lzma_stream stream = LZMA_STREAM_INIT; lzma_filter filters[LZMA_FILTERS_MAX + 1]; From 9694d22dcd2ada24d67e0a6fe696c2b7cc8a0be5 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sat, 20 Oct 2012 21:07:49 -0400 Subject: [PATCH 09/22] Declare start of read buffering infrastructure --- read.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/read.c b/read.c index 9eb1384..dd143df 100644 --- a/read.c +++ b/read.c @@ -51,12 +51,23 @@ static bool tar_next_block(void); static void tar_write_last(void); +#pragma mark DECLARE READ BUFFER + +static pipeline_item_t *gRbufPI = NULL; +static io_block_t *gRbuf = NULL; + +static void block_capacity(io_block_t *ib, size_t incap, size_t outcap); +static void stream_write(pipeline_item_t *pi); + +static ssize_t rbuf_read(size_t bytes); +static void rbuf_consume(size_t bytes); +static void rbuf_dispatch(); + + #pragma mark DECLARE UTILS static lzma_vli gFileIndexOffset = 0; -static void check_capacity(io_block_t *ib, size_t incap, size_t outcap); - #pragma mark MAIN @@ -228,7 +239,7 @@ static void wanted_files(size_t count, char **specs) { #pragma mark READ -static void check_capacity(io_block_t *ib, size_t incap, size_t outcap) { +static void block_capacity(io_block_t *ib, size_t incap, size_t outcap) { if (incap > ib->incap) { ib->incap = incap; ib->input = realloc(ib->input, incap); @@ -263,7 +274,7 @@ static void read_thread_noindex(void) { pipeline_item_t *pi; queue_pop(gPipelineStartQ, (void**)&pi); io_block_t *ib = (io_block_t*)(pi->data); - check_capacity(ib, LZMA_BLOCK_HEADER_SIZE_MAX, 0); + block_capacity(ib, LZMA_BLOCK_HEADER_SIZE_MAX, 0); // Check for index if (ib->insize < 1 && fread(ib->input, 1, 1, gInFile) != 1) @@ -286,7 +297,7 @@ static void read_thread_noindex(void) { ib->outsize = block.uncompressed_size; if (comp == LZMA_VLI_UNKNOWN || ib->outsize == LZMA_VLI_UNKNOWN) die("No sizes in header!!!"); // FIXME: streaming; file index - check_capacity(ib, ib->insize, ib->outsize); + block_capacity(ib, ib->insize, ib->outsize); rest = ib->insize - block.header_size; bytes = fread(ib->input + block.header_size, 1, rest, gInFile); @@ -328,7 +339,7 @@ static void read_thread(void) { pipeline_item_t *pi; queue_pop(gPipelineStartQ, (void**)&pi); io_block_t *ib = (io_block_t*)(pi->data); - check_capacity(ib, iter.block.unpadded_size, + block_capacity(ib, iter.block.unpadded_size, iter.block.uncompressed_size); // Seek if needed, and get the data From 70a3c58520c0254eb58511bc09d76e32c3e53dc1 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sat, 20 Oct 2012 21:54:17 -0400 Subject: [PATCH 10/22] Use the read buffer --- common.c | 8 ++++-- pixz.h | 1 + read.c | 79 +++++++++++++++++++++++++++++++++++++------------------- 3 files changed, 60 insertions(+), 28 deletions(-) diff --git a/common.c b/common.c index a0e0b30..2f0f051 100644 --- a/common.c +++ b/common.c @@ -437,10 +437,14 @@ void pipeline_destroy(void) { free(gPLProcessThreads); } -void pipeline_split(pipeline_item_t *item) { +void pipeline_dispatch(pipeline_item_t *item, queue_t *q) { item->seq = gPLSplitSeq++; item->next = NULL; - queue_push(gPipelineSplitQ, PIPELINE_ITEM, item); + queue_push(q, PIPELINE_ITEM, item); +} + +void pipeline_split(pipeline_item_t *item) { + pipeline_dispatch(item, gPipelineSplitQ); } pipeline_item_t *pipeline_merged() { diff --git a/pixz.h b/pixz.h index ee45a2e..7e7b5fc 100644 --- a/pixz.h +++ b/pixz.h @@ -135,5 +135,6 @@ void pipeline_create( void pipeline_stop(void); void pipeline_destroy(void); +void pipeline_dispatch(pipeline_item_t *item, queue_t *q); void pipeline_split(pipeline_item_t *item); pipeline_item_t *pipeline_merged(); diff --git a/read.c b/read.c index dd143df..9d71005 100644 --- a/read.c +++ b/read.c @@ -57,11 +57,14 @@ static pipeline_item_t *gRbufPI = NULL; static io_block_t *gRbuf = NULL; static void block_capacity(io_block_t *ib, size_t incap, size_t outcap); -static void stream_write(pipeline_item_t *pi); -static ssize_t rbuf_read(size_t bytes); +typedef enum { + RBUF_ERR, RBUF_EOF, RBUF_PART, RBUF_FULL +} rbuf_read_status; + +static rbuf_read_status rbuf_read(size_t bytes); static void rbuf_consume(size_t bytes); -static void rbuf_dispatch(); +static void rbuf_dispatch(void); #pragma mark DECLARE UTILS @@ -250,6 +253,40 @@ static void block_capacity(io_block_t *ib, size_t incap, size_t outcap) { } } +// Ensure at least this many bytes available +// Return 1 on success, zero on EOF, -1 on error +static rbuf_read_status rbuf_read(size_t bytes) { + if (!gRbufPI) { + queue_pop(gPipelineStartQ, (void**)&gRbufPI); + gRbuf = (io_block_t*)(gRbufPI->data); + gRbuf->insize = gRbuf->outsize = 0; + } + + if (gRbuf->insize >= bytes) + return RBUF_FULL; + + block_capacity(gRbuf, bytes, 0); + size_t r = fread(gRbuf->input + gRbuf->insize, 1, bytes - gRbuf->insize, + gInFile); + gRbuf->insize += r; + + if (r) + return (gRbuf->insize == bytes) ? RBUF_FULL : RBUF_PART; + return feof(gInFile) ? RBUF_EOF : RBUF_ERR; +} + +static void rbuf_consume(size_t bytes) { + if (bytes < gRbuf->insize) + memmove(gRbuf->input, gRbuf->input + bytes, gRbuf->insize - bytes); + gRbuf->insize -= bytes; +} + +static void rbuf_dispatch(void) { + pipeline_split(gRbufPI); + gRbufPI = NULL; + gRbuf = NULL; +} + static void read_thread_noindex(void) { size_t bytes; lzma_ret err; @@ -270,40 +307,30 @@ static void read_thread_noindex(void) { lzma_filter filters[LZMA_FILTERS_MAX + 1]; lzma_block block = { .filters = filters, .check = gCheck, .version = 0 }; while (true) { - // Get pipeline item - pipeline_item_t *pi; - queue_pop(gPipelineStartQ, (void**)&pi); - io_block_t *ib = (io_block_t*)(pi->data); - block_capacity(ib, LZMA_BLOCK_HEADER_SIZE_MAX, 0); - // Check for index - if (ib->insize < 1 && fread(ib->input, 1, 1, gInFile) != 1) + if (rbuf_read(1) != RBUF_FULL) die("Error reading block header size"); - if (ib->input[0] == 0) - break; // Found the index - + if (gRbuf->input[0] == 0) + break; // Found the index. FIXME: multi-stream? + // Decode header - block.header_size = lzma_block_header_size_decode(ib->input[0]); + block.header_size = lzma_block_header_size_decode(gRbuf->input[0]); if (block.header_size > LZMA_BLOCK_HEADER_SIZE_MAX) die("Block header size too large"); - size_t rest = block.header_size - 1; - if (fread(ib->input + 1, 1, rest, gInFile) != rest) + if (rbuf_read(block.header_size) != RBUF_FULL) die("Error reading block header"); - if (lzma_block_header_decode(&block, NULL, ib->input) != LZMA_OK) + if (lzma_block_header_decode(&block, NULL, gRbuf->input) != LZMA_OK) die("Error decoding block header"); - lzma_vli comp = block.compressed_size; - ib->insize = lzma_block_total_size(&block); - ib->outsize = block.uncompressed_size; - if (comp == LZMA_VLI_UNKNOWN || ib->outsize == LZMA_VLI_UNKNOWN) + size_t comp = block.compressed_size, outsize = block.uncompressed_size; + if (comp == LZMA_VLI_UNKNOWN || outsize == LZMA_VLI_UNKNOWN) die("No sizes in header!!!"); // FIXME: streaming; file index - block_capacity(ib, ib->insize, ib->outsize); + block_capacity(gRbuf, 0, outsize); + gRbuf->outsize = outsize; - rest = ib->insize - block.header_size; - bytes = fread(ib->input + block.header_size, 1, rest, gInFile); - if (bytes != rest) + if (rbuf_read(lzma_block_total_size(&block)) != RBUF_FULL) die("Error reading block contents"); - pipeline_split(pi); + rbuf_dispatch(); } pipeline_stop(); From 4474af5419fdf588131a13c70422e8543edb2749 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sat, 20 Oct 2012 22:32:20 -0400 Subject: [PATCH 11/22] Streaming read works --- read.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/read.c b/read.c index 9d71005..b17a5fc 100644 --- a/read.c +++ b/read.c @@ -53,6 +53,8 @@ static void tar_write_last(void); #pragma mark DECLARE READ BUFFER +#define STREAMSIZE (1024 * 1024) + static pipeline_item_t *gRbufPI = NULL; static io_block_t *gRbuf = NULL; @@ -66,6 +68,8 @@ static rbuf_read_status rbuf_read(size_t bytes); static void rbuf_consume(size_t bytes); static void rbuf_dispatch(void); +static void read_streaming(lzma_block *block); + #pragma mark DECLARE UTILS @@ -287,11 +291,57 @@ static void rbuf_dispatch(void) { gRbuf = NULL; } +static void read_streaming(lzma_block *block) { + lzma_stream stream = LZMA_STREAM_INIT; + if (lzma_block_decoder(&stream, block) != LZMA_OK) + die("Error initializing streaming block decode"); + stream.next_in = gRbuf->input + block->header_size; + stream.avail_in = gRbuf->insize - block->header_size; + stream.avail_out = 0; + + pipeline_item_t *pi = NULL; + io_block_t *ib = NULL; + + lzma_ret err = LZMA_OK; + while (err != LZMA_STREAM_END) { + if (err != LZMA_OK) + die("Error decoding streaming block"); + + if (stream.avail_out == 0) { + if (ib) { + ib->outsize = ib->outcap; + pipeline_dispatch(pi, gPipelineMergeQ); + } + queue_pop(gPipelineStartQ, (void**)&pi); + ib = (io_block_t*)pi->data; + block_capacity(ib, 0, STREAMSIZE); + stream.next_out = ib->output; + stream.avail_out = ib->outcap; + } + if (stream.avail_in == 0) { + rbuf_consume(gRbuf->insize); + if (rbuf_read(CHUNKSIZE) < RBUF_PART) + die("Error reading streaming block contents"); + stream.next_in = gRbuf->input; + stream.avail_in = gRbuf->insize; + } + + err = lzma_code(&stream, LZMA_RUN); + } + + if (ib && stream.avail_out != ib->outcap) { + ib->outsize = ib->outcap - stream.avail_out; + pipeline_dispatch(pi, gPipelineMergeQ); + } + rbuf_consume(gRbuf->insize - stream.avail_in); + lzma_end(&stream); +} + static void read_thread_noindex(void) { size_t bytes; lzma_ret err; - // Read the header + // Stream header uint8_t stream_header[LZMA_STREAM_HEADER_SIZE]; bytes = fread(stream_header, 1, LZMA_STREAM_HEADER_SIZE, gInFile); if (bytes != LZMA_STREAM_HEADER_SIZE) @@ -307,13 +357,11 @@ static void read_thread_noindex(void) { lzma_filter filters[LZMA_FILTERS_MAX + 1]; lzma_block block = { .filters = filters, .check = gCheck, .version = 0 }; while (true) { - // Check for index if (rbuf_read(1) != RBUF_FULL) die("Error reading block header size"); if (gRbuf->input[0] == 0) break; // Found the index. FIXME: multi-stream? - // Decode header block.header_size = lzma_block_header_size_decode(gRbuf->input[0]); if (block.header_size > LZMA_BLOCK_HEADER_SIZE_MAX) die("Block header size too large"); @@ -323,14 +371,16 @@ static void read_thread_noindex(void) { die("Error decoding block header"); size_t comp = block.compressed_size, outsize = block.uncompressed_size; - if (comp == LZMA_VLI_UNKNOWN || outsize == LZMA_VLI_UNKNOWN) - die("No sizes in header!!!"); // FIXME: streaming; file index - block_capacity(gRbuf, 0, outsize); - gRbuf->outsize = outsize; + if (comp == LZMA_VLI_UNKNOWN || outsize == LZMA_VLI_UNKNOWN) { + read_streaming(&block); + } else { + block_capacity(gRbuf, 0, outsize); + gRbuf->outsize = outsize; - if (rbuf_read(lzma_block_total_size(&block)) != RBUF_FULL) - die("Error reading block contents"); - rbuf_dispatch(); + if (rbuf_read(lzma_block_total_size(&block)) != RBUF_FULL) + die("Error reading block contents"); + rbuf_dispatch(); + } } pipeline_stop(); From aa79e8795655890e0311868aa2e2454896462e44 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sat, 20 Oct 2012 23:30:07 -0400 Subject: [PATCH 12/22] Multiple streams are supported --- read.c | 169 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 118 insertions(+), 51 deletions(-) diff --git a/read.c b/read.c index b17a5fc..592fc66 100644 --- a/read.c +++ b/read.c @@ -65,10 +65,15 @@ typedef enum { } rbuf_read_status; static rbuf_read_status rbuf_read(size_t bytes); +static bool rbuf_cycle(lzma_stream *stream, bool start, size_t skip); static void rbuf_consume(size_t bytes); static void rbuf_dispatch(void); +static bool read_header(void); +static bool read_block(void); static void read_streaming(lzma_block *block); +static void read_index(void); +static void read_footer(void); #pragma mark DECLARE UTILS @@ -179,6 +184,7 @@ static void wanted_free(wanted_t *w) { } } + static bool spec_match(char *spec, char *name) { bool match = true; for (; *spec; ++spec, ++name) { @@ -279,6 +285,17 @@ static rbuf_read_status rbuf_read(size_t bytes) { return feof(gInFile) ? RBUF_EOF : RBUF_ERR; } +static bool rbuf_cycle(lzma_stream *stream, bool start, size_t skip) { + if (!start) { + rbuf_consume(gRbuf->insize); + if (rbuf_read(CHUNKSIZE) < RBUF_PART) + return false; + } + stream->next_in = gRbuf->input + skip; + stream->avail_in = gRbuf->insize - skip; + return true; +} + static void rbuf_consume(size_t bytes) { if (bytes < gRbuf->insize) memmove(gRbuf->input, gRbuf->input + bytes, gRbuf->insize - bytes); @@ -291,12 +308,60 @@ static void rbuf_dispatch(void) { gRbuf = NULL; } + +static bool read_header(void) { + lzma_stream_flags stream_flags; + rbuf_read_status st = rbuf_read(LZMA_STREAM_HEADER_SIZE); + if (st == RBUF_EOF) + return false; + else if (st != RBUF_FULL) + die("Error reading stream header"); + lzma_ret err = lzma_stream_header_decode(&stream_flags, gRbuf->input); + if (err == LZMA_FORMAT_ERROR) + die("Not an XZ file"); + else if (err != LZMA_OK) + die("Error decoding XZ header"); + gCheck = stream_flags.check; + rbuf_consume(LZMA_STREAM_HEADER_SIZE); + return true; +} + +static bool read_block(void) { + lzma_filter filters[LZMA_FILTERS_MAX + 1]; + lzma_block block = { .filters = filters, .check = gCheck, .version = 0 }; + + if (rbuf_read(1) != RBUF_FULL) + die("Error reading block header size"); + if (gRbuf->input[0] == 0) + return false; + + block.header_size = lzma_block_header_size_decode(gRbuf->input[0]); + if (block.header_size > LZMA_BLOCK_HEADER_SIZE_MAX) + die("Block header size too large"); + if (rbuf_read(block.header_size) != RBUF_FULL) + die("Error reading block header"); + if (lzma_block_header_decode(&block, NULL, gRbuf->input) != LZMA_OK) + die("Error decoding block header"); + + size_t comp = block.compressed_size, outsize = block.uncompressed_size; + if (comp == LZMA_VLI_UNKNOWN || outsize == LZMA_VLI_UNKNOWN) { + read_streaming(&block); + } else { + block_capacity(gRbuf, 0, outsize); + gRbuf->outsize = outsize; + + if (rbuf_read(lzma_block_total_size(&block)) != RBUF_FULL) + die("Error reading block contents"); + rbuf_dispatch(); + } + return true; +} + static void read_streaming(lzma_block *block) { lzma_stream stream = LZMA_STREAM_INIT; if (lzma_block_decoder(&stream, block) != LZMA_OK) die("Error initializing streaming block decode"); - stream.next_in = gRbuf->input + block->header_size; - stream.avail_in = gRbuf->insize - block->header_size; + rbuf_cycle(&stream, true, block->header_size); stream.avail_out = 0; pipeline_item_t *pi = NULL; @@ -318,13 +383,8 @@ static void read_streaming(lzma_block *block) { stream.next_out = ib->output; stream.avail_out = ib->outcap; } - if (stream.avail_in == 0) { - rbuf_consume(gRbuf->insize); - if (rbuf_read(CHUNKSIZE) < RBUF_PART) - die("Error reading streaming block contents"); - stream.next_in = gRbuf->input; - stream.avail_in = gRbuf->insize; - } + if (stream.avail_in == 0 && !rbuf_cycle(&stream, false, 0)) + die("Error reading streaming block"); err = lzma_code(&stream, LZMA_RUN); } @@ -337,54 +397,61 @@ static void read_streaming(lzma_block *block) { lzma_end(&stream); } -static void read_thread_noindex(void) { - size_t bytes; - lzma_ret err; +static void read_index(void) { + // FIXME: verify it matches the blocks? + lzma_stream stream = LZMA_STREAM_INIT; + lzma_index *index; + if (lzma_index_decoder(&stream, &index, MEMLIMIT) != LZMA_OK) + die("Error initializing index decoder"); + rbuf_cycle(&stream, true, 0); - // Stream header - uint8_t stream_header[LZMA_STREAM_HEADER_SIZE]; - bytes = fread(stream_header, 1, LZMA_STREAM_HEADER_SIZE, gInFile); - if (bytes != LZMA_STREAM_HEADER_SIZE) - die("Error reading stream header"); + lzma_ret err = LZMA_OK; + while (err != LZMA_STREAM_END) { + if (err != LZMA_OK) + die("Error decoding index"); + if (stream.avail_in == 0 && !rbuf_cycle(&stream, false, 0)) + die("Error reading index"); + err = lzma_code(&stream, LZMA_RUN); + } + rbuf_consume(gRbuf->insize - stream.avail_in); + lzma_end(&stream); +} + +static void read_footer(void) { + // FIXME: compare with header? lzma_stream_flags stream_flags; - err = lzma_stream_header_decode(&stream_flags, stream_header); - if (err == LZMA_FORMAT_ERROR) - die("Not an XZ file"); - else if (err != LZMA_OK) - die("Error decoding XZ header"); - gCheck = stream_flags.check; + if (rbuf_read(LZMA_STREAM_HEADER_SIZE) != RBUF_FULL) + die("Error reading stream footer"); + if (lzma_stream_footer_decode(&stream_flags, gRbuf->input) != LZMA_OK) + die("Error decoding XZ footer"); + rbuf_consume(LZMA_STREAM_HEADER_SIZE); - lzma_filter filters[LZMA_FILTERS_MAX + 1]; - lzma_block block = { .filters = filters, .check = gCheck, .version = 0 }; + char zeros[4] = "\0\0\0\0"; while (true) { - if (rbuf_read(1) != RBUF_FULL) - die("Error reading block header size"); - if (gRbuf->input[0] == 0) - break; // Found the index. FIXME: multi-stream? - - block.header_size = lzma_block_header_size_decode(gRbuf->input[0]); - if (block.header_size > LZMA_BLOCK_HEADER_SIZE_MAX) - die("Block header size too large"); - if (rbuf_read(block.header_size) != RBUF_FULL) - die("Error reading block header"); - if (lzma_block_header_decode(&block, NULL, gRbuf->input) != LZMA_OK) - die("Error decoding block header"); - - size_t comp = block.compressed_size, outsize = block.uncompressed_size; - if (comp == LZMA_VLI_UNKNOWN || outsize == LZMA_VLI_UNKNOWN) { - read_streaming(&block); - } else { - block_capacity(gRbuf, 0, outsize); - gRbuf->outsize = outsize; - - if (rbuf_read(lzma_block_total_size(&block)) != RBUF_FULL) - die("Error reading block contents"); - rbuf_dispatch(); - } + rbuf_read_status st = rbuf_read(4); + if (st == RBUF_EOF) + return; + if (st != RBUF_FULL) + die("Footer must be multiple of four bytes"); + if (memcmp(zeros, gRbuf->input, 4) != 0) + return; + rbuf_consume(4); } - +} + +static void read_thread_noindex(void) { + bool empty = true; + while (read_header()) { + empty = false; + while (read_block()) + ; // pass + read_index(); + read_footer(); + // FIXME: don't output the pixz file index! heuristic? + } + if (empty) + die("Empty input"); pipeline_stop(); - // FIXME: don't output the pixz file index! heuristic? } static void read_thread(void) { From 28e0515d75b1aafd0c8a07e774fadf63ba514484 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 4 Nov 2012 19:48:28 -0500 Subject: [PATCH 13/22] Start factoring out index decoding --- common.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 91 insertions(+), 18 deletions(-) diff --git a/common.c b/common.c index 2f0f051..a6da1f6 100644 --- a/common.c +++ b/common.c @@ -229,39 +229,112 @@ static void read_file_index_data(void) { } } -bool decode_index(void) { - if (fseek(gInFile, -LZMA_STREAM_HEADER_SIZE, SEEK_END) == -1) - return false; // not seekable + +#define BWCHUNK 512 + +typedef struct { + uint8_t buf[BWCHUNK]; + off_t pos; + size_t size; +} bw; + +static uint32_t *bw_read(bw *b) { + size_t sz = sizeof(uint32_t); + if (b->size < sz) { + if (b->pos < sz) + return NULL; // EOF + b->size = (b->pos > BWCHUNK) ? BWCHUNK : b->pos; + b->pos -= b->size; + if (fseeko(gInFile, b->pos, SEEK_SET) == -1) + return NULL; + if (fread(b->buf, b->size, 1, gInFile) != 1) + return NULL; + } + + b->size -= sz; + return &((uint32_t*)b->buf)[b->size / sz]; +} + +static off_t stream_padding(bw *b, off_t pos) { + b->pos = pos; + b->size = 0; + + for (off_t pad = 0; true; ++pad) { + uint32_t *i = bw_read(b); + if (!i) + die("Error reading stream padding"); + if (*i != 0) { + b->size += sizeof(uint32_t); + return pad; + } + } +} + +static void stream_footer(bw *b, lzma_stream_flags *flags) { + uint8_t ftr[LZMA_STREAM_HEADER_SIZE]; + for (int i = sizeof(ftr) / sizeof(uint32_t) - 1; i >= 0; --i) { + uint32_t *p = bw_read(b); + if (!p) + die("Error reading stream footer"); + *((uint32_t*)ftr + i) = *p; + } - uint8_t hdrbuf[LZMA_STREAM_HEADER_SIZE]; - if (fread(hdrbuf, LZMA_STREAM_HEADER_SIZE, 1, gInFile) != 1) - die("Error reading stream footer"); - lzma_stream_flags flags; - if (lzma_stream_footer_decode(&flags, hdrbuf) != LZMA_OK) + if (lzma_stream_footer_decode(flags, ftr) != LZMA_OK) die("Error decoding stream footer"); - - gCheck = flags.check; - size_t index_seek = -LZMA_STREAM_HEADER_SIZE - flags.backward_size; - if (fseek(gInFile, index_seek, SEEK_CUR) == -1) + gCheck = flags->check; // FIXME: multiple streams +} + +static lzma_index *next_index(off_t *pos) { + bw b; + off_t pad = stream_padding(&b, *pos); + off_t eos = *pos - pad; + + lzma_stream_flags flags; + stream_footer(&b, &flags); + *pos = eos - LZMA_STREAM_HEADER_SIZE - flags.backward_size; + if (fseeko(gInFile, *pos, SEEK_SET) == -1) die("Error seeking to index"); - if (lzma_index_decoder(&gStream, &gIndex, MEMLIMIT) != LZMA_OK) + + lzma_stream strm = LZMA_STREAM_INIT; + lzma_index *index; + if (lzma_index_decoder(&strm, &index, MEMLIMIT) != LZMA_OK) die("Error creating index decoder"); uint8_t ibuf[CHUNKSIZE]; - gStream.avail_in = 0; + strm.avail_in = 0; lzma_ret err = LZMA_OK; while (err != LZMA_STREAM_END) { - if (gStream.avail_in == 0) { - gStream.avail_in = fread(ibuf, 1, CHUNKSIZE, gInFile); + if (strm.avail_in == 0) { + strm.avail_in = fread(ibuf, 1, CHUNKSIZE, gInFile); if (ferror(gInFile)) die("Error reading index"); - gStream.next_in = ibuf; + strm.next_in = ibuf; } - err = lzma_code(&gStream, LZMA_RUN); + err = lzma_code(&strm, LZMA_RUN); if (err != LZMA_OK && err != LZMA_STREAM_END) die("Error decoding index"); } + + *pos = eos - lzma_index_stream_size(index); + if (fseeko(gInFile, *pos, SEEK_SET) == -1) + die("Error seeking to beginning of stream"); + + + if (lzma_index_stream_flags(index, &flags) != LZMA_OK) + die("Error setting stream flags"); + if (lzma_index_stream_padding(index, pad) != LZMA_OK) + die("Error setting stream padding"); + return index; +} + +bool decode_index(void) { + if (fseeko(gInFile, 0, SEEK_END) == -1) + return false; // not seekable + off_t pos = ftello(gInFile); + + gIndex = next_index(&pos); + return true; } From 4ef79b67d6b71b1a5c68048edc5155a888da2604 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 4 Nov 2012 20:11:49 -0500 Subject: [PATCH 14/22] Reading multiple indices ok! --- common.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/common.c b/common.c index a6da1f6..f5e3073 100644 --- a/common.c +++ b/common.c @@ -259,7 +259,7 @@ static off_t stream_padding(bw *b, off_t pos) { b->pos = pos; b->size = 0; - for (off_t pad = 0; true; ++pad) { + for (off_t pad = 0; true; pad += sizeof(uint32_t)) { uint32_t *i = bw_read(b); if (!i) die("Error reading stream padding"); @@ -333,7 +333,13 @@ bool decode_index(void) { return false; // not seekable off_t pos = ftello(gInFile); - gIndex = next_index(&pos); + gIndex = NULL; + while (pos > 0) { + lzma_index *index = next_index(&pos); + if (gIndex && lzma_index_cat(index, gIndex, NULL) != LZMA_OK) + die("Error concatenating indices"); + gIndex = index; + } return true; } From dd86134d64d7fd044319fdec033ac540fdc85878 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 4 Nov 2012 21:15:55 -0500 Subject: [PATCH 15/22] Remove global gCheck --- common.c | 73 ++++++++++++++++++++++++++++---------------------------- pixz.h | 6 ----- read.c | 26 ++++++++++++-------- 3 files changed, 53 insertions(+), 52 deletions(-) diff --git a/common.c b/common.c index f5e3073..334dc0b 100644 --- a/common.c +++ b/common.c @@ -5,16 +5,9 @@ #pragma mark UTILS -typedef struct { - lzma_block block; - lzma_filter filters[LZMA_FILTERS_MAX + 1]; -} block_wrapper_t; - FILE *gInFile = NULL; lzma_stream gStream = LZMA_STREAM_INIT; -lzma_check gCheck = LZMA_CHECK_NONE; - void die(const char *fmt, ...) { va_list args; @@ -36,32 +29,6 @@ char *xstrdup(const char *s) { return memcpy(r, s, len + 1); } -void *decode_block_start(off_t block_seek) { - if (fseeko(gInFile, block_seek, SEEK_SET) == -1) - die("Error seeking to block"); - - // Some memory in which to keep the discovered filters safe - block_wrapper_t *bw = malloc(sizeof(block_wrapper_t)); - bw->block = (lzma_block){ .check = gCheck, .filters = bw->filters, - .version = 0 }; - - int b = fgetc(gInFile); - if (b == EOF || b == 0) - die("Error reading block size"); - bw->block.header_size = lzma_block_header_size_decode(b); - uint8_t hdrbuf[bw->block.header_size]; - hdrbuf[0] = (uint8_t)b; - if (fread(hdrbuf + 1, bw->block.header_size - 1, 1, gInFile) != 1) - die("Error reading block header"); - if (lzma_block_header_decode(&bw->block, NULL, hdrbuf) != LZMA_OK) - die("Error decoding file index block header"); - - if (lzma_block_decoder(&gStream, &bw->block) != LZMA_OK) - die("Error initializing file index stream"); - - return bw; -} - bool is_multi_header(const char *name) { size_t i = strlen(name); while (i != 0 && name[i - 1] != '/') @@ -82,6 +49,9 @@ static lzma_ret gFIBErr = LZMA_OK; static uint8_t gFIBInputBuf[CHUNKSIZE]; static size_t gMoved = 0; +static void *decode_file_index_start(off_t block_seek, lzma_check check); +static lzma_vli find_file_index(void **bdatap); + static char *read_file_index_name(void); static void read_file_index_make_space(void); static void read_file_index_data(void); @@ -109,7 +79,38 @@ void free_file_index(void) { gFileIndex = gLastFile = NULL; } -lzma_vli find_file_index(void **bdatap) { +typedef struct { + lzma_block block; + lzma_filter filters[LZMA_FILTERS_MAX + 1]; +} block_wrapper_t; + +static void *decode_file_index_start(off_t block_seek, lzma_check check) { + if (fseeko(gInFile, block_seek, SEEK_SET) == -1) + die("Error seeking to block"); + + // Some memory in which to keep the discovered filters safe + block_wrapper_t *bw = malloc(sizeof(block_wrapper_t)); + bw->block = (lzma_block){ .check = check, .filters = bw->filters, + .version = 0 }; + + int b = fgetc(gInFile); + if (b == EOF || b == 0) + die("Error reading block size"); + bw->block.header_size = lzma_block_header_size_decode(b); + uint8_t hdrbuf[bw->block.header_size]; + hdrbuf[0] = (uint8_t)b; + if (fread(hdrbuf + 1, bw->block.header_size - 1, 1, gInFile) != 1) + die("Error reading block header"); + if (lzma_block_header_decode(&bw->block, NULL, hdrbuf) != LZMA_OK) + die("Error decoding file index block header"); + + if (lzma_block_decoder(&gStream, &bw->block) != LZMA_OK) + die("Error initializing file index stream"); + + return bw; +} + +static lzma_vli find_file_index(void **bdatap) { if (!gIndex) decode_index(); @@ -119,7 +120,8 @@ lzma_vli find_file_index(void **bdatap) { lzma_vli loc = lzma_index_uncompressed_size(gIndex) - 1; if (lzma_index_iter_locate(&iter, loc)) die("Can't locate file index block"); - void *bdata = decode_block_start(iter.block.compressed_file_offset); + void *bdata = decode_file_index_start(iter.block.compressed_file_offset, + iter.stream.flags->check); gFileIndexBuf = malloc(gFIBSize); gStream.avail_out = gFIBSize; @@ -281,7 +283,6 @@ static void stream_footer(bw *b, lzma_stream_flags *flags) { if (lzma_stream_footer_decode(flags, ftr) != LZMA_OK) die("Error decoding stream footer"); - gCheck = flags->check; // FIXME: multiple streams } static lzma_index *next_index(off_t *pos) { diff --git a/pixz.h b/pixz.h index 7e7b5fc..b7c7d61 100644 --- a/pixz.h +++ b/pixz.h @@ -50,8 +50,6 @@ uint64_t xle64dec(const uint8_t *d); void xle64enc(uint8_t *d, uint64_t n); size_t num_threads(void); -void *decode_block_start(off_t block_seek); - #pragma mark INDEX @@ -64,13 +62,9 @@ struct file_index_t { extern file_index_t *gFileIndex, *gLastFile; -// As discovered from footer -extern lzma_check gCheck; - bool is_multi_header(const char *name); bool decode_index(void); // true on success -lzma_vli find_file_index(void **bdatap); lzma_vli read_file_index(void); void dump_file_index(FILE *out, bool verbose); void free_file_index(void); diff --git a/read.c b/read.c index 592fc66..eb25bfd 100644 --- a/read.c +++ b/read.c @@ -28,6 +28,7 @@ typedef struct { size_t incap, outcap; size_t insize, outsize; off_t uoffset; // uncompressed offset + lzma_check check; } io_block_t; static void *block_create(void); @@ -69,8 +70,8 @@ static bool rbuf_cycle(lzma_stream *stream, bool start, size_t skip); static void rbuf_consume(size_t bytes); static void rbuf_dispatch(void); -static bool read_header(void); -static bool read_block(void); +static bool read_header(lzma_check *check); +static bool read_block(lzma_check check); static void read_streaming(lzma_block *block); static void read_index(void); static void read_footer(void); @@ -309,7 +310,7 @@ static void rbuf_dispatch(void) { } -static bool read_header(void) { +static bool read_header(lzma_check *check) { lzma_stream_flags stream_flags; rbuf_read_status st = rbuf_read(LZMA_STREAM_HEADER_SIZE); if (st == RBUF_EOF) @@ -321,14 +322,14 @@ static bool read_header(void) { die("Not an XZ file"); else if (err != LZMA_OK) die("Error decoding XZ header"); - gCheck = stream_flags.check; + *check = stream_flags.check; rbuf_consume(LZMA_STREAM_HEADER_SIZE); return true; } -static bool read_block(void) { +static bool read_block(lzma_check check) { lzma_filter filters[LZMA_FILTERS_MAX + 1]; - lzma_block block = { .filters = filters, .check = gCheck, .version = 0 }; + lzma_block block = { .filters = filters, .check = check, .version = 0 }; if (rbuf_read(1) != RBUF_FULL) die("Error reading block header size"); @@ -349,6 +350,7 @@ static bool read_block(void) { } else { block_capacity(gRbuf, 0, outsize); gRbuf->outsize = outsize; + gRbuf->check = check; if (rbuf_read(lzma_block_total_size(&block)) != RBUF_FULL) die("Error reading block contents"); @@ -441,9 +443,10 @@ static void read_footer(void) { static void read_thread_noindex(void) { bool empty = true; - while (read_header()) { + lzma_check check = LZMA_CHECK_NONE; + while (read_header(&check)) { empty = false; - while (read_block()) + while (read_block(check)) ; // pass read_index(); read_footer(); @@ -496,6 +499,7 @@ static void read_thread(void) { die("Error reading block contents"); offset += bsize; ib->uoffset = iter.block.uncompressed_file_offset; + ib->check = iter.stream.flags->check; pipeline_split(pi); } @@ -508,7 +512,8 @@ static void read_thread(void) { static void decode_thread(size_t thnum) { lzma_stream stream = LZMA_STREAM_INIT; lzma_filter filters[LZMA_FILTERS_MAX + 1]; - lzma_block block = { .filters = filters, .check = gCheck, .version = 0 }; + lzma_block block = { .filters = filters, .check = LZMA_CHECK_NONE, + .version = 0 }; pipeline_item_t *pi; io_block_t *ib; @@ -517,7 +522,8 @@ static void decode_thread(size_t thnum) { ib = (io_block_t*)(pi->data); block.header_size = lzma_block_header_size_decode(*(ib->input)); - if (lzma_block_header_decode(&block, NULL, ib->input) != LZMA_OK) + block.check = ib->check; + if (lzma_block_header_decode(&block, NULL, ib->input) != LZMA_OK) die("Error decoding block header"); if (lzma_block_decoder(&stream, &block) != LZMA_OK) die("Error initializing block decode"); From dd5f6d01e3dad7d82a0478fff0325c34b5535c2b Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 4 Nov 2012 21:23:18 -0500 Subject: [PATCH 16/22] Enable both seekable and non-seekable modes --- list.c | 4 +++- read.c | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/list.c b/list.c index 9798d29..3dbf614 100644 --- a/list.c +++ b/list.c @@ -3,7 +3,9 @@ #pragma mark FUNCTION DEFINITIONS void pixz_list(bool tar) { - decode_index(); + if (!decode_index()) + die("Can't list non-seekable input"); + lzma_index_iter iter; lzma_index_iter_init(&iter, gIndex); diff --git a/read.c b/read.c index eb25bfd..5058b16 100644 --- a/read.c +++ b/read.c @@ -85,7 +85,7 @@ static lzma_vli gFileIndexOffset = 0; #pragma mark MAIN void pixz_read(bool verify, size_t nspecs, char **specs) { - if (0 && decode_index()) { // FIXME + if (decode_index()) { // FIXME if (verify) gFileIndexOffset = read_file_index(); wanted_files(nspecs, specs); @@ -96,7 +96,8 @@ void pixz_read(bool verify, size_t nspecs, char **specs) { debug("want: %s", w->name); #endif - pipeline_create(block_create, block_free, read_thread_noindex, decode_thread); + pipeline_create(block_create, block_free, + gIndex ? read_thread : read_thread_noindex, decode_thread); if (verify && gFileIndexOffset) { // FIXME: verify this works with noindex/streamed reading // FIXME: don't stop on End Of Archive From 3d5d1f1d399df15557098167da9f2feafea66a4a Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 4 Nov 2012 23:27:59 -0500 Subject: [PATCH 17/22] Handle oversized blocks --- read.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/read.c b/read.c index 5058b16..b979783 100644 --- a/read.c +++ b/read.c @@ -55,6 +55,7 @@ static void tar_write_last(void); #pragma mark DECLARE READ BUFFER #define STREAMSIZE (1024 * 1024) +#define MAXSPLITSIZE (64 * 1024 * 1024) // xz -9 blocksize static pipeline_item_t *gRbufPI = NULL; static io_block_t *gRbuf = NULL; @@ -71,7 +72,7 @@ static void rbuf_consume(size_t bytes); static void rbuf_dispatch(void); static bool read_header(lzma_check *check); -static bool read_block(lzma_check check); +static bool read_block(bool force_stream, lzma_check check); static void read_streaming(lzma_block *block); static void read_index(void); static void read_footer(void); @@ -328,7 +329,7 @@ static bool read_header(lzma_check *check) { return true; } -static bool read_block(lzma_check check) { +static bool read_block(bool force_stream, lzma_check check) { lzma_filter filters[LZMA_FILTERS_MAX + 1]; lzma_block block = { .filters = filters, .check = check, .version = 0 }; @@ -346,7 +347,9 @@ static bool read_block(lzma_check check) { die("Error decoding block header"); size_t comp = block.compressed_size, outsize = block.uncompressed_size; - if (comp == LZMA_VLI_UNKNOWN || outsize == LZMA_VLI_UNKNOWN) { + if (force_stream || comp == LZMA_VLI_UNKNOWN + || outsize == LZMA_VLI_UNKNOWN + || outsize > MAXSPLITSIZE) { read_streaming(&block); } else { block_capacity(gRbuf, 0, outsize); @@ -447,7 +450,7 @@ static void read_thread_noindex(void) { lzma_check check = LZMA_CHECK_NONE; while (read_header(&check)) { empty = false; - while (read_block(check)) + while (read_block(false, check)) ; // pass read_index(); read_footer(); @@ -494,15 +497,22 @@ static void read_thread(void) { if (offset != boffset) { fseeko(gInFile, boffset, SEEK_SET); offset = boffset; - } - ib->insize = fread(ib->input, 1, bsize, gInFile); - if (ib->insize < bsize) - die("Error reading block contents"); - offset += bsize; - ib->uoffset = iter.block.uncompressed_file_offset; - ib->check = iter.stream.flags->check; + } + + if (iter.block.uncompressed_size > MAXSPLITSIZE) { // must stream + if (gRbuf) + rbuf_consume(gRbuf->insize); // clear + read_block(true, iter.stream.flags->check); + } else { + ib->insize = fread(ib->input, 1, bsize, gInFile); + if (ib->insize < bsize) + die("Error reading block contents"); + offset += bsize; + ib->uoffset = iter.block.uncompressed_file_offset; + ib->check = iter.stream.flags->check; - pipeline_split(pi); + pipeline_split(pi); + } } pipeline_stop(); From 0aa5ae7d1a2c3b65a948692845be72d700a531af Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Mon, 5 Nov 2012 01:10:32 -0500 Subject: [PATCH 18/22] Output post-tar data (if it won't interfere with wanted-file filtering) --- common.c | 2 +- read.c | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/common.c b/common.c index 334dc0b..8077541 100644 --- a/common.c +++ b/common.c @@ -342,7 +342,7 @@ bool decode_index(void) { gIndex = index; } - return true; + return (gIndex != NULL); } diff --git a/read.c b/read.c index b979783..a30e41c 100644 --- a/read.c +++ b/read.c @@ -45,6 +45,7 @@ static off_t gArLastOffset; static size_t gArLastSize; static wanted_t *gArWanted = NULL; static bool gArNextItem = false; +static bool gExplicitFiles = false; static int tar_ok(struct archive *ar, void *ref); static ssize_t tar_read(struct archive *ar, void *ref, const void **bufp); @@ -86,10 +87,11 @@ static lzma_vli gFileIndexOffset = 0; #pragma mark MAIN void pixz_read(bool verify, size_t nspecs, char **specs) { - if (decode_index()) { // FIXME + if (decode_index()) { if (verify) gFileIndexOffset = read_file_index(); - wanted_files(nspecs, specs); + wanted_files(nspecs, specs); + gExplicitFiles = nspecs; } #if DEBUG @@ -100,7 +102,6 @@ void pixz_read(bool verify, size_t nspecs, char **specs) { pipeline_create(block_create, block_free, gIndex ? read_thread : read_thread_noindex, decode_thread); if (verify && gFileIndexOffset) { - // FIXME: verify this works with noindex/streamed reading // FIXME: don't stop on End Of Archive gArWanted = gWantedFiles; wanted_t *w = gWantedFiles, *wlast = NULL; @@ -146,7 +147,8 @@ void pixz_read(bool verify, size_t nspecs, char **specs) { if (w && w->name) die("File %s missing in archive", w->name); tar_write_last(); // write whatever's left - } else { + } + if (!gExplicitFiles) { pipeline_item_t *pi; while ((pi = pipeline_merged())) { io_block_t *ib = (io_block_t*)(pi->data); @@ -475,7 +477,7 @@ static void read_thread(void) { continue; // Do we need this block? - if (gWantedFiles) { + if (gWantedFiles && gExplicitFiles) { off_t uend = iter.block.uncompressed_file_offset + iter.block.uncompressed_size; if (!w || w->start >= uend) { @@ -565,7 +567,7 @@ static int tar_ok(struct archive *ar, void *ref) { } static bool tar_next_block(void) { - if (gArItem && !gArNextItem && gArWanted) { + if (gArItem && !gArNextItem && gArWanted && gExplicitFiles) { io_block_t *ib = (io_block_t*)(gArItem->data); if (gArWanted->start < ib->uoffset + ib->outsize) return true; // No need @@ -598,7 +600,7 @@ static ssize_t tar_read(struct archive *ar, void *ref, const void **bufp) { off_t off; size_t size; io_block_t *ib = (io_block_t*)(gArItem->data); - if (gWantedFiles) { + if (gWantedFiles && gExplicitFiles) { debug("tar want: %s", gArWanted->name); off = gArWanted->start - ib->uoffset; size = gArWanted->size; From e20c330366cf7e9466296371b25f844af1c42b3f Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Mon, 5 Nov 2012 01:14:59 -0500 Subject: [PATCH 19/22] Makefile cleanup --- Makefile | 19 ++++++++++--------- pixz.h | 4 +++- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 4901162..c09e45a 100644 --- a/Makefile +++ b/Makefile @@ -2,22 +2,23 @@ ifneq ($(shell gcc -v 2>&1 | grep 'Apple Inc'),) APPLE=1 endif -ifdef APPLE -ifeq ($(CC),gcc) - LDFLAGS += -search_paths_first -endif -endif OPT = -g -O0 -CFLAGS = $(patsubst %,-I%/include,$(LIBPREFIX)) $(OPT) -std=gnu99 \ +MYCFLAGS = $(patsubst %,-I%/include,$(LIBPREFIX)) $(OPT) -std=gnu99 \ -Wall -Wno-unknown-pragmas -LDFLAGS = $(patsubst %,-L%/lib,$(LIBPREFIX)) $(OPT) -Wall +MYLDFLAGS = $(patsubst %,-L%/lib,$(LIBPREFIX)) $(OPT) -Wall THREADS = -lpthread LIBADD = $(THREADS) -llzma -larchive CC = gcc -COMPILE = $(CC) $(CFLAGS) -c -o -LD = $(CC) $(LDFLAGS) -o +COMPILE = $(CC) $(MYCFLAGS) $(CFLAGS) -c -o +LD = $(CC) $(MYLDFLAGS) $(LDFLAGS) -o + +ifdef APPLE +ifeq ($(CC),gcc) + MYLDFLAGS += -search_paths_first +endif +endif PROGS = pixz COMMON = common.o endian.o cpu.o read.o write.o list.o diff --git a/pixz.h b/pixz.h index b7c7d61..df9a398 100644 --- a/pixz.h +++ b/pixz.h @@ -20,7 +20,9 @@ #define CHUNKSIZE 4096 -#define DEBUG 0 +#ifndef DEBUG + #define DEBUG 0 +#endif #if DEBUG #define debug(str, ...) fprintf(stderr, str "\n", ##__VA_ARGS__) #else From 09c60316cf1b7b0aaa7d5f2aee411ad1d8768dbd Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Mon, 5 Nov 2012 02:39:20 -0500 Subject: [PATCH 20/22] Use heuristic to omit file index --- read.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/read.c b/read.c index a30e41c..b9470b1 100644 --- a/read.c +++ b/read.c @@ -23,12 +23,16 @@ static void wanted_free(wanted_t *w); #pragma mark DECLARE PIPELINE +typedef enum { BLOCK_SIZED, BLOCK_UNSIZED, BLOCK_CONTINUATION } block_type; + typedef struct { uint8_t *input, *output; size_t incap, outcap; size_t insize, outsize; off_t uoffset; // uncompressed offset lzma_check check; + + block_type btype; } io_block_t; static void *block_create(void); @@ -83,6 +87,9 @@ static void read_footer(void); static lzma_vli gFileIndexOffset = 0; +static bool taste_tar(io_block_t *ib); +static bool taste_file_index(io_block_t *ib); + #pragma mark MAIN @@ -99,6 +106,7 @@ void pixz_read(bool verify, size_t nspecs, char **specs) { debug("want: %s", w->name); #endif + bool first = true; pipeline_create(block_create, block_free, gIndex ? read_thread : read_thread_noindex, decode_thread); if (verify && gFileIndexOffset) { @@ -144,15 +152,36 @@ void pixz_read(bool verify, size_t nspecs, char **specs) { wlast = w; w = w->next; } + archive_read_finish(ar); if (w && w->name) die("File %s missing in archive", w->name); tar_write_last(); // write whatever's left + first = false; } if (!gExplicitFiles) { - pipeline_item_t *pi; + bool tar = false; + bool all_sized = true; + bool skipping = false; + + pipeline_item_t *pi; while ((pi = pipeline_merged())) { io_block_t *ib = (io_block_t*)(pi->data); - fwrite(ib->output, ib->outsize, 1, gOutFile); + if (first) { + tar = taste_tar(ib); + first = false; + } + if (skipping && ib->btype != BLOCK_CONTINUATION) { + die("File index heuristic failed, retry with -t flag"); + skipping = false; + } + if (verify && !skipping && !first && tar && all_sized + && ib->btype == BLOCK_UNSIZED && taste_file_index(ib)) + skipping = true; + if (ib->btype != BLOCK_SIZED) + all_sized = false; + + if (!skipping) + fwrite(ib->output, ib->outsize, 1, gOutFile); queue_push(gPipelineStartQ, PIPELINE_ITEM, pi); } } @@ -357,6 +386,7 @@ static bool read_block(bool force_stream, lzma_check check) { block_capacity(gRbuf, 0, outsize); gRbuf->outsize = outsize; gRbuf->check = check; + gRbuf->btype = BLOCK_SIZED; if (rbuf_read(lzma_block_total_size(&block)) != RBUF_FULL) die("Error reading block contents"); @@ -372,6 +402,7 @@ static void read_streaming(lzma_block *block) { rbuf_cycle(&stream, true, block->header_size); stream.avail_out = 0; + bool first = true; pipeline_item_t *pi = NULL; io_block_t *ib = NULL; @@ -384,9 +415,11 @@ static void read_streaming(lzma_block *block) { if (ib) { ib->outsize = ib->outcap; pipeline_dispatch(pi, gPipelineMergeQ); + first = false; } queue_pop(gPipelineStartQ, (void**)&pi); ib = (io_block_t*)pi->data; + ib->btype = (first ? BLOCK_UNSIZED : BLOCK_CONTINUATION); block_capacity(ib, 0, STREAMSIZE); stream.next_out = ib->output; stream.avail_out = ib->outcap; @@ -512,7 +545,8 @@ static void read_thread(void) { offset += bsize; ib->uoffset = iter.block.uncompressed_file_offset; ib->check = iter.stream.flags->check; - + ib->btype = BLOCK_SIZED; + pipeline_split(pi); } } @@ -626,3 +660,21 @@ static ssize_t tar_read(struct archive *ar, void *ref, const void **bufp) { *bufp = ib->output + off; return size; } + + +#pragma mark UTILS + +static bool taste_tar(io_block_t *ib) { + struct archive *ar = archive_read_new(); + archive_read_support_compression_none(ar); + archive_read_support_format_tar(ar); + archive_read_open_memory(ar, ib->output, ib->outsize); + struct archive_entry *entry; + bool ok = (archive_read_next_header(ar, &entry) == ARCHIVE_OK); + archive_read_finish(ar); + return ok; +} + +static bool taste_file_index(io_block_t *ib) { + return xle64dec(ib->output) == PIXZ_INDEX_MAGIC; +} From 14f5644bd8c1846e268329c3c59d53d3568950e1 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Mon, 5 Nov 2012 02:45:03 -0500 Subject: [PATCH 21/22] Fixup docs --- TODO | 8 +++++++- read.c | 4 ---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/TODO b/TODO index 64d4d3b..40ca5b1 100644 --- a/TODO +++ b/TODO @@ -15,7 +15,13 @@ BUGS * performance lags under IO? * slow input -> CPUs idle while waiting for input * safe extraction - * abort if block size exceeded + * sanity checks, from spec: + - CRCs are already tested, i think? + - backward size should match file + - reserved flags must be zero + - header vs footer flags + - uncompressed size field vs actual uncompressed size + - index vs actual blocks EFFICIENCY * more efficient indexing: ranges? sorted? mtree? diff --git a/read.c b/read.c index b9470b1..1085e71 100644 --- a/read.c +++ b/read.c @@ -110,7 +110,6 @@ void pixz_read(bool verify, size_t nspecs, char **specs) { pipeline_create(block_create, block_free, gIndex ? read_thread : read_thread_noindex, decode_thread); if (verify && gFileIndexOffset) { - // FIXME: don't stop on End Of Archive gArWanted = gWantedFiles; wanted_t *w = gWantedFiles, *wlast = NULL; bool lastmulti = false; @@ -439,7 +438,6 @@ static void read_streaming(lzma_block *block) { } static void read_index(void) { - // FIXME: verify it matches the blocks? lzma_stream stream = LZMA_STREAM_INIT; lzma_index *index; if (lzma_index_decoder(&stream, &index, MEMLIMIT) != LZMA_OK) @@ -459,7 +457,6 @@ static void read_index(void) { } static void read_footer(void) { - // FIXME: compare with header? lzma_stream_flags stream_flags; if (rbuf_read(LZMA_STREAM_HEADER_SIZE) != RBUF_FULL) die("Error reading stream footer"); @@ -489,7 +486,6 @@ static void read_thread_noindex(void) { ; // pass read_index(); read_footer(); - // FIXME: don't output the pixz file index! heuristic? } if (empty) die("Empty input"); From 267e538c5eedd99a436a542d45d3a9e0754298c8 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sat, 10 Nov 2012 00:00:35 -0500 Subject: [PATCH 22/22] More file-index skipping fixes Don't detect file-index on multi-stream files, that usage is not supported. Handle multi-stream files correctly when using heuristic skipping. --- common.c | 3 +++ read.c | 26 ++++++++++++++------------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/common.c b/common.c index 8077541..0740d30 100644 --- a/common.c +++ b/common.c @@ -120,6 +120,9 @@ static lzma_vli find_file_index(void **bdatap) { lzma_vli loc = lzma_index_uncompressed_size(gIndex) - 1; if (lzma_index_iter_locate(&iter, loc)) die("Can't locate file index block"); + if (iter.stream.number != 1) + return 0; // Too many streams for one file index + void *bdata = decode_file_index_start(iter.block.compressed_file_offset, iter.stream.flags->check); diff --git a/read.c b/read.c index 1085e71..a496148 100644 --- a/read.c +++ b/read.c @@ -106,7 +106,6 @@ void pixz_read(bool verify, size_t nspecs, char **specs) { debug("want: %s", w->name); #endif - bool first = true; pipeline_create(block_create, block_free, gIndex ? read_thread : read_thread_noindex, decode_thread); if (verify && gFileIndexOffset) { @@ -155,27 +154,30 @@ void pixz_read(bool verify, size_t nspecs, char **specs) { if (w && w->name) die("File %s missing in archive", w->name); tar_write_last(); // write whatever's left - first = false; } if (!gExplicitFiles) { - bool tar = false; - bool all_sized = true; - bool skipping = false; + /* Heuristics for detecting pixz file index: + * - Input must be streaming (otherwise read_thread does this) + * - Data must look tar-like + * - Must have all sized blocks, followed by unsized file index */ + bool start = !gIndex && verify, + tar = false, all_sized = true, skipping = false; pipeline_item_t *pi; while ((pi = pipeline_merged())) { io_block_t *ib = (io_block_t*)(pi->data); - if (first) { - tar = taste_tar(ib); - first = false; - } if (skipping && ib->btype != BLOCK_CONTINUATION) { - die("File index heuristic failed, retry with -t flag"); + fprintf(stderr, + "Warning: File index heuristic failed, use -t flag.\n"); skipping = false; } - if (verify && !skipping && !first && tar && all_sized + if (!skipping && tar && !start && all_sized && ib->btype == BLOCK_UNSIZED && taste_file_index(ib)) skipping = true; + if (start) { + tar = taste_tar(ib); + start = false; + } if (ib->btype != BLOCK_SIZED) all_sized = false; @@ -541,7 +543,7 @@ static void read_thread(void) { offset += bsize; ib->uoffset = iter.block.uncompressed_file_offset; ib->check = iter.stream.flags->check; - ib->btype = BLOCK_SIZED; + ib->btype = BLOCK_SIZED; // Indexed blocks always sized pipeline_split(pi); }