/** * Copyright (c) 2007-2012, Timothy Stack * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of Timothy Stack nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * @file line_buffer.cc */ #include "config.h" #include #include #include #include #include #ifdef HAVE_BZLIB_H #include #endif #include #include "lnav_util.hh" #include "line_buffer.hh" using namespace std; static const size_t DEFAULT_LINE_BUFFER_SIZE = 256 * 1024; static const size_t MAX_LINE_BUFFER_SIZE = 4 * DEFAULT_LINE_BUFFER_SIZE; static const size_t DEFAULT_INCREMENT = 128 * 1024; static const size_t MAX_COMPRESSED_BUFFER_SIZE = 32 * 1024 * 1024; /* * XXX REMOVE ME * * The stock gzipped file code does not use pread, so we need to use a lock to * get exclusive access to the file. In the future, we should just rewrite * the gzipped file code to use pread. */ class lock_hack { public: class guard { public: guard() : g_lock(lock_hack::singleton()) { this->g_lock.lock(); }; ~guard() { this->g_lock.unlock(); }; private: lock_hack &g_lock; }; static lock_hack &singleton() { static lock_hack retval; return retval; }; void lock() { lockf(this->lh_fd, F_LOCK, 0); }; void unlock() { lockf(this->lh_fd, F_ULOCK, 0); }; private: lock_hack() { char lockname[64]; snprintf(lockname, sizeof(lockname), "/tmp/lnav.%d.lck", getpid()); this->lh_fd = open(lockname, O_CREAT | O_RDWR, 0600); fcntl(this->lh_fd, F_SETFD, FD_CLOEXEC); unlink(lockname); }; auto_fd lh_fd; }; /* XXX END */ line_buffer::line_buffer() : lb_gz_file(NULL), lb_bz_file(false), lb_gz_offset(0), lb_file_size((size_t)-1), lb_file_offset(0), lb_file_time(0), lb_buffer_size(0), lb_buffer_max(DEFAULT_LINE_BUFFER_SIZE), lb_seekable(false), lb_last_line_offset(-1) { if ((this->lb_buffer = (char *)malloc(this->lb_buffer_max)) == NULL) { throw bad_alloc(); } ensure(this->invariant()); } line_buffer::~line_buffer() { auto_fd fd = -1; // Make sure any shared refs take ownership of the data. this->lb_share_manager.invalidate_refs(); this->set_fd(fd); } void line_buffer::set_fd(auto_fd &fd) throw (error) { off_t newoff = 0; if (this->lb_gz_file) { gzclose(this->lb_gz_file); this->lb_gz_file = NULL; } if (this->lb_bz_file) { this->lb_bz_file = false; } if (fd != -1) { /* Sync the fd's offset with the object. */ newoff = lseek(fd, 0, SEEK_CUR); if (newoff == -1) { if (errno != ESPIPE) { throw error(errno); } /* It's a pipe, start with a zero offset. */ newoff = 0; this->lb_seekable = false; } else { char gz_id[2 + 1 + 1 + 4]; if (pread(fd, gz_id, sizeof(gz_id), 0) == sizeof(gz_id)) { if (gz_id[0] == '\037' && gz_id[1] == '\213') { int gzfd = dup(fd); fcntl(gzfd, F_SETFD, FD_CLOEXEC); lseek(fd, 0, SEEK_SET); if ((this->lb_gz_file = gzdopen(gzfd, "r")) == NULL) { if (errno == 0) { throw bad_alloc(); } else{ throw error(errno); } } this->lb_file_time = read_le32( (const unsigned char *)&gz_id[4]); if (this->lb_file_time < 0) { this->lb_file_time = 0; } this->lb_gz_offset = lseek(this->lb_fd, 0, SEEK_CUR); } #ifdef HAVE_BZLIB_H else if (gz_id[0] == 'B' && gz_id[1] == 'Z') { lseek(fd, 0, SEEK_SET); this->lb_bz_file = true; /* * Loading data from a bzip2 file is pretty slow, so we try * to keep as much in memory as possible. */ this->resize_buffer(MAX_COMPRESSED_BUFFER_SIZE); } #endif } this->lb_seekable = true; } } this->lb_file_offset = newoff; this->lb_buffer_size = 0; this->lb_fd = fd; ensure(this->invariant()); } void line_buffer::resize_buffer(size_t new_max) throw (error) { char *tmp, *old; require(this->lb_bz_file || this->lb_gz_file || new_max <= MAX_LINE_BUFFER_SIZE); /* Still need more space, try a realloc. */ old = this->lb_buffer.release(); tmp = (char *)realloc(old, new_max); if (tmp != NULL) { this->lb_buffer = tmp; this->lb_buffer_max = new_max; } else { this->lb_buffer = old; throw error(ENOMEM); } } void line_buffer::ensure_available(off_t start, size_t max_length) throw (error) { size_t prefill, available; require(max_length <= MAX_LINE_BUFFER_SIZE); /* * Check to see if the start is inside the cached range or immediately * after. */ if (start < this->lb_file_offset || start > (off_t)(this->lb_file_offset + this->lb_buffer_size)) { /* * The request is outside the cached range, need to reload the * whole thing. */ prefill = 0; this->lb_buffer_size = 0; if ((this->lb_file_size != (size_t)-1) && (start + this->lb_buffer_max > this->lb_file_size)) { /* * If the start is near the end of the file, move the offset back a * bit so we can get more of the file in the cache. */ this->lb_file_offset = this->lb_file_size - std::min(this->lb_file_size, this->lb_buffer_max); } else { this->lb_file_offset = start; } } else { /* The request is in the cached range. Record how much extra data is in * the buffer before the requested range. */ prefill = start - this->lb_file_offset; } require(this->lb_file_offset <= start); require(prefill <= this->lb_buffer_size); available = this->lb_buffer_max - (start - this->lb_file_offset); require(available <= this->lb_buffer_max); if (max_length > available) { /* * Need more space, move any existing data to the front of the * buffer. */ this->lb_buffer_size -= prefill; this->lb_file_offset += prefill; memmove(&this->lb_buffer[0], &this->lb_buffer[prefill], this->lb_buffer_size); available = this->lb_buffer_max - (start - this->lb_file_offset); if (max_length > available) { this->resize_buffer(this->lb_buffer_max + DEFAULT_LINE_BUFFER_SIZE); } this->lb_share_manager.invalidate_refs(); } } bool line_buffer::fill_range(off_t start, size_t max_length) throw (error) { bool retval = false; require(start >= 0); if (this->in_range(start) && this->in_range(start + max_length - 1)) { /* Cache already has the data, nothing to do. */ retval = true; } else if (this->lb_fd != -1) { ssize_t rc; /* Make sure there is enough space, then */ this->ensure_available(start, max_length); /* ... read in the new data. */ if (this->lb_gz_file) { if (this->lb_file_size != (size_t)-1 && this->in_range(start) && this->in_range(this->lb_file_size - 1)) { rc = 0; } else { lock_hack::guard guard; lseek(this->lb_fd, this->lb_gz_offset, SEEK_SET); gzseek(this->lb_gz_file, this->lb_file_offset + this->lb_buffer_size, SEEK_SET); rc = gzread(this->lb_gz_file, &this->lb_buffer[this->lb_buffer_size], this->lb_buffer_max - this->lb_buffer_size); this->lb_gz_offset = lseek(this->lb_fd, 0, SEEK_CUR); } } #ifdef HAVE_BZLIB_H else if (this->lb_bz_file) { if (this->lb_file_size != (size_t)-1 && (((size_t)start >= this->lb_file_size) || (this->in_range(start) && this->in_range(this->lb_file_size - 1)))) { rc = 0; } else { lock_hack::guard guard; char scratch[32 * 1024]; BZFILE * bz_file; off_t seek_to; /* * Unfortunately, there is no bzseek, so we need to reopen the * file every time we want to do a read. */ lseek(this->lb_fd, 0, SEEK_SET); if ((bz_file = BZ2_bzdopen(dup(this->lb_fd), "r")) == NULL) { if (errno == 0) { throw bad_alloc(); } else{ throw error(errno); } } seek_to = this->lb_file_offset + this->lb_buffer_size; while (seek_to > 0) { int count; count = BZ2_bzread(bz_file, scratch, std::min((size_t)seek_to, sizeof(scratch))); seek_to -= count; } rc = BZ2_bzread(bz_file, &this->lb_buffer[this->lb_buffer_size], this->lb_buffer_max - this->lb_buffer_size); BZ2_bzclose(bz_file); if (rc != -1 && ( rc < (this->lb_buffer_max - this->lb_buffer_size))) { this->lb_file_size = ( this->lb_file_offset + this->lb_buffer_size + rc); } } } #endif else if (this->lb_seekable) { rc = pread(this->lb_fd, &this->lb_buffer[this->lb_buffer_size], this->lb_buffer_max - this->lb_buffer_size, this->lb_file_offset + this->lb_buffer_size); } else { rc = read(this->lb_fd, &this->lb_buffer[this->lb_buffer_size], this->lb_buffer_max - this->lb_buffer_size); } // XXX For some reason, cygwin is giving us a bogus return value when // up to the end of the file. if (rc > (this->lb_buffer_max - this->lb_buffer_size)) { rc = -1; #ifdef ENODATA errno = ENODATA; #else errno = EAGAIN; #endif } switch (rc) { case 0: if (!this->lb_seekable) { this->lb_file_size = this->lb_file_offset + this->lb_buffer_size; } if (start < (off_t) this->lb_file_size) { retval = true; } if (this->lb_gz_file || this->lb_bz_file) { /* * For compressed files, increase the buffer size so we don't * have to spend as much time uncompressing the data. */ this->resize_buffer(MAX_COMPRESSED_BUFFER_SIZE); } break; case (ssize_t)-1: switch (errno) { #ifdef ENODATA /* Cygwin seems to return this when pread reaches the end of the */ /* file. */ case ENODATA: #endif case EINTR: case EAGAIN: break; default: throw error(errno); } break; default: this->lb_buffer_size += rc; retval = true; break; } ensure(this->lb_buffer_size <= this->lb_buffer_max); } return retval; } bool line_buffer::read_line(off_t &offset, line_value &lv, bool include_delim) throw (error) { size_t request_size = DEFAULT_INCREMENT; bool retval = false; require(this->lb_fd != -1); if (this->lb_last_line_offset != -1 && offset > this->lb_last_line_offset) { /* * Don't return anything past the last known line. The caller needs * to try reading at the offset of the last line again. */ return false; } lv.lv_len = 0; lv.lv_partial = false; while (!retval) { char *line_start, *lf; this->fill_range(offset, request_size); /* Find the data in the cache and */ line_start = this->get_range(offset, lv.lv_len); /* ... look for the end-of-line or end-of-file. */ if (((lf = (char *)memchr(line_start, '\n', lv.lv_len)) != NULL) || (lv.lv_len >= MAX_LINE_BUFFER_SIZE) || (request_size == MAX_LINE_BUFFER_SIZE) || ((request_size > lv.lv_len) && lv.lv_len > 0)) { if ((lf != NULL) && ((lf - line_start) >= MAX_LINE_BUFFER_SIZE - 1)) { lf = NULL; } if (lf != NULL) { lv.lv_partial = false; lv.lv_len = lf - line_start; if (include_delim) { lv.lv_len += 1; } else { offset += 1; /* Skip the delimiter. */ } if (offset >= this->lb_last_line_offset) { this->lb_last_line_offset = offset + lv.lv_len; } } else { if (lv.lv_len >= MAX_LINE_BUFFER_SIZE) { lv.lv_len = MAX_LINE_BUFFER_SIZE - 1; lv.lv_partial = false; } else { lv.lv_partial = true; } /* * Be nice and make sure there is room for the caller to * add a NULL-terminator. */ this->ensure_available(offset, lv.lv_len + 1); line_start = this->get_range(offset, lv.lv_len); if (lv.lv_len >= MAX_LINE_BUFFER_SIZE) { lv.lv_len = MAX_LINE_BUFFER_SIZE - 1; } if (lv.lv_partial) { /* * Since no delimiter was seen, we need to remember the offset * of the last line in the file so we don't mistakenly return * two partial lines to the caller. * * 1. read_line() - returns partial line * 2. file is written * 3. read_line() - returns the middle of partial line. */ this->lb_last_line_offset = offset; } else if (offset >= this->lb_last_line_offset) { this->lb_last_line_offset = offset + lv.lv_len; } } lv.lv_start = line_start; offset += lv.lv_len; retval = true; } else { request_size += DEFAULT_INCREMENT; } if (!retval && !this->fill_range(offset, request_size)) { break; } } ensure(lv.lv_len <= this->lb_buffer_size); ensure(!retval || (lv.lv_start >= this->lb_buffer && (lv.lv_start + lv.lv_len) <= (this->lb_buffer + this->lb_buffer_size))); ensure(this->invariant()); return retval; } bool line_buffer::read_line(off_t &offset_inout, shared_buffer_ref &sbr, line_value *lv) throw (error) { line_value lv_tmp; bool retval; if (lv == NULL) { lv = &lv_tmp; } // Clear the incoming ref right away so that an invalidate // does not cause a wasted malloc/copy. sbr.disown(); if ((retval = this->read_line(offset_inout, *lv))) { sbr.share(this->lb_share_manager, lv->lv_start, lv->lv_len); } return retval; } bool line_buffer::read_range(off_t offset, size_t len, shared_buffer_ref &sbr) throw (error) { char *line_start; size_t avail; sbr.disown(); if (this->lb_last_line_offset != -1 && offset > this->lb_last_line_offset) { /* * Don't return anything past the last known line. The caller needs * to try reading at the offset of the last line again. */ return false; } this->fill_range(offset, len); line_start = this->get_range(offset, avail); sbr.share(this->lb_share_manager, line_start, len); return true; }