|
|
|
@ -1,12 +1,12 @@
|
|
|
|
|
//! Reading index files linearly from disk, a capability needed for merging
|
|
|
|
|
//! index files.
|
|
|
|
|
|
|
|
|
|
use crate::write::IndexFileWriter;
|
|
|
|
|
use byteorder::{LittleEndian, ReadBytesExt};
|
|
|
|
|
use std::fs::{self, File};
|
|
|
|
|
use std::io::prelude::*;
|
|
|
|
|
use std::io::{self, BufReader, SeekFrom};
|
|
|
|
|
use std::path::Path;
|
|
|
|
|
use byteorder::{LittleEndian, ReadBytesExt};
|
|
|
|
|
use crate::write::IndexFileWriter;
|
|
|
|
|
|
|
|
|
|
/// A `IndexFileReader` does a single linear pass over an index file from
|
|
|
|
|
/// beginning to end. Needless to say, this is not how an index is normally
|
|
|
|
@ -30,7 +30,7 @@ pub struct IndexFileReader {
|
|
|
|
|
/// The next entry in the table of contents, if any; or `None` if we've
|
|
|
|
|
/// reached the end of the table. `IndexFileReader` always reads ahead one
|
|
|
|
|
/// entry in the contents and stores it here.
|
|
|
|
|
next: Option<Entry>
|
|
|
|
|
next: Option<Entry>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// An entry in the table of contents of an index file.
|
|
|
|
@ -51,7 +51,7 @@ pub struct Entry {
|
|
|
|
|
pub offset: u64,
|
|
|
|
|
|
|
|
|
|
/// Length of the index data for this term, in bytes.
|
|
|
|
|
pub nbytes: u64
|
|
|
|
|
pub nbytes: u64,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl IndexFileReader {
|
|
|
|
@ -62,31 +62,35 @@ impl IndexFileReader {
|
|
|
|
|
/// from its directory, but it'll still take up space on disk until the
|
|
|
|
|
/// file is closed, which normally happens when the `IndexFileReader` is
|
|
|
|
|
/// dropped.
|
|
|
|
|
pub fn open_and_delete<P: AsRef<Path>>(filename: P) -> io::Result<IndexFileReader> {
|
|
|
|
|
pub fn open_and_delete<P: AsRef<Path>>(filename: P) -> io::Result<Self> {
|
|
|
|
|
let filename = filename.as_ref();
|
|
|
|
|
let mut main_raw = File::open(filename)?;
|
|
|
|
|
|
|
|
|
|
// Read the file header.
|
|
|
|
|
let contents_offset = main_raw.read_u64::<LittleEndian>()?;
|
|
|
|
|
println!("opened {}, table of contents starts at {}", filename.display(), contents_offset);
|
|
|
|
|
println!(
|
|
|
|
|
"opened {}, table of contents starts at {}",
|
|
|
|
|
filename.display(),
|
|
|
|
|
contents_offset
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Open again so we have two read heads;
|
|
|
|
|
// move the contents read head to its starting position.
|
|
|
|
|
// Set up buffering.
|
|
|
|
|
let mut contents_raw = File::open(filename)?;
|
|
|
|
|
contents_raw.seek(SeekFrom::Start(contents_offset))?;
|
|
|
|
|
let _start = contents_raw.seek(SeekFrom::Start(contents_offset))?;
|
|
|
|
|
let main = BufReader::new(main_raw);
|
|
|
|
|
let mut contents = BufReader::new(contents_raw);
|
|
|
|
|
|
|
|
|
|
// We always read ahead one entry, so load the first entry right away.
|
|
|
|
|
let first = IndexFileReader::read_entry(&mut contents)?;
|
|
|
|
|
let first = Self::read_entry(&mut contents)?;
|
|
|
|
|
|
|
|
|
|
fs::remove_file(filename)?; // YOLO
|
|
|
|
|
fs::remove_file(filename)?; // YOLO
|
|
|
|
|
|
|
|
|
|
Ok(IndexFileReader {
|
|
|
|
|
main: main,
|
|
|
|
|
contents: contents,
|
|
|
|
|
next: first
|
|
|
|
|
Ok(Self {
|
|
|
|
|
main,
|
|
|
|
|
contents,
|
|
|
|
|
next: first,
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -98,30 +102,30 @@ impl IndexFileReader {
|
|
|
|
|
// that's considered a success, with no entry read.
|
|
|
|
|
let offset = match f.read_u64::<LittleEndian>() {
|
|
|
|
|
Ok(value) => value,
|
|
|
|
|
Err(err) =>
|
|
|
|
|
Err(err) => {
|
|
|
|
|
if err.kind() == io::ErrorKind::UnexpectedEof {
|
|
|
|
|
return Ok(None)
|
|
|
|
|
return Ok(None);
|
|
|
|
|
} else {
|
|
|
|
|
return Err(err)
|
|
|
|
|
return Err(err);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let nbytes = f.read_u64::<LittleEndian>()?;
|
|
|
|
|
let df = f.read_u32::<LittleEndian>()?;
|
|
|
|
|
let term_len = f.read_u32::<LittleEndian>()? as usize;
|
|
|
|
|
let mut bytes = Vec::with_capacity(term_len);
|
|
|
|
|
bytes.resize(term_len, 0);
|
|
|
|
|
let mut bytes = vec![0; term_len];
|
|
|
|
|
f.read_exact(&mut bytes)?;
|
|
|
|
|
let term = match String::from_utf8(bytes) {
|
|
|
|
|
Ok(s) => s,
|
|
|
|
|
Err(_) => return Err(io::Error::new(io::ErrorKind::Other, "unicode fail"))
|
|
|
|
|
Err(_) => return Err(io::Error::new(io::ErrorKind::Other, "unicode fail")),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Ok(Some(Entry {
|
|
|
|
|
term: term,
|
|
|
|
|
df: df,
|
|
|
|
|
offset: offset,
|
|
|
|
|
nbytes: nbytes
|
|
|
|
|
term,
|
|
|
|
|
df,
|
|
|
|
|
offset,
|
|
|
|
|
nbytes,
|
|
|
|
|
}))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -129,13 +133,15 @@ impl IndexFileReader {
|
|
|
|
|
/// (Since we always read ahead one entry, this method can't fail.)
|
|
|
|
|
///
|
|
|
|
|
/// Returns `None` if we've reached the end of the file.
|
|
|
|
|
pub fn peek(&self) -> Option<&Entry> { self.next.as_ref() }
|
|
|
|
|
pub fn peek(&self) -> Option<&Entry> {
|
|
|
|
|
self.next.as_ref()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// True if the next entry is for the given term.
|
|
|
|
|
pub fn is_at(&self, term: &str) -> bool {
|
|
|
|
|
match self.next {
|
|
|
|
|
Some(ref e) => e.term == term,
|
|
|
|
|
None => false
|
|
|
|
|
None => false,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -148,11 +154,12 @@ impl IndexFileReader {
|
|
|
|
|
let e = self.next.as_ref().expect("no entry to move");
|
|
|
|
|
if e.nbytes > usize::max_value() as u64 {
|
|
|
|
|
// This can only happen on 32-bit platforms.
|
|
|
|
|
return Err(io::Error::new(io::ErrorKind::Other,
|
|
|
|
|
"computer not big enough to hold index entry"));
|
|
|
|
|
return Err(io::Error::new(
|
|
|
|
|
io::ErrorKind::Other,
|
|
|
|
|
"computer not big enough to hold index entry",
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
let mut buf = Vec::with_capacity(e.nbytes as usize);
|
|
|
|
|
buf.resize(e.nbytes as usize, 0);
|
|
|
|
|
let mut buf = vec![0; e.nbytes as usize];
|
|
|
|
|
self.main.read_exact(&mut buf)?;
|
|
|
|
|
out.write_main(&buf)?;
|
|
|
|
|
}
|
|
|
|
|