You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

164 lines
6.2 KiB
Rust

//! Reading index files linearly from disk, a capability needed for merging
//! index files.
use std::fs::{self, File};
use std::io::prelude::*;
use std::io::{self, BufReader, SeekFrom};
use std::path::Path;
use byteorder::{LittleEndian, ReadBytesExt};
use crate::write::IndexFileWriter;
/// A `IndexFileReader` does a single linear pass over an index file from
/// beginning to end. Needless to say, this is not how an index is normally
/// used! This is used only when merging multiple index files.
///
/// The only way to advance through the file is to use the `.move_entry_to()`
/// method.
pub struct IndexFileReader {
/// Reader that reads the actual index data.
///
/// We have two readers. The index data is most of the file. There's also a
/// table of contents, stored separately at the end. We have to read them
/// in tandem, so we open the file twice.
main: BufReader<File>,
/// Reader that reads the table of contents. (Since this table is stored at
/// the end of the file, we have to begin by `seek`ing to it; see the code
/// in `IndexFileReader::open_and_delete`.)
contents: BufReader<File>,
/// The next entry in the table of contents, if any; or `None` if we've
/// reached the end of the table. `IndexFileReader` always reads ahead one
/// entry in the contents and stores it here.
next: Option<Entry>
}
/// An entry in the table of contents of an index file.
///
/// Each entry in the table of contents is small. It consists of a string, the
/// `term`; summary information about that term, as used in the corpus (`df`);
/// and a pointer to bulkier data that tells more (`offset` and `nbytes`).
pub struct Entry {
/// The term is a word that appears in one or more documents in the corpus.
/// The index file contains information about the documents that use this
/// word.
pub term: String,
/// Total number of documents in the corpus that contain this term.
pub df: u32,
/// Offset of the index data for this term from the beginning of the file, in bytes.
pub offset: u64,
/// Length of the index data for this term, in bytes.
pub nbytes: u64
}
impl IndexFileReader {
/// Open an index file to read it from beginning to end.
///
/// This deletes the file, which may not work properly on Windows. Patches
/// welcome! On Unix, it works like this: the file immediately disappears
/// from its directory, but it'll still take up space on disk until the
/// file is closed, which normally happens when the `IndexFileReader` is
/// dropped.
pub fn open_and_delete<P: AsRef<Path>>(filename: P) -> io::Result<IndexFileReader> {
let filename = filename.as_ref();
let mut main_raw = File::open(filename)?;
// Read the file header.
let contents_offset = main_raw.read_u64::<LittleEndian>()?;
println!("opened {}, table of contents starts at {}", filename.display(), contents_offset);
// Open again so we have two read heads;
// move the contents read head to its starting position.
// Set up buffering.
let mut contents_raw = File::open(filename)?;
contents_raw.seek(SeekFrom::Start(contents_offset))?;
let main = BufReader::new(main_raw);
let mut contents = BufReader::new(contents_raw);
// We always read ahead one entry, so load the first entry right away.
let first = IndexFileReader::read_entry(&mut contents)?;
fs::remove_file(filename)?; // YOLO
Ok(IndexFileReader {
main: main,
contents: contents,
next: first
})
}
/// Read the next entry from the table of contents.
///
/// Returns `Ok(None)` if we have reached the end of the file.
fn read_entry(f: &mut BufReader<File>) -> io::Result<Option<Entry>> {
// If the first read here fails with `UnexpectedEof`,
// that's considered a success, with no entry read.
let offset = match f.read_u64::<LittleEndian>() {
Ok(value) => value,
Err(err) =>
if err.kind() == io::ErrorKind::UnexpectedEof {
return Ok(None)
} else {
return Err(err)
}
};
let nbytes = f.read_u64::<LittleEndian>()?;
let df = f.read_u32::<LittleEndian>()?;
let term_len = f.read_u32::<LittleEndian>()? as usize;
let mut bytes = Vec::with_capacity(term_len);
bytes.resize(term_len, 0);
f.read_exact(&mut bytes)?;
let term = match String::from_utf8(bytes) {
Ok(s) => s,
Err(_) => return Err(io::Error::new(io::ErrorKind::Other, "unicode fail"))
};
Ok(Some(Entry {
term: term,
df: df,
offset: offset,
nbytes: nbytes
}))
}
/// Borrow a reference to the next entry in the table of contents.
/// (Since we always read ahead one entry, this method can't fail.)
///
/// Returns `None` if we've reached the end of the file.
pub fn peek(&self) -> Option<&Entry> { self.next.as_ref() }
/// True if the next entry is for the given term.
pub fn is_at(&self, term: &str) -> bool {
match self.next {
Some(ref e) => e.term == term,
None => false
}
}
/// Copy the current entry to the specified output stream, then read the
/// header for the next entry.
pub fn move_entry_to(&mut self, out: &mut IndexFileWriter) -> io::Result<()> {
// This block limits the scope of borrowing `self.next` (for `e`),
// because after this block is over we'll want to assign to `self.next`.
{
let e = self.next.as_ref().expect("no entry to move");
if e.nbytes > usize::max_value() as u64 {
// This can only happen on 32-bit platforms.
return Err(io::Error::new(io::ErrorKind::Other,
"computer not big enough to hold index entry"));
}
let mut buf = Vec::with_capacity(e.nbytes as usize);
buf.resize(e.nbytes as usize, 0);
self.main.read_exact(&mut buf)?;
out.write_main(&buf)?;
}
self.next = Self::read_entry(&mut self.contents)?;
Ok(())
}
}