|
|
|
@ -10,21 +10,58 @@ use write::IndexFileWriter;
|
|
|
|
|
|
|
|
|
|
/// A `IndexFileReader` does a single linear pass over an index file from
|
|
|
|
|
/// beginning to end. Needless to say, this is not how an index is normally
|
|
|
|
|
/// used! This is only used when merging multiple index files.
|
|
|
|
|
/// used! This is used only when merging multiple index files.
|
|
|
|
|
///
|
|
|
|
|
/// The only way to advance through the file is to use the `.move_entry_to()`
|
|
|
|
|
/// method.
|
|
|
|
|
pub struct IndexFileReader {
|
|
|
|
|
/// Reader that reads the actual index data.
|
|
|
|
|
///
|
|
|
|
|
/// We have two readers. The index data is most of the file. There's also a
|
|
|
|
|
/// table of contents, stored separately at the end. We have to read them
|
|
|
|
|
/// in tandem, so we open the file twice.
|
|
|
|
|
main: BufReader<File>,
|
|
|
|
|
|
|
|
|
|
/// Reader that reads the table of contents. (Since this table is stored at
|
|
|
|
|
/// the end of the file, we have to begin by `seek`ing to it; see the code
|
|
|
|
|
/// in `IndexFileReader::open_and_delete`.)
|
|
|
|
|
contents: BufReader<File>,
|
|
|
|
|
|
|
|
|
|
/// The next entry in the table of contents, if any; or `None` if we've
|
|
|
|
|
/// reached the end of the table. `IndexFileReader` always reads ahead one
|
|
|
|
|
/// entry in the contents and stores it here.
|
|
|
|
|
next: Option<Entry>
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// An entry in the table of contents of an index file.
|
|
|
|
|
///
|
|
|
|
|
/// Each entry in the table of contents is small. It consists of a string, the
|
|
|
|
|
/// `term`; summary information about that term, as used in the corpus (`df`);
|
|
|
|
|
/// and a pointer to bulkier data that tells more (`offset` and `nbytes`).
|
|
|
|
|
pub struct Entry {
|
|
|
|
|
/// The term is a word that appears in one or more documents in the corpus.
|
|
|
|
|
/// The index file contains information about the documents that use this
|
|
|
|
|
/// word.
|
|
|
|
|
pub term: String,
|
|
|
|
|
|
|
|
|
|
/// Total number of documents in the corpus that contain this term.
|
|
|
|
|
pub df: u32,
|
|
|
|
|
|
|
|
|
|
/// Offset of the index data for this term from the beginning of the file, in bytes.
|
|
|
|
|
pub offset: u64,
|
|
|
|
|
|
|
|
|
|
/// Length of the index data for this term, in bytes.
|
|
|
|
|
pub nbytes: u64
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl IndexFileReader {
|
|
|
|
|
/// Open an index file to read it from beginning to end.
|
|
|
|
|
///
|
|
|
|
|
/// This deletes the file, which may not work properly on Windows. Patches
|
|
|
|
|
/// welcome! On Unix, it works like this: the file immediately disappears
|
|
|
|
|
/// from its directory, but it'll still take up space on disk until the
|
|
|
|
|
/// file is closed, which normally happens when the `IndexFileReader` is
|
|
|
|
|
/// dropped.
|
|
|
|
|
pub fn open_and_delete<P: AsRef<Path>>(filename: P) -> io::Result<IndexFileReader> {
|
|
|
|
|
let filename = filename.as_ref();
|
|
|
|
|
let mut main_raw = File::open(filename)?;
|
|
|
|
@ -53,6 +90,9 @@ impl IndexFileReader {
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Read the next entry from the table of contents.
|
|
|
|
|
///
|
|
|
|
|
/// Returns `Ok(None)` if we have reached the end of the file.
|
|
|
|
|
fn read_entry(f: &mut BufReader<File>) -> io::Result<Option<Entry>> {
|
|
|
|
|
// If the first read here fails with `UnexpectedEof`,
|
|
|
|
|
// that's considered a success, with no entry read.
|
|
|
|
@ -85,8 +125,13 @@ impl IndexFileReader {
|
|
|
|
|
}))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Borrow a reference to the next entry in the table of contents.
|
|
|
|
|
/// (Since we always read ahead one entry, this method can't fail.)
|
|
|
|
|
///
|
|
|
|
|
/// Returns `None` if we've reached the end of the file.
|
|
|
|
|
pub fn peek(&self) -> Option<&Entry> { self.next.as_ref() }
|
|
|
|
|
|
|
|
|
|
/// True if the next entry is for the given term.
|
|
|
|
|
pub fn is_at(&self, term: &str) -> bool {
|
|
|
|
|
match self.next {
|
|
|
|
|
Some(ref e) => e.term == term,
|
|
|
|
@ -94,8 +139,8 @@ impl IndexFileReader {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Copy the current entry to the specified output stream,
|
|
|
|
|
/// then read the header for the next entry.
|
|
|
|
|
/// Copy the current entry to the specified output stream, then read the
|
|
|
|
|
/// header for the next entry.
|
|
|
|
|
pub fn move_entry_to(&mut self, out: &mut IndexFileWriter) -> io::Result<()> {
|
|
|
|
|
// This block limits the scope of borrowing `self.next` (for `e`),
|
|
|
|
|
// because after this block is over we'll want to assign to `self.next`.
|
|
|
|
|