diff --git a/src/read.rs b/src/read.rs index 50ece07..30f50df 100644 --- a/src/read.rs +++ b/src/read.rs @@ -10,21 +10,58 @@ use write::IndexFileWriter; /// A `IndexFileReader` does a single linear pass over an index file from /// beginning to end. Needless to say, this is not how an index is normally -/// used! This is only used when merging multiple index files. +/// used! This is used only when merging multiple index files. +/// +/// The only way to advance through the file is to use the `.move_entry_to()` +/// method. pub struct IndexFileReader { + /// Reader that reads the actual index data. + /// + /// We have two readers. The index data is most of the file. There's also a + /// table of contents, stored separately at the end. We have to read them + /// in tandem, so we open the file twice. main: BufReader, + + /// Reader that reads the table of contents. (Since this table is stored at + /// the end of the file, we have to begin by `seek`ing to it; see the code + /// in `IndexFileReader::open_and_delete`.) contents: BufReader, + + /// The next entry in the table of contents, if any; or `None` if we've + /// reached the end of the table. `IndexFileReader` always reads ahead one + /// entry in the contents and stores it here. next: Option } +/// An entry in the table of contents of an index file. +/// +/// Each entry in the table of contents is small. It consists of a string, the +/// `term`; summary information about that term, as used in the corpus (`df`); +/// and a pointer to bulkier data that tells more (`offset` and `nbytes`). pub struct Entry { + /// The term is a word that appears in one or more documents in the corpus. + /// The index file contains information about the documents that use this + /// word. pub term: String, + + /// Total number of documents in the corpus that contain this term. pub df: u32, + + /// Offset of the index data for this term from the beginning of the file, in bytes. pub offset: u64, + + /// Length of the index data for this term, in bytes. pub nbytes: u64 } impl IndexFileReader { + /// Open an index file to read it from beginning to end. + /// + /// This deletes the file, which may not work properly on Windows. Patches + /// welcome! On Unix, it works like this: the file immediately disappears + /// from its directory, but it'll still take up space on disk until the + /// file is closed, which normally happens when the `IndexFileReader` is + /// dropped. pub fn open_and_delete>(filename: P) -> io::Result { let filename = filename.as_ref(); let mut main_raw = File::open(filename)?; @@ -53,6 +90,9 @@ impl IndexFileReader { }) } + /// Read the next entry from the table of contents. + /// + /// Returns `Ok(None)` if we have reached the end of the file. fn read_entry(f: &mut BufReader) -> io::Result> { // If the first read here fails with `UnexpectedEof`, // that's considered a success, with no entry read. @@ -85,8 +125,13 @@ impl IndexFileReader { })) } + /// Borrow a reference to the next entry in the table of contents. + /// (Since we always read ahead one entry, this method can't fail.) + /// + /// Returns `None` if we've reached the end of the file. pub fn peek(&self) -> Option<&Entry> { self.next.as_ref() } + /// True if the next entry is for the given term. pub fn is_at(&self, term: &str) -> bool { match self.next { Some(ref e) => e.term == term, @@ -94,8 +139,8 @@ impl IndexFileReader { } } - /// Copy the current entry to the specified output stream, - /// then read the header for the next entry. + /// Copy the current entry to the specified output stream, then read the + /// header for the next entry. pub fn move_entry_to(&mut self, out: &mut IndexFileWriter) -> io::Result<()> { // This block limits the scope of borrowing `self.next` (for `e`), // because after this block is over we'll want to assign to `self.next`.