diff --git a/src/index.rs b/src/index.rs index e0c6b9f..68b1b6b 100644 --- a/src/index.rs +++ b/src/index.rs @@ -64,7 +64,7 @@ impl InMemoryIndex { let text = text.to_lowercase(); let tokens = tokenize(&text); for (i, token) in tokens.iter().enumerate() { - let mut hits = + let hits = index.map .entry(token.to_string()) .or_insert_with(|| { diff --git a/src/main.rs b/src/main.rs index 2b83054..eec7757 100644 --- a/src/main.rs +++ b/src/main.rs @@ -134,6 +134,19 @@ fn start_file_indexing_thread(texts: Receiver) (receiver, handle) } +/// Start a thread that merges in-memory indexes. +/// +/// `file_indexes` receives a stream of indexes from the file indexing thread. +/// These indexes typically vary a lot in size, since the input documents will +/// typically be all different sizes. +/// +/// The thread created by this function merges those indexes into "large" +/// indexes and passes these large indexes on to a new channel. +/// +/// This returns a pair: a receiver, the sequence of large indexes produced by +/// merging the input indexes; and a `JoinHandle` that can be used to wait for +/// this thread to exit. This stage of the pipeline is infallible (it performs +/// no I/O). fn start_in_memory_merge_thread(file_indexes: Receiver) -> (Receiver, JoinHandle<()>) { @@ -158,6 +171,14 @@ fn start_in_memory_merge_thread(file_indexes: Receiver) (receiver, handle) } +/// Start a thread that saves large indexes to temporary files. +/// +/// This thread generates a meaningless unique filename for each index in +/// `big_indexes`, saves the data, and passes the filename on to a new channel. +/// +/// This returns a pair: a receiver that receives the filenames; and a +/// `JoinHandle` that can be used to wait for this thread to exit and receive +/// any I/O errors it encountered. fn start_index_writer_thread(big_indexes: Receiver, output_dir: &Path) -> (Receiver, JoinHandle>) @@ -178,6 +199,8 @@ fn start_index_writer_thread(big_indexes: Receiver, (receiver, handle) } +/// Given a sequence of filenames of index data files, merge all the files +/// into a single index data file. fn merge_index_files(files: Receiver, output_dir: &Path) -> io::Result<()> { @@ -218,6 +241,13 @@ fn run_pipeline(documents: Vec, output_dir: PathBuf) result } +/// Given some paths, generate the complete list of text files to index. We check +/// on disk whether the path is the name of a file or a directory; for +/// directories, all .txt files immediately under the directory are indexed. +/// Relative paths are fine. +/// +/// It's an error if any of the `args` is not a valid path to an existing file +/// or directory. fn expand_filename_arguments(args: Vec) -> io::Result> { let mut filenames = vec![]; for arg in args { @@ -236,6 +266,7 @@ fn expand_filename_arguments(args: Vec) -> io::Result> { Ok(filenames) } +/// Generate an index for a bunch of text files. fn run(filenames: Vec, single_threaded: bool) -> io::Result<()> { let output_dir = PathBuf::from("."); let documents = expand_filename_arguments(filenames)?; diff --git a/src/read.rs b/src/read.rs index 50ece07..30f50df 100644 --- a/src/read.rs +++ b/src/read.rs @@ -10,21 +10,58 @@ use write::IndexFileWriter; /// A `IndexFileReader` does a single linear pass over an index file from /// beginning to end. Needless to say, this is not how an index is normally -/// used! This is only used when merging multiple index files. +/// used! This is used only when merging multiple index files. +/// +/// The only way to advance through the file is to use the `.move_entry_to()` +/// method. pub struct IndexFileReader { + /// Reader that reads the actual index data. + /// + /// We have two readers. The index data is most of the file. There's also a + /// table of contents, stored separately at the end. We have to read them + /// in tandem, so we open the file twice. main: BufReader, + + /// Reader that reads the table of contents. (Since this table is stored at + /// the end of the file, we have to begin by `seek`ing to it; see the code + /// in `IndexFileReader::open_and_delete`.) contents: BufReader, + + /// The next entry in the table of contents, if any; or `None` if we've + /// reached the end of the table. `IndexFileReader` always reads ahead one + /// entry in the contents and stores it here. next: Option } +/// An entry in the table of contents of an index file. +/// +/// Each entry in the table of contents is small. It consists of a string, the +/// `term`; summary information about that term, as used in the corpus (`df`); +/// and a pointer to bulkier data that tells more (`offset` and `nbytes`). pub struct Entry { + /// The term is a word that appears in one or more documents in the corpus. + /// The index file contains information about the documents that use this + /// word. pub term: String, + + /// Total number of documents in the corpus that contain this term. pub df: u32, + + /// Offset of the index data for this term from the beginning of the file, in bytes. pub offset: u64, + + /// Length of the index data for this term, in bytes. pub nbytes: u64 } impl IndexFileReader { + /// Open an index file to read it from beginning to end. + /// + /// This deletes the file, which may not work properly on Windows. Patches + /// welcome! On Unix, it works like this: the file immediately disappears + /// from its directory, but it'll still take up space on disk until the + /// file is closed, which normally happens when the `IndexFileReader` is + /// dropped. pub fn open_and_delete>(filename: P) -> io::Result { let filename = filename.as_ref(); let mut main_raw = File::open(filename)?; @@ -53,6 +90,9 @@ impl IndexFileReader { }) } + /// Read the next entry from the table of contents. + /// + /// Returns `Ok(None)` if we have reached the end of the file. fn read_entry(f: &mut BufReader) -> io::Result> { // If the first read here fails with `UnexpectedEof`, // that's considered a success, with no entry read. @@ -85,8 +125,13 @@ impl IndexFileReader { })) } + /// Borrow a reference to the next entry in the table of contents. + /// (Since we always read ahead one entry, this method can't fail.) + /// + /// Returns `None` if we've reached the end of the file. pub fn peek(&self) -> Option<&Entry> { self.next.as_ref() } + /// True if the next entry is for the given term. pub fn is_at(&self, term: &str) -> bool { match self.next { Some(ref e) => e.term == term, @@ -94,8 +139,8 @@ impl IndexFileReader { } } - /// Copy the current entry to the specified output stream, - /// then read the header for the next entry. + /// Copy the current entry to the specified output stream, then read the + /// header for the next entry. pub fn move_entry_to(&mut self, out: &mut IndexFileWriter) -> io::Result<()> { // This block limits the scope of borrowing `self.next` (for `e`), // because after this block is over we'll want to assign to `self.next`. diff --git a/src/write.rs b/src/write.rs index 100f388..ee30c60 100644 --- a/src/write.rs +++ b/src/write.rs @@ -6,9 +6,23 @@ use index::InMemoryIndex; use tmp::TmpDir; use byteorder::{LittleEndian, WriteBytesExt}; +/// Writer for saving an index to a binary file. +/// +/// The first 8 bytes of the index file contain the offset of the table of +/// contents, in bytes. Then come the main entries, all stored back-to-back +/// with no particular metadata. +/// + +/// An index file has two parts. The main part of the file is a sequence of +/// entries, stored back-to-back; the pub struct IndexFileWriter { + /// The number of bytes written so far. offset: u64, + + /// The open file we're writing to. writer: BufWriter, + + /// The table of contents for this file. contents_buf: Vec } @@ -38,6 +52,7 @@ impl IndexFileWriter { self.contents_buf.extend(bytes); } + /// Finish writing the index file and close it. pub fn finish(mut self) -> io::Result<()> { let contents_start = self.offset; self.writer.write_all(&self.contents_buf)?;