Merge branch 'master' of github.com:ProgrammingRust/fingertips

6 years ago · fecdce2521
parent bd98fb7c52 80dfdc559c
commit fecdce2521
4 changed files with 95 additions and 4 deletions
--- a/src/index.rs
+++ b/src/index.rs
@ -64,7 +64,7 @@ impl InMemoryIndex {
        let text = text.to_lowercase();
        let tokens = tokenize(&text);
        for (i, token) in tokens.iter().enumerate() {
-            let mut hits =
+            let hits =
                index.map
                .entry(token.to_string())
                .or_insert_with(|| {
--- a/src/main.rs
+++ b/src/main.rs
@ -134,6 +134,19 @@ fn start_file_indexing_thread(texts: Receiver<String>)
    (receiver, handle)
 }

+/// Start a thread that merges in-memory indexes.
+///
+/// `file_indexes` receives a stream of indexes from the file indexing thread.
+/// These indexes typically vary a lot in size, since the input documents will
+/// typically be all different sizes.
+///
+/// The thread created by this function merges those indexes into "large"
+/// indexes and passes these large indexes on to a new channel.
+///
+/// This returns a pair: a receiver, the sequence of large indexes produced by
+/// merging the input indexes; and a `JoinHandle` that can be used to wait for
+/// this thread to exit. This stage of the pipeline is infallible (it performs
+/// no I/O).
 fn start_in_memory_merge_thread(file_indexes: Receiver<InMemoryIndex>)
    -> (Receiver<InMemoryIndex>, JoinHandle<()>)
 {
@ -158,6 +171,14 @@ fn start_in_memory_merge_thread(file_indexes: Receiver<InMemoryIndex>)
    (receiver, handle)
 }

+/// Start a thread that saves large indexes to temporary files.
+///
+/// This thread generates a meaningless unique filename for each index in
+/// `big_indexes`, saves the data, and passes the filename on to a new channel.
+///
+/// This returns a pair: a receiver that receives the filenames; and a
+/// `JoinHandle` that can be used to wait for this thread to exit and receive
+/// any I/O errors it encountered.
 fn start_index_writer_thread(big_indexes: Receiver<InMemoryIndex>,
                             output_dir: &Path)
    -> (Receiver<PathBuf>, JoinHandle<io::Result<()>>)
@ -178,6 +199,8 @@ fn start_index_writer_thread(big_indexes: Receiver<InMemoryIndex>,
    (receiver, handle)
 }

+/// Given a sequence of filenames of index data files, merge all the files
+/// into a single index data file.
 fn merge_index_files(files: Receiver<PathBuf>, output_dir: &Path)
    -> io::Result<()>
 {
@ -218,6 +241,13 @@ fn run_pipeline(documents: Vec<PathBuf>, output_dir: PathBuf)
    result
 }

+/// Given some paths, generate the complete list of text files to index. We check
+/// on disk whether the path is the name of a file or a directory; for
+/// directories, all .txt files immediately under the directory are indexed.
+/// Relative paths are fine.
+///
+/// It's an error if any of the `args` is not a valid path to an existing file
+/// or directory.
 fn expand_filename_arguments(args: Vec<String>) -> io::Result<Vec<PathBuf>> {
    let mut filenames = vec![];
    for arg in args {
@ -236,6 +266,7 @@ fn expand_filename_arguments(args: Vec<String>) -> io::Result<Vec<PathBuf>> {
    Ok(filenames)
 }

+/// Generate an index for a bunch of text files.
 fn run(filenames: Vec<String>, single_threaded: bool) -> io::Result<()> {
    let output_dir = PathBuf::from(".");
    let documents = expand_filename_arguments(filenames)?;
--- a/src/read.rs
+++ b/src/read.rs
@ -10,21 +10,58 @@ use write::IndexFileWriter;

 /// A `IndexFileReader` does a single linear pass over an index file from
 /// beginning to end. Needless to say, this is not how an index is normally
-/// used! This is only used when merging multiple index files.
+/// used! This is used only when merging multiple index files.
+///
+/// The only way to advance through the file is to use the `.move_entry_to()`
+/// method.
 pub struct IndexFileReader {
+    /// Reader that reads the actual index data.
+    ///
+    /// We have two readers. The index data is most of the file. There's also a
+    /// table of contents, stored separately at the end. We have to read them
+    /// in tandem, so we open the file twice.
    main: BufReader<File>,
+
+    /// Reader that reads the table of contents. (Since this table is stored at
+    /// the end of the file, we have to begin by `seek`ing to it; see the code
+    /// in `IndexFileReader::open_and_delete`.)
    contents: BufReader<File>,
+
+    /// The next entry in the table of contents, if any; or `None` if we've
+    /// reached the end of the table. `IndexFileReader` always reads ahead one
+    /// entry in the contents and stores it here.
    next: Option<Entry>
 }

+/// An entry in the table of contents of an index file.
+///
+/// Each entry in the table of contents is small. It consists of a string, the
+/// `term`; summary information about that term, as used in the corpus (`df`);
+/// and a pointer to bulkier data that tells more (`offset` and `nbytes`).
 pub struct Entry {
+    /// The term is a word that appears in one or more documents in the corpus.
+    /// The index file contains information about the documents that use this
+    /// word.
    pub term: String,
+
+    /// Total number of documents in the corpus that contain this term.
    pub df: u32,
+
+    /// Offset of the index data for this term from the beginning of the file, in bytes.
    pub offset: u64,
+
+    /// Length of the index data for this term, in bytes.
    pub nbytes: u64
 }

 impl IndexFileReader {
+    /// Open an index file to read it from beginning to end.
+    ///
+    /// This deletes the file, which may not work properly on Windows. Patches
+    /// welcome!  On Unix, it works like this: the file immediately disappears
+    /// from its directory, but it'll still take up space on disk until the
+    /// file is closed, which normally happens when the `IndexFileReader` is
+    /// dropped.
    pub fn open_and_delete<P: AsRef<Path>>(filename: P) -> io::Result<IndexFileReader> {
        let filename = filename.as_ref();
        let mut main_raw = File::open(filename)?;
@ -53,6 +90,9 @@ impl IndexFileReader {
        })
    }

+    /// Read the next entry from the table of contents.
+    ///
+    /// Returns `Ok(None)` if we have reached the end of the file.
    fn read_entry(f: &mut BufReader<File>) -> io::Result<Option<Entry>> {
        // If the first read here fails with `UnexpectedEof`,
        // that's considered a success, with no entry read.
@ -85,8 +125,13 @@ impl IndexFileReader {
        }))
    }

+    /// Borrow a reference to the next entry in the table of contents.
+    /// (Since we always read ahead one entry, this method can't fail.)
+    ///
+    /// Returns `None` if we've reached the end of the file.
    pub fn peek(&self) -> Option<&Entry> { self.next.as_ref() }

+    /// True if the next entry is for the given term.
    pub fn is_at(&self, term: &str) -> bool {
        match self.next {
            Some(ref e) => e.term == term,
@ -94,8 +139,8 @@ impl IndexFileReader {
        }
    }

-    /// Copy the current entry to the specified output stream,
-    /// then read the header for the next entry.
+    /// Copy the current entry to the specified output stream, then read the
+    /// header for the next entry.
    pub fn move_entry_to(&mut self, out: &mut IndexFileWriter) -> io::Result<()> {
        // This block limits the scope of borrowing `self.next` (for `e`),
        // because after this block is over we'll want to assign to `self.next`.
--- a/src/write.rs
+++ b/src/write.rs
@ -6,9 +6,23 @@ use index::InMemoryIndex;
 use tmp::TmpDir;
 use byteorder::{LittleEndian, WriteBytesExt};

+/// Writer for saving an index to a binary file.
+///
+/// The first 8 bytes of the index file contain the offset of the table of
+/// contents, in bytes. Then come the main entries, all stored back-to-back
+/// with no particular metadata.
+///
+
+/// An index file has two parts. The main part of the file is a sequence of
+/// entries, stored back-to-back; the
 pub struct IndexFileWriter {
+    /// The number of bytes written so far.
    offset: u64,
+
+    /// The open file we're writing to.
    writer: BufWriter<File>,
+
+    /// The table of contents for this file.
    contents_buf: Vec<u8>
 }

@ -38,6 +52,7 @@ impl IndexFileWriter {
        self.contents_buf.extend(bytes);
    }

+    /// Finish writing the index file and close it.
    pub fn finish(mut self) -> io::Result<()> {
        let contents_start = self.offset;
        self.writer.write_all(&self.contents_buf)?;