Merge branch 'master' of github.com:ProgrammingRust/fingertips

master
Jim Blandy 6 years ago
commit fecdce2521

@ -64,7 +64,7 @@ impl InMemoryIndex {
let text = text.to_lowercase();
let tokens = tokenize(&text);
for (i, token) in tokens.iter().enumerate() {
let mut hits =
let hits =
index.map
.entry(token.to_string())
.or_insert_with(|| {

@ -134,6 +134,19 @@ fn start_file_indexing_thread(texts: Receiver<String>)
(receiver, handle)
}
/// Start a thread that merges in-memory indexes.
///
/// `file_indexes` receives a stream of indexes from the file indexing thread.
/// These indexes typically vary a lot in size, since the input documents will
/// typically be all different sizes.
///
/// The thread created by this function merges those indexes into "large"
/// indexes and passes these large indexes on to a new channel.
///
/// This returns a pair: a receiver, the sequence of large indexes produced by
/// merging the input indexes; and a `JoinHandle` that can be used to wait for
/// this thread to exit. This stage of the pipeline is infallible (it performs
/// no I/O).
fn start_in_memory_merge_thread(file_indexes: Receiver<InMemoryIndex>)
-> (Receiver<InMemoryIndex>, JoinHandle<()>)
{
@ -158,6 +171,14 @@ fn start_in_memory_merge_thread(file_indexes: Receiver<InMemoryIndex>)
(receiver, handle)
}
/// Start a thread that saves large indexes to temporary files.
///
/// This thread generates a meaningless unique filename for each index in
/// `big_indexes`, saves the data, and passes the filename on to a new channel.
///
/// This returns a pair: a receiver that receives the filenames; and a
/// `JoinHandle` that can be used to wait for this thread to exit and receive
/// any I/O errors it encountered.
fn start_index_writer_thread(big_indexes: Receiver<InMemoryIndex>,
output_dir: &Path)
-> (Receiver<PathBuf>, JoinHandle<io::Result<()>>)
@ -178,6 +199,8 @@ fn start_index_writer_thread(big_indexes: Receiver<InMemoryIndex>,
(receiver, handle)
}
/// Given a sequence of filenames of index data files, merge all the files
/// into a single index data file.
fn merge_index_files(files: Receiver<PathBuf>, output_dir: &Path)
-> io::Result<()>
{
@ -218,6 +241,13 @@ fn run_pipeline(documents: Vec<PathBuf>, output_dir: PathBuf)
result
}
/// Given some paths, generate the complete list of text files to index. We check
/// on disk whether the path is the name of a file or a directory; for
/// directories, all .txt files immediately under the directory are indexed.
/// Relative paths are fine.
///
/// It's an error if any of the `args` is not a valid path to an existing file
/// or directory.
fn expand_filename_arguments(args: Vec<String>) -> io::Result<Vec<PathBuf>> {
let mut filenames = vec![];
for arg in args {
@ -236,6 +266,7 @@ fn expand_filename_arguments(args: Vec<String>) -> io::Result<Vec<PathBuf>> {
Ok(filenames)
}
/// Generate an index for a bunch of text files.
fn run(filenames: Vec<String>, single_threaded: bool) -> io::Result<()> {
let output_dir = PathBuf::from(".");
let documents = expand_filename_arguments(filenames)?;

@ -10,21 +10,58 @@ use write::IndexFileWriter;
/// A `IndexFileReader` does a single linear pass over an index file from
/// beginning to end. Needless to say, this is not how an index is normally
/// used! This is only used when merging multiple index files.
/// used! This is used only when merging multiple index files.
///
/// The only way to advance through the file is to use the `.move_entry_to()`
/// method.
pub struct IndexFileReader {
/// Reader that reads the actual index data.
///
/// We have two readers. The index data is most of the file. There's also a
/// table of contents, stored separately at the end. We have to read them
/// in tandem, so we open the file twice.
main: BufReader<File>,
/// Reader that reads the table of contents. (Since this table is stored at
/// the end of the file, we have to begin by `seek`ing to it; see the code
/// in `IndexFileReader::open_and_delete`.)
contents: BufReader<File>,
/// The next entry in the table of contents, if any; or `None` if we've
/// reached the end of the table. `IndexFileReader` always reads ahead one
/// entry in the contents and stores it here.
next: Option<Entry>
}
/// An entry in the table of contents of an index file.
///
/// Each entry in the table of contents is small. It consists of a string, the
/// `term`; summary information about that term, as used in the corpus (`df`);
/// and a pointer to bulkier data that tells more (`offset` and `nbytes`).
pub struct Entry {
/// The term is a word that appears in one or more documents in the corpus.
/// The index file contains information about the documents that use this
/// word.
pub term: String,
/// Total number of documents in the corpus that contain this term.
pub df: u32,
/// Offset of the index data for this term from the beginning of the file, in bytes.
pub offset: u64,
/// Length of the index data for this term, in bytes.
pub nbytes: u64
}
impl IndexFileReader {
/// Open an index file to read it from beginning to end.
///
/// This deletes the file, which may not work properly on Windows. Patches
/// welcome! On Unix, it works like this: the file immediately disappears
/// from its directory, but it'll still take up space on disk until the
/// file is closed, which normally happens when the `IndexFileReader` is
/// dropped.
pub fn open_and_delete<P: AsRef<Path>>(filename: P) -> io::Result<IndexFileReader> {
let filename = filename.as_ref();
let mut main_raw = File::open(filename)?;
@ -53,6 +90,9 @@ impl IndexFileReader {
})
}
/// Read the next entry from the table of contents.
///
/// Returns `Ok(None)` if we have reached the end of the file.
fn read_entry(f: &mut BufReader<File>) -> io::Result<Option<Entry>> {
// If the first read here fails with `UnexpectedEof`,
// that's considered a success, with no entry read.
@ -85,8 +125,13 @@ impl IndexFileReader {
}))
}
/// Borrow a reference to the next entry in the table of contents.
/// (Since we always read ahead one entry, this method can't fail.)
///
/// Returns `None` if we've reached the end of the file.
pub fn peek(&self) -> Option<&Entry> { self.next.as_ref() }
/// True if the next entry is for the given term.
pub fn is_at(&self, term: &str) -> bool {
match self.next {
Some(ref e) => e.term == term,
@ -94,8 +139,8 @@ impl IndexFileReader {
}
}
/// Copy the current entry to the specified output stream,
/// then read the header for the next entry.
/// Copy the current entry to the specified output stream, then read the
/// header for the next entry.
pub fn move_entry_to(&mut self, out: &mut IndexFileWriter) -> io::Result<()> {
// This block limits the scope of borrowing `self.next` (for `e`),
// because after this block is over we'll want to assign to `self.next`.

@ -6,9 +6,23 @@ use index::InMemoryIndex;
use tmp::TmpDir;
use byteorder::{LittleEndian, WriteBytesExt};
/// Writer for saving an index to a binary file.
///
/// The first 8 bytes of the index file contain the offset of the table of
/// contents, in bytes. Then come the main entries, all stored back-to-back
/// with no particular metadata.
///
/// An index file has two parts. The main part of the file is a sequence of
/// entries, stored back-to-back; the
pub struct IndexFileWriter {
/// The number of bytes written so far.
offset: u64,
/// The open file we're writing to.
writer: BufWriter<File>,
/// The table of contents for this file.
contents_buf: Vec<u8>
}
@ -38,6 +52,7 @@ impl IndexFileWriter {
self.contents_buf.extend(bytes);
}
/// Finish writing the index file and close it.
pub fn finish(mut self) -> io::Result<()> {
let contents_start = self.offset;
self.writer.write_all(&self.contents_buf)?;

Loading…
Cancel
Save