Initial commit.

master
Jason Orendorff 7 years ago
commit dcbebecb72

1
.gitignore vendored

@ -0,0 +1 @@
target

21
Cargo.lock generated

@ -0,0 +1,21 @@
[root]
name = "fingertips"
version = "0.1.0"
dependencies = [
"argparse 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "argparse"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "byteorder"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[metadata]
"checksum argparse 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "37bb99f5e39ee8b23b6e227f5b8f024207e8616f44aa4b8c76ecd828011667ef"
"checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855"

@ -0,0 +1,8 @@
[package]
name = "fingertips"
version = "0.1.0"
authors = ["Jason Orendorff <jason.orendorff@gmail.com>"]
[dependencies]
argparse = "0.2.1"
byteorder = "0.5.3"

@ -0,0 +1,75 @@
use std::collections::HashMap;
use byteorder::{LittleEndian, WriteBytesExt};
fn tokenize(text: &str) -> Vec<&str> {
text.split(|ch: char| !ch.is_alphanumeric())
.filter(|word| !word.is_empty())
.collect()
}
pub struct InMemoryIndex {
pub word_count: usize,
pub map: HashMap<String, Vec<Hit>>
}
/// A record indicating that a particular document contains some term,
/// how many times it appears, and at what offsets.
///
/// The buffer contains all the hit data in binary form, little-endian.
/// The first u32 of the data is the document id.
/// The remaining [u32] are offsets.
pub type Hit = Vec<u8>;
impl InMemoryIndex {
pub fn new() -> InMemoryIndex {
InMemoryIndex {
word_count: 0,
map: HashMap::new()
}
}
pub fn from_single_document(document_id: usize, text: String) -> InMemoryIndex {
let document_id = document_id as u32;
let mut index = InMemoryIndex::new();
let text = text.to_lowercase();
let tokens = tokenize(&text);
for (i, token) in tokens.iter().enumerate() {
let mut hits =
index.map
.entry(token.to_string())
.or_insert_with(|| {
let mut hits = Vec::with_capacity(4 + 4);
hits.write_u32::<LittleEndian>(document_id).unwrap();
vec![hits]
});
hits[0].write_u32::<LittleEndian>(i as u32).unwrap();
index.word_count += 1;
}
if document_id % 100 == 0 {
println!("indexed document {}, {} bytes, {} words", document_id, text.len(), index.word_count);
}
index
}
pub fn merge(&mut self, other: InMemoryIndex) {
for (term, hits) in other.map {
self.map.entry(term)
.or_insert_with(|| vec![])
.extend(hits)
}
self.word_count += other.word_count;
}
pub fn is_empty(&self) -> bool {
self.word_count == 0
}
pub fn is_large(&self) -> bool {
// This depends on how much memory your computer has, of course.
const REASONABLE_SIZE: usize = 100_000_000;
self.word_count > REASONABLE_SIZE
}
}

@ -0,0 +1,218 @@
extern crate argparse;
extern crate byteorder;
mod index;
mod read;
mod write;
mod merge;
mod tmp;
use std::error::Error;
use std::fs::File;
use std::io;
use std::io::prelude::*;
use std::path::{Path, PathBuf};
use std::sync::mpsc::{channel, Receiver};
use std::thread::{spawn, JoinHandle};
use argparse::{ArgumentParser, StoreTrue, Collect};
use index::InMemoryIndex;
use write::write_index_to_tmp_file;
use merge::FileMerge;
use tmp::TmpDir;
fn start_file_reader_thread(documents: Vec<PathBuf>)
-> (Receiver<String>, JoinHandle<io::Result<()>>)
{
let (sender, receiver) = channel();
let handle = spawn(move || {
for filename in documents {
let mut f = File::open(filename)?;
let mut text = String::new();
f.read_to_string(&mut text)?;
if sender.send(text).is_err() {
break;
}
}
Ok(())
});
(receiver, handle)
}
fn start_file_indexing_thread(texts: Receiver<String>)
-> (Receiver<InMemoryIndex>, JoinHandle<()>)
{
let (sender, receiver) = channel();
let handle = spawn(move || {
for (doc_id, text) in texts.into_iter().enumerate() {
let index = InMemoryIndex::from_single_document(doc_id, text);
if sender.send(index).is_err() {
break;
}
}
});
(receiver, handle)
}
fn start_in_memory_merge_thread(file_indexes: Receiver<InMemoryIndex>)
-> (Receiver<InMemoryIndex>, JoinHandle<()>)
{
let (sender, receiver) = channel();
let handle = spawn(move || {
let mut accumulated_index = InMemoryIndex::new();
for fi in file_indexes {
accumulated_index.merge(fi);
if accumulated_index.is_large() {
if sender.send(accumulated_index).is_err() {
return;
}
accumulated_index = InMemoryIndex::new();
}
}
if !accumulated_index.is_empty() {
let _ = sender.send(accumulated_index);
}
});
(receiver, handle)
}
fn start_index_writer_thread(big_indexes: Receiver<InMemoryIndex>,
output_dir: &Path)
-> (Receiver<PathBuf>, JoinHandle<io::Result<()>>)
{
let (sender, receiver) = channel();
let mut tmp_dir = TmpDir::new(output_dir);
let handle = spawn(move || {
for index in big_indexes {
let file = write_index_to_tmp_file(index, &mut tmp_dir)?;
if sender.send(file).is_err() {
break;
}
}
Ok(())
});
(receiver, handle)
}
fn merge_index_files(files: Receiver<PathBuf>, output_dir: &Path)
-> io::Result<()>
{
let mut merge = FileMerge::new(output_dir);
for file in files {
merge.add_file(file)?;
}
merge.finish()
}
fn run_pipeline(documents: Vec<PathBuf>, output_dir: PathBuf)
-> io::Result<()>
{
// Launch all five stages of the pipeline.
let (texts, h1) = start_file_reader_thread(documents);
let (pints, h2) = start_file_indexing_thread(texts);
let (gallons, h3) = start_in_memory_merge_thread(pints);
let (files, h4) = start_index_writer_thread(gallons, &output_dir);
let result = merge_index_files(files, &output_dir);
// Wait for threads to finish, holding on to any errors that they encounter.
let r1 = h1.join().unwrap();
h2.join().unwrap();
h3.join().unwrap();
let r4 = h4.join().unwrap();
// Return the first error encountered, if any.
// (As it happens, h2 and h3 can't fail: those threads
// are pure in-memory data processing.)
r1?;
r4?;
result
}
fn run_single_threaded(documents: Vec<PathBuf>, output_dir: PathBuf)
-> io::Result<()>
{
let mut accumulated_index = InMemoryIndex::new();
let mut merge = FileMerge::new(&output_dir);
let mut tmp_dir = TmpDir::new(&output_dir);
for (doc_id, filename) in documents.into_iter().enumerate() {
let mut f = File::open(filename)?;
let mut text = String::new();
f.read_to_string(&mut text)?;
let index = InMemoryIndex::from_single_document(doc_id, text);
accumulated_index.merge(index);
if accumulated_index.is_large() {
let file = write_index_to_tmp_file(accumulated_index, &mut tmp_dir)?;
merge.add_file(file)?;
accumulated_index = InMemoryIndex::new();
}
}
if !accumulated_index.is_empty() {
let file = write_index_to_tmp_file(accumulated_index, &mut tmp_dir)?;
merge.add_file(file)?;
}
merge.finish()
}
fn expand_filename_arguments(args: Vec<String>) -> io::Result<Vec<PathBuf>> {
let mut filenames = vec![];
for arg in args {
let path = PathBuf::from(arg);
if path.metadata()?.is_dir() {
for entry in path.read_dir()? {
let entry = entry?;
if entry.file_type()?.is_file() {
filenames.push(entry.path());
}
}
} else {
filenames.push(path);
}
}
Ok(filenames)
}
fn run(filenames: Vec<String>, single_threaded: bool) -> io::Result<()> {
let output_dir = PathBuf::from(".");
let documents = expand_filename_arguments(filenames)?;
if single_threaded {
run_single_threaded(documents, output_dir)
} else {
run_pipeline(documents, output_dir)
}
}
fn main() {
let mut single_threaded = false;
let mut filenames = vec![];
{
let mut ap = ArgumentParser::new();
ap.set_description("Make an inverted index for searching documents.");
ap.refer(&mut single_threaded)
.add_option(&["-1", "--single-threaded"], StoreTrue,
"Do all the work on a single thread.");
ap.refer(&mut filenames)
.add_argument("filenames", Collect,
"Names of files/directories to index. \
For directories, all .txt files immediately \
under the directory are indexed.");
ap.parse_args_or_exit();
}
match run(filenames, single_threaded) {
Ok(()) => {}
Err(err) => println!("error: {:?}", err.description())
}
}

@ -0,0 +1,132 @@
use std::fs::{self, File};
use std::io::{self, BufWriter};
use std::mem;
use std::path::{Path, PathBuf};
use tmp::TmpDir;
use read::IndexFileReader;
use write::IndexFileWriter;
pub struct FileMerge {
output_dir: PathBuf,
tmp_dir: TmpDir,
stacks: Vec<Vec<PathBuf>>
}
// How many files to merge at a time, at most.
const NSTREAMS: usize = 8;
const MERGED_FILENAME: &'static str = "index.dat";
impl FileMerge {
pub fn new(output_dir: &Path) -> FileMerge {
FileMerge {
output_dir: output_dir.to_owned(),
tmp_dir: TmpDir::new(output_dir.to_owned()),
stacks: vec![]
}
}
pub fn add_file(&mut self, mut file: PathBuf) -> io::Result<()> {
let mut level = 0;
loop {
if level == self.stacks.len() {
self.stacks.push(vec![]);
}
self.stacks[level].push(file);
if self.stacks[level].len() < NSTREAMS {
break;
}
let (filename, out) = self.tmp_dir.create()?;
let mut to_merge = vec![];
mem::swap(&mut self.stacks[level], &mut to_merge);
merge_streams(to_merge, out)?;
file = filename;
level += 1;
}
Ok(())
}
pub fn finish(mut self) -> io::Result<()> {
let mut tmp = Vec::with_capacity(NSTREAMS);
for stack in self.stacks {
for file in stack.into_iter().rev() {
tmp.push(file);
if tmp.len() == NSTREAMS {
merge_reversed(&mut tmp, &mut self.tmp_dir)?;
}
}
}
if tmp.len() > 1 {
merge_reversed(&mut tmp, &mut self.tmp_dir)?;
}
assert!(tmp.len() <= 1);
match tmp.pop() {
Some(last_file) =>
fs::rename(last_file, self.output_dir.join(MERGED_FILENAME)),
None =>
Err(io::Error::new(io::ErrorKind::Other,
"no documents were parsed or none contained any words"))
}
}
}
fn merge_streams(files: Vec<PathBuf>, out: BufWriter<File>)
-> io::Result<()>
{
let mut streams: Vec<IndexFileReader> =
files.into_iter()
.map(IndexFileReader::open_and_delete)
.collect::<io::Result<_>>()?;
let mut output = IndexFileWriter::new(out)?;
let mut point: u64 = 0;
let mut count = streams.iter().filter(|s| s.peek().is_some()).count();
while count > 0 {
let mut term = None;
let mut nbytes = 0;
let mut df = 0;
for s in &streams {
match s.peek() {
None => {}
Some(entry) => {
if term.is_none() || entry.term < *term.as_ref().unwrap() {
term = Some(entry.term.clone()); // XXX LAME clone
nbytes = entry.nbytes;
df = entry.df;
} else if entry.term == *term.as_ref().unwrap() {
nbytes += entry.nbytes;
df += entry.df;
}
}
}
}
let term = term.expect("bug in algorithm!");
for s in &mut streams {
if s.is_at(&term) {
s.move_entry_to(&mut output)?;
if s.peek().is_none() {
count -= 1;
}
}
}
output.write_contents_entry(term, df, point, nbytes as u64);
point += nbytes as u64;
}
assert!(streams.iter().all(|s| s.peek().is_none()));
output.finish()
}
fn merge_reversed(filenames: &mut Vec<PathBuf>, tmp_dir: &mut TmpDir) -> io::Result<()> {
filenames.reverse();
let (merged_filename, out) = tmp_dir.create()?;
let mut to_merge = Vec::with_capacity(NSTREAMS);
mem::swap(filenames, &mut to_merge);
merge_streams(to_merge, out)?;
filenames.push(merged_filename);
Ok(())
}

@ -0,0 +1,115 @@
use std::fs::{self, File};
use std::io::prelude::*;
use std::io::{self, BufReader, SeekFrom};
use std::path::Path;
use byteorder::{LittleEndian, ReadBytesExt};
use write::IndexFileWriter;
/// A `IndexFileReader` does a single linear pass over an index file from
/// beginning to end. Needless to say, this is not how an index is normally
/// used! This is only used when merging multiple index files.
pub struct IndexFileReader {
main: BufReader<File>,
contents: BufReader<File>,
next: Option<Entry>
}
pub struct Entry {
pub term: String,
pub df: u32,
pub offset: u64,
pub nbytes: u64
}
impl IndexFileReader {
pub fn open_and_delete<P: AsRef<Path>>(filename: P) -> io::Result<IndexFileReader> {
let filename = filename.as_ref();
let mut main_raw = File::open(filename)?;
// Read the file header.
let contents_offset = main_raw.read_u64::<LittleEndian>()?;
println!("opened {}, table of contents starts at {}", filename.display(), contents_offset);
// Open again so we have two read heads;
// move the contents read head to its starting position.
// Set up buffering.
let mut contents_raw = File::open(filename)?;
contents_raw.seek(SeekFrom::Start(contents_offset))?;
let main = BufReader::new(main_raw);
let mut contents = BufReader::new(contents_raw);
// We always read ahead one entry, so load the first entry right away.
let first = IndexFileReader::read_entry(&mut contents)?;
fs::remove_file(filename)?; // YOLO
Ok(IndexFileReader {
main: main,
contents: contents,
next: first
})
}
fn read_entry(f: &mut BufReader<File>) -> io::Result<Option<Entry>> {
// If the first read here fails with `UnexpectedEof`,
// that's considered a success, with no entry read.
let offset = match f.read_u64::<LittleEndian>() {
Ok(value) => value,
Err(err) =>
if err.kind() == io::ErrorKind::UnexpectedEof {
return Ok(None)
} else {
return Err(err)
}
};
let nbytes = f.read_u64::<LittleEndian>()?;
let df = f.read_u32::<LittleEndian>()?;
let term_len = f.read_u32::<LittleEndian>()? as usize;
let mut bytes = Vec::with_capacity(term_len);
bytes.resize(term_len, 0);
f.read_exact(&mut bytes)?;
let term = match String::from_utf8(bytes) {
Ok(s) => s,
Err(_) => return Err(io::Error::new(io::ErrorKind::Other, "unicode fail"))
};
Ok(Some(Entry {
term: term,
df: df,
offset: offset,
nbytes: nbytes
}))
}
pub fn peek(&self) -> Option<&Entry> { self.next.as_ref() }
pub fn is_at(&self, term: &str) -> bool {
match self.next {
Some(ref e) => e.term == term,
None => false
}
}
/// Copy the current entry to the specified output stream,
/// then read the header for the next entry.
pub fn move_entry_to(&mut self, out: &mut IndexFileWriter) -> io::Result<()> {
// This block limits the scope of borrowing `self.next` (for `e`),
// because after this block is over we'll want to assign to `self.next`.
{
let e = self.next.as_ref().expect("no entry to move");
if e.nbytes > usize::max_value() as u64 {
// This can only happen on 32-bit platforms.
return Err(io::Error::new(io::ErrorKind::Other,
"computer not big enough to hold index entry"));
}
let mut buf = Vec::with_capacity(e.nbytes as usize);
buf.resize(e.nbytes as usize, 0);
self.main.read_exact(&mut buf)?;
out.write_main(&buf)?;
}
self.next = Self::read_entry(&mut self.contents)?;
Ok(())
}
}

@ -0,0 +1,42 @@
use std::io::{self, BufWriter};
use std::fs::{self, File};
use std::path::{Path, PathBuf};
#[derive(Clone)]
pub struct TmpDir {
dir: PathBuf,
n: usize
}
impl TmpDir {
pub fn new<P: AsRef<Path>>(dir: P) -> TmpDir {
TmpDir {
dir: dir.as_ref().to_owned(),
n: 1
}
}
pub fn create(&mut self) -> io::Result<(PathBuf, BufWriter<File>)> {
let mut try = 1;
loop {
let filename = self.dir.join(PathBuf::from(format!("tmp{:08x}.dat", self.n)));
self.n += 1;
match fs::OpenOptions::new()
.write(true)
.create_new(true)
.open(&filename)
{
Ok(f) =>
return Ok((filename, BufWriter::new(f))),
Err(exc) =>
if try < 999 && exc.kind() == io::ErrorKind::AlreadyExists {
// keep going
} else {
return Err(exc);
}
}
try += 1;
}
}
}

@ -0,0 +1,73 @@
use std::fs::File;
use std::io::{self, BufWriter, SeekFrom};
use std::io::prelude::*;
use std::path::PathBuf;
use index::InMemoryIndex;
use tmp::TmpDir;
use byteorder::{LittleEndian, WriteBytesExt};
pub struct IndexFileWriter {
offset: u64,
writer: BufWriter<File>,
contents_buf: Vec<u8>
}
impl IndexFileWriter {
pub fn new(mut f: BufWriter<File>) -> io::Result<IndexFileWriter> {
const HEADER_SIZE: u64 = 8;
f.write_u64::<LittleEndian>(0)?;
Ok(IndexFileWriter {
offset: HEADER_SIZE,
writer: f,
contents_buf: vec![]
})
}
pub fn write_main(&mut self, buf: &[u8]) -> io::Result<()> {
self.writer.write_all(buf)?;
self.offset += buf.len() as u64;
Ok(())
}
pub fn write_contents_entry(&mut self, term: String, df: u32, offset: u64, nbytes: u64) {
self.contents_buf.write_u64::<LittleEndian>(offset).unwrap();
self.contents_buf.write_u64::<LittleEndian>(nbytes).unwrap();
self.contents_buf.write_u32::<LittleEndian>(df).unwrap();
let bytes = term.bytes();
self.contents_buf.write_u32::<LittleEndian>(bytes.len() as u32).unwrap();
self.contents_buf.extend(bytes);
}
pub fn finish(mut self) -> io::Result<()> {
let contents_start = self.offset;
self.writer.write_all(&self.contents_buf)?;
println!("{} bytes main, {} bytes total", contents_start, contents_start + self.contents_buf.len() as u64);
self.writer.seek(SeekFrom::Start(0))?;
self.writer.write_u64::<LittleEndian>(contents_start)?;
Ok(())
}
}
pub fn write_index_to_tmp_file(index: InMemoryIndex, tmp_dir: &mut TmpDir) -> io::Result<PathBuf> {
let (filename, f) = tmp_dir.create()?;
let mut writer = IndexFileWriter::new(f)?;
// The merge algorithm requires the entries within each file to be sorted by term.
// Sort before writing anything.
let mut index_as_vec: Vec<_> = index.map.into_iter().collect();
index_as_vec.sort_by(|&(ref a, _), &(ref b, _)| a.cmp(b));
for (term, hits) in index_as_vec {
let df = hits.len() as u32;
let start = writer.offset;
for buffer in hits {
writer.write_main(&buffer)?;
}
let stop = writer.offset;
writer.write_contents_entry(term, df, start, stop - start);
}
writer.finish()?;
println!("wrote file {:?}", filename);
Ok(filename)
}
Loading…
Cancel
Save