|
|
|
@ -4,8 +4,8 @@
|
|
|
|
|
//! `InMemoryIndex` can be used to do that, up to the size of the machine's
|
|
|
|
|
//! memory.
|
|
|
|
|
|
|
|
|
|
use std::collections::HashMap;
|
|
|
|
|
use byteorder::{LittleEndian, WriteBytesExt};
|
|
|
|
|
use std::collections::HashMap;
|
|
|
|
|
|
|
|
|
|
/// Break a string into words.
|
|
|
|
|
fn tokenize(text: &str) -> Vec<&str> {
|
|
|
|
@ -21,6 +21,7 @@ fn tokenize(text: &str) -> Vec<&str> {
|
|
|
|
|
/// answer simple search queries. And you can use the `read`, `write`, and
|
|
|
|
|
/// `merge` modules to save an in-memory index to disk and merge it with other
|
|
|
|
|
/// indices, producing a large index.
|
|
|
|
|
#[derive(Default)]
|
|
|
|
|
pub struct InMemoryIndex {
|
|
|
|
|
/// The total number of words in the indexed documents.
|
|
|
|
|
pub word_count: usize,
|
|
|
|
@ -34,7 +35,7 @@ pub struct InMemoryIndex {
|
|
|
|
|
/// document id in increasing order. This is handy for some algorithms you
|
|
|
|
|
/// might want to run on the index, so we preserve this property wherever
|
|
|
|
|
/// possible.
|
|
|
|
|
pub map: HashMap<String, Vec<Hit>>
|
|
|
|
|
pub map: HashMap<String, Vec<Hit>>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// A `Hit` indicates that a particular document contains some term, how many
|
|
|
|
@ -48,10 +49,7 @@ pub type Hit = Vec<u8>;
|
|
|
|
|
impl InMemoryIndex {
|
|
|
|
|
/// Create a new, empty index.
|
|
|
|
|
pub fn new() -> InMemoryIndex {
|
|
|
|
|
InMemoryIndex {
|
|
|
|
|
word_count: 0,
|
|
|
|
|
map: HashMap::new()
|
|
|
|
|
}
|
|
|
|
|
InMemoryIndex::default()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Index a single document.
|
|
|
|
@ -64,20 +62,22 @@ impl InMemoryIndex {
|
|
|
|
|
let text = text.to_lowercase();
|
|
|
|
|
let tokens = tokenize(&text);
|
|
|
|
|
for (i, token) in tokens.iter().enumerate() {
|
|
|
|
|
let hits =
|
|
|
|
|
index.map
|
|
|
|
|
.entry(token.to_string())
|
|
|
|
|
.or_insert_with(|| {
|
|
|
|
|
let mut hits = Vec::with_capacity(4 + 4);
|
|
|
|
|
hits.write_u32::<LittleEndian>(document_id).unwrap();
|
|
|
|
|
vec![hits]
|
|
|
|
|
});
|
|
|
|
|
let hits = index.map.entry(token.to_string()).or_insert_with(|| {
|
|
|
|
|
let mut hits = Vec::with_capacity(4 + 4);
|
|
|
|
|
hits.write_u32::<LittleEndian>(document_id).unwrap();
|
|
|
|
|
vec![hits]
|
|
|
|
|
});
|
|
|
|
|
hits[0].write_u32::<LittleEndian>(i as u32).unwrap();
|
|
|
|
|
index.word_count += 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if document_id % 100 == 0 {
|
|
|
|
|
println!("indexed document {}, {} bytes, {} words", document_id, text.len(), index.word_count);
|
|
|
|
|
println!(
|
|
|
|
|
"indexed document {}, {} bytes, {} words",
|
|
|
|
|
document_id,
|
|
|
|
|
text.len(),
|
|
|
|
|
index.word_count
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
index
|
|
|
|
@ -90,9 +90,7 @@ impl InMemoryIndex {
|
|
|
|
|
/// `*self` remains sorted by document id after merging.
|
|
|
|
|
pub fn merge(&mut self, other: InMemoryIndex) {
|
|
|
|
|
for (term, hits) in other.map {
|
|
|
|
|
self.map.entry(term)
|
|
|
|
|
.or_insert_with(|| vec![])
|
|
|
|
|
.extend(hits)
|
|
|
|
|
self.map.entry(term).or_insert_with(|| vec![]).extend(hits)
|
|
|
|
|
}
|
|
|
|
|
self.word_count += other.word_count;
|
|
|
|
|
}
|
|
|
|
|