fix: clippy lints

Signed-off-by: simonsan <14062932+simonsan@users.noreply.github.com>
pull/5/head
simonsan 3 months ago
parent 604ac59393
commit e538dfe98b
No known key found for this signature in database
GPG Key ID: E11D13668EC3B71B

@ -4,8 +4,8 @@
//! `InMemoryIndex` can be used to do that, up to the size of the machine's
//! memory.
use std::collections::HashMap;
use byteorder::{LittleEndian, WriteBytesExt};
use std::collections::HashMap;
/// Break a string into words.
fn tokenize(text: &str) -> Vec<&str> {
@ -34,7 +34,7 @@ pub struct InMemoryIndex {
/// document id in increasing order. This is handy for some algorithms you
/// might want to run on the index, so we preserve this property wherever
/// possible.
pub map: HashMap<String, Vec<Hit>>
pub map: HashMap<String, Vec<Hit>>,
}
/// A `Hit` indicates that a particular document contains some term, how many
@ -47,37 +47,39 @@ pub type Hit = Vec<u8>;
impl InMemoryIndex {
/// Create a new, empty index.
pub fn new() -> InMemoryIndex {
InMemoryIndex {
pub fn new() -> Self {
Self {
word_count: 0,
map: HashMap::new()
map: HashMap::new(),
}
}
/// Index a single document.
///
/// The resulting index contains exactly one `Hit` per term.
pub fn from_single_document(document_id: usize, text: String) -> InMemoryIndex {
pub fn from_single_document(document_id: usize, text: String) -> Self {
let document_id = document_id as u32;
let mut index = InMemoryIndex::new();
let mut index = Self::new();
let text = text.to_lowercase();
let tokens = tokenize(&text);
for (i, token) in tokens.iter().enumerate() {
let hits =
index.map
.entry(token.to_string())
.or_insert_with(|| {
let mut hits = Vec::with_capacity(4 + 4);
hits.write_u32::<LittleEndian>(document_id).unwrap();
vec![hits]
});
let hits = index.map.entry((*token).to_string()).or_insert_with(|| {
let mut hits = Vec::with_capacity(4 + 4);
hits.write_u32::<LittleEndian>(document_id).unwrap();
vec![hits]
});
hits[0].write_u32::<LittleEndian>(i as u32).unwrap();
index.word_count += 1;
}
if document_id % 100 == 0 {
println!("indexed document {}, {} bytes, {} words", document_id, text.len(), index.word_count);
println!(
"indexed document {}, {} bytes, {} words",
document_id,
text.len(),
index.word_count
);
}
index
@ -88,11 +90,9 @@ impl InMemoryIndex {
/// If both `*self` and `other` are sorted by document id, and all document
/// ids in `other` are greater than every document id in `*self`, then
/// `*self` remains sorted by document id after merging.
pub fn merge(&mut self, other: InMemoryIndex) {
pub fn merge(&mut self, other: Self) {
for (term, hits) in other.map {
self.map.entry(term)
.or_insert_with(|| vec![])
.extend(hits)
self.map.entry(term).or_default().extend(hits);
}
self.word_count += other.word_count;
}

@ -3,27 +3,27 @@ use std::io::{self, BufWriter};
use std::mem;
use std::path::{Path, PathBuf};
use crate::tmp::TmpDir;
use crate::read::IndexFileReader;
use crate::tmp::TmpDir;
use crate::write::IndexFileWriter;
pub struct FileMerge {
output_dir: PathBuf,
tmp_dir: TmpDir,
stacks: Vec<Vec<PathBuf>>
stacks: Vec<Vec<PathBuf>>,
}
// How many files to merge at a time, at most.
const NSTREAMS: usize = 8;
const MERGED_FILENAME: &'static str = "index.dat";
const MERGED_FILENAME: &str = "index.dat";
impl FileMerge {
pub fn new(output_dir: &Path) -> FileMerge {
FileMerge {
pub fn new(output_dir: &Path) -> Self {
Self {
output_dir: output_dir.to_owned(),
tmp_dir: TmpDir::new(output_dir.to_owned()),
stacks: vec![]
tmp_dir: TmpDir::new(output_dir),
stacks: vec![],
}
}
@ -63,22 +63,20 @@ impl FileMerge {
}
assert!(tmp.len() <= 1);
match tmp.pop() {
Some(last_file) =>
fs::rename(last_file, self.output_dir.join(MERGED_FILENAME)),
None =>
Err(io::Error::new(io::ErrorKind::Other,
"no documents were parsed or none contained any words"))
Some(last_file) => fs::rename(last_file, self.output_dir.join(MERGED_FILENAME)),
None => Err(io::Error::new(
io::ErrorKind::Other,
"no documents were parsed or none contained any words",
)),
}
}
}
fn merge_streams(files: Vec<PathBuf>, out: BufWriter<File>)
-> io::Result<()>
{
let mut streams: Vec<IndexFileReader> =
files.into_iter()
.map(IndexFileReader::open_and_delete)
.collect::<io::Result<_>>()?;
fn merge_streams(files: Vec<PathBuf>, out: BufWriter<File>) -> io::Result<()> {
let mut streams: Vec<IndexFileReader> = files
.into_iter()
.map(IndexFileReader::open_and_delete)
.collect::<io::Result<_>>()?;
let mut output = IndexFileWriter::new(out)?;
@ -113,8 +111,8 @@ fn merge_streams(files: Vec<PathBuf>, out: BufWriter<File>)
}
}
}
output.write_contents_entry(term, df, point, nbytes as u64);
point += nbytes as u64;
output.write_contents_entry(term, df, point, nbytes);
point += nbytes;
}
assert!(streams.iter().all(|s| s.peek().is_none()));

@ -1,12 +1,12 @@
//! Reading index files linearly from disk, a capability needed for merging
//! index files.
use crate::write::IndexFileWriter;
use byteorder::{LittleEndian, ReadBytesExt};
use std::fs::{self, File};
use std::io::prelude::*;
use std::io::{self, BufReader, SeekFrom};
use std::path::Path;
use byteorder::{LittleEndian, ReadBytesExt};
use crate::write::IndexFileWriter;
/// A `IndexFileReader` does a single linear pass over an index file from
/// beginning to end. Needless to say, this is not how an index is normally
@ -30,7 +30,7 @@ pub struct IndexFileReader {
/// The next entry in the table of contents, if any; or `None` if we've
/// reached the end of the table. `IndexFileReader` always reads ahead one
/// entry in the contents and stores it here.
next: Option<Entry>
next: Option<Entry>,
}
/// An entry in the table of contents of an index file.
@ -51,7 +51,7 @@ pub struct Entry {
pub offset: u64,
/// Length of the index data for this term, in bytes.
pub nbytes: u64
pub nbytes: u64,
}
impl IndexFileReader {
@ -62,31 +62,35 @@ impl IndexFileReader {
/// from its directory, but it'll still take up space on disk until the
/// file is closed, which normally happens when the `IndexFileReader` is
/// dropped.
pub fn open_and_delete<P: AsRef<Path>>(filename: P) -> io::Result<IndexFileReader> {
pub fn open_and_delete<P: AsRef<Path>>(filename: P) -> io::Result<Self> {
let filename = filename.as_ref();
let mut main_raw = File::open(filename)?;
// Read the file header.
let contents_offset = main_raw.read_u64::<LittleEndian>()?;
println!("opened {}, table of contents starts at {}", filename.display(), contents_offset);
println!(
"opened {}, table of contents starts at {}",
filename.display(),
contents_offset
);
// Open again so we have two read heads;
// move the contents read head to its starting position.
// Set up buffering.
let mut contents_raw = File::open(filename)?;
contents_raw.seek(SeekFrom::Start(contents_offset))?;
let _start = contents_raw.seek(SeekFrom::Start(contents_offset))?;
let main = BufReader::new(main_raw);
let mut contents = BufReader::new(contents_raw);
// We always read ahead one entry, so load the first entry right away.
let first = IndexFileReader::read_entry(&mut contents)?;
let first = Self::read_entry(&mut contents)?;
fs::remove_file(filename)?; // YOLO
fs::remove_file(filename)?; // YOLO
Ok(IndexFileReader {
main: main,
contents: contents,
next: first
Ok(Self {
main,
contents,
next: first,
})
}
@ -98,30 +102,30 @@ impl IndexFileReader {
// that's considered a success, with no entry read.
let offset = match f.read_u64::<LittleEndian>() {
Ok(value) => value,
Err(err) =>
Err(err) => {
if err.kind() == io::ErrorKind::UnexpectedEof {
return Ok(None)
return Ok(None);
} else {
return Err(err)
return Err(err);
}
}
};
let nbytes = f.read_u64::<LittleEndian>()?;
let df = f.read_u32::<LittleEndian>()?;
let term_len = f.read_u32::<LittleEndian>()? as usize;
let mut bytes = Vec::with_capacity(term_len);
bytes.resize(term_len, 0);
let mut bytes = vec![0; term_len];
f.read_exact(&mut bytes)?;
let term = match String::from_utf8(bytes) {
Ok(s) => s,
Err(_) => return Err(io::Error::new(io::ErrorKind::Other, "unicode fail"))
Err(_) => return Err(io::Error::new(io::ErrorKind::Other, "unicode fail")),
};
Ok(Some(Entry {
term: term,
df: df,
offset: offset,
nbytes: nbytes
term,
df,
offset,
nbytes,
}))
}
@ -129,13 +133,15 @@ impl IndexFileReader {
/// (Since we always read ahead one entry, this method can't fail.)
///
/// Returns `None` if we've reached the end of the file.
pub fn peek(&self) -> Option<&Entry> { self.next.as_ref() }
pub fn peek(&self) -> Option<&Entry> {
self.next.as_ref()
}
/// True if the next entry is for the given term.
pub fn is_at(&self, term: &str) -> bool {
match self.next {
Some(ref e) => e.term == term,
None => false
None => false,
}
}
@ -148,11 +154,12 @@ impl IndexFileReader {
let e = self.next.as_ref().expect("no entry to move");
if e.nbytes > usize::max_value() as u64 {
// This can only happen on 32-bit platforms.
return Err(io::Error::new(io::ErrorKind::Other,
"computer not big enough to hold index entry"));
return Err(io::Error::new(
io::ErrorKind::Other,
"computer not big enough to hold index entry",
));
}
let mut buf = Vec::with_capacity(e.nbytes as usize);
buf.resize(e.nbytes as usize, 0);
let mut buf = vec![0; e.nbytes as usize];
self.main.read_exact(&mut buf)?;
out.write_main(&buf)?;
}

@ -9,8 +9,8 @@ pub struct TmpDir {
}
impl TmpDir {
pub fn new<P: AsRef<Path>>(dir: P) -> TmpDir {
TmpDir {
pub fn new<P: AsRef<Path>>(dir: P) -> Self {
Self {
dir: dir.as_ref().to_owned(),
n: 1
}

@ -1,10 +1,10 @@
use std::fs::File;
use std::io::{self, BufWriter, SeekFrom};
use std::io::prelude::*;
use std::path::PathBuf;
use crate::index::InMemoryIndex;
use crate::tmp::TmpDir;
use byteorder::{LittleEndian, WriteBytesExt};
use std::fs::File;
use std::io::prelude::*;
use std::io::{self, BufWriter, SeekFrom};
use std::path::PathBuf;
/// Writer for saving an index to a binary file.
///
@ -23,17 +23,17 @@ pub struct IndexFileWriter {
writer: BufWriter<File>,
/// The table of contents for this file.
contents_buf: Vec<u8>
contents_buf: Vec<u8>,
}
impl IndexFileWriter {
pub fn new(mut f: BufWriter<File>) -> io::Result<IndexFileWriter> {
pub fn new(mut f: BufWriter<File>) -> io::Result<Self> {
const HEADER_SIZE: u64 = 8;
f.write_u64::<LittleEndian>(0)?;
Ok(IndexFileWriter {
Ok(Self {
offset: HEADER_SIZE,
writer: f,
contents_buf: vec![]
contents_buf: vec![],
})
}
@ -48,7 +48,9 @@ impl IndexFileWriter {
self.contents_buf.write_u64::<LittleEndian>(nbytes).unwrap();
self.contents_buf.write_u32::<LittleEndian>(df).unwrap();
let bytes = term.bytes();
self.contents_buf.write_u32::<LittleEndian>(bytes.len() as u32).unwrap();
self.contents_buf
.write_u32::<LittleEndian>(bytes.len() as u32)
.unwrap();
self.contents_buf.extend(bytes);
}
@ -56,8 +58,12 @@ impl IndexFileWriter {
pub fn finish(mut self) -> io::Result<()> {
let contents_start = self.offset;
self.writer.write_all(&self.contents_buf)?;
println!("{} bytes main, {} bytes total", contents_start, contents_start + self.contents_buf.len() as u64);
self.writer.seek(SeekFrom::Start(0))?;
println!(
"{} bytes main, {} bytes total",
contents_start,
contents_start + self.contents_buf.len() as u64
);
let _start = self.writer.seek(SeekFrom::Start(0))?;
self.writer.write_u64::<LittleEndian>(contents_start)?;
Ok(())
}
@ -70,7 +76,7 @@ pub fn write_index_to_tmp_file(index: InMemoryIndex, tmp_dir: &mut TmpDir) -> io
// The merge algorithm requires the entries within each file to be sorted by term.
// Sort before writing anything.
let mut index_as_vec: Vec<_> = index.map.into_iter().collect();
index_as_vec.sort_by(|&(ref a, _), &(ref b, _)| a.cmp(b));
index_as_vec.sort_by(|(a, _), (b, _)| a.cmp(b));
for (term, hits) in index_as_vec {
let df = hits.len() as u32;
@ -83,6 +89,6 @@ pub fn write_index_to_tmp_file(index: InMemoryIndex, tmp_dir: &mut TmpDir) -> io
}
writer.finish()?;
println!("wrote file {:?}", filename);
println!("wrote file {filename:?}");
Ok(filename)
}

Loading…
Cancel
Save