Simplified a lot of cache concept

pull/1072/head
Rafał Mikrut 8 months ago
parent 97f874bffd
commit 8197bf677e

@ -85,26 +85,86 @@ where
pub fn load_cache_from_file_generalized_by_path<T>(cache_file_name: &str, delete_outdated_cache: bool, used_files: &BTreeMap<String, T>) -> (Messages, Option<BTreeMap<String, T>>)
where
for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync,
for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone,
{
let (text_messages, vec_loaded_cache) = load_cache_from_file_generalized(cache_file_name, delete_outdated_cache, used_files);
let Some(vec_loaded_entries) = vec_loaded_cache else {
return (text_messages, None);
};
debug!("Converting cache vec into map");
debug!("Converting cache Vec<T> into BTreeMap<String, T>");
let map_loaded_entries: BTreeMap<String, T> = vec_loaded_entries
.into_iter()
.map(|file_entry| (file_entry.get_path().to_string_lossy().into_owned(), file_entry))
.collect();
debug!("Converted cache vec into map");
debug!("Converted cache Vec<T> into BTreeMap<String, T>");
(text_messages, Some(map_loaded_entries))
}
pub fn load_cache_from_file_generalized_by_size<T>(
cache_file_name: &str,
delete_outdated_cache: bool,
cache_not_converted: &BTreeMap<u64, Vec<T>>,
) -> (Messages, Option<BTreeMap<u64, Vec<T>>>)
where
for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone,
{
debug!("Converting cache BtreeMap<u64, Vec<T>> into BTreeMap<String, T>");
let mut used_files: BTreeMap<String, T> = Default::default();
for file_entry in cache_not_converted.values().flatten() {
used_files.insert(file_entry.get_path().to_string_lossy().into_owned(), file_entry.clone());
}
debug!("Converted cache BtreeMap<u64, Vec<T>> into BTreeMap<String, T>");
let (text_messages, vec_loaded_cache) = load_cache_from_file_generalized(cache_file_name, delete_outdated_cache, &used_files);
let Some(vec_loaded_entries) = vec_loaded_cache else {
return (text_messages, None);
};
debug!("Converting cache Vec<T> into BTreeMap<u64, Vec<T>>");
let mut map_loaded_entries: BTreeMap<u64, Vec<T>> = Default::default();
for file_entry in vec_loaded_entries {
map_loaded_entries.entry(file_entry.get_size()).or_default().push(file_entry);
}
debug!("Converted cache Vec<T> into BTreeMap<u64, Vec<T>>");
(text_messages, Some(map_loaded_entries))
}
pub fn load_cache_from_file_generalized_by_path_from_size<T>(
cache_file_name: &str,
delete_outdated_cache: bool,
cache_not_converted: &BTreeMap<u64, Vec<T>>,
) -> (Messages, Option<BTreeMap<String, T>>)
where
for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone,
{
debug!("Converting cache BtreeMap<u64, Vec<T>> into BTreeMap<String, T>");
let mut used_files: BTreeMap<String, T> = Default::default();
for file_entry in cache_not_converted.values().flatten() {
used_files.insert(file_entry.get_path().to_string_lossy().into_owned(), file_entry.clone());
}
debug!("Converted cache BtreeMap<u64, Vec<T>> into BTreeMap<String, T>");
let (text_messages, vec_loaded_cache) = load_cache_from_file_generalized(cache_file_name, delete_outdated_cache, &used_files);
let Some(vec_loaded_entries) = vec_loaded_cache else {
return (text_messages, None);
};
debug!("Converting cache Vec<T> into BTreeMap<String, T>");
let map_loaded_entries: BTreeMap<String, T> = vec_loaded_entries
.into_iter()
.map(|file_entry| (file_entry.get_path().to_string_lossy().into_owned(), file_entry))
.collect();
debug!("Converted cache Vec<T> into BTreeMap<String, T>");
(text_messages, Some(map_loaded_entries))
}
fn load_cache_from_file_generalized<T>(cache_file_name: &str, delete_outdated_cache: bool, used_files: &BTreeMap<String, T>) -> (Messages, Option<Vec<T>>)
where
for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync,
for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone,
{
debug!("Loading cache from file {} (or json alternative)", cache_file_name);
let mut text_messages = Messages::new();
@ -175,44 +235,3 @@ where
debug!("Failed to load cache from file {cache_file_name} because not exists");
(text_messages, None)
}
// pub fn save_hashes_to_file<T>(cache_file_name: &str, hashmap: &BTreeMap<String, Vec<T>>, save_also_as_json: bool) -> Messages
// where
// T: Serialize + ResultEntry + Sized + Send + Sync,
// {
// debug!("Saving cache to file {} (or also json alternative) - {} results", cache_file_name, hashmap.len());
// let mut text_messages = Messages::new();
// if let Some(((file_handler, cache_file), (file_handler_json, cache_file_json))) =
// common::open_cache_folder(cache_file_name, true, save_also_as_json, &mut text_messages.warnings)
// {
// {
// let writer = BufWriter::new(file_handler.unwrap()); // Unwrap because cannot fail here
// if let Err(e) = bincode::serialize_into(writer, &hashmap.values().collect::<Vec<_>>()) {
// text_messages
// .warnings
// .push(format!("Cannot write data to cache file {}, reason {}", cache_file.display(), e));
// debug!("Failed to save cache to file {:?}", cache_file);
// return text_messages;
// }
// debug!("Saved binary to file {:?}", cache_file);
// }
// if save_also_as_json {
// if let Some(file_handler_json) = file_handler_json {
// let writer = BufWriter::new(file_handler_json);
// if let Err(e) = serde_json::to_writer(writer, &hashmap.values().collect::<Vec<_>>()) {
// text_messages
// .warnings
// .push(format!("Cannot write data to cache file {}, reason {}", cache_file_json.display(), e));
// debug!("Failed to save cache to file {:?}", cache_file_json);
// return text_messages;
// }
// debug!("Saved json to file {:?}", cache_file_json);
// }
// }
//
// text_messages.messages.push(format!("Properly saved to file {} cache entries.", hashmap.len()));
// } else {
// debug!("Failed to save cache to file {cache_file_name} because not exists");
// }
// text_messages
// }

@ -4,10 +4,10 @@ use std::collections::HashSet;
use std::fs::File;
use std::hash::Hasher;
use std::io::prelude::*;
use std::io::{self, BufReader, BufWriter, Error, ErrorKind};
use std::io::{self, BufWriter, Error, ErrorKind};
#[cfg(target_family = "unix")]
use std::os::unix::fs::MetadataExt;
use std::path::{Path, PathBuf};
use std::path::Path;
use std::sync::atomic::Ordering;
use std::{fs, mem};
@ -18,14 +18,12 @@ use log::{debug, info};
use rayon::prelude::*;
use xxhash_rust::xxh3::Xxh3;
use crate::common::{open_cache_folder, prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads};
use crate::common_cache::{get_duplicate_cache_file, save_cache_to_file_generalized};
use crate::common::{prepare_thread_handler_common, send_info_and_wait_for_ending_all_threads};
use crate::common_cache::{get_duplicate_cache_file, load_cache_from_file_generalized_by_size, save_cache_to_file_generalized};
use crate::common_dir_traversal::{CheckingMethod, DirTraversalBuilder, DirTraversalResult, FileEntry, ProgressData, ToolType};
use crate::common_messages::Messages;
use crate::common_tool::{CommonData, CommonToolData};
use crate::common_traits::*;
use crate::flc;
use crate::localizer_core::generate_translation_hashmap;
const TEMP_HARDLINK_FILE: &str = "rzeczek.rxrxrxl";
@ -536,32 +534,28 @@ impl DuplicateFinder {
if self.use_prehash_cache {
debug!("prehash_load_cache_at_start - using prehash cache start");
loaded_hash_map = match load_hashes_from_file(&mut self.common_data.text_messages, self.common_data.delete_outdated_cache, &self.hash_type, true) {
Some(t) => t,
None => Default::default(),
};
let mut loaded_hash_map2: BTreeMap<String, FileEntry> = Default::default();
for vec_file_entry in loaded_hash_map.values() {
for file_entry in vec_file_entry {
loaded_hash_map2.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone());
}
}
let (messages, loaded_items) = load_cache_from_file_generalized_by_size::<FileEntry>(
&get_duplicate_cache_file(&self.hash_type, true),
self.get_delete_outdated_cache(),
&self.files_with_identical_size,
);
self.get_text_messages_mut().extend_with_another_messages(messages);
loaded_hash_map = loaded_items.unwrap_or_default();
#[allow(clippy::if_same_then_else)]
for vec_file_entry in self.files_with_identical_size.values() {
for file_entry in vec_file_entry {
let name = file_entry.path.to_string_lossy().to_string();
if !loaded_hash_map2.contains_key(&name) {
// If loaded data doesn't contains current image info
non_cached_files_to_check.entry(file_entry.size).or_default().push(file_entry.clone());
} else if file_entry.size != loaded_hash_map2.get(&name).unwrap().size || file_entry.modified_date != loaded_hash_map2.get(&name).unwrap().modified_date {
// When size or modification date of image changed, then it is clear that is different image
non_cached_files_to_check.entry(file_entry.size).or_default().push(file_entry.clone());
} else {
// Checking may be omitted when already there is entry with same size and modification date
records_already_cached.entry(file_entry.size).or_default().push(file_entry.clone());
for (size, vec_file_entry) in mem::take(&mut self.files_with_identical_size) {
if let Some(vec_file_entry) = loaded_hash_map.get(&size) {
// TODO maybe hashset is not needed when using < 4 elements
let cached_path_entries = vec_file_entry.iter().map(|e| &e.path).collect::<HashSet<_>>();
for file_entry in vec_file_entry {
if cached_path_entries.contains(&file_entry.path) {
records_already_cached.entry(size).or_default().push(file_entry.clone());
} else {
non_cached_files_to_check.entry(size).or_default().push(file_entry.clone());
}
}
} else {
non_cached_files_to_check.entry(size).or_default().append(&mut vec_file_entry.clone());
}
}
} else {
@ -693,35 +687,28 @@ impl DuplicateFinder {
if self.common_data.use_cache {
debug!("full_hashing_load_cache_at_start - using cache");
loaded_hash_map = match load_hashes_from_file(&mut self.common_data.text_messages, self.common_data.delete_outdated_cache, &self.hash_type, false) {
Some(t) => t,
None => Default::default(),
};
let (messages, loaded_items) =
load_cache_from_file_generalized_by_size::<FileEntry>(&get_duplicate_cache_file(&self.hash_type, false), self.get_delete_outdated_cache(), &pre_checked_map);
self.get_text_messages_mut().extend_with_another_messages(messages);
loaded_hash_map = loaded_items.unwrap_or_default();
debug!("full_hashing_load_cache_at_start - started diff between loaded and prechecked files");
for (size, vec_file_entry) in pre_checked_map {
#[allow(clippy::collapsible_if)]
if !loaded_hash_map.contains_key(&size) {
// If loaded data doesn't contains current info
non_cached_files_to_check.insert(size, vec_file_entry);
} else {
let loaded_vec_file_entry = loaded_hash_map.get(&size).unwrap();
if let Some(vec_file_entry) = loaded_hash_map.get(&size) {
// TODO maybe hashset is not needed when using < 4 elements
let cached_path_entries = vec_file_entry.iter().map(|e| &e.path).collect::<HashSet<_>>();
for file_entry in vec_file_entry {
let mut found: bool = false;
for loaded_file_entry in loaded_vec_file_entry {
if file_entry.path == loaded_file_entry.path && file_entry.modified_date == loaded_file_entry.modified_date {
records_already_cached.entry(file_entry.size).or_default().push(loaded_file_entry.clone());
found = true;
break;
}
}
if !found {
non_cached_files_to_check.entry(file_entry.size).or_default().push(file_entry);
if cached_path_entries.contains(&file_entry.path) {
records_already_cached.entry(size).or_default().push(file_entry.clone());
} else {
non_cached_files_to_check.entry(size).or_default().push(file_entry.clone());
}
}
} else {
non_cached_files_to_check.entry(size).or_default().append(&mut vec_file_entry.clone());
}
}
debug!("full_hashing_load_cache_at_start - completed diff between loaded and prechecked files");
} else {
debug!("full_hashing_load_cache_at_start - not using cache");
loaded_hash_map = Default::default();
@ -1328,86 +1315,6 @@ pub fn make_hard_link(src: &Path, dst: &Path) -> io::Result<()> {
result
}
pub fn load_hashes_from_file(text_messages: &mut Messages, delete_outdated_cache: bool, type_of_hash: &HashType, is_prehash: bool) -> Option<BTreeMap<u64, Vec<FileEntry>>> {
if let Some(((file_handler, cache_file), (_json_file, _json_name))) =
open_cache_folder(&get_file_hash_name(type_of_hash, is_prehash), false, false, &mut text_messages.warnings)
{
// Unwrap could fail when failed to open cache file, but json would exists
let Some(file_handler) = file_handler else {
return Default::default();
};
let reader = BufReader::new(file_handler);
let mut hashmap_loaded_entries: BTreeMap<u64, Vec<FileEntry>> = Default::default();
// Read the file line by line using the lines() iterator from std::io::BufRead.
for (index, line) in reader.lines().enumerate() {
let line = match line {
Ok(t) => t,
Err(e) => {
text_messages
.warnings
.push(format!("Failed to load line number {} from cache file {}, reason {}", index + 1, cache_file.display(), e));
return None;
}
};
let uuu = line.split("//").collect::<Vec<&str>>();
if uuu.len() != 4 {
text_messages.warnings.push(format!(
"Found invalid data(too much or too low amount of data) in line {} - ({}) in cache file {}",
index + 1,
line,
cache_file.display()
));
continue;
}
// Don't load cache data if destination file not exists
if !delete_outdated_cache || Path::new(uuu[0]).exists() {
let file_entry = FileEntry {
path: PathBuf::from(uuu[0]),
size: match uuu[1].parse::<u64>() {
Ok(t) => t,
Err(e) => {
text_messages.warnings.push(format!(
"Found invalid size value in line {} - ({}) in cache file {}, reason {}",
index + 1,
line,
cache_file.display(),
e
));
continue;
}
},
modified_date: match uuu[2].parse::<u64>() {
Ok(t) => t,
Err(e) => {
text_messages.warnings.push(format!(
"Found invalid modified date value in line {} - ({}) in cache file {}, reason {}",
index + 1,
line,
cache_file.display(),
e
));
continue;
}
},
hash: uuu[3].to_string(),
symlink_info: None,
};
hashmap_loaded_entries.entry(file_entry.size).or_default().push(file_entry);
}
}
text_messages.messages.push(flc!(
"core_loading_from_cache",
generate_translation_hashmap(vec![("number", hashmap_loaded_entries.values().map(std::vec::Vec::len).sum::<usize>().to_string())])
));
return Some(hashmap_loaded_entries);
}
None
}
pub trait MyHasher {
fn update(&mut self, bytes: &[u8]);
fn finalize(&self) -> String;
@ -1437,11 +1344,6 @@ fn hash_calculation(buffer: &mut [u8], file_entry: &FileEntry, hash_type: &HashT
Ok(hasher.finalize())
}
fn get_file_hash_name(type_of_hash: &HashType, is_prehash: bool) -> String {
let prehash_str = if is_prehash { "_prehash" } else { "" };
format!("cache_duplicates_{type_of_hash:?}{prehash_str}.txt")
}
impl MyHasher for blake3::Hasher {
fn update(&mut self, bytes: &[u8]) {
self.update(bytes);
@ -1486,6 +1388,7 @@ mod tests {
use std::os::fs::MetadataExt;
#[cfg(target_family = "unix")]
use std::os::unix::fs::MetadataExt;
use std::path::PathBuf;
use super::*;

@ -2,7 +2,8 @@ use std::collections::BTreeMap;
use std::default::Default;
use czkawka_core::common_cache::{
get_duplicate_cache_file, get_similar_images_cache_file, get_similar_videos_cache_file, load_cache_from_file_generalized_by_path, save_cache_to_file_generalized,
get_duplicate_cache_file, get_similar_images_cache_file, get_similar_videos_cache_file, load_cache_from_file_generalized_by_path, load_cache_from_file_generalized_by_size,
save_cache_to_file_generalized,
};
use directories_next::ProjectDirs;
use gtk4::prelude::*;
@ -124,7 +125,13 @@ pub fn connect_settings(gui_data: &GuiData) {
let mut messages: Messages = Messages::new();
for use_prehash in [true, false] {
for type_of_hash in &[HashType::Xxh3, HashType::Blake3, HashType::Crc32] {
if let Some(cache_entries) = czkawka_core::duplicate::load_hashes_from_file(&mut messages, true, type_of_hash, use_prehash) {
let (mut messages, loaded_items) = load_cache_from_file_generalized_by_size::<czkawka_core::common_dir_traversal::FileEntry>(
&get_duplicate_cache_file(type_of_hash, use_prehash),
true,
&Default::default(),
);
if let Some(cache_entries) = loaded_items {
let mut hashmap_to_save: BTreeMap<String, czkawka_core::common_dir_traversal::FileEntry> = Default::default();
for (_, vec_file_entry) in cache_entries {
for file_entry in vec_file_entry {
@ -134,7 +141,9 @@ pub fn connect_settings(gui_data: &GuiData) {
let minimal_cache_size = entry_settings_cache_file_minimal_size.text().as_str().parse::<u64>().unwrap_or(2 * 1024 * 1024);
save_cache_to_file_generalized(&get_duplicate_cache_file(type_of_hash, use_prehash), &hashmap_to_save, false, minimal_cache_size);
let save_messages =
save_cache_to_file_generalized(&get_duplicate_cache_file(type_of_hash, use_prehash), &hashmap_to_save, false, minimal_cache_size);
messages.extend_with_another_messages(save_messages);
}
}

Loading…
Cancel
Save