@ -1,9 +1,8 @@
use std ::collections ::BTreeMap ;
use std ::collections ::HashSet ;
use std ::collections ::{ BTreeMap , HashSet } ;
use std ::fs ::File ;
use std ::hash ::Hasher ;
use std ::io ::prelude ::* ;
use std ::io ::{ self , BufWriter, Error, ErrorKind } ;
use std ::io ::{ self , Error, ErrorKind } ;
#[ cfg(target_family = " unix " ) ]
use std ::os ::unix ::fs ::MetadataExt ;
use std ::path ::Path ;
@ -11,9 +10,10 @@ use std::sync::atomic::Ordering;
use std ::{ fs , mem } ;
use crossbeam_channel ::Receiver ;
use fun_time ::fun_time ;
use futures ::channel ::mpsc ::UnboundedSender ;
use humansize ::{ format_size , BINARY } ;
use log ::{ debug , info } ;
use log ::debug ;
use rayon ::prelude ::* ;
use xxhash_rust ::xxh3 ::Xxh3 ;
@ -21,7 +21,7 @@ use crate::common::{prepare_thread_handler_common, send_info_and_wait_for_ending
use crate ::common_cache ::{ get_duplicate_cache_file , load_cache_from_file_generalized_by_size , save_cache_to_file_generalized } ;
use crate ::common_dir_traversal ::{ CheckingMethod , DirTraversalBuilder , DirTraversalResult , FileEntry , ProgressData , ToolType } ;
use crate ::common_messages ::Messages ;
use crate ::common_tool ::{ CommonData , CommonToolData };
use crate ::common_tool ::{ CommonData , CommonToolData , DeleteMethod };
use crate ::common_traits ::* ;
const TEMP_HARDLINK_FILE : & str = "rzeczek.rxrxrxl" ;
@ -44,17 +44,6 @@ impl HashType {
}
}
#[ derive(Eq, PartialEq, Clone, Debug, Copy, Default) ]
pub enum DeleteMethod {
#[ default ]
None ,
AllExceptNewest ,
AllExceptOldest ,
OneOldest ,
OneNewest ,
HardLink ,
}
#[ derive(Default) ]
pub struct Info {
pub number_of_groups_by_size : usize ,
@ -89,7 +78,6 @@ pub struct DuplicateFinder {
// File Size, next grouped by file size, next grouped by hash
files_with_identical_hashes_referenced : BTreeMap < u64 , Vec < ( FileEntry , Vec < FileEntry > ) > > ,
check_method : CheckingMethod ,
delete_method : DeleteMethod ,
hash_type : HashType ,
ignore_hard_links : bool ,
dryrun : bool ,
@ -113,25 +101,18 @@ impl DuplicateFinder {
files_with_identical_size_referenced : Default ::default ( ) ,
files_with_identical_hashes_referenced : Default ::default ( ) ,
check_method : CheckingMethod ::None ,
delete_method : DeleteMethod ::None ,
ignore_hard_links : true ,
hash_type : HashType ::Blake3 ,
dryrun : false ,
use_prehash_cache : true ,
minimal_cache_file_size : 1024 * 1024 / 4 , // By default cache only >= 256 KB files
minimal_cache_file_size : 1024 * 256 , // By default cache only >= 256 KB files
minimal_prehash_cache_file_size : 0 ,
case_sensitive_name_comparison : false ,
}
}
#[ fun_time(message = " find_duplicates " ) ]
pub fn find_duplicates ( & mut self , stop_receiver : Option < & Receiver < ( ) > > , progress_sender : Option < & UnboundedSender < ProgressData > > ) {
info ! ( "Starting finding duplicates" ) ;
let start_time = std ::time ::Instant ::now ( ) ;
self . find_duplicates_internal ( stop_receiver , progress_sender ) ;
info ! ( "Ended finding duplicates which took {:?}" , start_time . elapsed ( ) ) ;
}
fn find_duplicates_internal ( & mut self , stop_receiver : Option < & Receiver < ( ) > > , progress_sender : Option < & UnboundedSender < ProgressData > > ) {
self . optimize_dirs_before_start ( ) ;
self . common_data . use_reference_folders = ! self . common_data . directories . reference_directories . is_empty ( ) ;
@ -170,88 +151,8 @@ impl DuplicateFinder {
self . debug_print ( ) ;
}
pub fn set_case_sensitive_name_comparison ( & mut self , case_sensitive_name_comparison : bool ) {
self . case_sensitive_name_comparison = case_sensitive_name_comparison ;
}
pub const fn get_check_method ( & self ) -> & CheckingMethod {
& self . check_method
}
pub fn set_minimal_cache_file_size ( & mut self , minimal_cache_file_size : u64 ) {
self . minimal_cache_file_size = minimal_cache_file_size ;
}
pub fn set_minimal_prehash_cache_file_size ( & mut self , minimal_prehash_cache_file_size : u64 ) {
self . minimal_prehash_cache_file_size = minimal_prehash_cache_file_size ;
}
pub const fn get_files_sorted_by_names ( & self ) -> & BTreeMap < String , Vec < FileEntry > > {
& self . files_with_identical_names
}
pub fn set_use_prehash_cache ( & mut self , use_prehash_cache : bool ) {
self . use_prehash_cache = use_prehash_cache ;
}
pub const fn get_files_sorted_by_size ( & self ) -> & BTreeMap < u64 , Vec < FileEntry > > {
& self . files_with_identical_size
}
pub const fn get_files_sorted_by_size_name ( & self ) -> & BTreeMap < ( u64 , String ) , Vec < FileEntry > > {
& self . files_with_identical_size_names
}
pub const fn get_files_sorted_by_hash ( & self ) -> & BTreeMap < u64 , Vec < Vec < FileEntry > > > {
& self . files_with_identical_hashes
}
pub const fn get_information ( & self ) -> & Info {
& self . information
}
pub fn set_hash_type ( & mut self , hash_type : HashType ) {
self . hash_type = hash_type ;
}
pub fn set_ignore_hard_links ( & mut self , ignore_hard_links : bool ) {
self . ignore_hard_links = ignore_hard_links ;
}
pub fn set_dryrun ( & mut self , dryrun : bool ) {
self . dryrun = dryrun ;
}
pub fn set_check_method ( & mut self , check_method : CheckingMethod ) {
self . check_method = check_method ;
}
pub fn set_delete_method ( & mut self , delete_method : DeleteMethod ) {
self . delete_method = delete_method ;
}
pub fn get_use_reference ( & self ) -> bool {
self . common_data . use_reference_folders
}
pub fn get_files_with_identical_hashes_referenced ( & self ) -> & BTreeMap < u64 , Vec < ( FileEntry , Vec < FileEntry > ) > > {
& self . files_with_identical_hashes_referenced
}
pub fn get_files_with_identical_name_referenced ( & self ) -> & BTreeMap < String , ( FileEntry , Vec < FileEntry > ) > {
& self . files_with_identical_names_referenced
}
pub fn get_files_with_identical_size_referenced ( & self ) -> & BTreeMap < u64 , ( FileEntry , Vec < FileEntry > ) > {
& self . files_with_identical_size_referenced
}
pub fn get_files_with_identical_size_names_referenced ( & self ) -> & BTreeMap < ( u64 , String ) , ( FileEntry , Vec < FileEntry > ) > {
& self . files_with_identical_size_names_referenced
}
#[ fun_time(message = " check_files_name " ) ]
fn check_files_name ( & mut self , stop_receiver : Option < & Receiver < ( ) > > , progress_sender : Option < & UnboundedSender < ProgressData > > ) -> bool {
debug ! ( "check_files_name - starting checking for same names" ) ;
let group_by_func = if self . case_sensitive_name_comparison {
| fe : & FileEntry | fe . path . file_name ( ) . unwrap ( ) . to_string_lossy ( ) . to_string ( )
} else {
@ -272,21 +173,13 @@ impl DuplicateFinder {
. maximal_file_size ( self . common_data . maximal_file_size )
. build ( )
. run ( ) ;
debug ! ( "check_files_name - after finding file sizes" ) ;
let res = match result {
match result {
DirTraversalResult ::SuccessFiles { grouped_file_entries , warnings } = > {
self . files_with_identical_names = grouped_file_entries ;
self . common_data . text_messages . warnings . extend ( warnings ) ;
// Create new BTreeMap without single size entries(files have not duplicates)
let mut new_map : BTreeMap < String , Vec < FileEntry > > = Default ::default ( ) ;
for ( name , vector ) in & self . files_with_identical_names {
if vector . len ( ) > 1 {
new_map . insert ( name . clone ( ) , vector . clone ( ) ) ;
}
}
self . files_with_identical_names = new_map ;
self . files_with_identical_names = grouped_file_entries . into_iter ( ) . filter ( | ( _name , vector ) | vector . len ( ) > 1 ) . collect ( ) ;
// Reference - only use in size, because later hash will be counted differently
if self . common_data . use_reference_folders {
@ -316,9 +209,7 @@ impl DuplicateFinder {
unreachable! ( )
}
DirTraversalResult ::Stopped = > false ,
} ;
debug ! ( "check_files_name - finished checking for same names" ) ;
res
}
}
fn calculate_name_stats ( & mut self ) {
@ -335,8 +226,8 @@ impl DuplicateFinder {
}
}
#[ fun_time(message = " check_files_size_name " ) ]
fn check_files_size_name ( & mut self , stop_receiver : Option < & Receiver < ( ) > > , progress_sender : Option < & UnboundedSender < ProgressData > > ) -> bool {
debug ! ( "check_files_size_name - starting checking for same size and name" ) ;
let group_by_func = if self . case_sensitive_name_comparison {
| fe : & FileEntry | ( fe . size , fe . path . file_name ( ) . unwrap ( ) . to_string_lossy ( ) . to_string ( ) )
} else {
@ -357,21 +248,12 @@ impl DuplicateFinder {
. maximal_file_size ( self . common_data . maximal_file_size )
. build ( )
. run ( ) ;
debug ! ( "check_files_size_name - after finding file sizes" ) ;
let res = match result {
match result {
DirTraversalResult ::SuccessFiles { grouped_file_entries , warnings } = > {
self . files_with_identical_size_names = grouped_file_entries ;
self . common_data . text_messages . warnings . extend ( warnings ) ;
// Create new BTreeMap without single size entries(files have not duplicates)
let mut new_map : BTreeMap < ( u64 , String ) , Vec < FileEntry > > = Default ::default ( ) ;
for ( name_size , vector ) in & self . files_with_identical_size_names {
if vector . len ( ) > 1 {
new_map . insert ( name_size . clone ( ) , vector . clone ( ) ) ;
}
}
self . files_with_identical_size_names = new_map ;
self . files_with_identical_size_names = grouped_file_entries . into_iter ( ) . filter ( | ( _name , vector ) | vector . len ( ) > 1 ) . collect ( ) ;
// Reference - only use in size, because later hash will be counted differently
if self . common_data . use_reference_folders {
@ -402,9 +284,7 @@ impl DuplicateFinder {
unreachable! ( )
}
DirTraversalResult ::Stopped = > false ,
} ;
debug ! ( "check_files_size_name - finished checking for same size and name" ) ;
res
}
}
fn calculate_size_name_stats ( & mut self ) {
@ -423,10 +303,8 @@ impl DuplicateFinder {
}
}
/// Read file length and puts it to different boxes(each for different lengths)
/// If in box is only 1 result, then it is removed
#[ fun_time(message = " check_files_size " ) ]
fn check_files_size ( & mut self , stop_receiver : Option < & Receiver < ( ) > > , progress_sender : Option < & UnboundedSender < ProgressData > > ) -> bool {
debug ! ( "check_file_size - start" ) ;
let max_stage = match self . check_method {
CheckingMethod ::Size = > 0 ,
CheckingMethod ::Hash = > 2 ,
@ -447,16 +325,12 @@ impl DuplicateFinder {
. maximal_file_size ( self . common_data . maximal_file_size )
. build ( )
. run ( ) ;
debug ! ( "check_file_size - after finding file sizes" ) ;
let res = match result {
match result {
DirTraversalResult ::SuccessFiles { grouped_file_entries , warnings } = > {
self . files_with_identical_size = grouped_file_entries ;
self . common_data . text_messages . warnings . extend ( warnings ) ;
// Create new BTreeMap without single size entries(files have not duplicates)
let old_map : BTreeMap < u64 , Vec < FileEntry > > = mem ::take ( & mut self . files_with_identical_size ) ;
for ( size , vec ) in old_map {
for ( size , vec ) in grouped_file_entries {
if vec . len ( ) < = 1 {
continue ;
}
@ -471,21 +345,21 @@ impl DuplicateFinder {
self . filter_reference_folders_by_size ( ) ;
self . calculate_size_stats ( ) ;
debug ! (
"check_file_size - after calculating size stats/duplicates, found in {} groups, {} files with same size | referenced {} groups, {} files" ,
self . files_with_identical_size . len ( ) ,
self . files_with_identical_size . values ( ) . map ( Vec ::len ) . sum ::< usize > ( ) ,
self . files_with_identical_size_referenced . len ( ) ,
self . files_with_identical_size_referenced . values ( ) . map ( | ( _fe , vec ) | vec . len ( ) ) . sum ::< usize > ( )
) ;
true
}
DirTraversalResult ::SuccessFolders { .. } = > {
unreachable! ( )
}
DirTraversalResult ::Stopped = > false ,
} ;
debug ! (
"check_file_size - after calculating size stats/duplicates, found in {} groups, {} files with same size | referenced {} groups, {} files" ,
self . files_with_identical_size . len ( ) ,
self . files_with_identical_size . values ( ) . map ( Vec ::len ) . sum ::< usize > ( ) ,
self . files_with_identical_size_referenced . len ( ) ,
self . files_with_identical_size_referenced . values ( ) . map ( | ( _fe , vec ) | vec . len ( ) ) . sum ::< usize > ( )
) ;
res
}
}
fn calculate_size_stats ( & mut self ) {
@ -504,8 +378,7 @@ impl DuplicateFinder {
}
}
/// This step check for references, only when checking for size.
/// This is needed, because later reference folders looks for hashes, not size
#[ fun_time(message = " filter_reference_folders_by_size " ) ]
fn filter_reference_folders_by_size ( & mut self ) {
if self . common_data . use_reference_folders & & self . check_method = = CheckingMethod ::Size {
let vec = mem ::take ( & mut self . files_with_identical_size )
@ -528,6 +401,7 @@ impl DuplicateFinder {
}
}
#[ fun_time(message = " prehash_load_cache_at_start " ) ]
fn prehash_load_cache_at_start ( & mut self ) -> ( BTreeMap < u64 , Vec < FileEntry > > , BTreeMap < u64 , Vec < FileEntry > > , BTreeMap < u64 , Vec < FileEntry > > ) {
// Cache algorithm
// - Load data from cache
@ -538,8 +412,6 @@ impl DuplicateFinder {
let mut non_cached_files_to_check : BTreeMap < u64 , Vec < FileEntry > > = Default ::default ( ) ;
if self . use_prehash_cache {
debug ! ( "prehash_load_cache_at_start - using prehash cache start" ) ;
let ( messages , loaded_items ) = load_cache_from_file_generalized_by_size ::< FileEntry > (
& get_duplicate_cache_file ( & self . hash_type , true ) ,
self . get_delete_outdated_cache ( ) ,
@ -573,16 +445,14 @@ impl DuplicateFinder {
format_size ( records_already_cached . values ( ) . map ( | v | v . iter ( ) . map ( | e | e . size ) . sum ::< u64 > ( ) ) . sum ::< u64 > ( ) , BINARY ) ,
) ;
} else {
debug ! ( "prehash_load_cache_at_start - not using prehash cache start" ) ;
loaded_hash_map = Default ::default ( ) ;
mem ::swap ( & mut self . files_with_identical_size , & mut non_cached_files_to_check ) ;
}
debug ! ( "prehash_load_cache_at_start - end" ) ;
( loaded_hash_map , records_already_cached , non_cached_files_to_check )
}
#[ fun_time(message = " prehash_save_cache_at_exit " ) ]
fn prehash_save_cache_at_exit ( & mut self , loaded_hash_map : BTreeMap < u64 , Vec < FileEntry > > , pre_hash_results : & Vec < ( u64 , BTreeMap < String , Vec < FileEntry > > , Vec < String > ) > ) {
debug ! ( "prehash_save_cache_at_exit - start - using prehash cache {}" , self . use_prehash_cache ) ;
if self . use_prehash_cache {
// All results = records already cached + computed results
let mut save_cache_to_hashmap : BTreeMap < String , FileEntry > = Default ::default ( ) ;
@ -612,18 +482,16 @@ impl DuplicateFinder {
self . minimal_prehash_cache_file_size ,
) ;
self . get_text_messages_mut ( ) . extend_with_another_messages ( messages ) ;
debug ! ( "prehash_save_cache_at_exit - saving prehash cache end" ) ;
}
}
#[ fun_time(message = " prehashing " ) ]
fn prehashing (
& mut self ,
stop_receiver : Option < & Receiver < ( ) > > ,
progress_sender : Option < & UnboundedSender < ProgressData > > ,
pre_checked_map : & mut BTreeMap < u64 , Vec < FileEntry > > ,
) -> Option < ( ) > {
debug ! ( "prehashing - start" ) ;
let check_type = self . hash_type ;
let ( progress_thread_handle , progress_thread_run , atomic_counter , check_was_stopped ) = prepare_thread_handler_common (
progress_sender ,
@ -636,6 +504,7 @@ impl DuplicateFinder {
let ( loaded_hash_map , records_already_cached , non_cached_files_to_check ) = self . prehash_load_cache_at_start ( ) ;
debug ! ( "Starting calculating prehash" ) ;
#[ allow(clippy::type_complexity) ]
let pre_hash_results : Vec < ( u64 , BTreeMap < String , Vec < FileEntry > > , Vec < String > ) > = non_cached_files_to_check
. par_iter ( )
@ -661,6 +530,7 @@ impl DuplicateFinder {
} )
. while_some ( )
. collect ( ) ;
debug ! ( "Completed calculating prehash" ) ;
send_info_and_wait_for_ending_all_threads ( & progress_thread_run , progress_thread_handle ) ;
@ -676,7 +546,9 @@ impl DuplicateFinder {
// Check results
for ( size , hash_map , errors ) in & pre_hash_results {
self . common_data . text_messages . warnings . append ( & mut errors . clone ( ) ) ;
if ! errors . is_empty ( ) {
self . common_data . text_messages . warnings . append ( & mut errors . clone ( ) ) ;
}
for vec_file_entry in hash_map . values ( ) {
if vec_file_entry . len ( ) > 1 {
pre_checked_map . entry ( * size ) . or_default ( ) . append ( & mut vec_file_entry . clone ( ) ) ;
@ -686,15 +558,14 @@ impl DuplicateFinder {
self . prehash_save_cache_at_exit ( loaded_hash_map , & pre_hash_results ) ;
debug ! ( "prehashing - end" ) ;
Some ( ( ) )
}
#[ fun_time(message = " full_hashing_load_cache_at_start " ) ]
fn full_hashing_load_cache_at_start (
& mut self ,
mut pre_checked_map : BTreeMap < u64 , Vec < FileEntry > > ,
) -> ( BTreeMap < u64 , Vec < FileEntry > > , BTreeMap < u64 , Vec < FileEntry > > , BTreeMap < u64 , Vec < FileEntry > > ) {
debug ! ( "full_hashing_load_cache_at_start - start" ) ;
let loaded_hash_map ;
let mut records_already_cached : BTreeMap < u64 , Vec < FileEntry > > = Default ::default ( ) ;
let mut non_cached_files_to_check : BTreeMap < u64 , Vec < FileEntry > > = Default ::default ( ) ;
@ -735,17 +606,16 @@ impl DuplicateFinder {
loaded_hash_map = Default ::default ( ) ;
mem ::swap ( & mut pre_checked_map , & mut non_cached_files_to_check ) ;
}
debug ! ( "full_hashing_load_cache_at_start - end" ) ;
( loaded_hash_map , records_already_cached , non_cached_files_to_check )
}
#[ fun_time(message = " full_hashing_save_cache_at_exit " ) ]
fn full_hashing_save_cache_at_exit (
& mut self ,
records_already_cached : BTreeMap < u64 , Vec < FileEntry > > ,
full_hash_results : & mut Vec < ( u64 , BTreeMap < String , Vec < FileEntry > > , Vec < String > ) > ,
loaded_hash_map : BTreeMap < u64 , Vec < FileEntry > > ,
) {
debug ! ( "full_hashing_save_cache_at_exit - start" ) ;
if ! self . common_data . use_cache {
return ;
}
@ -789,17 +659,15 @@ impl DuplicateFinder {
self . minimal_cache_file_size ,
) ;
self . get_text_messages_mut ( ) . extend_with_another_messages ( messages ) ;
debug ! ( "full_hashing_save_cache_at_exit - end" ) ;
}
#[ fun_time(message = " full_hashing " ) ]
fn full_hashing (
& mut self ,
stop_receiver : Option < & Receiver < ( ) > > ,
progress_sender : Option < & UnboundedSender < ProgressData > > ,
pre_checked_map : BTreeMap < u64 , Vec < FileEntry > > ,
) -> Option < ( ) > {
debug ! ( "full_hashing - start" ) ;
let check_type = self . hash_type ;
let ( progress_thread_handle , progress_thread_run , atomic_counter , check_was_stopped ) = prepare_thread_handler_common (
@ -812,59 +680,60 @@ impl DuplicateFinder {
) ;
///////////////////////////////////////////////////////////////////////////// HASHING START
{
let ( loaded_hash_map , records_already_cached , non_cached_files_to_check ) = self . full_hashing_load_cache_at_start ( pre_checked_map ) ;
let mut full_hash_results : Vec < ( u64 , BTreeMap < String , Vec < FileEntry > > , Vec < String > ) > = non_cached_files_to_check
. into_par_iter ( )
. map ( | ( size , vec_file_entry ) | {
let mut hashmap_with_hash : BTreeMap < String , Vec < FileEntry > > = Default ::default ( ) ;
let mut errors : Vec < String > = Vec ::new ( ) ;
let mut buffer = [ 0 u8 ; 1024 * 16 ] ;
atomic_counter . fetch_add ( vec_file_entry . len ( ) , Ordering ::Relaxed ) ;
for mut file_entry in vec_file_entry {
if stop_receiver . is_some ( ) & & stop_receiver . unwrap ( ) . try_recv ( ) . is_ok ( ) {
check_was_stopped . store ( true , Ordering ::Relaxed ) ;
return None ;
}
match hash_calculation ( & mut buffer , & file_entry , & check_type , u64 ::MAX ) {
Ok ( hash_string ) = > {
file_entry . hash = hash_string . clone ( ) ;
hashmap_with_hash . entry ( hash_string . clone ( ) ) . or_default ( ) . push ( file_entry ) ;
}
Err ( s ) = > errors . push ( s ) ,
let ( loaded_hash_map , records_already_cached , non_cached_files_to_check ) = self . full_hashing_load_cache_at_start ( pre_checked_map ) ;
debug ! ( "Starting full hashing of {} files" , non_cached_files_to_check . values ( ) . map ( Vec ::len ) . sum ::< usize > ( ) ) ;
let mut full_hash_results : Vec < ( u64 , BTreeMap < String , Vec < FileEntry > > , Vec < String > ) > = non_cached_files_to_check
. into_par_iter ( )
. map ( | ( size , vec_file_entry ) | {
let mut hashmap_with_hash : BTreeMap < String , Vec < FileEntry > > = Default ::default ( ) ;
let mut errors : Vec < String > = Vec ::new ( ) ;
let mut buffer = [ 0 u8 ; 1024 * 16 ] ;
atomic_counter . fetch_add ( vec_file_entry . len ( ) , Ordering ::Relaxed ) ;
for mut file_entry in vec_file_entry {
if stop_receiver . is_some ( ) & & stop_receiver . unwrap ( ) . try_recv ( ) . is_ok ( ) {
check_was_stopped . store ( true , Ordering ::Relaxed ) ;
return None ;
}
match hash_calculation ( & mut buffer , & file_entry , & check_type , u64 ::MAX ) {
Ok ( hash_string ) = > {
file_entry . hash = hash_string . clone ( ) ;
hashmap_with_hash . entry ( hash_string . clone ( ) ) . or_default ( ) . push ( file_entry ) ;
}
Err ( s ) = > errors . push ( s ) ,
}
Some ( ( size , hashmap_with_hash , errors ) )
} )
. while_some ( )
. collect ( ) ;
}
Some ( ( size , hashmap_with_hash , errors ) )
} )
. while_some ( )
. collect ( ) ;
debug ! ( "Finished full hashing" ) ;
self . full_hashing_save_cache_at_exit ( records_already_cached , & mut full_hash_results , loaded_hash_map ) ;
self . full_hashing_save_cache_at_exit ( records_already_cached , & mut full_hash_results , loaded_hash_map ) ;
send_info_and_wait_for_ending_all_threads ( & progress_thread_run , progress_thread_handle ) ;
send_info_and_wait_for_ending_all_threads ( & progress_thread_run , progress_thread_handle ) ;
// Break if stop was clicked after saving to cache
if check_was_stopped . load ( Ordering ::Relaxed ) {
return None ;
}
// Break if stop was clicked after saving to cache
if check_was_stopped . load ( Ordering ::Relaxed ) {
return None ;
}
for ( size , hash_map , mut errors ) in full_hash_results {
self . common_data . text_messages . warnings . append ( & mut errors ) ;
for ( _hash , vec_file_entry ) in hash_map {
if vec_file_entry . len ( ) > 1 {
self . files_with_identical_hashes . entry ( size ) . or_default ( ) . push ( vec_file_entry ) ;
}
for ( size , hash_map , mut errors ) in full_hash_results {
self . common_data . text_messages . warnings . append ( & mut errors ) ;
for ( _hash , vec_file_entry ) in hash_map {
if vec_file_entry . len ( ) > 1 {
self . files_with_identical_hashes . entry ( size ) . or_default ( ) . push ( vec_file_entry ) ;
}
}
}
debug ! ( "full_hashing - end" ) ;
Some ( ( ) )
}
#[ fun_time(message = " hash_reference_folders " ) ]
fn hash_reference_folders ( & mut self ) {
// Reference - only use in size, because later hash will be counted differently
if self . common_data . use_reference_folders {
@ -913,7 +782,7 @@ impl DuplicateFinder {
}
}
/// The slowest checking type, which must be applied after checking for size
#[ fun_time(message = " check_files_hash " ) ]
fn check_files_hash ( & mut self , stop_receiver : Option < & Receiver < ( ) > > , progress_sender : Option < & UnboundedSender < ProgressData > > ) -> bool {
assert_eq! ( self . check_method , CheckingMethod ::Hash ) ;
@ -936,34 +805,33 @@ impl DuplicateFinder {
true
}
/// Function to delete files, from filed before `BTreeMap`
/// Using another function to delete files to avoid duplicates data
#[ fun_time(message = " delete_files " ) ]
fn delete_files ( & mut self ) {
if self . delete_method = = DeleteMethod ::None {
if self . common_data. delete_method = = DeleteMethod ::None {
return ;
}
match self . check_method {
CheckingMethod ::Name = > {
for vector in self . files_with_identical_names . values ( ) {
let _tuple : ( u64 , usize , usize ) = delete_files ( vector , & self . delete_method, & mut self . common_data . text_messages , self . dryrun ) ;
let _tuple : ( u64 , usize , usize ) = delete_files ( vector , & self . common_data. delete_method, & mut self . common_data . text_messages , self . dryrun ) ;
}
}
CheckingMethod ::SizeName = > {
for vector in self . files_with_identical_size_names . values ( ) {
let _tuple : ( u64 , usize , usize ) = delete_files ( vector , & self . delete_method, & mut self . common_data . text_messages , self . dryrun ) ;
let _tuple : ( u64 , usize , usize ) = delete_files ( vector , & self . common_data. delete_method, & mut self . common_data . text_messages , self . dryrun ) ;
}
}
CheckingMethod ::Hash = > {
for vector_vectors in self . files_with_identical_hashes . values ( ) {
for vector in vector_vectors {
let _tuple : ( u64 , usize , usize ) = delete_files ( vector , & self . delete_method, & mut self . common_data . text_messages , self . dryrun ) ;
let _tuple : ( u64 , usize , usize ) = delete_files ( vector , & self . common_data. delete_method, & mut self . common_data . text_messages , self . dryrun ) ;
}
}
}
CheckingMethod ::Size = > {
for vector in self . files_with_identical_size . values ( ) {
let _tuple : ( u64 , usize , usize ) = delete_files ( vector , & self . delete_method, & mut self . common_data . text_messages , self . dryrun ) ;
let _tuple : ( u64 , usize , usize ) = delete_files ( vector , & self . common_data. delete_method, & mut self . common_data . text_messages , self . dryrun ) ;
}
}
_ = > panic! ( ) ,
@ -971,6 +839,84 @@ impl DuplicateFinder {
}
}
impl DuplicateFinder {
pub fn set_case_sensitive_name_comparison ( & mut self , case_sensitive_name_comparison : bool ) {
self . case_sensitive_name_comparison = case_sensitive_name_comparison ;
}
pub const fn get_check_method ( & self ) -> & CheckingMethod {
& self . check_method
}
pub fn set_minimal_cache_file_size ( & mut self , minimal_cache_file_size : u64 ) {
self . minimal_cache_file_size = minimal_cache_file_size ;
}
pub fn set_minimal_prehash_cache_file_size ( & mut self , minimal_prehash_cache_file_size : u64 ) {
self . minimal_prehash_cache_file_size = minimal_prehash_cache_file_size ;
}
pub const fn get_files_sorted_by_names ( & self ) -> & BTreeMap < String , Vec < FileEntry > > {
& self . files_with_identical_names
}
pub fn set_use_prehash_cache ( & mut self , use_prehash_cache : bool ) {
self . use_prehash_cache = use_prehash_cache ;
}
pub const fn get_files_sorted_by_size ( & self ) -> & BTreeMap < u64 , Vec < FileEntry > > {
& self . files_with_identical_size
}
pub const fn get_files_sorted_by_size_name ( & self ) -> & BTreeMap < ( u64 , String ) , Vec < FileEntry > > {
& self . files_with_identical_size_names
}
pub const fn get_files_sorted_by_hash ( & self ) -> & BTreeMap < u64 , Vec < Vec < FileEntry > > > {
& self . files_with_identical_hashes
}
pub const fn get_information ( & self ) -> & Info {
& self . information
}
pub fn set_hash_type ( & mut self , hash_type : HashType ) {
self . hash_type = hash_type ;
}
pub fn set_ignore_hard_links ( & mut self , ignore_hard_links : bool ) {
self . ignore_hard_links = ignore_hard_links ;
}
pub fn set_dryrun ( & mut self , dryrun : bool ) {
self . dryrun = dryrun ;
}
pub fn set_check_method ( & mut self , check_method : CheckingMethod ) {
self . check_method = check_method ;
}
pub fn get_use_reference ( & self ) -> bool {
self . common_data . use_reference_folders
}
pub fn get_files_with_identical_hashes_referenced ( & self ) -> & BTreeMap < u64 , Vec < ( FileEntry , Vec < FileEntry > ) > > {
& self . files_with_identical_hashes_referenced
}
pub fn get_files_with_identical_name_referenced ( & self ) -> & BTreeMap < String , ( FileEntry , Vec < FileEntry > ) > {
& self . files_with_identical_names_referenced
}
pub fn get_files_with_identical_size_referenced ( & self ) -> & BTreeMap < u64 , ( FileEntry , Vec < FileEntry > ) > {
& self . files_with_identical_size_referenced
}
pub fn get_files_with_identical_size_names_referenced ( & self ) -> & BTreeMap < ( u64 , String ) , ( FileEntry , Vec < FileEntry > ) > {
& self . files_with_identical_size_names_referenced
}
}
impl Default for DuplicateFinder {
fn default ( ) -> Self {
Self ::new ( )
@ -978,12 +924,8 @@ impl Default for DuplicateFinder {
}
impl DebugPrint for DuplicateFinder {
#[ allow(dead_code) ]
#[ allow(unreachable_code) ]
/// Debugging printing - only available on debug build
fn debug_print ( & self ) {
#[ cfg(not(debug_assertions)) ]
{
if ! cfg! ( debug_assertions ) {
return ;
}
println! ( "---------------DEBUG PRINT---------------" ) ;
@ -1015,62 +957,40 @@ impl DebugPrint for DuplicateFinder {
println! ( "Files list size - {}" , self . files_with_identical_size . len ( ) ) ;
println! ( "Hashed Files list size - {}" , self . files_with_identical_hashes . len ( ) ) ;
println! ( "Checking Method - {:?}" , self . check_method ) ;
println! ( "Delete Method - {:?}" , self . delete_method ) ;
self . debug_print_common ( ) ;
println! ( "-----------------------------------------" ) ;
}
}
impl SaveResults for DuplicateFinder {
fn save_results_to_file ( & mut self , file_name : & str ) -> bool {
let file_name : String = match file_name {
"" = > "results.txt" . to_string ( ) ,
k = > k . to_string ( ) ,
} ;
let file_handler = match File ::create ( & file_name ) {
Ok ( t ) = > t ,
Err ( e ) = > {
self . common_data . text_messages . errors . push ( format! ( "Failed to create file {file_name}, reason {e}" ) ) ;
return false ;
}
} ;
let mut writer = BufWriter ::new ( file_handler ) ;
if let Err ( e ) = writeln! (
impl PrintResults for DuplicateFinder {
fn write_results < T : Write > ( & self , writer : & mut T ) -> io ::Result < ( ) > {
writeln! (
writer ,
"Results of searching {:?} with excluded directories {:?} and excluded items {:?}" ,
self . common_data . directories . included_directories , self . common_data . directories . excluded_directories , self . common_data . excluded_items . items
) {
self . common_data
. text_messages
. errors
. push ( format! ( "Failed to save results to file {file_name}, reason {e}" ) ) ;
return false ;
}
) ? ;
match self . check_method {
CheckingMethod ::Name = > {
if ! self . files_with_identical_names . is_empty ( ) {
writeln! (
writer ,
"-------------------------------------------------Files with same names-------------------------------------------------"
)
. unwrap ( ) ;
) ? ;
writeln! (
writer ,
"Found {} files in {} groups with same name(may have different content)" ,
self . information . number_of_duplicated_files_by_name , self . information . number_of_groups_by_name ,
)
. unwrap ( ) ;
) ? ;
for ( name , vector ) in self . files_with_identical_names . iter ( ) . rev ( ) {
writeln! ( writer , "Name - {} - {} files " , name , vector . len ( ) ) . unwrap ( ) ;
writeln! ( writer , "Name - {} - {} files " , name , vector . len ( ) ) ? ;
for j in vector {
writeln! ( writer , "{}" , j . path . display ( ) ) . unwrap ( ) ;
writeln! ( writer , "{}" , j . path . display ( ) ) ? ;
}
writeln! ( writer ) . unwrap ( ) ;
writeln! ( writer ) ? ;
}
} else {
write! ( writer , "Not found any files with same names." ) . unwrap ( ) ;
write! ( writer , "Not found any files with same names." ) ? ;
}
}
CheckingMethod ::SizeName = > {
@ -1078,23 +998,21 @@ impl SaveResults for DuplicateFinder {
writeln! (
writer ,
"-------------------------------------------------Files with same size and names-------------------------------------------------"
)
. unwrap ( ) ;
) ? ;
writeln! (
writer ,
"Found {} files in {} groups with same size and name(may have different content)" ,
self . information . number_of_duplicated_files_by_size_name , self . information . number_of_groups_by_size_name ,
)
. unwrap ( ) ;
) ? ;
for ( ( size , name ) , vector ) in self . files_with_identical_size_names . iter ( ) . rev ( ) {
writeln! ( writer , "Name - {}, {} - {} files " , name , format_size ( * size , BINARY ) , vector . len ( ) ) . unwrap ( ) ;
writeln! ( writer , "Name - {}, {} - {} files " , name , format_size ( * size , BINARY ) , vector . len ( ) ) ? ;
for j in vector {
writeln! ( writer , "{}" , j . path . display ( ) ) . unwrap ( ) ;
writeln! ( writer , "{}" , j . path . display ( ) ) ? ;
}
writeln! ( writer ) . unwrap ( ) ;
writeln! ( writer ) ? ;
}
} else {
write! ( writer , "Not found any files with same size and names." ) . unwrap ( ) ;
write! ( writer , "Not found any files with same size and names." ) ? ;
}
}
CheckingMethod ::Size = > {
@ -1102,24 +1020,22 @@ impl SaveResults for DuplicateFinder {
writeln! (
writer ,
"-------------------------------------------------Files with same size-------------------------------------------------"
)
. unwrap ( ) ;
) ? ;
writeln! (
writer ,
"Found {} duplicated files which in {} groups which takes {}." ,
self . information . number_of_duplicated_files_by_size ,
self . information . number_of_groups_by_size ,
format_size ( self . information . lost_space_by_size , BINARY )
)
. unwrap ( ) ;
) ? ;
for ( size , vector ) in self . files_with_identical_size . iter ( ) . rev ( ) {
write! ( writer , "\n---- Size {} ({}) - {} files \n" , format_size ( * size , BINARY ) , size , vector . len ( ) ) . unwrap ( ) ;
write! ( writer , "\n---- Size {} ({}) - {} files \n" , format_size ( * size , BINARY ) , size , vector . len ( ) ) ? ;
for file_entry in vector {
writeln! ( writer , "{}" , file_entry . path . display ( ) ) . unwrap ( ) ;
writeln! ( writer , "{}" , file_entry . path . display ( ) ) ? ;
}
}
} else {
write! ( writer , "Not found any duplicates." ) . unwrap ( ) ;
write! ( writer , "Not found any duplicates." ) ? ;
}
}
CheckingMethod ::Hash = > {
@ -1127,121 +1043,33 @@ impl SaveResults for DuplicateFinder {
writeln! (
writer ,
"-------------------------------------------------Files with same hashes-------------------------------------------------"
)
. unwrap ( ) ;
) ? ;
writeln! (
writer ,
"Found {} duplicated files which in {} groups which takes {}." ,
self . information . number_of_duplicated_files_by_hash ,
self . information . number_of_groups_by_hash ,
format_size ( self . information . lost_space_by_hash , BINARY )
)
. unwrap ( ) ;
) ? ;
for ( size , vectors_vector ) in self . files_with_identical_hashes . iter ( ) . rev ( ) {
for vector in vectors_vector {
writeln! ( writer , "\n---- Size {} ({}) - {} files" , format_size ( * size , BINARY ) , size , vector . len ( ) ) . unwrap ( ) ;
writeln! ( writer , "\n---- Size {} ({}) - {} files" , format_size ( * size , BINARY ) , size , vector . len ( ) ) ? ;
for file_entry in vector {
writeln! ( writer , "{}" , file_entry . path . display ( ) ) . unwrap ( ) ;
writeln! ( writer , "{}" , file_entry . path . display ( ) ) ? ;
}
}
}
} else {
write! ( writer , "Not found any duplicates." ) . unwrap ( ) ;
write! ( writer , "Not found any duplicates." ) ? ;
}
}
_ = > panic! ( ) ,
}
true
}
}
impl PrintResults for DuplicateFinder {
/// Print information's about duplicated entries
/// Only needed for CLI
fn print_results ( & self ) {
let mut number_of_files : u64 = 0 ;
let mut number_of_groups : u64 = 0 ;
match self . check_method {
CheckingMethod ::Name = > {
for i in & self . files_with_identical_names {
number_of_files + = i . 1. len ( ) as u64 ;
number_of_groups + = 1 ;
}
println! ( "Found {number_of_files} files in {number_of_groups} groups with same name(may have different content)" , ) ;
for ( name , vector ) in & self . files_with_identical_names {
println! ( "Name - {} - {} files " , name , vector . len ( ) ) ;
for j in vector {
println! ( "{}" , j . path . display ( ) ) ;
}
println! ( ) ;
}
}
CheckingMethod ::SizeName = > {
for i in & self . files_with_identical_size_names {
number_of_files + = i . 1. len ( ) as u64 ;
number_of_groups + = 1 ;
}
println! ( "Found {number_of_files} files in {number_of_groups} groups with same size and name(may have different content)" , ) ;
for ( ( size , name ) , vector ) in & self . files_with_identical_size_names {
println! ( "Name - {}, {} - {} files " , name , format_size ( * size , BINARY ) , vector . len ( ) ) ;
for j in vector {
println! ( "{}" , j . path . display ( ) ) ;
}
println! ( ) ;
}
}
CheckingMethod ::Hash = > {
for vector in self . files_with_identical_hashes . values ( ) {
for j in vector {
number_of_files + = j . len ( ) as u64 ;
number_of_groups + = 1 ;
}
}
println! (
"Found {} duplicated files in {} groups with same content which took {}:" ,
number_of_files ,
number_of_groups ,
format_size ( self . information . lost_space_by_size , BINARY )
) ;
for ( size , vector ) in self . files_with_identical_hashes . iter ( ) . rev ( ) {
for j in vector {
println! ( "Size - {} ({}) - {} files " , format_size ( * size , BINARY ) , size , j . len ( ) ) ;
for k in j {
println! ( "{}" , k . path . display ( ) ) ;
}
println! ( "----" ) ;
}
println! ( ) ;
}
}
CheckingMethod ::Size = > {
for i in & self . files_with_identical_size {
number_of_files + = i . 1. len ( ) as u64 ;
number_of_groups + = 1 ;
}
println! (
"Found {} files in {} groups with same size(may have different content) which took {}:" ,
number_of_files ,
number_of_groups ,
format_size ( self . information . lost_space_by_size , BINARY )
) ;
for ( size , vector ) in & self . files_with_identical_size {
println! ( "Size - {} ({}) - {} files " , format_size ( * size , BINARY ) , size , vector . len ( ) ) ;
for j in vector {
println! ( "{}" , j . path . display ( ) ) ;
}
println! ( ) ;
}
}
_ = > panic! ( ) ,
}
Ok ( ( ) )
}
}
/// Functions to remove slice(vector) of files with provided method
/// Returns size of removed elements, number of deleted and failed to delete files and modified warning list
fn delete_files ( vector : & [ FileEntry ] , delete_method : & DeleteMethod , text_messages : & mut Messages , dryrun : bool ) -> ( u64 , usize , usize ) {
assert! ( vector . len ( ) > 1 , "Vector length must be bigger than 1(This should be done in previous steps)." ) ;
let mut gained_space : u64 = 0 ;
@ -1252,11 +1080,13 @@ fn delete_files(vector: &[FileEntry], delete_method: &DeleteMethod, text_message
DeleteMethod ::OneOldest | DeleteMethod ::AllExceptNewest = > values . max_by ( | ( _ , l ) , ( _ , r ) | l . modified_date . cmp ( & r . modified_date ) ) ,
DeleteMethod ::OneNewest | DeleteMethod ::AllExceptOldest | DeleteMethod ::HardLink = > values . min_by ( | ( _ , l ) , ( _ , r ) | l . modified_date . cmp ( & r . modified_date ) ) ,
DeleteMethod ::None = > values . next ( ) ,
_ = > unreachable! ( ) ,
} ;
let q_index = q_index . map_or ( 0 , | t | t . 0 ) ;
let n = match delete_method {
DeleteMethod ::OneNewest | DeleteMethod ::OneOldest = > 1 ,
DeleteMethod ::AllExceptNewest | DeleteMethod ::AllExceptOldest | DeleteMethod ::None | DeleteMethod ::HardLink = > usize ::MAX ,
_ = > unreachable! ( ) ,
} ;
for ( index , file ) in vector . iter ( ) . enumerate ( ) {
if q_index = = index {
@ -1283,6 +1113,7 @@ fn delete_files(vector: &[FileEntry], delete_method: &DeleteMethod, text_message
}
}
DeleteMethod ::None = > Ok ( None ) ,
_ = > unreachable! ( ) ,
} ;
match r {