@ -1,47 +1,200 @@
// Todo, należy upewnić się, że ma wystarczające uprawnienia do odczytu i usuwania
use std ::collections ::HashMap ;
use std ::collections ::{ BTreeMap , HashMap } ;
use std ::fs ::{ File , Metadata } ;
use std ::hash ::Hash ;
use std ::io ::prelude ::* ;
use std ::path ::Path ;
use std ::time ::SystemTime ;
use std ::{ fs , process } ;
const MIN_FILE_SIZE : u64 = 1000 ;
#[ derive(PartialEq) ]
#[ allow(dead_code) ]
pub enum CheckingMethod {
SIZE ,
HASH ,
}
pub struct DuplicateFinder {
number_of_checked_files : usize ,
number_of_ignored_files : usize ,
number_of_checked_folders : usize ,
number_of_ignored_things : usize ,
number_of_duplicated_files : usize ,
// files : Vec<HashMap<FileEntry, Vec<FileEntry>>>,
files_size : HashMap < u64 , Vec < FileEntry > > ,
// files_hashes: HashMap<[u8],Vec<FileEntry>>,
// duplicated_entries // Same as files, but only with 2+ entries
// files : Vec<Vec<FileEntry>>,
files_with_identical_size : HashMap < u64 , Vec < FileEntry > > ,
files_with_identical_hashes : BTreeMap < u64 , Vec < Vec < FileEntry > > > ,
allowed_extensions : Vec < String > , // jpg, jpeg, mp4
// excluded_items: Vec<String>,
excluded_directories : Vec < String > ,
included_directories : Vec < String > ,
// ignored_extensions: Vec<String>,
// allowed_extensions: Vec<String>,
// ignored_file_names: Vec<String>, // TODO Regex Support
// allowed_file_names: Vec<String>, // TODO Regex Support
min_file_size : u64 ,
}
impl DuplicateFinder {
pub fn new ( ) -> DuplicateFinder {
DuplicateFinder {
number_of_checked_files : 0 ,
number_of_ignored_files : 0 ,
number_of_checked_folders : 0 ,
number_of_ignored_things : 0 ,
number_of_duplicated_files : 0 ,
files_size : Default ::default ( ) ,
// files_hashes: Default::default(),
files_with_identical_size : Default ::default ( ) ,
files_with_identical_hashes : Default ::default ( ) ,
// excluded_items: vec![],
excluded_directories : vec ! [ ] ,
included_directories : vec ! [ ] ,
// ignored_extensions: vec![],
// allowed_extensions: vec![],
// ignored_file_names: vec![],
// allowed_file_names: vec![]
min_file_size : 0 ,
allowed_extensions : vec ! [ ] ,
}
}
pub fn find_duplicates ( mut self , check_method : CheckingMethod ) {
self . optimize_directories ( ) ;
self . debug_print ( ) ;
self . check_files_size ( ) ;
self . remove_files_with_unique_size ( ) ;
if check_method = = CheckingMethod ::HASH {
self . check_files_hash ( ) ;
}
// self.print_duplicated_entries(check_method);
}
pub fn set_min_file_size ( & mut self , min_size : u64 ) {
self . min_file_size = min_size ;
}
pub fn set_excluded_items ( & mut self , _excluded_items : String ) {
// TODO Still don't know how to exactly parse this
// Things like /.git/ should be by default hidden with help of this *.git*
}
pub fn set_allowed_extensions ( & mut self , mut allowed_extensions : String ) {
if allowed_extensions . is_empty ( ) {
println! ( "No allowed extension was provided, so all are allowed" ) ;
}
allowed_extensions = allowed_extensions . replace ( "IMAGE" , "jpg,kra,gif,png,bmp,tiff,webp,hdr,svg" ) ;
allowed_extensions = allowed_extensions . replace ( "VIDEO" , "mp4,flv,mkv,webm,vob,ogv,gifv,avi,mov,wmv,mpg,m4v,m4p,mpeg,3gp" ) ;
allowed_extensions = allowed_extensions . replace ( "MUSIC" , "mp3,flac,ogg,tta,wma,webm" ) ;
let extensions : Vec < String > = allowed_extensions . split ( ',' ) . map ( String ::from ) . collect ( ) ;
for mut extension in extensions {
if extension . contains ( '.' ) {
if ! extension . starts_with ( '.' ) {
println! ( "{} is not valid extension(valid extension doesn't have dot inside)" , extension ) ;
}
extension = extension . replace ( '.' , "" ) ;
}
self . allowed_extensions . push ( extension . trim ( ) . to_string ( ) ) ;
}
if self . allowed_extensions . len ( ) = = 0 {
println! ( "No valid extensions were provided, so allowing all extensions by default." ) ;
}
}
pub fn set_include_directory ( & mut self , mut include_directory : String ) {
// let start_time: SystemTime = SystemTime::now();
if include_directory . is_empty ( ) {
println! ( "At least one directory must be provided" )
}
include_directory = include_directory . replace ( "\"" , "" ) ;
let directories : Vec < String > = include_directory . split ( ',' ) . map ( String ::from ) . collect ( ) ;
let mut checked_directories : Vec < String > = Vec ::new ( ) ;
for directory in directories {
if directory = = "/" {
println! ( "Using / is probably not good idea, you may go out of ram." ) ;
}
if directory . contains ( '*' ) {
println! ( "Include Directory ERROR: Wildcards are not supported, please don't use it." ) ;
process ::exit ( 1 ) ;
}
if directory . starts_with ( '~' ) {
println! ( "Include Directory ERROR: ~ in path isn't supported." ) ;
process ::exit ( 1 ) ;
}
if ! directory . starts_with ( '/' ) {
println! ( "Include Directory ERROR: Relative path are not supported." ) ;
process ::exit ( 1 ) ;
}
if ! Path ::new ( & directory ) . exists ( ) {
println! ( "Include Directory ERROR: Path {} doesn't exists." , directory ) ;
process ::exit ( 1 ) ;
}
if ! Path ::new ( & directory ) . exists ( ) {
println! ( "Include Directory ERROR: {} isn't folder." , directory ) ;
process ::exit ( 1 ) ;
}
// directory must end with /, due to possiblity of incorrect assumption, that e.g. /home/rafal is top folder to /home/rafalinho
if ! directory . ends_with ( '/' ) {
checked_directories . push ( directory . trim ( ) . to_string ( ) + "/" ) ;
} else {
checked_directories . push ( directory . trim ( ) . to_string ( ) ) ;
}
}
if checked_directories . is_empty ( ) {
println! ( "Not found even one correct path to include." ) ;
process ::exit ( 1 ) ;
}
self . included_directories = checked_directories ;
//DuplicateFinder::print_time(start_time, SystemTime::now(), "set_include_directory".to_string());
}
pub fn set_exclude_directory ( & mut self , mut exclude_directory : String ) {
//let start_time: SystemTime = SystemTime::now();
if exclude_directory . is_empty ( ) {
return ;
}
exclude_directory = exclude_directory . replace ( "\"" , "" ) ;
let directories : Vec < String > = exclude_directory . split ( ',' ) . map ( String ::from ) . collect ( ) ;
let mut checked_directories : Vec < String > = Vec ::new ( ) ;
for directory in directories {
if directory = = "/" {
println! ( "Exclude Directory ERROR: Excluding / is pointless, because it means that no files will be scanned." ) ;
}
if directory . contains ( '*' ) {
println! ( "Exclude Directory ERROR: Wildcards are not supported, please don't use it." ) ;
process ::exit ( 1 ) ;
}
if directory . starts_with ( '~' ) {
println! ( "Exclude Directory ERROR: ~ in path isn't supported." ) ;
process ::exit ( 1 ) ;
}
if ! directory . starts_with ( '/' ) {
println! ( "Exclude Directory ERROR: Relative path are not supported." ) ;
process ::exit ( 1 ) ;
}
if ! Path ::new ( & directory ) . exists ( ) {
println! ( "Exclude Directory ERROR: Path {} doesn't exists." , directory ) ;
process ::exit ( 1 ) ;
}
if ! Path ::new ( & directory ) . exists ( ) {
println! ( "Exclude Directory ERROR: {} isn't folder." , directory ) ;
process ::exit ( 1 ) ;
}
// directory must end with /, due to possiblity of incorrect assumption, that e.g. /home/rafal is top folder to /home/rafalinho
if ! directory . ends_with ( '/' ) {
checked_directories . push ( directory . trim ( ) . to_string ( ) + "/" ) ;
} else {
checked_directories . push ( directory . trim ( ) . to_string ( ) ) ;
}
}
println! ( "{:?}" , checked_directories ) ;
self . excluded_directories = checked_directories ;
//DuplicateFinder::print_time(start_time, SystemTime::now(), "set_exclude_directory".to_string());
}
// TODO - Still isn't used but it will be probably required with GUI
// pub fn clear(&mut self) {
//
@ -54,11 +207,11 @@ impl DuplicateFinder {
// self.excluded_directories.clear();
// self.included_directories.clear();
// }
pub fn find_duplicates_by_size ( & mut self ) {
// TODO add multithread checking for file hash
//let mut path;
fn check_files_size ( & mut self ) {
// TODO maybe add multithread checking for file hash
let start_time : SystemTime = SystemTime ::now ( ) ;
let mut folders_to_check : Vec < String > = Vec ::with_capacity ( 1024 * 16 ) ; // This should be small enough too not see to big difference and big enough to store most of paths without needing to resize vector
let mut folders_to_check : Vec < String > = Vec ::with_capacity ( 1024 * 2 ) ; // This should be small enough too not see to big difference and
// big enough to store most of paths without needing to resize vector
// Add root folders for finding
for id in & self . included_directories {
@ -82,7 +235,7 @@ impl DuplicateFinder {
let mut is_excluded_dir = false ;
next_folder = "" . to_owned ( ) + & current_folder + & entry_data . file_name ( ) . into_string ( ) . unwrap ( ) + "/" ;
for ed in & self . excluded_directories {
if next_folder = = ed . to_string ( ) {
if next_folder = = * ed {
is_excluded_dir = true ;
break ;
}
@ -94,21 +247,28 @@ impl DuplicateFinder {
//println!("Directory\t - {:?}", next_folder); // DEBUG
} else if metadata . is_file ( ) {
let current_file_name = "" . to_owned ( ) + & current_folder + & entry_data . file_name ( ) . into_string ( ) . unwrap ( ) ;
// println!("File\t\t - {:?}", current_file_name); // DEBUG
//file_to_check
let fe : FileEntry = FileEntry {
path : current_file_name ,
size : metadata . len ( ) ,
created_date : metadata . created ( ) . unwrap ( ) ,
modified_date : metadata . modified ( ) . unwrap ( ) ,
} ;
if ! self . files_size . contains_key ( & metadata . len ( ) ) {
self . files_size . insert ( metadata . len ( ) , Vec ::new ( ) ) ;
if metadata . len ( ) > = MIN_FILE_SIZE {
let current_file_name = "" . to_owned ( ) + & current_folder + & entry_data . file_name ( ) . into_string ( ) . unwrap ( ) ;
// println!("File\t\t - {:?}", current_file_name); // DEBUG
//file_to_check
let fe : FileEntry = FileEntry {
path : current_file_name ,
size : metadata . len ( ) ,
created_date : metadata . created ( ) . unwrap ( ) ,
modified_date : metadata . modified ( ) . unwrap ( ) ,
} ;
// // self.files_with_identical_size.entry from below should be faster according to clippy
// if !self.files_with_identical_size.contains_key(&metadata.len()) {
// self.files_with_identical_size.insert(metadata.len(), Vec::new());
// }
self . files_with_identical_size . entry ( metadata . len ( ) ) . or_insert_with ( Vec ::new ) ;
self . files_with_identical_size . get_mut ( & metadata . len ( ) ) . unwrap ( ) . push ( fe ) ;
self . number_of_checked_files + = 1 ;
} else {
self . number_of_ignored_files + = 1 ;
}
self . files_size . get_mut ( & metadata . len ( ) ) . unwrap ( ) . push ( fe ) ;
self . number_of_checked_files + = 1 ;
} else {
// Probably this is symbolic links so we are free to ignore this
// println!("Found another type of file {} {:?}","".to_owned() + ¤t_folder + &entry_data.file_name().into_string().unwrap(), metadata) //DEBUG
@ -117,49 +277,45 @@ impl DuplicateFinder {
}
}
self . debug_print ( ) ;
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , " find_duplicates ". to_string ( ) ) ;
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , " check_files_size ". to_string ( ) ) ;
//println!("Duration of finding duplicates {:?}", end_time.duration_since(start_time).expect("a"));
}
// pub fn save_ to_file(&self) {}
// pub fn save_ results_ to_file(&self) {}
/// Remove files which have unique size
pub fn remove_files_with_unique_size ( & mut self ) {
fn remove_files_with_unique_size ( & mut self ) {
let start_time : SystemTime = SystemTime ::now ( ) ;
self . debug_print ( ) ;
let mut new_hashmap : HashMap < u64 , Vec < FileEntry > > = Default ::default ( ) ;
self . number_of_duplicated_files = 0 ;
for entry in & self . files_ size {
for entry in & self . files_ with_identical_ size {
if entry . 1. len ( ) > 1 {
self . number_of_duplicated_files + = entry . 1. len ( ) - 1 ;
new_hashmap . insert ( * entry . 0 , entry . 1. clone ( ) ) ;
}
}
self . files_ size = new_hashmap ;
self . files_ with_identical_ size = new_hashmap ;
self . debug_print ( ) ;
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , " optimize_files ". to_string ( ) ) ;
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , " remove_files_with_unique_size ". to_string ( ) ) ;
}
/// Should be slower than checking in different ways, but still needs to be checked
pub fn find_dup licat es_by_ hashing ( mut self ) {
fn check_ files_hash( & mut self ) {
let start_time : SystemTime = SystemTime ::now ( ) ;
let mut file_handler : File ;
let mut hashmap_with_hash : HashMap < String , Vec < FileEntry > > ;
for entry in self . files_size {
let mut hashes : Vec < String > = Vec ::new ( ) ;
if entry . 1. len ( ) > 5 {
println! ( "{}" , entry . 1. len ( ) ) ;
}
for entry in & self . files_with_identical_size {
hashmap_with_hash = Default ::default ( ) ;
for file_entry in entry . 1. iter ( ) . enumerate ( ) {
file_handler = match File ::open ( & file_entry . 1. path ) {
Ok ( T) = > T ,
Ok ( t) = > t ,
Err ( _ ) = > {
// Removing File may happens,so we should handle this
hashes . push ( "" . to_owned ( ) ) ;
continue ;
}
} ;
@ -173,11 +329,20 @@ impl DuplicateFinder {
}
hasher . update ( & buffer [ .. n ] ) ;
}
//println!("{}", hasher.finalize().to_hex().to_string());
let hash_string : String = hasher . finalize ( ) . to_hex ( ) . to_string ( ) ;
hashmap_with_hash . entry ( hash_string . to_string ( ) ) . or_insert_with ( Vec ::new ) ;
hashmap_with_hash . get_mut ( & * hash_string ) . unwrap ( ) . push ( file_entry . 1. to_owned ( ) ) ;
}
for hash_entry in hashmap_with_hash {
if hash_entry . 1. len ( ) > 1 {
self . files_with_identical_hashes . entry ( * entry . 0 ) . or_insert_with ( Vec ::new ) ;
self . files_with_identical_hashes . get_mut ( entry . 0 ) . unwrap ( ) . push ( hash_entry . 1 ) ;
// self.files_with_identical_hashes.insert(*entry.0,hash_entry.1);
}
}
}
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , "find_duplicates_by_hashing" . to_string ( ) ) ;
self . debug_print ( ) ;
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , " check_ files_hash". to_string ( ) ) ;
}
// /// I'mm not sure about performance, so maybe I
// pub fn find_small_duplicates_by_hashing(mut self){
@ -188,7 +353,7 @@ impl DuplicateFinder {
// DuplicateFinder::print_time(start_time, SystemTime::now(), "find_duplicates_by_comparting_begin_bytes_of_file".to_string());
// }
pub fn print_time ( start_time : SystemTime , end_time : SystemTime , function_name : String ) {
fn print_time ( start_time : SystemTime , end_time : SystemTime , function_name : String ) {
println! (
"Execution of function \"{}\" took {:?}" ,
function_name ,
@ -197,124 +362,80 @@ impl DuplicateFinder {
}
/// Setting include directories, panics when there is not directories available
pub fn set_include_directory ( & mut self , mut include_directory : String ) {
let start_time : SystemTime = SystemTime ::now ( ) ;
if include_directory . is_empty ( ) {
println! ( "At least one directory must be provided" )
}
include_directory = include_directory . replace ( "\"" , "" ) ;
let directories : Vec < String > = include_directory . split ( ',' ) . map ( String ::from ) . collect ( ) ;
let mut checked_directories : Vec < String > = Vec ::new ( ) ;
for directory in directories {
if directory = = "/" {
println! ( "Using / is probably not good idea, you may go out of ram." ) ;
}
if directory . contains ( '*' ) {
println! ( "Include Directory ERROR: Wildcards are not supported, please don't use it." ) ;
process ::exit ( 1 ) ;
}
if directory . starts_with ( '~' ) {
println! ( "Include Directory ERROR: ~ in path isn't supported." ) ;
process ::exit ( 1 ) ;
}
if ! directory . starts_with ( '/' ) {
println! ( "Include Directory ERROR: Relative path are not supported." ) ;
process ::exit ( 1 ) ;
}
if ! Path ::new ( & directory ) . exists ( ) {
println! ( "Include Directory ERROR: Path {} doens't exists." , directory ) ;
process ::exit ( 1 ) ;
}
if ! Path ::new ( & directory ) . exists ( ) {
println! ( "Include Directory ERROR: {} isn't folder." , directory ) ;
process ::exit ( 1 ) ;
}
// directory must end with /, due to possiblity of incorrect assumption, that e.g. /home/rafal is top folder to /home/rafalinho
if ! directory . ends_with ( '/' ) {
checked_directories . push ( directory + "/" ) ;
} else {
checked_directories . push ( directory ) ;
}
}
if checked_directories . is_empty ( ) {
println! ( "Not found even one correct path to include." ) ;
process ::exit ( 1 ) ;
}
self . included_directories = checked_directories ;
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , "set_include_directory" . to_string ( ) ) ;
fn debug_print ( & self ) {
// println!("---------------DEBUG PRINT---------------");
// println!("Number of all checked files - {}", self.number_of_checked_files);
// println!("Number of all ignored files - {}", self.number_of_ignored_files);
// println!("Number of all checked folders - {}", self.number_of_checked_folders);
// println!("Number of all ignored things - {}", self.number_of_ignored_things);
// println!("Number of duplicated files - {}", self.number_of_duplicated_files);
// let mut file_size : u64 = 0;
// for i in &self.files_with_identical_size{
// file_size += i.1.len() as u64;
// }
// println!("Files list size - {} ({})", self.files_with_identical_size.len(), file_size);
// let mut hashed_file_size : u64 = 0;
// for i in &self.files_with_identical_hashes{
// for j in i.1{
// hashed_file_size += j.len() as u64;
// }
// }
// println!("Hashed Files list size - {} ({})", self.files_with_identical_hashes.len(), hashed_file_size);
// println!("Excluded directories - {:?}", self.excluded_directories);
// println!("Included directories - {:?}", self.included_directories);
// println!("-----------------------------------------");
}
pub fn set_exclude_directory ( & mut self , mut exclude_directory : String ) {
#[ allow(dead_code) ]
fn print_duplicated_entries ( & self , check_method : CheckingMethod ) {
let start_time : SystemTime = SystemTime ::now ( ) ;
if exclude_directory . is_empty ( ) {
return ;
}
exclude_directory = exclude_directory . replace ( "\"" , "" ) ;
let directories : Vec < String > = exclude_directory . split ( ',' ) . map ( String ::from ) . collect ( ) ;
let mut checked_directories : Vec < String > = Vec ::new ( ) ;
for directory in directories {
if directory = = "/" {
println! ( "Exclude Directory ERROR: Excluding / is pointless, because it means that no files will be scanned." ) ;
}
if directory . contains ( '*' ) {
println! ( "Exclude Directory ERROR: Wildcards are not supported, please don't use it." ) ;
process ::exit ( 1 ) ;
}
if directory . starts_with ( '~' ) {
println! ( "Exclude Directory ERROR: ~ in path isn't supported." ) ;
process ::exit ( 1 ) ;
}
if ! directory . starts_with ( '/' ) {
println! ( "Exclude Directory ERROR: Relative path are not supported." ) ;
process ::exit ( 1 ) ;
}
if ! Path ::new ( & directory ) . exists ( ) {
println! ( "Exclude Directory ERROR: Path {} doens't exists." , directory ) ;
process ::exit ( 1 ) ;
}
if ! Path ::new ( & directory ) . exists ( ) {
println! ( "Exclude Directory ERROR: {} isn't folder." , directory ) ;
process ::exit ( 1 ) ;
let mut number_of_files : u64 = 0 ;
let mut number_of_groups : u64 = 0 ;
match check_method {
CheckingMethod ::HASH = > {
for i in & self . files_with_identical_hashes {
for j in i . 1 {
number_of_files + = j . len ( ) as u64 ;
number_of_groups + = 1 ;
}
}
println! ( "Found {} files in {} groups with same content:" , number_of_files , number_of_groups ) ;
for i in & self . files_with_identical_hashes {
println! ( "Size - {}" , i . 0 ) ;
for j in i . 1 {
for k in j {
println! ( "{}" , k . path ) ;
}
println! ( "----" ) ;
}
println! ( ) ;
}
}
// directory must end with /, due to possiblity of incorrect assumption, that e.g. /home/rafal is top folder to /home/rafalinho
if ! directory . ends_with ( '/' ) {
checked_directories . push ( directory + "/" ) ;
} else {
checked_directories . push ( directory ) ;
CheckingMethod ::SIZE = > {
for i in & self . files_with_identical_size {
number_of_files + = i . 1. len ( ) as u64 ;
number_of_groups + = 1 ;
}
println! ( "Found {} files in {} groups with same size(may have different content):" , number_of_files , number_of_groups ) ;
for i in & self . files_with_identical_size {
println! ( "Size - {}" , i . 0 ) ;
for j in i . 1 {
println! ( "{}" , j . path ) ;
}
println! ( ) ;
}
}
}
self . excluded_directories = checked_directories ;
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , "set_exclude_directory" . to_string ( ) ) ;
}
pub fn debug_print ( & self ) {
println! ( "---------------DEBUG PRINT---------------" ) ;
println! ( "Number of all checked files - {}" , self . number_of_checked_files ) ;
println! ( "Number of all checked folders - {}" , self . number_of_checked_folders ) ;
println! ( "Number of all ignored things - {}" , self . number_of_ignored_things ) ;
println! ( "Number of duplicated files - {}" , self . number_of_duplicated_files ) ;
println! ( "Files list - {}" , self . files_size . len ( ) ) ;
println! ( "Excluded directories - {:?}" , self . excluded_directories ) ;
println! ( "Included directories - {:?}" , self . included_directories ) ;
println! ( "-----------------------------------------" ) ;
DuplicateFinder ::print_time ( start_time , SystemTime ::now ( ) , "print_duplicated_entries" . to_string ( ) ) ;
}
/// Remove unused entries when included or excluded overlaps with each other or are duplicated
/// ```
/// let df : DuplicateFinder = saf
/// ```
pub fn optimize_directories ( & mut self ) {
fn optimize_directories ( & mut self ) {
let start_time : SystemTime = SystemTime ::now ( ) ;
let mut optimized_included : Vec < String > = Vec ::< String > ::new ( ) ;