@ -729,19 +729,28 @@ impl SimilarImages {
} ;
//// PROGRESS THREAD END
for hash in & all_hashes {
self . bktree . add ( hash . to_vec ( ) ) ;
// Don't use hashes with multiple images in bktree, because they will always be master of group and cannot be find by other hashes
let mut additional_chunk_to_check : Vec < _ > = Default ::default ( ) ;
let mut hashes_with_multiple_images : HashSet < _ > = Default ::default ( ) ; // Fast way to check if hash have multiple imaages
for ( hash , vec_files ) in & all_hashed_images {
if vec_files . len ( ) > = 2 {
additional_chunk_to_check . push ( hash ) ;
hashes_with_multiple_images . insert ( hash ) ;
} else {
self . bktree . add ( hash . to_vec ( ) ) ;
}
}
let number_of_processors = num_cpus ::get ( ) ;
let chunk_size = all_hashes . len ( ) / number_of_processors ;
let chunks : Vec < _ > = if chunk_size > 0 { all_hashes . chunks ( chunk_size ) . collect ( ) } else { vec! [ & all_hashes ] } ;
let mut chunks : Vec < _ > = if chunk_size > 0 { all_hashes . chunks ( chunk_size ) . collect ( ) } else { vec! [ & all_hashes ] } ;
chunks . push ( & additional_chunk_to_check ) ;
let parts : Vec < _ > = chunks
. into_par_iter ( )
. map ( | hashes_to_check | {
let mut hashes_parents : HashMap < & Vec < u8 > , u32 > = Default ::default ( ) ; // Hash used as parent, childrens
let mut hashes_similarity : HashMap < & Vec < u8 > , ( & Vec < u8 > , u32 ) > = Default ::default ( ) ; // Hash used as child, (parent_hash,similarity)
let mut hashes_parents : HashMap < & Vec < u8 > , u32 > = Default ::default ( ) ; // Hash es used as parent (hash , children_number_of_ha sh)
let mut hashes_similarity : HashMap < & Vec < u8 > , ( & Vec < u8 > , u32 ) > = Default ::default ( ) ; // Hash es used as child, (parent_hash, similarity)
// Sprawdź czy hash nie jest użyty jako master gdzie indziej
// Jeśli tak to przejdź do sprawdzania kolejnego elementu
@ -752,15 +761,16 @@ impl SimilarImages {
for ( index , hash_to_check ) in hashes_to_check . iter ( ) . enumerate ( ) {
// Don't check for user stop too often
// Also don't add too ofte r data to variables
const CYCLES_COUNTER : usize = 50 ;
if index % CYCLES_COUNTER = = 0 & & index ! = 0 {
// Also don't add too ofte n data to atomic variable
const CYCLES_COUNTER : usize = 0b111111 ;
if ( ( index & CYCLES_COUNTER ) = = CYCLES_COUNTER ) & & index ! = 0 {
atomic_mode_counter . fetch_add ( CYCLES_COUNTER , Ordering ::Relaxed ) ;
if stop_receiver . is_some ( ) & & stop_receiver . unwrap ( ) . try_recv ( ) . is_ok ( ) {
check_was_stopped . store ( true , Ordering ::Relaxed ) ;
return None ;
}
}
hashes_parents . insert ( hash_to_check , 0 ) ;
let mut found_items = self
. bktree
@ -770,55 +780,15 @@ impl SimilarImages {
found_items . sort_unstable_by_key ( | f | f . 0 ) ;
for ( similarity , other_hash ) in found_items {
// SSSTART
// Cannot use hash if already is used as master record(have more than 0 children)
if let Some ( children_number ) = hashes_parents . get ( other_hash ) {
if * children_number > 0 {
continue ;
}
}
// If there is already record, with smaller sensitivity, then replace it
let mut need_to_add = false ;
let mut need_to_check = false ;
// TODO replace variables from above with closures
// If current checked hash, have parent, first we must check if similarity between them is lower than checked item
if let Some ( ( current_parent_hash , current_similarity_with_parent ) ) = hashes_similarity . get ( hash_to_check ) {
if * current_similarity_with_parent > similarity {
need_to_check = true ;
* hashes_parents . get_mut ( current_parent_hash ) . unwrap ( ) - = 1 ;
hashes_similarity . remove ( hash_to_check ) . unwrap ( ) ;
}
} else {
need_to_check = true ;
}
if need_to_check {
if let Some ( ( other_parent_hash , other_similarity ) ) = hashes_similarity . get ( other_hash ) {
if * other_similarity > similarity {
need_to_add = true ;
* hashes_parents . get_mut ( other_parent_hash ) . unwrap ( ) - = 1 ;
}
}
// But when there is no record, just add it
else {
need_to_add = true
}
}
if need_to_add {
hashes_similarity . insert ( other_hash , ( hash_to_check , similarity ) ) ;
if let Some ( number_of_children ) = hashes_parents . get_mut ( hash_to_check ) {
* number_of_children + = 1 ;
} else {
hashes_parents . insert ( hash_to_check , 1 ) ;
}
}
// ENND
for ( similarity , compared_hash ) in found_items {
image_to_check (
& mut hashes_parents ,
& mut hashes_similarity ,
& hashes_with_multiple_images ,
hash_to_check ,
compared_hash ,
similarity ,
) ;
}
}
@ -849,66 +819,39 @@ impl SimilarImages {
hashes_similarity = first_hashes_similarity ;
}
for ( _partial_hashes_with_parents , partial_hashes_with_similarity ) in iter {
for ( hash_to_check , ( other_hash , similarity ) ) in partial_hashes_with_similarity {
// SSSTART
// Cannot use hash if already is used as master record(have more than 0 children)
if let Some ( children_number ) = hashes_parents . get ( other_hash ) {
if * children_number > 0 {
continue ;
}
for ( partial_hashes_with_parents , partial_hashes_with_similarity ) in iter {
for ( parent_hash , _child_number ) in partial_hashes_with_parents {
if ! hashes_parents . contains_key ( parent_hash ) & & ! hashes_similarity . contains_key ( parent_hash ) {
hashes_parents . insert ( parent_hash , 0 ) ;
}
}
// If there is already record, with smaller sensitivity, then replace it
let mut need_to_add = false ;
let mut need_to_check = false ;
// TODO replace variables from above with closures
// If current checked hash, have parent, first we must check if similarity between them is lower than checked item
if let Some ( ( current_parent_hash , current_similarity_with_parent ) ) = hashes_similarity . get ( hash_to_check ) {
if * current_similarity_with_parent > similarity {
need_to_check = true ;
* hashes_parents . get_mut ( current_parent_hash ) . unwrap ( ) - = 1 ;
hashes_similarity . remove ( hash_to_check ) . unwrap ( ) ;
}
} else {
need_to_check = true ;
}
if need_to_check {
if let Some ( ( other_parent_hash , other_similarity ) ) = hashes_similarity . get ( other_hash ) {
if * other_similarity > similarity {
need_to_add = true ;
* hashes_parents . get_mut ( other_parent_hash ) . unwrap ( ) - = 1 ;
}
}
// But when there is no record, just add it
else {
need_to_add = true
}
}
if need_to_add {
hashes_similarity . insert ( other_hash , ( hash_to_check , similarity ) ) ;
if let Some ( number_of_children ) = hashes_parents . get_mut ( hash_to_check ) {
* number_of_children + = 1 ;
} else {
hashes_parents . insert ( hash_to_check , 1 ) ;
}
}
// ENND
for ( hash_to_check , ( compared_hash , similarity ) ) in partial_hashes_with_similarity {
image_to_check (
& mut hashes_parents ,
& mut hashes_similarity ,
& hashes_with_multiple_images ,
hash_to_check ,
compared_hash ,
similarity ,
) ;
}
}
#[ cfg(debug_assertions) ]
debug_check_for_duplicated_things ( hashes_parents . clone ( ) , hashes_similarity . clone ( ) , all_hashed_images . clone ( ) , "LATTER" ) ;
// Collecting results
// Just simple check if all original hashes with multiple entries are available in end results
let original_hashes_at_start = hashes_with_multiple_images . len ( ) ;
let original_hashes_in_end_results = hashes_parents
. iter ( )
. filter ( | ( parent_hash , _child_number ) | hashes_with_multiple_images . contains ( * parent_hash ) )
. count ( ) ;
assert_eq! ( original_hashes_at_start , original_hashes_in_end_results ) ;
// Collecting results to vector
for ( parent_hash , child_number ) in hashes_parents {
if child_number > 0 {
// If hash contains other hasher OR multiple images are available for checked hash
if child_number > 0 | | hashes_with_multiple_images . contains ( parent_hash ) {
let vec_fe = all_hashed_images . get ( parent_hash ) . unwrap ( ) . clone ( ) ;
collected_similar_images . insert ( parent_hash . clone ( ) , vec_fe ) ;
}
@ -1040,6 +983,61 @@ impl SimilarImages {
}
}
fn image_to_check < ' a > (
hashes_parents : & mut HashMap < & ' a Vec < u8 > , u32 > ,
hashes_similarity : & mut HashMap < & ' a Vec < u8 > , ( & ' a Vec < u8 > , u32 ) > ,
hashes_with_multiple_images : & HashSet < & ' a Vec < u8 > > ,
hash_to_check : & ' a Vec < u8 > ,
compared_hash : & ' a Vec < u8 > ,
similarity : u32 ,
) {
if let Some ( children_number ) = hashes_parents . get ( compared_hash ) {
if * children_number > 0 | | hashes_with_multiple_images . contains ( compared_hash ) {
return ;
}
}
// If there is already record, with smaller sensitivity, then replace it
let mut need_to_add = false ;
let mut need_to_check = false ;
// TODO consider to replace variables from above with closures
// If current checked hash, have parent, first we must check if similarity between them is lower than checked item
if let Some ( ( current_parent_hash , current_similarity_with_parent ) ) = hashes_similarity . get ( hash_to_check ) {
if * current_similarity_with_parent > similarity {
need_to_check = true ;
* hashes_parents . get_mut ( current_parent_hash ) . unwrap ( ) - = 1 ;
hashes_similarity . remove ( hash_to_check ) . unwrap ( ) ;
}
} else {
need_to_check = true ;
}
if need_to_check {
if let Some ( ( other_parent_hash , other_similarity ) ) = hashes_similarity . get ( compared_hash ) {
if * other_similarity > similarity {
need_to_add = true ;
* hashes_parents . get_mut ( other_parent_hash ) . unwrap ( ) - = 1 ;
}
}
// But when there is no record, just add it
else {
need_to_add = true
}
}
if need_to_add {
hashes_similarity . insert ( compared_hash , ( hash_to_check , similarity ) ) ;
if let Some ( number_of_children ) = hashes_parents . get_mut ( hash_to_check ) {
* number_of_children + = 1 ;
} else {
hashes_parents . insert ( hash_to_check , 1 ) ;
}
}
}
impl Default for SimilarImages {
fn default ( ) -> Self {
Self ::new ( )
@ -1368,12 +1366,14 @@ fn debug_check_for_duplicated_things(
all_hashed_images : HashMap < Vec < u8 > , Vec < FileEntry > > ,
numm : & str ,
) {
let mut found_broken_thing = false ;
let mut hashmap_hashes : HashSet < _ > = Default ::default ( ) ;
let mut hashmap_names : HashSet < _ > = Default ::default ( ) ;
for ( hash , number_of_children ) in & hashes_parents {
if * number_of_children > 0 {
if hashmap_hashes . contains ( * hash ) {
println! ( "------1--HASH--{} {:?}" , numm , all_hashed_images . get ( * hash ) . unwrap ( ) ) ;
found_broken_thing = true ;
}
hashmap_hashes . insert ( hash . to_vec ( ) ) ;
@ -1381,6 +1381,7 @@ fn debug_check_for_duplicated_things(
let name = i . path . to_string_lossy ( ) . to_string ( ) ;
if hashmap_names . contains ( & name ) {
println! ( "------1--NAME--{} {:?}" , numm , name ) ;
found_broken_thing = true ;
}
hashmap_names . insert ( name ) ;
}
@ -1389,6 +1390,7 @@ fn debug_check_for_duplicated_things(
for hash in hashes_similarity . keys ( ) {
if hashmap_hashes . contains ( * hash ) {
println! ( "------2--HASH--{} {:?}" , numm , all_hashed_images . get ( * hash ) . unwrap ( ) ) ;
found_broken_thing = true ;
}
hashmap_hashes . insert ( hash . to_vec ( ) ) ;
@ -1396,8 +1398,13 @@ fn debug_check_for_duplicated_things(
let name = i . path . to_string_lossy ( ) . to_string ( ) ;
if hashmap_names . contains ( & name ) {
println! ( "------2--NAME--{} {:?}" , numm , name ) ;
found_broken_thing = true ;
}
hashmap_names . insert ( name ) ;
}
}
if found_broken_thing {
panic! ( ) ;
}
}