From d29935d2082083d2f62f66c4406f9b16f691a978 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mikrut?= <41945903+qarmin@users.noreply.github.com> Date: Mon, 29 Aug 2022 21:37:31 +0200 Subject: [PATCH] Fix similar images algorithm (#799) * Fixed missing images with similarity equal to 0 * Checking * Unify algorithm between two functions * Finally fix problem with missing images * Lock and debug * No comment --- Cargo.lock | 81 ++++++----- czkawka_core/src/similar_images.rs | 223 +++++++++++++++-------------- 2 files changed, 155 insertions(+), 149 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c144fae..d8964b1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -352,9 +352,9 @@ dependencies = [ [[package]] name = "clap" -version = "3.2.17" +version = "3.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29e724a68d9319343bb3328c9cc2dfde263f4b3142ee1059a9980580171c954b" +checksum = "b15f2ea93df33549dbe2e8eecd1ca55269d63ae0b3ba1f55db030817d1c2867f" dependencies = [ "atty", "bitflags", @@ -369,9 +369,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "3.2.17" +version = "3.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13547f7012c01ab4a0e8f8967730ada8f9fdf419e8b6c792788f39cf4e46eefa" +checksum = "ea0c8bce528c4be4da13ea6fead8965e95b6073585a2f05204bd8f4119f82a65" dependencies = [ "heck", "proc-macro-error", @@ -409,9 +409,9 @@ checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" [[package]] name = "cpufeatures" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1079fb8528d9f9c888b1e8aa651e6e079ade467323d58f75faf1d30b1808f540" +checksum = "dc948ebb96241bb40ab73effeb80d9f93afaad49359d159a5e61be51619fe813" dependencies = [ "libc", ] @@ -660,16 +660,15 @@ dependencies = [ [[package]] name = "exr" -version = "1.4.2" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14cc0e06fb5f67e5d6beadf3a382fec9baca1aa751c6d5368fdeee7e5932c215" +checksum = "78c26a90d9dd411a3d119d6f55752fb4c134ca243250c32fb9cab7b2561638d2" dependencies = [ "bit_field", - "deflate 1.0.0", "flume", "half", - "inflate", "lebe", + "miniz_oxide", "smallvec", "threadpool", ] @@ -827,9 +826,9 @@ checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" [[package]] name = "futures" -version = "0.3.23" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab30e97ab6aacfe635fad58f22c2bb06c8b685f7421eb1e064a729e2a5f481fa" +checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c" dependencies = [ "futures-channel", "futures-core", @@ -842,9 +841,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.23" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bfc52cbddcfd745bf1740338492bb0bd83d76c67b445f91c5fb29fae29ecaa1" +checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050" dependencies = [ "futures-core", "futures-sink", @@ -852,15 +851,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.23" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2acedae88d38235936c3922476b10fced7b2b68136f5e3c03c2d5be348a1115" +checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf" [[package]] name = "futures-executor" -version = "0.3.23" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d11aa21b5b587a64682c0094c2bdd4df0076c5324961a40cc3abd7f37930528" +checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab" dependencies = [ "futures-core", "futures-task", @@ -869,15 +868,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.23" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93a66fc6d035a26a3ae255a6d2bca35eda63ae4c5512bef54449113f7a1228e5" +checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68" [[package]] name = "futures-macro" -version = "0.3.23" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0db9cce532b0eae2ccf2766ab246f114b56b9cf6d445e00c2549fbc100ca045d" +checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17" dependencies = [ "proc-macro2", "quote", @@ -886,21 +885,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.23" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca0bae1fe9752cf7fd9b0064c674ae63f97b37bc714d745cbde0afb7ec4e6765" +checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56" [[package]] name = "futures-task" -version = "0.3.23" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "842fc63b931f4056a24d59de13fb1272134ce261816e063e634ad0c15cdc5306" +checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1" [[package]] name = "futures-util" -version = "0.3.23" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0828a5471e340229c11c77ca80017937ce3c58cb788a17e5f1c2d5c485a9577" +checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90" dependencies = [ "futures-channel", "futures-core", @@ -1557,9 +1556,9 @@ dependencies = [ [[package]] name = "lock_api" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53" +checksum = "9f80bf5aacaf25cbfc8210d1cfb718f2bf3b11c4c54e5afe36c236853a8ec390" dependencies = [ "autocfg", "scopeguard", @@ -1940,9 +1939,9 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pest" -version = "2.2.1" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69486e2b8c2d2aeb9762db7b4e00b0331156393555cff467f4163ff06821eef8" +checksum = "4b0560d531d1febc25a3c9398a62a71256c0178f2e3443baedd9ad4bb8c9deb4" dependencies = [ "thiserror", "ucd-trie", @@ -2352,18 +2351,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.143" +version = "1.0.144" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53e8e5d5b70924f74ff5c6d64d9a5acd91422117c60f48c4e07855238a254553" +checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.143" +version = "1.0.144" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3d8e8de557aee63c26b85b947f5e59b690d0454c753f3adeb5cd7835ab88391" +checksum = "94ed3a816fb1d101812f83e789f888322c34e291f894f19590dc310963e87a00" dependencies = [ "proc-macro2", "quote", @@ -2372,9 +2371,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.83" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38dd04e3c8279e75b31ef29dbdceebfe5ad89f4d0937213c53f7d49d01b3d5a7" +checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44" dependencies = [ "itoa", "ryu", @@ -2779,9 +2778,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db76ff9fa4b1458b3c7f077f3ff9887394058460d21e634355b273aaf11eea45" +checksum = "3c3f9a28b618c3a6b9251b6908e9c99e04b9e5c02e6581ccbb67d59c34ef7f9b" dependencies = [ "itoa", "libc", @@ -3205,5 +3204,5 @@ dependencies = [ "hmac", "pbkdf2", "sha1", - "time 0.3.13", + "time 0.3.14", ] diff --git a/czkawka_core/src/similar_images.rs b/czkawka_core/src/similar_images.rs index 771f914..43e8228 100644 --- a/czkawka_core/src/similar_images.rs +++ b/czkawka_core/src/similar_images.rs @@ -729,19 +729,28 @@ impl SimilarImages { }; //// PROGRESS THREAD END - for hash in &all_hashes { - self.bktree.add(hash.to_vec()); + // Don't use hashes with multiple images in bktree, because they will always be master of group and cannot be find by other hashes + let mut additional_chunk_to_check: Vec<_> = Default::default(); + let mut hashes_with_multiple_images: HashSet<_> = Default::default(); // Fast way to check if hash have multiple imaages + for (hash, vec_files) in &all_hashed_images { + if vec_files.len() >= 2 { + additional_chunk_to_check.push(hash); + hashes_with_multiple_images.insert(hash); + } else { + self.bktree.add(hash.to_vec()); + } } let number_of_processors = num_cpus::get(); let chunk_size = all_hashes.len() / number_of_processors; - let chunks: Vec<_> = if chunk_size > 0 { all_hashes.chunks(chunk_size).collect() } else { vec![&all_hashes] }; + let mut chunks: Vec<_> = if chunk_size > 0 { all_hashes.chunks(chunk_size).collect() } else { vec![&all_hashes] }; + chunks.push(&additional_chunk_to_check); let parts: Vec<_> = chunks .into_par_iter() .map(|hashes_to_check| { - let mut hashes_parents: HashMap<&Vec, u32> = Default::default(); // Hash used as parent, childrens - let mut hashes_similarity: HashMap<&Vec, (&Vec, u32)> = Default::default(); // Hash used as child, (parent_hash,similarity) + let mut hashes_parents: HashMap<&Vec, u32> = Default::default(); // Hashes used as parent (hash, children_number_of_hash) + let mut hashes_similarity: HashMap<&Vec, (&Vec, u32)> = Default::default(); // Hashes used as child, (parent_hash, similarity) // Sprawdź czy hash nie jest użyty jako master gdzie indziej // Jeśli tak to przejdź do sprawdzania kolejnego elementu @@ -752,15 +761,16 @@ impl SimilarImages { for (index, hash_to_check) in hashes_to_check.iter().enumerate() { // Don't check for user stop too often - // Also don't add too ofter data to variables - const CYCLES_COUNTER: usize = 50; - if index % CYCLES_COUNTER == 0 && index != 0 { + // Also don't add too often data to atomic variable + const CYCLES_COUNTER: usize = 0b111111; + if ((index & CYCLES_COUNTER) == CYCLES_COUNTER) && index != 0 { atomic_mode_counter.fetch_add(CYCLES_COUNTER, Ordering::Relaxed); if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() { check_was_stopped.store(true, Ordering::Relaxed); return None; } } + hashes_parents.insert(hash_to_check, 0); let mut found_items = self .bktree @@ -770,55 +780,15 @@ impl SimilarImages { found_items.sort_unstable_by_key(|f| f.0); - for (similarity, other_hash) in found_items { - // SSSTART - // Cannot use hash if already is used as master record(have more than 0 children) - if let Some(children_number) = hashes_parents.get(other_hash) { - if *children_number > 0 { - continue; - } - } - - // If there is already record, with smaller sensitivity, then replace it - let mut need_to_add = false; - let mut need_to_check = false; - - // TODO replace variables from above with closures - // If current checked hash, have parent, first we must check if similarity between them is lower than checked item - if let Some((current_parent_hash, current_similarity_with_parent)) = hashes_similarity.get(hash_to_check) { - if *current_similarity_with_parent > similarity { - need_to_check = true; - - *hashes_parents.get_mut(current_parent_hash).unwrap() -= 1; - hashes_similarity.remove(hash_to_check).unwrap(); - } - } else { - need_to_check = true; - } - - if need_to_check { - if let Some((other_parent_hash, other_similarity)) = hashes_similarity.get(other_hash) { - if *other_similarity > similarity { - need_to_add = true; - *hashes_parents.get_mut(other_parent_hash).unwrap() -= 1; - } - } - // But when there is no record, just add it - else { - need_to_add = true - } - } - - if need_to_add { - hashes_similarity.insert(other_hash, (hash_to_check, similarity)); - - if let Some(number_of_children) = hashes_parents.get_mut(hash_to_check) { - *number_of_children += 1; - } else { - hashes_parents.insert(hash_to_check, 1); - } - } - // ENND + for (similarity, compared_hash) in found_items { + image_to_check( + &mut hashes_parents, + &mut hashes_similarity, + &hashes_with_multiple_images, + hash_to_check, + compared_hash, + similarity, + ); } } @@ -849,66 +819,39 @@ impl SimilarImages { hashes_similarity = first_hashes_similarity; } - for (_partial_hashes_with_parents, partial_hashes_with_similarity) in iter { - for (hash_to_check, (other_hash, similarity)) in partial_hashes_with_similarity { - // SSSTART - // Cannot use hash if already is used as master record(have more than 0 children) - if let Some(children_number) = hashes_parents.get(other_hash) { - if *children_number > 0 { - continue; - } + for (partial_hashes_with_parents, partial_hashes_with_similarity) in iter { + for (parent_hash, _child_number) in partial_hashes_with_parents { + if !hashes_parents.contains_key(parent_hash) && !hashes_similarity.contains_key(parent_hash) { + hashes_parents.insert(parent_hash, 0); } + } - // If there is already record, with smaller sensitivity, then replace it - let mut need_to_add = false; - let mut need_to_check = false; - - // TODO replace variables from above with closures - // If current checked hash, have parent, first we must check if similarity between them is lower than checked item - if let Some((current_parent_hash, current_similarity_with_parent)) = hashes_similarity.get(hash_to_check) { - if *current_similarity_with_parent > similarity { - need_to_check = true; - - *hashes_parents.get_mut(current_parent_hash).unwrap() -= 1; - hashes_similarity.remove(hash_to_check).unwrap(); - } - } else { - need_to_check = true; - } - - if need_to_check { - if let Some((other_parent_hash, other_similarity)) = hashes_similarity.get(other_hash) { - if *other_similarity > similarity { - need_to_add = true; - *hashes_parents.get_mut(other_parent_hash).unwrap() -= 1; - } - } - // But when there is no record, just add it - else { - need_to_add = true - } - } - - if need_to_add { - hashes_similarity.insert(other_hash, (hash_to_check, similarity)); - - if let Some(number_of_children) = hashes_parents.get_mut(hash_to_check) { - *number_of_children += 1; - } else { - hashes_parents.insert(hash_to_check, 1); - } - } - // ENND + for (hash_to_check, (compared_hash, similarity)) in partial_hashes_with_similarity { + image_to_check( + &mut hashes_parents, + &mut hashes_similarity, + &hashes_with_multiple_images, + hash_to_check, + compared_hash, + similarity, + ); } } #[cfg(debug_assertions)] debug_check_for_duplicated_things(hashes_parents.clone(), hashes_similarity.clone(), all_hashed_images.clone(), "LATTER"); - // Collecting results - + // Just simple check if all original hashes with multiple entries are available in end results + let original_hashes_at_start = hashes_with_multiple_images.len(); + let original_hashes_in_end_results = hashes_parents + .iter() + .filter(|(parent_hash, _child_number)| hashes_with_multiple_images.contains(*parent_hash)) + .count(); + assert_eq!(original_hashes_at_start, original_hashes_in_end_results); + // Collecting results to vector for (parent_hash, child_number) in hashes_parents { - if child_number > 0 { + // If hash contains other hasher OR multiple images are available for checked hash + if child_number > 0 || hashes_with_multiple_images.contains(parent_hash) { let vec_fe = all_hashed_images.get(parent_hash).unwrap().clone(); collected_similar_images.insert(parent_hash.clone(), vec_fe); } @@ -1040,6 +983,61 @@ impl SimilarImages { } } +fn image_to_check<'a>( + hashes_parents: &mut HashMap<&'a Vec, u32>, + hashes_similarity: &mut HashMap<&'a Vec, (&'a Vec, u32)>, + hashes_with_multiple_images: &HashSet<&'a Vec>, + hash_to_check: &'a Vec, + compared_hash: &'a Vec, + similarity: u32, +) { + if let Some(children_number) = hashes_parents.get(compared_hash) { + if *children_number > 0 || hashes_with_multiple_images.contains(compared_hash) { + return; + } + } + + // If there is already record, with smaller sensitivity, then replace it + let mut need_to_add = false; + let mut need_to_check = false; + + // TODO consider to replace variables from above with closures + // If current checked hash, have parent, first we must check if similarity between them is lower than checked item + if let Some((current_parent_hash, current_similarity_with_parent)) = hashes_similarity.get(hash_to_check) { + if *current_similarity_with_parent > similarity { + need_to_check = true; + + *hashes_parents.get_mut(current_parent_hash).unwrap() -= 1; + hashes_similarity.remove(hash_to_check).unwrap(); + } + } else { + need_to_check = true; + } + + if need_to_check { + if let Some((other_parent_hash, other_similarity)) = hashes_similarity.get(compared_hash) { + if *other_similarity > similarity { + need_to_add = true; + *hashes_parents.get_mut(other_parent_hash).unwrap() -= 1; + } + } + // But when there is no record, just add it + else { + need_to_add = true + } + } + + if need_to_add { + hashes_similarity.insert(compared_hash, (hash_to_check, similarity)); + + if let Some(number_of_children) = hashes_parents.get_mut(hash_to_check) { + *number_of_children += 1; + } else { + hashes_parents.insert(hash_to_check, 1); + } + } +} + impl Default for SimilarImages { fn default() -> Self { Self::new() @@ -1368,12 +1366,14 @@ fn debug_check_for_duplicated_things( all_hashed_images: HashMap, Vec>, numm: &str, ) { + let mut found_broken_thing = false; let mut hashmap_hashes: HashSet<_> = Default::default(); let mut hashmap_names: HashSet<_> = Default::default(); for (hash, number_of_children) in &hashes_parents { if *number_of_children > 0 { if hashmap_hashes.contains(*hash) { println!("------1--HASH--{} {:?}", numm, all_hashed_images.get(*hash).unwrap()); + found_broken_thing = true; } hashmap_hashes.insert(hash.to_vec()); @@ -1381,6 +1381,7 @@ fn debug_check_for_duplicated_things( let name = i.path.to_string_lossy().to_string(); if hashmap_names.contains(&name) { println!("------1--NAME--{} {:?}", numm, name); + found_broken_thing = true; } hashmap_names.insert(name); } @@ -1389,6 +1390,7 @@ fn debug_check_for_duplicated_things( for hash in hashes_similarity.keys() { if hashmap_hashes.contains(*hash) { println!("------2--HASH--{} {:?}", numm, all_hashed_images.get(*hash).unwrap()); + found_broken_thing = true; } hashmap_hashes.insert(hash.to_vec()); @@ -1396,8 +1398,13 @@ fn debug_check_for_duplicated_things( let name = i.path.to_string_lossy().to_string(); if hashmap_names.contains(&name) { println!("------2--NAME--{} {:?}", numm, name); + found_broken_thing = true; } hashmap_names.insert(name); } } + + if found_broken_thing { + panic!(); + } }