#include "reachability_testing.hpp" #include #include #include #include using std::chrono::steady_clock; namespace llarp::consensus { using fseconds = std::chrono::duration; using fminutes = std::chrono::duration; static void check_incoming_tests_impl( std::string_view name, const time_point_t& now, const time_point_t& startup, detail::incoming_test_state& incoming) { const auto elapsed = now - std::max(startup, incoming.last_test); bool failing = elapsed > reachability_testing::MAX_TIME_WITHOUT_PING; bool whine = failing != incoming.was_failing || (failing && now - incoming.last_whine > reachability_testing::WHINING_INTERVAL); incoming.was_failing = failing; if (whine) { incoming.last_whine = now; if (!failing) { LogInfo(name, " ping received; port is likely reachable again"); } else { if (incoming.last_test.time_since_epoch() == 0s) { LogWarn("Have NEVER received ", name, " pings!"); } else { LogWarn( "Have not received ", name, " pings for a long time: ", fminutes{elapsed}.count(), " minutes"); } LogWarn( "Please check your ", name, " port. Not being reachable " "over ", name, " may result in a deregistration!"); } } } void reachability_testing::check_incoming_tests(const time_point_t& now) { check_incoming_tests_impl("lokinet", now, startup, last); } void reachability_testing::incoming_ping(const time_point_t& now) { last.last_test = now; } std::optional reachability_testing::next_random(AbstractRouter* router, const time_point_t& now, bool requeue) { if (next_general_test > now) return std::nullopt; CSRNG rng; next_general_test = now + std::chrono::duration_cast(fseconds(TESTING_INTERVAL(rng))); // Pull the next element off the queue, but skip ourself, any that are no longer registered, and // any that are currently known to be failing (those are queued for testing separately). RouterID my_pk{router->pubkey()}; while (!testing_queue.empty()) { auto& pk = testing_queue.back(); std::optional sn; if (pk != my_pk && !failing.count(pk)) sn = pk; testing_queue.pop_back(); if (sn) return sn; } if (!requeue) return std::nullopt; // FIXME: when a *new* node comes online we need to inject it into a random position in the SN // list with probability (L/N) [L = current list size, N = potential list size] // // (FIXME: put this FIXME in a better place ;-) ) // We exhausted the queue so repopulate it and try again testing_queue.clear(); const auto all = router->GetRouterWhitelist(); testing_queue.insert(testing_queue.begin(), all.begin(), all.end()); std::shuffle(testing_queue.begin(), testing_queue.end(), rng); // Recurse with the rebuilt list, but don't let it try rebuilding again return next_random(router, now, false); } std::vector> reachability_testing::get_failing(const time_point_t& now) { // Our failing_queue puts the oldest retest times at the top, so pop them off into our result // until the top node should be retested sometime in the future std::vector> result; while (result.size() < MAX_RETESTS_PER_TICK && !failing_queue.empty()) { auto& [pk, retest_time, failures] = failing_queue.top(); if (retest_time > now) break; if (failing.count(pk)) result.emplace_back(pk, failures); failing_queue.pop(); } return result; } void reachability_testing::add_failing_node(const RouterID& pk, int previous_failures) { using namespace std::chrono; if (previous_failures < 0) previous_failures = 0; CSRNG rng; auto next_test_in = duration_cast( previous_failures * TESTING_BACKOFF + fseconds{TESTING_INTERVAL(rng)}); if (next_test_in > TESTING_BACKOFF_MAX) next_test_in = TESTING_BACKOFF_MAX; failing.insert(pk); failing_queue.emplace(pk, steady_clock::now() + next_test_in, previous_failures + 1); } void reachability_testing::remove_node_from_failing(const RouterID& pk) { failing.erase(pk); } } // namespace llarp::consensus