lokinet/llarp/consensus/reachability_testing.cpp
Jason Rhinelander 1ddfb7420a
Merge pull request #1791 from majestrate/exclude-failing-from-queue-2021-10-29
exclude fully failed nodes from service node testing list
2021-10-29 14:44:16 -03:00

158 lines
4.6 KiB
C++

#include "reachability_testing.hpp"
#include <chrono>
#include <llarp/router/abstractrouter.hpp>
#include <llarp/util/logging/logger.hpp>
#include <llarp/crypto/crypto.hpp>
using std::chrono::steady_clock;
namespace llarp::consensus
{
using fseconds = std::chrono::duration<float, std::chrono::seconds::period>;
using fminutes = std::chrono::duration<float, std::chrono::minutes::period>;
static void
check_incoming_tests_impl(
std::string_view name,
const time_point_t& now,
const time_point_t& startup,
detail::incoming_test_state& incoming)
{
const auto elapsed = now - std::max(startup, incoming.last_test);
bool failing = elapsed > reachability_testing::MAX_TIME_WITHOUT_PING;
bool whine = failing != incoming.was_failing
|| (failing && now - incoming.last_whine > reachability_testing::WHINING_INTERVAL);
incoming.was_failing = failing;
if (whine)
{
incoming.last_whine = now;
if (!failing)
{
LogInfo(name, " ping received; port is likely reachable again");
}
else
{
if (incoming.last_test.time_since_epoch() == 0s)
{
LogWarn("Have NEVER received ", name, " pings!");
}
else
{
LogWarn(
"Have not received ",
name,
" pings for a long time: ",
fminutes{elapsed}.count(),
" minutes");
}
LogWarn(
"Please check your ",
name,
" port. Not being reachable "
"over ",
name,
" may result in a deregistration!");
}
}
}
void
reachability_testing::check_incoming_tests(const time_point_t& now)
{
check_incoming_tests_impl("lokinet", now, startup, last);
}
void
reachability_testing::incoming_ping(const time_point_t& now)
{
last.last_test = now;
}
std::optional<RouterID>
reachability_testing::next_random(AbstractRouter* router, const time_point_t& now, bool requeue)
{
if (next_general_test > now)
return std::nullopt;
CSRNG rng;
next_general_test =
now + std::chrono::duration_cast<time_point_t::duration>(fseconds(TESTING_INTERVAL(rng)));
// Pull the next element off the queue, but skip ourself, any that are no longer registered, and
// any that are currently known to be failing (those are queued for testing separately).
RouterID my_pk{router->pubkey()};
while (!testing_queue.empty())
{
auto& pk = testing_queue.back();
std::optional<RouterID> sn;
if (pk != my_pk && !failing.count(pk))
sn = pk;
testing_queue.pop_back();
if (sn)
return sn;
}
if (!requeue)
return std::nullopt;
// FIXME: when a *new* node comes online we need to inject it into a random position in the SN
// list with probability (L/N) [L = current list size, N = potential list size]
//
// (FIXME: put this FIXME in a better place ;-) )
// We exhausted the queue so repopulate it and try again
testing_queue.clear();
const auto all = router->GetRouterWhitelist();
testing_queue.insert(testing_queue.begin(), all.begin(), all.end());
std::shuffle(testing_queue.begin(), testing_queue.end(), rng);
// Recurse with the rebuilt list, but don't let it try rebuilding again
return next_random(router, now, false);
}
std::vector<std::pair<RouterID, int>>
reachability_testing::get_failing(const time_point_t& now)
{
// Our failing_queue puts the oldest retest times at the top, so pop them off into our result
// until the top node should be retested sometime in the future
std::vector<std::pair<RouterID, int>> result;
while (result.size() < MAX_RETESTS_PER_TICK && !failing_queue.empty())
{
auto& [pk, retest_time, failures] = failing_queue.top();
if (retest_time > now)
break;
if (failing.count(pk))
result.emplace_back(pk, failures);
failing_queue.pop();
}
return result;
}
void
reachability_testing::add_failing_node(const RouterID& pk, int previous_failures)
{
using namespace std::chrono;
if (previous_failures < 0)
previous_failures = 0;
CSRNG rng;
auto next_test_in = duration_cast<time_point_t::duration>(
previous_failures * TESTING_BACKOFF + fseconds{TESTING_INTERVAL(rng)});
if (next_test_in > TESTING_BACKOFF_MAX)
next_test_in = TESTING_BACKOFF_MAX;
failing.insert(pk);
failing_queue.emplace(pk, steady_clock::now() + next_test_in, previous_failures + 1);
}
void
reachability_testing::remove_node_from_failing(const RouterID& pk)
{
failing.erase(pk);
}
} // namespace llarp::consensus