mirror of
https://github.com/oxen-io/lokinet.git
synced 2024-11-15 12:13:24 +00:00
1ddfb7420a
exclude fully failed nodes from service node testing list
158 lines
4.6 KiB
C++
158 lines
4.6 KiB
C++
|
|
#include "reachability_testing.hpp"
|
|
#include <chrono>
|
|
#include <llarp/router/abstractrouter.hpp>
|
|
#include <llarp/util/logging/logger.hpp>
|
|
#include <llarp/crypto/crypto.hpp>
|
|
|
|
using std::chrono::steady_clock;
|
|
|
|
namespace llarp::consensus
|
|
{
|
|
using fseconds = std::chrono::duration<float, std::chrono::seconds::period>;
|
|
using fminutes = std::chrono::duration<float, std::chrono::minutes::period>;
|
|
|
|
static void
|
|
check_incoming_tests_impl(
|
|
std::string_view name,
|
|
const time_point_t& now,
|
|
const time_point_t& startup,
|
|
detail::incoming_test_state& incoming)
|
|
{
|
|
const auto elapsed = now - std::max(startup, incoming.last_test);
|
|
bool failing = elapsed > reachability_testing::MAX_TIME_WITHOUT_PING;
|
|
bool whine = failing != incoming.was_failing
|
|
|| (failing && now - incoming.last_whine > reachability_testing::WHINING_INTERVAL);
|
|
|
|
incoming.was_failing = failing;
|
|
|
|
if (whine)
|
|
{
|
|
incoming.last_whine = now;
|
|
if (!failing)
|
|
{
|
|
LogInfo(name, " ping received; port is likely reachable again");
|
|
}
|
|
else
|
|
{
|
|
if (incoming.last_test.time_since_epoch() == 0s)
|
|
{
|
|
LogWarn("Have NEVER received ", name, " pings!");
|
|
}
|
|
else
|
|
{
|
|
LogWarn(
|
|
"Have not received ",
|
|
name,
|
|
" pings for a long time: ",
|
|
fminutes{elapsed}.count(),
|
|
" minutes");
|
|
}
|
|
LogWarn(
|
|
"Please check your ",
|
|
name,
|
|
" port. Not being reachable "
|
|
"over ",
|
|
name,
|
|
" may result in a deregistration!");
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
reachability_testing::check_incoming_tests(const time_point_t& now)
|
|
{
|
|
check_incoming_tests_impl("lokinet", now, startup, last);
|
|
}
|
|
|
|
void
|
|
reachability_testing::incoming_ping(const time_point_t& now)
|
|
{
|
|
last.last_test = now;
|
|
}
|
|
|
|
std::optional<RouterID>
|
|
reachability_testing::next_random(AbstractRouter* router, const time_point_t& now, bool requeue)
|
|
{
|
|
if (next_general_test > now)
|
|
return std::nullopt;
|
|
CSRNG rng;
|
|
next_general_test =
|
|
now + std::chrono::duration_cast<time_point_t::duration>(fseconds(TESTING_INTERVAL(rng)));
|
|
|
|
// Pull the next element off the queue, but skip ourself, any that are no longer registered, and
|
|
// any that are currently known to be failing (those are queued for testing separately).
|
|
RouterID my_pk{router->pubkey()};
|
|
while (!testing_queue.empty())
|
|
{
|
|
auto& pk = testing_queue.back();
|
|
std::optional<RouterID> sn;
|
|
if (pk != my_pk && !failing.count(pk))
|
|
sn = pk;
|
|
testing_queue.pop_back();
|
|
if (sn)
|
|
return sn;
|
|
}
|
|
if (!requeue)
|
|
return std::nullopt;
|
|
|
|
// FIXME: when a *new* node comes online we need to inject it into a random position in the SN
|
|
// list with probability (L/N) [L = current list size, N = potential list size]
|
|
//
|
|
// (FIXME: put this FIXME in a better place ;-) )
|
|
|
|
// We exhausted the queue so repopulate it and try again
|
|
|
|
testing_queue.clear();
|
|
const auto all = router->GetRouterWhitelist();
|
|
testing_queue.insert(testing_queue.begin(), all.begin(), all.end());
|
|
|
|
std::shuffle(testing_queue.begin(), testing_queue.end(), rng);
|
|
|
|
// Recurse with the rebuilt list, but don't let it try rebuilding again
|
|
return next_random(router, now, false);
|
|
}
|
|
|
|
std::vector<std::pair<RouterID, int>>
|
|
reachability_testing::get_failing(const time_point_t& now)
|
|
{
|
|
// Our failing_queue puts the oldest retest times at the top, so pop them off into our result
|
|
// until the top node should be retested sometime in the future
|
|
std::vector<std::pair<RouterID, int>> result;
|
|
while (result.size() < MAX_RETESTS_PER_TICK && !failing_queue.empty())
|
|
{
|
|
auto& [pk, retest_time, failures] = failing_queue.top();
|
|
if (retest_time > now)
|
|
break;
|
|
if (failing.count(pk))
|
|
result.emplace_back(pk, failures);
|
|
failing_queue.pop();
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void
|
|
reachability_testing::add_failing_node(const RouterID& pk, int previous_failures)
|
|
{
|
|
using namespace std::chrono;
|
|
|
|
if (previous_failures < 0)
|
|
previous_failures = 0;
|
|
CSRNG rng;
|
|
auto next_test_in = duration_cast<time_point_t::duration>(
|
|
previous_failures * TESTING_BACKOFF + fseconds{TESTING_INTERVAL(rng)});
|
|
if (next_test_in > TESTING_BACKOFF_MAX)
|
|
next_test_in = TESTING_BACKOFF_MAX;
|
|
|
|
failing.insert(pk);
|
|
failing_queue.emplace(pk, steady_clock::now() + next_test_in, previous_failures + 1);
|
|
}
|
|
|
|
void
|
|
reachability_testing::remove_node_from_failing(const RouterID& pk)
|
|
{
|
|
failing.erase(pk);
|
|
}
|
|
|
|
} // namespace llarp::consensus
|