You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
lokinet/llarp/consensus/reachability_testing.hpp

146 lines
6.4 KiB
C++

#pragma once
#include <llarp/router_id.hpp>
#include <llarp/util/time.hpp>
#include <chrono>
#include <queue>
#include <random>
#include <unordered_map>
#include <unordered_set>
#include <vector>
namespace llarp
{
struct Router;
}
namespace llarp::consensus
{
namespace detail
{
using clock_t = std::chrono::steady_clock;
using time_point_t = std::chrono::time_point<clock_t>;
// Returns std::greater on the std::get<N>(v)th element value.
template <typename T, size_t N>
struct nth_greater
{
constexpr bool operator()(const T& lhs, const T& rhs) const
{
return std::greater<std::tuple_element_t<N, T>>{}(
std::get<N>(lhs), std::get<N>(rhs));
}
};
struct incoming_test_state
{
time_point_t last_test{};
time_point_t last_whine{};
bool was_failing = false;
};
} // namespace detail
using time_point_t = detail::time_point_t;
using clock_t = detail::clock_t;
// How often we tick the timer to check whether we need to do any tests.
constexpr auto REACHABILITY_TESTING_TIMER_INTERVAL = 50ms;
class reachability_testing
{
public:
// Distribution for the seconds between node tests: we throw in some randomness to avoid
// potential clustering of tests. (Note that there is some granularity here as the test
// timer only runs every REACHABILITY_TESTING_TIMER_INTERVAL).
std::normal_distribution<float> TESTING_INTERVAL{10.0, 3.0};
// The linear backoff after each consecutive test failure before we re-test. Specifically
// we schedule the next re-test for (TESTING_BACKOFF*previous_failures) +
// TESTING_INTERVAL(rng).
inline static constexpr auto TESTING_BACKOFF = 10s;
// The upper bound for the re-test interval.
inline static constexpr auto TESTING_BACKOFF_MAX = 2min;
// The maximum number of nodes that we will re-test at once (i.e. per
// TESTING_TIMING_INTERVAL); mainly intended to throttle ourselves if, for instance, our own
// connectivity loss makes us accumulate tons of nodes to test all at once. (Despite the
// random intervals, this can happen if we also get decommissioned during which we can't
// test at all but still have lots of failing nodes we want to test right away when we get
// recommissioned).
inline static constexpr int MAX_RETESTS_PER_TICK = 4;
// Maximum time without a ping before we start whining about it.
//
// We have a probability of about 0.368* of *not* getting pinged within a ping interval
// (10s), and so the probability of not getting a ping for 2 minutes (i.e. 12 test spans)
// just because we haven't been selected is extremely small (0.0000061). It also coincides
// nicely with blockchain time (i.e. two minutes) and our max testing backoff.
//
// * = approx value of ((n-1)/n)^n for non-tiny values of n
inline static constexpr auto MAX_TIME_WITHOUT_PING = 2min;
// How often we whine in the logs about being unreachable
inline static constexpr auto WHINING_INTERVAL = 2min;
private:
// Queue of pubkeys of service nodes to test; we pop off the back of this until the queue
// empties then we refill it with a shuffled list of all pubkeys then pull off of it until
// it is empty again, etc.
std::vector<RouterID> testing_queue;
// The next time for a general test
time_point_t next_general_test = time_point_t::min();
// When we started, so that we know not to hold off on whining about no pings for a while.
const time_point_t startup = clock_t::now();
// Pubkeys, next test times, and sequential failure counts of service nodes that are
// currently in "failed" status along with the last time they failed; we retest them first
// after 10s then back off linearly by an additional 10s up to a max testing interval of
// 2m30s, until we get a successful response.
using FailingPK = std::tuple<RouterID, time_point_t, int>;
std::priority_queue<FailingPK, std::vector<FailingPK>, detail::nth_greater<FailingPK, 1>>
failing_queue;
std::unordered_set<RouterID> failing;
// Track the last time *this node* was tested by other network nodes; used to detect and
// warn about possible network issues.
detail::incoming_test_state last;
public:
// If it is time to perform another random test, this returns the next node to test from the
// testing queue and returns it, also updating the timer for the next test. If it is not
// yet time, or if the queue is empty and cannot current be replenished, returns
// std::nullopt. If the queue empties then this builds a new one by shuffling current
// public keys in the swarm's "all nodes" then starts using the new queue for this an
// subsequent calls.
//
// `requeue` is mainly for internal use: if false it avoids rebuilding the queue if we run
// out (and instead just return nullopt).
std::optional<RouterID> next_random(
Router* router, const time_point_t& now = clock_t::now(), bool requeue = true);
// Removes and returns up to MAX_RETESTS_PER_TICK nodes that are due to be tested (i.e.
// next-testing-time <= now). Returns [snrecord, #previous-failures] for each.
std::vector<std::pair<RouterID, int>> get_failing(const time_point_t& now = clock_t::now());
// Adds a bad node pubkey to the failing list, to be re-tested soon (with a backoff
// depending on `failures`; see TESTING_BACKOFF). `previous_failures` should be the number
// of previous failures *before* this one, i.e. 0 for a random general test; or the failure
// count returned by `get_failing` for repeated failures.
void add_failing_node(const RouterID& pk, int previous_failures = 0);
/// removes the public key from the failing set
void remove_node_from_failing(const RouterID& pk);
// Called when this router receives an incomming session
void incoming_ping(const time_point_t& now = clock_t::now());
// Check whether we received incoming pings recently
void check_incoming_tests(const time_point_t& now = clock_t::now());
};
} // namespace llarp::consensus