#pragma once #include #include #include #include #include #include #include #include namespace llarp { struct AbstractRouter; } namespace llarp::consensus { namespace detail { using clock_t = std::chrono::steady_clock; using time_point_t = std::chrono::time_point; // Returns std::greater on the std::get(v)th element value. template struct nth_greater { constexpr bool operator()(const T& lhs, const T& rhs) const { return std::greater>{}(std::get(lhs), std::get(rhs)); } }; struct incoming_test_state { time_point_t last_test{}; time_point_t last_whine{}; bool was_failing = false; }; } // namespace detail using time_point_t = detail::time_point_t; using clock_t = detail::clock_t; // How often we tick the timer to check whether we need to do any tests. constexpr auto REACHABILITY_TESTING_TIMER_INTERVAL = 50ms; class reachability_testing { public: // Distribution for the seconds between node tests: we throw in some randomness to avoid // potential clustering of tests. (Note that there is some granularity here as the test timer // only runs every REACHABILITY_TESTING_TIMER_INTERVAL). std::normal_distribution TESTING_INTERVAL{10.0, 3.0}; // The linear backoff after each consecutive test failure before we re-test. Specifically we // schedule the next re-test for (TESTING_BACKOFF*previous_failures) + TESTING_INTERVAL(rng). inline static constexpr auto TESTING_BACKOFF = 10s; // The upper bound for the re-test interval. inline static constexpr auto TESTING_BACKOFF_MAX = 2min; // The maximum number of nodes that we will re-test at once (i.e. per TESTING_TIMING_INTERVAL); // mainly intended to throttle ourselves if, for instance, our own connectivity loss makes us // accumulate tons of nodes to test all at once. (Despite the random intervals, this can happen // if we also get decommissioned during which we can't test at all but still have lots of // failing nodes we want to test right away when we get recommissioned). inline static constexpr int MAX_RETESTS_PER_TICK = 4; // Maximum time without a ping before we start whining about it. // // We have a probability of about 0.368* of *not* getting pinged within a ping interval (10s), // and so the probability of not getting a ping for 2 minutes (i.e. 12 test spans) just because // we haven't been selected is extremely small (0.0000061). It also coincides nicely with // blockchain time (i.e. two minutes) and our max testing backoff. // // * = approx value of ((n-1)/n)^n for non-tiny values of n inline static constexpr auto MAX_TIME_WITHOUT_PING = 2min; // How often we whine in the logs about being unreachable inline static constexpr auto WHINING_INTERVAL = 2min; private: // Queue of pubkeys of service nodes to test; we pop off the back of this until the queue // empties then we refill it with a shuffled list of all pubkeys then pull off of it until it is // empty again, etc. std::vector testing_queue; // The next time for a general test time_point_t next_general_test = time_point_t::min(); // When we started, so that we know not to hold off on whining about no pings for a while. const time_point_t startup = clock_t::now(); // Pubkeys, next test times, and sequential failure counts of service nodes that are currently // in "failed" status along with the last time they failed; we retest them first after 10s then // back off linearly by an additional 10s up to a max testing interval of 2m30s, until we get a // successful response. using FailingPK = std::tuple; std::priority_queue, detail::nth_greater> failing_queue; std::unordered_set failing; // Track the last time *this node* was tested by other network nodes; used to detect and warn // about possible network issues. detail::incoming_test_state last; public: // If it is time to perform another random test, this returns the next node to test from the // testing queue and returns it, also updating the timer for the next test. If it is not yet // time, or if the queue is empty and cannot current be replenished, returns std::nullopt. If // the queue empties then this builds a new one by shuffling current public keys in the swarm's // "all nodes" then starts using the new queue for this an subsequent calls. // // `requeue` is mainly for internal use: if false it avoids rebuilding the queue if we run // out (and instead just return nullopt). std::optional next_random( AbstractRouter* router, const time_point_t& now = clock_t::now(), bool requeue = true); // Removes and returns up to MAX_RETESTS_PER_TICK nodes that are due to be tested (i.e. // next-testing-time <= now). Returns [snrecord, #previous-failures] for each. std::vector> get_failing(const time_point_t& now = clock_t::now()); // Adds a bad node pubkey to the failing list, to be re-tested soon (with a backoff depending on // `failures`; see TESTING_BACKOFF). `previous_failures` should be the number of previous // failures *before* this one, i.e. 0 for a random general test; or the failure count returned // by `get_failing` for repeated failures. void add_failing_node(const RouterID& pk, int previous_failures = 0); /// removes the public key from the failing set void remove_node_from_failing(const RouterID& pk); // Called when this router receives an incomming session void incoming_ping(const time_point_t& now = clock_t::now()); // Check whether we received incoming pings recently void check_incoming_tests(const time_point_t& now = clock_t::now()); }; } // namespace llarp::consensus