lokinet/llarp/consensus/reachability_testing.cpp


#include "reachability_testing.hpp"
#include <chrono>
#include <llarp/router/abstractrouter.hpp>
#include <llarp/util/logging.hpp>
#include <llarp/crypto/crypto.hpp>

using std::chrono::steady_clock;

namespace llarp::consensus
{
  using fseconds = std::chrono::duration<float, std::chrono::seconds::period>;
  using fminutes = std::chrono::duration<float, std::chrono::minutes::period>;

  static void
  check_incoming_tests_impl(
      std::string_view name,
      const time_point_t& now,
      const time_point_t& startup,
      detail::incoming_test_state& incoming)
  {
    const auto elapsed = now - std::max(startup, incoming.last_test);
    bool failing = elapsed > reachability_testing::MAX_TIME_WITHOUT_PING;
    bool whine = failing != incoming.was_failing
        || (failing && now - incoming.last_whine > reachability_testing::WHINING_INTERVAL);

    incoming.was_failing = failing;

    if (whine)
    {
      incoming.last_whine = now;
      if (!failing)
      {
        LogInfo(name, " ping received; port is likely reachable again");
      }
      else
      {
        if (incoming.last_test.time_since_epoch() == 0s)
        {
          LogWarn("Have NEVER received ", name, " pings!");
        }
        else
        {
          LogWarn(
              "Have not received ",
              name,
              " pings for a long time: ",
              fminutes{elapsed}.count(),
              " minutes");
        }
        LogWarn(
            "Please check your ",
            name,
            " port. Not being reachable "
            "over ",
            name,
            " may result in a deregistration!");
      }
    }
  }

  void
  reachability_testing::check_incoming_tests(const time_point_t& now)
  {
    check_incoming_tests_impl("lokinet", now, startup, last);
  }

  void
  reachability_testing::incoming_ping(const time_point_t& now)
  {
    last.last_test = now;
  }

  std::optional<RouterID>
  reachability_testing::next_random(AbstractRouter* router, const time_point_t& now, bool requeue)
  {
    if (next_general_test > now)
      return std::nullopt;
    CSRNG rng;
    next_general_test =
        now + std::chrono::duration_cast<time_point_t::duration>(fseconds(TESTING_INTERVAL(rng)));

    // Pull the next element off the queue, but skip ourself, any that are no longer registered, and
    // any that are currently known to be failing (those are queued for testing separately).
    RouterID my_pk{router->pubkey()};
    while (!testing_queue.empty())
    {
      auto& pk = testing_queue.back();
      std::optional<RouterID> sn;
      if (pk != my_pk && !failing.count(pk))
        sn = pk;
      testing_queue.pop_back();
      if (sn)
        return sn;
    }
    if (!requeue)
      return std::nullopt;

    // FIXME: when a *new* node comes online we need to inject it into a random position in the SN
    // list with probability (L/N) [L = current list size, N = potential list size]
    //
    // (FIXME: put this FIXME in a better place ;-) )

    // We exhausted the queue so repopulate it and try again

    testing_queue.clear();
    const auto all = router->GetRouterWhitelist();
    testing_queue.insert(testing_queue.begin(), all.begin(), all.end());

    std::shuffle(testing_queue.begin(), testing_queue.end(), rng);

    // Recurse with the rebuilt list, but don't let it try rebuilding again
    return next_random(router, now, false);
  }

  std::vector<std::pair<RouterID, int>>
  reachability_testing::get_failing(const time_point_t& now)
  {
    // Our failing_queue puts the oldest retest times at the top, so pop them off into our result
    // until the top node should be retested sometime in the future
    std::vector<std::pair<RouterID, int>> result;
    while (result.size() < MAX_RETESTS_PER_TICK && !failing_queue.empty())
    {
      auto& [pk, retest_time, failures] = failing_queue.top();
      if (retest_time > now)
        break;
      if (failing.count(pk))
        result.emplace_back(pk, failures);
      failing_queue.pop();
    }
    return result;
  }

  void
  reachability_testing::add_failing_node(const RouterID& pk, int previous_failures)
  {
    using namespace std::chrono;

    if (previous_failures < 0)
      previous_failures = 0;
    CSRNG rng;
    auto next_test_in = duration_cast<time_point_t::duration>(
        previous_failures * TESTING_BACKOFF + fseconds{TESTING_INTERVAL(rng)});
    if (next_test_in > TESTING_BACKOFF_MAX)
      next_test_in = TESTING_BACKOFF_MAX;

    failing.insert(pk);
    failing_queue.emplace(pk, steady_clock::now() + next_test_in, previous_failures + 1);
  }

  void
  reachability_testing::remove_node_from_failing(const RouterID& pk)
  {
    failing.erase(pk);
  }

}  // namespace llarp::consensus