diff --git a/llarp/peerstats/peer_db.cpp b/llarp/peerstats/peer_db.cpp index f1f62ee9c..8b3cb26ab 100644 --- a/llarp/peerstats/peer_db.cpp +++ b/llarp/peerstats/peer_db.cpp @@ -145,32 +145,85 @@ namespace llarp return itr->second; } + /// Assume we receive an RC at some point `R` in time which was signed at some point `S` in time + /// and expires at some point `E` in time, as depicted below: + /// + /// +-----------------------------+ + /// | signed rc | <- useful lifetime of RC + /// +-----------------------------+ + /// ^ [ . . . . . . . . ] <----------- window in which we receive this RC gossiped to us + /// | ^ ^ + /// | | | + /// S R E + /// + /// One useful metric from this is the difference between (E - R), the useful contact time of this + /// RC. As we track this metric over time, the high and low watermarks serve to tell us how + /// quickly we receive signed RCs from a given router and how close to expiration they are when + /// we receive them. The latter is particularly useful, and should always be a positive number for + /// a healthy router. A negative number indicates that we are receiving an expired RC. + /// + /// TODO: we actually discard expired RCs, so we currently would not detect a negative value for + /// (E - R) + /// + /// Another related metric is the distance between a newly received RC and the previous RC's + /// expiration, which represents how close we came to having no useful RC to work with. This + /// should be a high (positive) number for a healthy router, and if negative indicates that we + /// had no way to contact this router for a period of time. + /// + /// E1 E2 E3 + /// | | | + /// v | | + /// +-----------------------------+ | | + /// | signed rc 1 | | | + /// +-----------------------------+ | | + /// [ . . . . . ] v | + /// ^ +-----------------------------+ | + /// | | signed rc 2 | | + /// | +-----------------------------+ | + /// | [ . . . . . . . . . . ] v + /// | ^ +-----------------------------+ + /// | | | signed rc 3 | + /// | | +-----------------------------+ + /// | | [ . . ] + /// | | ^ + /// | | | + /// R1 R2 R3 + /// + /// Example: the delta between (E1 - R2) is healthy, but the delta between (E2 - R3) is indicates + /// that we had a brief period of time where we had no valid (non-expired) RC for this router + /// (because it is negative). void PeerDb::handleGossipedRC(const RouterContact& rc, llarp_time_t now) { std::lock_guard gaurd(m_statsLock); + LogWarn("Handling gossiped RC", rc); + RouterID id(rc.pubkey); auto& stats = m_peerStats[id]; stats.routerId = id.ToString(); - if (stats.lastRCUpdated < rc.last_updated.count()) + const bool isNewRC = (stats.lastRCUpdated < rc.last_updated.count()); + + if (isNewRC) { - if (stats.numDistinctRCsReceived > 0) + stats.numDistinctRCsReceived++; + + if (stats.numDistinctRCsReceived > 1) { - // we track max expiry as the delta between (time received - last expiration time), - // and this value will often be negative for a healthy router + const int64_t prevRCExpiration = (stats.lastRCUpdated + RouterContact::Lifetime.count()); + + // we track max expiry as the delta between (last expiration time - time received), + // and this value will be negative for an unhealthy router // TODO: handle case where new RC is also expired? just ignore? - int64_t expiry = (now.count() - (stats.lastRCUpdated + RouterContact::Lifetime.count())); - stats.mostExpiredRCMs = std::max(stats.mostExpiredRCMs, expiry); + int64_t expiry = (prevRCExpiration - now.count()); - if (stats.numDistinctRCsReceived == 1) + if (stats.numDistinctRCsReceived == 2) stats.mostExpiredRCMs = expiry; else - stats.mostExpiredRCMs = std::max(stats.mostExpiredRCMs, expiry); + stats.mostExpiredRCMs = std::min(stats.mostExpiredRCMs, expiry); } - stats.numDistinctRCsReceived++; stats.lastRCUpdated = rc.last_updated.count(); stats.stale = true; } @@ -206,10 +259,8 @@ namespace llarp std::vector statsObjs; statsObjs.reserve(m_peerStats.size()); - LogInfo("Building peer stats..."); for (const auto& pair : m_peerStats) { - LogInfo("Stat here"); statsObjs.push_back(pair.second.toJson()); } diff --git a/test/peerstats/test_peer_db.cpp b/test/peerstats/test_peer_db.cpp index d6d2f5918..414b49fda 100644 --- a/test/peerstats/test_peer_db.cpp +++ b/test/peerstats/test_peer_db.cpp @@ -3,6 +3,7 @@ #include #include +#include "peerstats/types.hpp" #include "router_contact.hpp" #include "util/time.hpp" @@ -147,8 +148,70 @@ TEST_CASE("Test PeerDb handleGossipedRC", "[PeerDb]") db.handleGossipedRC(rc, now); stats = db.getCurrentPeerStats(id); - // new RC received at 9sec, making it (expiration time - 9 sec) expired (a negative number) - CHECK(stats.value().mostExpiredRCMs == (9s - (now + rcLifetime)).count()); + // should be (previous expiration time - new received time) + CHECK(stats.value().mostExpiredRCMs == ((10s + rcLifetime) - now).count()); CHECK(stats.value().numDistinctRCsReceived == 2); CHECK(stats.value().lastRCUpdated == 11000); } + +TEST_CASE("Test PeerDb handleGossipedRC expiry calcs", "[PeerDb]") +{ + const llarp::RouterID id = llarp::test::makeBuf(0xF9); + + // see comments in peer_db.cpp above PeerDb::handleGossipedRC() for some context around these + // tests and esp. these numbers + const llarp_time_t ref = 48h; + const llarp_time_t rcLifetime = llarp::RouterContact::Lifetime; + + // rc1, first rc received + const llarp_time_t s1 = ref; + const llarp_time_t r1 = s1 + 30s; + const llarp_time_t e1 = s1 + rcLifetime; + llarp::RouterContact rc1; + rc1.pubkey = llarp::PubKey(id); + rc1.last_updated = s1; + + // rc2, second rc received + // received "healthily", with lots of room to spare before rc1 expires + const llarp_time_t s2 = s1 + 8h; + const llarp_time_t r2 = s2 + 30s; // healthy recv time + const llarp_time_t e2 = s2 + rcLifetime; + llarp::RouterContact rc2; + rc2.pubkey = llarp::PubKey(id); + rc2.last_updated = s2; + + // rc3, third rc received + // received "unhealthily" (after rc2 expires) + const llarp_time_t s3 = s2 + 8h; + const llarp_time_t r3 = e2 + 1h; // received after e2 + const llarp_time_t e3 = s3 + rcLifetime; + llarp::RouterContact rc3; + rc3.pubkey = llarp::PubKey(id); + rc3.last_updated = s3; + + llarp::PeerDb db; + + db.handleGossipedRC(rc1, r1); + auto stats1 = db.getCurrentPeerStats(id); + CHECK(stats1.has_value()); + CHECK(stats1.value().mostExpiredRCMs == 0); + CHECK(stats1.value().numDistinctRCsReceived == 1); + CHECK(stats1.value().lastRCUpdated == s1.count()); + + db.handleGossipedRC(rc2, r2); + auto stats2 = db.getCurrentPeerStats(id); + CHECK(stats2.has_value()); + CHECK(stats2.value().mostExpiredRCMs == (e1 - r2).count()); + CHECK(stats2.value().mostExpiredRCMs > 0); // ensure positive indicates healthy + CHECK(stats2.value().numDistinctRCsReceived == 2); + CHECK(stats2.value().lastRCUpdated == s2.count()); + + db.handleGossipedRC(rc3, r3); + auto stats3 = db.getCurrentPeerStats(id); + CHECK(stats3.has_value()); + CHECK(stats3.value().mostExpiredRCMs == (e2 - r3).count()); + CHECK( + stats3.value().mostExpiredRCMs < 0); // ensure negative indicates unhealthy and we use min() + CHECK(stats3.value().numDistinctRCsReceived == 3); + CHECK(stats3.value().lastRCUpdated == s3.count()); +}