diff --git a/llarp/dht/context.cpp b/llarp/dht/context.cpp index d2b0e7129..06c67953d 100644 --- a/llarp/dht/context.cpp +++ b/llarp/dht/context.cpp @@ -441,7 +441,7 @@ namespace llarp Context::CleanupTX() { auto now = Now(); - llarp::LogDebug("DHT tick"); + llarp::LogTrace("DHT tick"); pendingRouterLookups().Expire(now); _pendingIntrosetLookups.Expire(now); diff --git a/llarp/router/abstractrouter.hpp b/llarp/router/abstractrouter.hpp index 022e6767a..7f6a368a7 100644 --- a/llarp/router/abstractrouter.hpp +++ b/llarp/router/abstractrouter.hpp @@ -199,14 +199,12 @@ namespace llarp virtual bool IsServiceNode() const = 0; - virtual bool - IsActiveServiceNode() const = 0; - - /// If we are running as a service node and appear active, i.e. registered and not - /// decommissioned, we should *not* ping core if we know of too few peers, to indicate to core - /// we are not in a good state. - virtual bool - ShouldPingOxen() const = 0; + /// Called to determine if we're in a bad state (which gets reported to our oxend) that should + /// prevent uptime proofs from going out to the network (so that the error state gets noticed). + /// Currently this means we require a decent number of peers whenever we are fully staked + /// (active or decommed). + virtual std::optional + OxendErrorState() const = 0; virtual bool StartRpcServer() = 0; @@ -315,7 +313,9 @@ namespace llarp /// set router's service node whitelist virtual void SetRouterWhitelist( - const std::vector& whitelist, const std::vector& greylist) = 0; + const std::vector& whitelist, + const std::vector& greylist, + const std::vector& unfundedlist) = 0; virtual std::unordered_set GetRouterWhitelist() const = 0; diff --git a/llarp/router/i_rc_lookup_handler.hpp b/llarp/router/i_rc_lookup_handler.hpp index ffc5a3a4b..4efee04c6 100644 --- a/llarp/router/i_rc_lookup_handler.hpp +++ b/llarp/router/i_rc_lookup_handler.hpp @@ -34,7 +34,9 @@ namespace llarp virtual void SetRouterWhitelist( - const std::vector& whitelist, const std::vector& greylist) = 0; + const std::vector& whitelist, + const std::vector& greylist, + const std::vector& greenlist) = 0; virtual void GetRC(const RouterID& router, RCRequestCallback callback, bool forceLookup = false) = 0; @@ -48,6 +50,12 @@ namespace llarp virtual bool IsGreylisted(const RouterID& remote) const = 0; + virtual bool + IsGreenlisted(const RouterID& remote) const = 0; + + virtual bool + IsRegistered(const RouterID& remote) const = 0; + virtual bool CheckRC(const RouterContact& rc) const = 0; diff --git a/llarp/router/rc_lookup_handler.cpp b/llarp/router/rc_lookup_handler.cpp index 48150090f..ca9169e65 100644 --- a/llarp/router/rc_lookup_handler.cpp +++ b/llarp/router/rc_lookup_handler.cpp @@ -32,26 +32,28 @@ namespace llarp whitelistRouters.erase(router); } + static void + loadColourList(std::unordered_set& beigelist, const std::vector& new_beige) + { + beigelist.clear(); + beigelist.insert(new_beige.begin(), new_beige.end()); + } + void RCLookupHandler::SetRouterWhitelist( - const std::vector& whitelist, const std::vector& greylist) + const std::vector& whitelist, + const std::vector& greylist, + const std::vector& greenlist) { if (whitelist.empty()) return; util::Lock l(_mutex); - whitelistRouters.clear(); - greylistRouters.clear(); - for (auto& router : whitelist) - { - whitelistRouters.emplace(router); - } - for (auto& router : greylist) - { - greylistRouters.emplace(router); - } + loadColourList(whitelistRouters, whitelist); + loadColourList(greylistRouters, greylist); + loadColourList(greenlistRouters, greenlist); - LogInfo("lokinet service node list now has ", whitelistRouters.size(), " routers"); + LogInfo("lokinet service node list now has ", whitelistRouters.size(), " active routers"); } bool @@ -140,6 +142,21 @@ namespace llarp return greylistRouters.count(remote); } + bool + RCLookupHandler::IsGreenlisted(const RouterID& remote) const + { + util::Lock lock{_mutex}; + return greenlistRouters.count(remote); + } + + bool + RCLookupHandler::IsRegistered(const RouterID& remote) const + { + util::Lock lock{_mutex}; + return whitelistRouters.count(remote) || greylistRouters.count(remote) + || greenlistRouters.count(remote); + } + bool RCLookupHandler::PathIsAllowed(const RouterID& remote) const { diff --git a/llarp/router/rc_lookup_handler.hpp b/llarp/router/rc_lookup_handler.hpp index 8beee74b7..184b8d102 100644 --- a/llarp/router/rc_lookup_handler.hpp +++ b/llarp/router/rc_lookup_handler.hpp @@ -42,8 +42,11 @@ namespace llarp void SetRouterWhitelist( - const std::vector& whitelist, const std::vector& greylist) override - EXCLUDES(_mutex); + const std::vector& whitelist, + const std::vector& greylist, + const std::vector& greenlist + + ) override EXCLUDES(_mutex); bool HaveReceivedWhitelist() const override; @@ -61,6 +64,16 @@ namespace llarp bool IsGreylisted(const RouterID& remote) const override EXCLUDES(_mutex); + // "greenlist" = new routers (i.e. "green") that aren't fully funded yet + bool + IsGreenlisted(const RouterID& remote) const override EXCLUDES(_mutex); + + // registered just means that there is at least an operator stake, but doesn't require the node + // be fully funded, active, or not decommed. (In other words: it is any of the white, grey, or + // green list). + bool + IsRegistered(const RouterID& remote) const override EXCLUDES(_mutex); + bool CheckRC(const RouterContact& rc) const override; @@ -134,8 +147,12 @@ namespace llarp bool useWhitelist = false; bool isServiceNode = false; + // whitelist = active routers std::unordered_set whitelistRouters GUARDED_BY(_mutex); + // greylist = fully funded, but decommissioned routers std::unordered_set greylistRouters GUARDED_BY(_mutex); + // greenlist = registered but not fully-staked routers + std::unordered_set greenlistRouters GUARDED_BY(_mutex); using TimePoint = std::chrono::steady_clock::time_point; std::unordered_map _routerLookupTimes; diff --git a/llarp/router/router.cpp b/llarp/router/router.cpp index 975c6074b..b272e36e8 100644 --- a/llarp/router/router.cpp +++ b/llarp/router/router.cpp @@ -471,16 +471,14 @@ namespace llarp return nodedb()->NumLoaded() < KnownPeerWarningThreshold; } - bool - Router::IsActiveServiceNode() const + std::optional + Router::OxendErrorState() const { - return IsServiceNode() and not(LooksDeregistered() or LooksDecommissioned()); - } - - bool - Router::ShouldPingOxen() const - { - return IsActiveServiceNode() and not TooFewPeers(); + // If we're in the white or gray list then we *should* be establishing connections to other + // routers, so if we have almost no peers then something is almost certainly wrong. + if (LooksFunded() and TooFewPeers()) + return "too few peer connections; lokinet is not adequately connected to the network"; + return std::nullopt; } void @@ -508,10 +506,17 @@ namespace llarp } bool - Router::LooksDeregistered() const + Router::LooksFunded() const { return IsServiceNode() and whitelistRouters and _rcLookupHandler.HaveReceivedWhitelist() - and not _rcLookupHandler.SessionIsAllowed(pubkey()); + and _rcLookupHandler.SessionIsAllowed(pubkey()); + } + + bool + Router::LooksRegistered() const + { + return IsServiceNode() and whitelistRouters and _rcLookupHandler.HaveReceivedWhitelist() + and _rcLookupHandler.IsRegistered(pubkey()); } bool @@ -973,7 +978,7 @@ namespace llarp // don't purge bootstrap nodes from nodedb if (IsBootstrapNode(rc.pubkey)) { - log::debug(logcat, "Not removing {}: is bootstrap node", rc.pubkey); + log::trace(logcat, "Not removing {}: is bootstrap node", rc.pubkey); return false; } // if for some reason we stored an RC that isn't a valid router @@ -1061,12 +1066,16 @@ namespace llarp if (now >= m_NextDecommissionWarn) { constexpr auto DecommissionWarnInterval = 5min; - if (auto dereg = LooksDeregistered(); dereg or decom) + if (auto registered = LooksRegistered(), funded = LooksFunded(); + not(registered and funded and not decom)) { - // complain about being deregistered - LogError( - "We are running as a service node but we seem to be ", - dereg ? "deregistered" : "decommissioned"); + // complain about being deregistered/decommed/unfunded + log::error( + logcat, + "We are running as a service node but we seem to be {}", + not registered ? "deregistered" + : decom ? "decommissioned" + : "not fully staked"); m_NextDecommissionWarn = now + DecommissionWarnInterval; } else if (isSvcNode and TooFewPeers()) @@ -1081,7 +1090,7 @@ namespace llarp // if we need more sessions to routers and we are not a service node kicked from the network // we shall connect out to others - if (connected < connectToNum and not LooksDeregistered()) + if (connected < connectToNum and LooksFunded()) { size_t dlt = connectToNum - connected; LogDebug("connecting to ", dlt, " random routers to keep alive"); @@ -1233,9 +1242,11 @@ namespace llarp void Router::SetRouterWhitelist( - const std::vector& whitelist, const std::vector& greylist) + const std::vector& whitelist, + const std::vector& greylist, + const std::vector& unfundedlist) { - _rcLookupHandler.SetRouterWhitelist(whitelist, greylist); + _rcLookupHandler.SetRouterWhitelist(whitelist, greylist, unfundedlist); } bool diff --git a/llarp/router/router.hpp b/llarp/router/router.hpp index 8eb751410..53b24e4aa 100644 --- a/llarp/router/router.hpp +++ b/llarp/router/router.hpp @@ -143,7 +143,9 @@ namespace llarp void SetRouterWhitelist( - const std::vector& whitelist, const std::vector& greylist) override; + const std::vector& whitelist, + const std::vector& greylist, + const std::vector& unfunded) override; std::unordered_set GetRouterWhitelist() const override @@ -203,9 +205,16 @@ namespace llarp bool LooksDecommissioned() const; - /// return true if we look like we are a deregistered service node + /// return true if we look like we are a registered, fully-staked service node (either active or + /// decommissioned). This condition determines when we are allowed to (and attempt to) connect + /// to other peers when running as a service node. bool - LooksDeregistered() const; + LooksFunded() const; + + /// return true if we a registered service node; not that this only requires a partial stake, + /// and does not imply that this service node is *active* or fully funded. + bool + LooksRegistered() const; /// return true if we look like we are allowed and able to test other routers bool @@ -378,12 +387,8 @@ namespace llarp bool IsServiceNode() const override; - /// return true if service node *and* not deregistered or decommissioned - bool - IsActiveServiceNode() const override; - - bool - ShouldPingOxen() const override; + std::optional + OxendErrorState() const override; void Close(); @@ -556,8 +561,11 @@ namespace llarp bool m_isServiceNode = false; + // Delay warning about being decommed/dereged until we've had enough time to sync up with oxend + static constexpr auto DECOMM_WARNING_STARTUP_DELAY = 15s; + llarp_time_t m_LastStatsReport = 0s; - llarp_time_t m_NextDecommissionWarn = 0s; + llarp_time_t m_NextDecommissionWarn = time_now_ms() + DECOMM_WARNING_STARTUP_DELAY; std::shared_ptr m_keyManager; std::shared_ptr m_peerDb; diff --git a/llarp/rpc/lokid_rpc_client.cpp b/llarp/rpc/lokid_rpc_client.cpp index 7048188fd..a40c33383 100644 --- a/llarp/rpc/lokid_rpc_client.cpp +++ b/llarp/rpc/lokid_rpc_client.cpp @@ -112,35 +112,57 @@ namespace llarp void LokidRpcClient::UpdateServiceNodeList() { - nlohmann::json request, fields; - fields["pubkey_ed25519"] = true; - fields["service_node_pubkey"] = true; - fields["funded"] = true; - fields["active"] = true; - request["fields"] = fields; - m_UpdatingList = true; + if (m_UpdatingList.exchange(true)) + return; // update already in progress + + nlohmann::json request{ + {"fields", + { + {"pubkey_ed25519", true}, + {"service_node_pubkey", true}, + {"funded", true}, + {"active", true}, + {"block_hash", true}, + }}, + }; + if (!m_LastUpdateHash.empty()) + request["fields"]["poll_block_hash"] = m_LastUpdateHash; + Request( "rpc.get_service_nodes", [self = shared_from_this()](bool success, std::vector data) { - self->m_UpdatingList = false; if (not success) - { LogWarn("failed to update service node list"); - return; - } - if (data.size() < 2) - { - LogWarn("lokid gave empty reply for service node list"); - return; - } - try + else if (data.size() < 2) + LogWarn("oxend gave empty reply for service node list"); + else { - self->HandleGotServiceNodeList(std::move(data[1])); - } - catch (std::exception& ex) - { - LogError("failed to process service node list: ", ex.what()); + try + { + auto json = nlohmann::json::parse(std::move(data[1])); + if (json.at("status") != "OK") + throw std::runtime_error{"get_service_nodes did not return 'OK' status"}; + if (auto it = json.find("unchanged"); + it != json.end() and it->is_boolean() and it->get()) + LogDebug("service node list unchanged"); + else + { + self->HandleNewServiceNodeList(json.at("service_node_states")); + if (auto it = json.find("block_hash"); it != json.end() and it->is_string()) + self->m_LastUpdateHash = it->get(); + else + self->m_LastUpdateHash.clear(); + } + } + catch (const std::exception& ex) + { + LogError("failed to process service node list: ", ex.what()); + } } + + // set down here so that the 1) we don't start updating until we're completely finished + // with the previous update; and 2) so that m_UpdatingList also guards m_LastUpdateHash + self->m_UpdatingList = false; }, request.dump()); } @@ -152,25 +174,27 @@ namespace llarp auto makePingRequest = [self = shared_from_this()]() { // send a ping PubKey pk{}; - bool should_ping = false; - if (auto r = self->m_Router.lock()) - { - pk = r->pubkey(); - should_ping = r->ShouldPingOxen(); - } - if (should_ping) - { - nlohmann::json payload = { - {"pubkey_ed25519", oxenc::to_hex(pk.begin(), pk.end())}, - {"version", {VERSION[0], VERSION[1], VERSION[2]}}}; - self->Request( - "admin.lokinet_ping", - [](bool success, std::vector data) { - (void)data; - LogDebug("Received response for ping. Successful: ", success); - }, - payload.dump()); - } + auto r = self->m_Router.lock(); + if (not r) + return; // router has gone away, maybe shutting down? + + pk = r->pubkey(); + + nlohmann::json payload = { + {"pubkey_ed25519", oxenc::to_hex(pk.begin(), pk.end())}, + {"version", {VERSION[0], VERSION[1], VERSION[2]}}}; + + if (auto err = r->OxendErrorState()) + payload["error"] = *err; + + self->Request( + "admin.lokinet_ping", + [](bool success, std::vector data) { + (void)data; + LogDebug("Received response for ping. Successful: ", success); + }, + payload.dump()); + // subscribe to block updates self->Request("sub.block", [](bool success, std::vector data) { if (data.empty() or not success) @@ -180,52 +204,53 @@ namespace llarp } LogDebug("subscribed to new blocks: ", data[0]); }); + // Trigger an update on a regular timer as well in case we missed a block notify for some + // reason (e.g. oxend restarts and loses the subscription); we poll using the last known + // hash so that the poll is very cheap (basically empty) if the block hasn't advanced. + self->UpdateServiceNodeList(); }; + // Fire one ping off right away to get things going. + makePingRequest(); m_lokiMQ->add_timer(makePingRequest, PingInterval); - // initial fetch of service node list - UpdateServiceNodeList(); } void - LokidRpcClient::HandleGotServiceNodeList(std::string data) + LokidRpcClient::HandleNewServiceNodeList(const nlohmann::json& j) { - auto j = nlohmann::json::parse(std::move(data)); - if (const auto itr = j.find("unchanged"); itr != j.end() and itr->get()) - { - LogDebug("service node list unchanged"); - return; - } std::unordered_map keymap; - std::vector activeNodeList, nonActiveNodeList; - if (const auto itr = j.find("service_node_states"); itr != j.end() and itr->is_array()) + std::vector activeNodeList, decommNodeList, unfundedNodeList; + if (not j.is_array()) + throw std::runtime_error{ + "Invalid service node list: expected array of service node states"}; + + for (auto& snode : j) { - for (auto& snode : *itr) - { - // Skip unstaked snodes: - if (const auto funded_itr = snode.find("funded"); funded_itr == snode.end() - or not funded_itr->is_boolean() or not funded_itr->get()) - continue; - - const auto ed_itr = snode.find("pubkey_ed25519"); - if (ed_itr == snode.end() or not ed_itr->is_string()) - continue; - const auto svc_itr = snode.find("service_node_pubkey"); - if (svc_itr == snode.end() or not svc_itr->is_string()) - continue; - const auto active_itr = snode.find("active"); - if (active_itr == snode.end() or not active_itr->is_boolean()) - continue; - const bool active = active_itr->get(); - - RouterID rid; - PubKey pk; - if (not rid.FromHex(ed_itr->get()) - or not pk.FromHex(svc_itr->get())) - continue; - - keymap[rid] = pk; - (active ? activeNodeList : nonActiveNodeList).push_back(std::move(rid)); - } + const auto ed_itr = snode.find("pubkey_ed25519"); + if (ed_itr == snode.end() or not ed_itr->is_string()) + continue; + const auto svc_itr = snode.find("service_node_pubkey"); + if (svc_itr == snode.end() or not svc_itr->is_string()) + continue; + const auto active_itr = snode.find("active"); + if (active_itr == snode.end() or not active_itr->is_boolean()) + continue; + const bool active = active_itr->get(); + const auto funded_itr = snode.find("funded"); + if (funded_itr == snode.end() or not funded_itr->is_boolean()) + continue; + const bool funded = funded_itr->get(); + + RouterID rid; + PubKey pk; + if (not rid.FromHex(ed_itr->get()) + or not pk.FromHex(svc_itr->get())) + continue; + + keymap[rid] = pk; + (active ? activeNodeList + : funded ? decommNodeList + : unfundedNodeList) + .push_back(std::move(rid)); } if (activeNodeList.empty()) @@ -233,17 +258,19 @@ namespace llarp LogWarn("got empty service node list, ignoring."); return; } + // inform router about the new list if (auto router = m_Router.lock()) { auto& loop = router->loop(); loop->call([this, active = std::move(activeNodeList), - inactive = std::move(nonActiveNodeList), + decomm = std::move(decommNodeList), + unfunded = std::move(unfundedNodeList), keymap = std::move(keymap), router = std::move(router)]() mutable { m_KeyMap = std::move(keymap); - router->SetRouterWhitelist(active, inactive); + router->SetRouterWhitelist(active, decomm, unfunded); }); } else diff --git a/llarp/rpc/lokid_rpc_client.hpp b/llarp/rpc/lokid_rpc_client.hpp index b2614bd10..bfc267480 100644 --- a/llarp/rpc/lokid_rpc_client.hpp +++ b/llarp/rpc/lokid_rpc_client.hpp @@ -55,6 +55,8 @@ namespace llarp void Command(std::string_view cmd); + /// triggers a service node list refresh from oxend; thread-safe and will do nothing if an + /// update is already in progress. void UpdateServiceNodeList(); @@ -72,8 +74,10 @@ namespace llarp m_lokiMQ->request(*m_Connection, std::move(cmd), std::move(func)); } + // Handles a service node list update; takes the "service_node_states" object of an oxend + // "get_service_nodes" rpc request. void - HandleGotServiceNodeList(std::string json); + HandleNewServiceNodeList(const nlohmann::json& json); // Handles request from lokid for peer stats on a specific peer void @@ -88,6 +92,7 @@ namespace llarp std::weak_ptr m_Router; std::atomic m_UpdatingList; + std::string m_LastUpdateHash; std::unordered_map m_KeyMap;