From e0cef7375785c7433e2803509162fc7dfb2791d6 Mon Sep 17 00:00:00 2001 From: Alexander Borzunov Date: Wed, 15 Mar 2023 17:21:30 +0400 Subject: [PATCH] Hotfix: Increase daemon_startup_timeout (#292) For some reasons, right now 15 sec is not enough to connect to the bootstrap peers in the public swarm, as reported by multiple users and observed by me. Increasing it to 120 sec until we find the root cause of the issue. --- src/petals/cli/run_server.py | 5 +++++ src/petals/client/remote_model.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/petals/cli/run_server.py b/src/petals/cli/run_server.py index 5fb700d..57761fd 100644 --- a/src/petals/cli/run_server.py +++ b/src/petals/cli/run_server.py @@ -47,6 +47,9 @@ def main(): parser.add_argument('--announce_maddrs', nargs='+', required=False, help='Visible multiaddrs the host announces for external connections from other peers') + parser.add_argument('--daemon_startup_timeout', type=float, default=120, + help='Timeout for the libp2p daemon connecting to initial peers') + parser.add_argument('--compression', type=str, default='NONE', required=False, help='Tensor compression communication') parser.add_argument('--num_handlers', type=int, default=8, required=False, @@ -167,6 +170,8 @@ def main(): assert port != 0, "Please specify a fixed non-zero --port when you use --public_ip (e.g., --port 31337)" announce_maddrs = [f"/ip4/{public_ip}/tcp/{port}"] + args["startup_timeout"] = args.pop("daemon_startup_timeout") + if args.pop("increase_file_limit"): increase_file_limit() diff --git a/src/petals/client/remote_model.py b/src/petals/client/remote_model.py index 2ec127f..3b16abe 100644 --- a/src/petals/client/remote_model.py +++ b/src/petals/client/remote_model.py @@ -32,7 +32,7 @@ class DistributedBloomConfig(BloomConfig): initial_peers: List[str] = PUBLIC_INITIAL_PEERS # a list of initial peers for hivemind DHT dht_prefix: str # a prefix for all dht keys that correspond to this model (usually equal to model name) - daemon_startup_timeout: int = 30 + daemon_startup_timeout: int = 120 # timeout for the libp2p daemon connecting to initial peers dht: Optional[hivemind.DHT] = None # a running DHT instance, e.g. when using the same DHT for multiple models request_timeout: int = 3 * 60 # a number of seconds for waiting result from each node max_retries: Optional[int] = None # max number retries before the client raises an exception (default: inf)