Hotfix: Increase daemon_startup_timeout (#292)

For some reasons, right now 15 sec is not enough to connect to the bootstrap peers in the public swarm, as reported by multiple users and observed by me. Increasing it to 120 sec until we find the root cause of the issue.
1 year ago · e0cef73757
parent a7d3d02194
commit e0cef73757
2 changed files with 6 additions and 1 deletions
--- a/src/petals/cli/run_server.py
+++ b/src/petals/cli/run_server.py
@ -47,6 +47,9 @@ def main():
    parser.add_argument('--announce_maddrs', nargs='+', required=False,
                        help='Visible multiaddrs the host announces for external connections from other peers')

+    parser.add_argument('--daemon_startup_timeout', type=float, default=120,
+                        help='Timeout for the libp2p daemon connecting to initial peers')
+
    parser.add_argument('--compression', type=str, default='NONE', required=False, help='Tensor compression communication')

    parser.add_argument('--num_handlers', type=int, default=8, required=False,
@ -167,6 +170,8 @@ def main():
        assert port != 0, "Please specify a fixed non-zero --port when you use --public_ip (e.g., --port 31337)"
        announce_maddrs = [f"/ip4/{public_ip}/tcp/{port}"]

+    args["startup_timeout"] = args.pop("daemon_startup_timeout")
+
    if args.pop("increase_file_limit"):
        increase_file_limit()

--- a/src/petals/client/remote_model.py
+++ b/src/petals/client/remote_model.py
@ -32,7 +32,7 @@ class DistributedBloomConfig(BloomConfig):

    initial_peers: List[str] = PUBLIC_INITIAL_PEERS  # a list of initial peers for hivemind DHT
    dht_prefix: str  # a prefix for all dht keys that correspond to this model (usually equal to model name)
-    daemon_startup_timeout: int = 30
+    daemon_startup_timeout: int = 120  # timeout for the libp2p daemon connecting to initial peers
    dht: Optional[hivemind.DHT] = None  # a running DHT instance, e.g. when using the same DHT for multiple models
    request_timeout: int = 3 * 60  # a number of seconds for waiting result from each node
    max_retries: Optional[int] = None  # max number retries before the client raises an exception (default: inf)