diff --git a/src/petals/cli/run_server.py b/src/petals/cli/run_server.py index 5fb700d..57761fd 100644 --- a/src/petals/cli/run_server.py +++ b/src/petals/cli/run_server.py @@ -47,6 +47,9 @@ def main(): parser.add_argument('--announce_maddrs', nargs='+', required=False, help='Visible multiaddrs the host announces for external connections from other peers') + parser.add_argument('--daemon_startup_timeout', type=float, default=120, + help='Timeout for the libp2p daemon connecting to initial peers') + parser.add_argument('--compression', type=str, default='NONE', required=False, help='Tensor compression communication') parser.add_argument('--num_handlers', type=int, default=8, required=False, @@ -167,6 +170,8 @@ def main(): assert port != 0, "Please specify a fixed non-zero --port when you use --public_ip (e.g., --port 31337)" announce_maddrs = [f"/ip4/{public_ip}/tcp/{port}"] + args["startup_timeout"] = args.pop("daemon_startup_timeout") + if args.pop("increase_file_limit"): increase_file_limit() diff --git a/src/petals/client/remote_model.py b/src/petals/client/remote_model.py index 2ec127f..3b16abe 100644 --- a/src/petals/client/remote_model.py +++ b/src/petals/client/remote_model.py @@ -32,7 +32,7 @@ class DistributedBloomConfig(BloomConfig): initial_peers: List[str] = PUBLIC_INITIAL_PEERS # a list of initial peers for hivemind DHT dht_prefix: str # a prefix for all dht keys that correspond to this model (usually equal to model name) - daemon_startup_timeout: int = 30 + daemon_startup_timeout: int = 120 # timeout for the libp2p daemon connecting to initial peers dht: Optional[hivemind.DHT] = None # a running DHT instance, e.g. when using the same DHT for multiple models request_timeout: int = 3 * 60 # a number of seconds for waiting result from each node max_retries: Optional[int] = None # max number retries before the client raises an exception (default: inf)