Hotfix: Increase daemon_startup_timeout (#292)

For some reasons, right now 15 sec is not enough to connect to the bootstrap peers in the public swarm, as reported by multiple users and observed by me. Increasing it to 120 sec until we find the root cause of the issue.
pull/298/head
Alexander Borzunov 1 year ago committed by GitHub
parent a7d3d02194
commit e0cef73757
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -47,6 +47,9 @@ def main():
parser.add_argument('--announce_maddrs', nargs='+', required=False,
help='Visible multiaddrs the host announces for external connections from other peers')
parser.add_argument('--daemon_startup_timeout', type=float, default=120,
help='Timeout for the libp2p daemon connecting to initial peers')
parser.add_argument('--compression', type=str, default='NONE', required=False, help='Tensor compression communication')
parser.add_argument('--num_handlers', type=int, default=8, required=False,
@ -167,6 +170,8 @@ def main():
assert port != 0, "Please specify a fixed non-zero --port when you use --public_ip (e.g., --port 31337)"
announce_maddrs = [f"/ip4/{public_ip}/tcp/{port}"]
args["startup_timeout"] = args.pop("daemon_startup_timeout")
if args.pop("increase_file_limit"):
increase_file_limit()

@ -32,7 +32,7 @@ class DistributedBloomConfig(BloomConfig):
initial_peers: List[str] = PUBLIC_INITIAL_PEERS # a list of initial peers for hivemind DHT
dht_prefix: str # a prefix for all dht keys that correspond to this model (usually equal to model name)
daemon_startup_timeout: int = 30
daemon_startup_timeout: int = 120 # timeout for the libp2p daemon connecting to initial peers
dht: Optional[hivemind.DHT] = None # a running DHT instance, e.g. when using the same DHT for multiple models
request_timeout: int = 3 * 60 # a number of seconds for waiting result from each node
max_retries: Optional[int] = None # max number retries before the client raises an exception (default: inf)

Loading…
Cancel
Save