Support libp2p relays for NAT traversal (#186)

- Added relay options to servers - Enabled relay options by default - Changed hivemind version to 1.1.5 - Moved reachability check to be performed after blocks are loaded Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com>
1 year ago · 93bed7da5a
parent 16b69d6050
commit 93bed7da5a
5 changed files with 66 additions and 30 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -37,7 +37,7 @@ install_requires =
    huggingface-hub==0.11.1
    transformers==4.25.1
    speedtest-cli==2.1.3
-    hivemind==1.1.3
+    hivemind==1.1.5
    tensor_parallel==1.0.23
    humanfriendly
    async-timeout>=4.0.2
--- a/src/petals/cli/run_server.py
+++ b/src/petals/cli/run_server.py
@ -38,6 +38,9 @@ def main():
                             'This is a simplified way to set the --announce_maddrs option (see below).'
                             'Default: server announces IPv4/IPv6 addresses of your network interfaces')
    parser.add_argument("--no_auto_relay", action="store_false", dest="use_auto_relay",
                        help="Do not look for libp2p relays to reach peers behind NATs/firewalls")
    parser.add_argument('--host_maddrs', nargs='+', required=False,
                        help='Multiaddrs to listen for external connections from other peers')
    parser.add_argument('--announce_maddrs', nargs='+', required=False,
--- a/src/petals/client/remote_model.py
+++ b/src/petals/client/remote_model.py
@ -107,6 +107,8 @@ class DistributedBloomModel(_LowCPUMemoryMixin, BloomModel):
                num_workers=n_layer,
                startup_timeout=config.daemon_startup_timeout,
                start=True,
                use_relay=True,
                use_auto_relay=True,
            )
        )
        assert isinstance(dht, hivemind.DHT) and dht.is_alive(), "dht must be a running hivemind.DHT instance"
--- a/src/petals/server/reachability.py
+++ b/src/petals/server/reachability.py
@ -0,0 +1,39 @@
 import math
 import time
 import requests
 from hivemind.utils.logging import get_logger
 logger = get_logger(__file__)
 def check_reachability(peer_id, wait_time: float = 7 * 60, retry_delay: float = 15) -> None:
    for attempt_no in range(math.floor(wait_time / retry_delay) + 1):
        try:
            r = requests.get(f"http://health.petals.ml/api/v1/is_reachable/{peer_id}", timeout=10)
            r.raise_for_status()
            response = r.json()
            if response["success"]:
                logger.info("Server is reachable from the Internet. It will appear at http://health.petals.ml soon")
                return
            if attempt_no == 0:
                # Usually, libp2p manages to set up relays before we finish loading blocks.
                # In other cases, we may need to wait for up to `wait_time` seconds before it's done.
                logger.info("Detected a NAT or a firewall, connecting to libp2p relays. This takes a few minutes")
            time.sleep(retry_delay)
        except Exception as e:
            logger.warning(f"Skipping reachability check because health.petals.ml is down: {repr(e)}")
            return
    raise RuntimeError(
        f"Server has not become reachable from the Internet:\n\n"
        f"{response['message']}\n\n"
        f"You need to fix your port forwarding and/or firewall settings. How to do that:\n\n"
        f"    1. Choose a specific port for the Petals server, for example, 31337.\n"
        f"    2. Ensure that this port is accessible from the Internet and not blocked by your firewall.\n"
        f"    3. Add these arguments to explicitly announce your IP address and port to other peers:\n"
        f"        python -m petals.cli.run_server ... --public_ip {response['your_ip']} --port 31337\n"
        f"    4. If it does not help, ask for help in our Discord: https://discord.gg/Wuk8BnrEPH\n"
    )
--- a/src/petals/server/server.py
+++ b/src/petals/server/server.py
@ -10,7 +10,6 @@ from typing import Dict, List, Optional, Sequence, Union
 import numpy as np
 import psutil
 import requests
 import torch
 from hivemind import DHT, MAX_DHT_TIME_DISCREPANCY_SECONDS, BatchTensorDescriptor, get_dht_time
 from hivemind.moe.server.layers import add_custom_models_from_file
@ -28,6 +27,7 @@ from petals.server.backend import TransformerBackend
 from petals.server.block_utils import get_block_size
 from petals.server.handler import TransformerConnectionHandler
 from petals.server.memory_cache import MemoryCache
 from petals.server.reachability import check_reachability
 from petals.server.throughput import get_host_throughput
 from petals.utils.convert_block import check_device_balance, convert_block
 from petals.utils.disk_cache import DEFAULT_CACHE_DIR
@ -78,6 +78,8 @@ class Server:
        load_in_8bit: Optional[bool] = None,
        tensor_parallel_devices: Optional[Sequence[torch.device]] = None,
        skip_reachability_check: bool = False,
        use_relay: bool = True,
        use_auto_relay: bool = True,
        **kwargs,
    ):
        """Create a server with one or more bloom blocks. See run_server.py for documentation."""
@ -117,14 +119,20 @@ class Server:
        )
        self.module_uids = [f"{self.prefix}.{block_index}" for block_index in range(self.block_config.n_layer)]
-        self.dht = DHT(initial_peers=initial_peers, start=True, num_workers=self.block_config.n_layer, **kwargs)
+        self.dht = DHT(
            initial_peers=initial_peers,
            start=True,
            num_workers=self.block_config.n_layer,
            use_relay=use_relay,
            use_auto_relay=use_auto_relay,
            **kwargs,
        )
        visible_maddrs_str = [str(a) for a in self.dht.get_visible_maddrs()]
        if initial_peers == PUBLIC_INITIAL_PEERS:
            logger.info(f"Connecting to the public swarm, peer_id = {self.dht.peer_id}")
            if not skip_reachability_check:
                self._check_reachability()
        else:
            logger.info(f"Running DHT node on {visible_maddrs_str}, initial peers = {initial_peers}")
        self.need_reachability_check = not skip_reachability_check and initial_peers == PUBLIC_INITIAL_PEERS
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
@ -196,35 +204,14 @@ class Server:
        self.stop = threading.Event()
    def _check_reachability(self):
        try:
            r = requests.get(f"http://health.petals.ml/api/v1/is_reachable/{self.dht.peer_id}", timeout=10)
            r.raise_for_status()
            response = r.json()
        except Exception as e:
            logger.warning(f"Skipping reachability check because health.petals.ml is down: {repr(e)}")
            return
        if not response["success"]:
            # This happens only if health.petals.ml is up and explicitly told us that we are unreachable
            raise RuntimeError(
                f"Server is not reachable from the Internet:\n\n"
                f"{response['message']}\n\n"
                f"You need to fix your port forwarding and/or firewall settings. How to do that:\n\n"
                f"    1. Choose a specific port for the Petals server, for example, 31337.\n"
                f"    2. Ensure that this port is accessible from the Internet and not blocked by your firewall.\n"
                f"    3. Add these arguments to explicitly announce your IP address and port to other peers:\n"
                f"        python -m petals.cli.run_server ... --public_ip {response['your_ip']} --port 31337\n"
                f"    4. If it does not help, ask for help in our Discord: https://discord.gg/Wuk8BnrEPH\n"
            )
        logger.info("Server is reachable from the Internet, it will appear at http://health.petals.ml soon")
    def _choose_num_blocks(self) -> int:
        assert (
            self.converted_model_name_or_path == "bigscience/bloom-petals"
        ), "If you use a model other than bigscience/bloom-petals, please specify --num_blocks manually"
-        assert self.device.type == "cuda", "If you run a non-GPU server, please specify --num_blocks manually"
+        assert self.device.type == "cuda", (
            "GPU is not available. If you want to run a CPU-only server, please specify --num_blocks. "
            "CPU-only servers in the public swarm are discouraged since they are much slower"
        )
        num_devices = len(self.tensor_parallel_devices) if self.tensor_parallel_devices else 1
        if num_devices > 1:
@ -287,6 +274,7 @@ class Server:
                use_auth_token=self.use_auth_token,
                load_in_8bit=self.load_in_8bit,
                tensor_parallel_devices=self.tensor_parallel_devices,
                need_reachability_check=self.need_reachability_check,
                start=True,
            )
            try:
@ -380,6 +368,7 @@ class ModuleContainer(threading.Thread):
        use_auth_token: Optional[str],
        load_in_8bit: bool,
        tensor_parallel_devices: Sequence[torch.device],
        need_reachability_check: bool,
        **kwargs,
    ) -> ModuleContainer:
        module_uids = [f"{prefix}.{block_index}" for block_index in block_indices]
@ -433,6 +422,9 @@ class ModuleContainer(threading.Thread):
                    min_batch_size=min_batch_size,
                    max_batch_size=max_batch_size,
                )
            if need_reachability_check:
                check_reachability(dht.peer_id)
        except:
            logger.debug("Shutting down backends")
            for backend in blocks.values():