Share more info about a server in DHT (#355)

10 months ago · 2c8959e713
parent 37fdcb3fe0
commit 2c8959e713
11 changed files with 95 additions and 87 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -38,7 +38,7 @@ install_requires =
    tokenizers>=0.13.3
    transformers>=4.30.1,<5.0.0
    speedtest-cli==2.1.3
-    pydantic>=1.8.1,<2.0  # 2.0 is incompatible with hivemind==1.1.8
+    pydantic>=1.10,<2.0  # 2.0 is incompatible with hivemind==1.1.8
    hivemind==1.1.8
    tensor_parallel==1.0.23
    humanfriendly
--- a/src/petals/init.py
+++ b/src/petals/init.py
@ -11,7 +11,7 @@ from petals.models import *
 from petals.utils import *
 from petals.utils.logging import initialize_logs as _initialize_logs

-__version__ = "1.2.0.dev1"
+__version__ = "1.2.0.dev2"


 if not os.getenv("PETALS_IGNORE_DEPENDENCY_VERSION"):
--- a/src/petals/cli/run_server.py
+++ b/src/petals/cli/run_server.py
@ -146,8 +146,9 @@ def main():
                        help="Skip checking this server's reachability via health.petals.ml "
                             "when connecting to the public swarm. If you connect to a private swarm, "
                             "the check is skipped by default. Use this option only if you know what you are doing")
-    
-    parser.add_argument("--adapters", nargs='+', default=None, help="List of pretrained LoRA adapters that can be used for inference or training.")
+
+    parser.add_argument("--adapters", nargs='+', default=(),
+                        help="List of pre-loaded LoRA adapters that can be used for inference or training")

    # fmt:on
    args = vars(parser.parse_args())
--- a/src/petals/data_structures.py
+++ b/src/petals/data_structures.py
@ -1,10 +1,8 @@
-from __future__ import annotations
-
 import dataclasses
-from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Sequence, Tuple

+import pydantic
 from hivemind import PeerID
 from hivemind.moe.expert_uid import ExpertUID

@ -21,13 +19,32 @@ class ServerState(Enum):
    ONLINE = 2


-@dataclass
+@pydantic.dataclasses.dataclass
 class ServerInfo:
    state: ServerState
-    throughput: float
+    throughput: pydantic.confloat(ge=0, allow_inf_nan=False, strict=True)
+
+    adapters: Sequence[str] = ()
+    version: Optional[str] = None
+    torch_dtype: Optional[str] = None
+    quant_type: Optional[str] = None
+    using_relay: Optional[bool] = None
+    cache_tokens_left: Optional[pydantic.conint(ge=0, strict=True)] = None
+
+    def to_tuple(self) -> Tuple[int, float, dict]:
+        extra_info = dataclasses.asdict(self)
+        del extra_info["state"], extra_info["throughput"]
+        return (self.state.value, self.throughput, extra_info)
+
+    @classmethod
+    def from_tuple(cls, source: tuple):
+        state, throughput = source[:2]
+        extra_info = source[2] if len(source) > 2 else {}
+        # pydantic will validate existing fields and ignore extra ones
+        return cls(state=ServerState(state), throughput=throughput, **extra_info)


-@dataclass
+@dataclasses.dataclass
 class RemoteModuleInfo:
    """A remote module that is served by one or more servers"""

@ -35,7 +52,7 @@ class RemoteModuleInfo:
    servers: Dict[PeerID, ServerInfo]


-@dataclass
+@dataclasses.dataclass
 class RemoteSpanInfo:
    """A chain of remote blocks served by one specific remote peer"""

--- a/src/petals/dht_utils.py
+++ b/src/petals/dht_utils.py
@ -11,7 +11,7 @@ from hivemind.dht import DHT, DHTNode, DHTValue
 from hivemind.p2p import PeerID
 from hivemind.utils import DHTExpiration, MPFuture, get_dht_time, get_logger

-from petals.data_structures import CHAIN_DELIMITER, UID_DELIMITER, ModuleUID, RemoteModuleInfo, ServerInfo, ServerState
+from petals.data_structures import CHAIN_DELIMITER, UID_DELIMITER, ModuleUID, RemoteModuleInfo, ServerInfo

 logger = get_logger(__name__)

@ -19,10 +19,8 @@ logger = get_logger(__name__)
 def declare_active_modules(
    dht: DHT,
    uids: Sequence[ModuleUID],
+    server_info: ServerInfo,
    expiration_time: DHTExpiration,
-    state: ServerState,
-    throughput: float,
-    adapters: Optional[Sequence[str]] = None,
    wait: bool = True,
 ) -> Union[Dict[ModuleUID, bool], MPFuture[Dict[ModuleUID, bool]]]:
    """
@ -42,14 +40,7 @@ def declare_active_modules(
        assert isinstance(uid, ModuleUID) and UID_DELIMITER in uid and CHAIN_DELIMITER not in uid

    return dht.run_coroutine(
-        partial(
-            _declare_active_modules,
-            uids=uids,
-            expiration_time=expiration_time,
-            state=state,
-            throughput=throughput,
-            adapters=list(adapters or []),
-        ),
+        partial(_declare_active_modules, uids=uids, server_info=server_info, expiration_time=expiration_time),
        return_future=not wait,
    )

@ -58,16 +49,14 @@ async def _declare_active_modules(
    dht: DHT,
    node: DHTNode,
    uids: List[ModuleUID],
+    server_info: ServerInfo,
    expiration_time: DHTExpiration,
-    state: ServerState,
-    throughput: float,
-    adapters: List[str],
 ) -> Dict[ModuleUID, bool]:
    num_workers = len(uids) if dht.num_workers is None else min(len(uids), dht.num_workers)
    return await node.store_many(
        keys=uids,
        subkeys=[dht.peer_id.to_base58()] * len(uids),
-        values=[(state.value, throughput, dict(adapters=adapters))] * len(uids),
+        values=[server_info.to_tuple()] * len(uids),
        expiration_time=expiration_time,
        num_workers=num_workers,
    )
@ -115,29 +104,21 @@ async def _get_remote_module_infos(
        metadata = found[uid]
        if metadata is None or not isinstance(metadata.value, dict):
            if metadata is not None:
-                logger.error(f"Incorrect metadata for {uid}: {metadata}")
+                logger.warning(f"Incorrect metadata for {uid}: {metadata}")
            continue
        servers = {}
        for peer_id, server_info in metadata.value.items():
            try:
                peer_id = PeerID.from_base58(peer_id)
-                state, throughput = server_info.value[:2]
-                extra_info = server_info.value[2] if len(server_info.value) > 2 else {}
-                adapters = extra_info.get("adapters", [])
-                if bool(active_adapter) and active_adapter not in adapters:
+                server_info = ServerInfo.from_tuple(server_info.value)
+
+                if active_adapter and active_adapter not in server_info.adapters:
                    logger.debug(f"Skipped server {peer_id} since it does not have adapter {active_adapter}")
                    continue

-                if not (
-                    isinstance(state, int)
-                    and isinstance(throughput, float)
-                    and math.isfinite(throughput)
-                    and throughput >= 0.0
-                ):
-                    raise ValueError(f"Invalid server info: {server_info}")
-                servers[peer_id] = ServerInfo(ServerState(state), throughput)
+                servers[peer_id] = server_info
            except (TypeError, ValueError) as e:
-                logger.error(f"Incorrect peer entry for uid={uid}, peer_id={peer_id}: {e}")
+                logger.warning(f"Incorrect peer entry for uid={uid}, peer_id={peer_id}: {e}")
        if servers:
            modules[i] = RemoteModuleInfo(uid, servers)
    return modules
--- a/src/petals/models/bloom/config.py
+++ b/src/petals/models/bloom/config.py
@ -9,8 +9,6 @@ from petals.client.lm_head import LMHeadConfig
 from petals.client.ptune import PTuneConfig
 from petals.client.routing.sequence_manager import SequenceManagerConfig
 from petals.models.bloom.block import WrappedBloomBlock
-from petals.utils.auto_config import AutoDistributedConfig
-from petals.utils.version import get_compatible_model_repo

 logger = get_logger(__name__)

--- a/src/petals/models/llama/config.py
+++ b/src/petals/models/llama/config.py
@ -9,7 +9,6 @@ from petals.client.lm_head import LMHeadConfig
 from petals.client.ptune import PTuneConfig
 from petals.client.routing.sequence_manager import SequenceManagerConfig
 from petals.models.llama.block import WrappedLlamaBlock
-from petals.utils.auto_config import AutoDistributedConfig

 logger = get_logger(__name__)

@ -31,8 +30,7 @@ class DistributedLlamaConfig(LlamaConfig, SequenceManagerConfig, PTuneConfig, LM
        loading_from_repo = model_name_or_path is not None and not os.path.isdir(model_name_or_path)
        if loading_from_repo and dht_prefix is None:
            dht_prefix = str(model_name_or_path)
-            if "/" in dht_prefix:  # If present, strip repository name to merge blocks hosted by different accounts
-                dht_prefix = dht_prefix[dht_prefix.rfind("/") + 1 :]
+            dht_prefix = dht_prefix.split("/")[-1]  # Use only repo name to merge blocks hosted by different accounts
            if not dht_prefix.endswith("-hf"):
                dht_prefix += "-hf"
            logger.info(f"Using DHT prefix: {dht_prefix}")
--- a/src/petals/server/handler.py
+++ b/src/petals/server/handler.py
@ -562,11 +562,10 @@ class TransformerConnectionHandler(ConnectionHandler):
        """Return metadata about stored block uids and current load"""

        backend = self.module_backends[request.uid] if request.uid else next(iter(self.module_backends.values()))
-        cache_bytes_left = max(0, backend.memory_cache.max_size_bytes - backend.memory_cache.current_size_bytes)
        result = {
            "version": petals.__version__,
            "dht_client_mode": self.dht.client_mode,
-            CACHE_TOKENS_AVAILABLE: cache_bytes_left // max(backend.cache_bytes_per_token.values()),
+            CACHE_TOKENS_AVAILABLE: backend.memory_cache.bytes_left // max(backend.cache_bytes_per_token.values()),
        }

        if request.uid:
--- a/src/petals/server/memory_cache.py
+++ b/src/petals/server/memory_cache.py
@ -47,6 +47,10 @@ class MemoryCache:
    def current_size_bytes(self, value: int):
        self._current_size.value = value

+    @property
+    def bytes_left(self) -> int:
+        return self.max_size_bytes - self.current_size_bytes
+
    @property
    def handle_counter(self) -> int:
        return self._handle_counter.value
--- a/src/petals/server/server.py
+++ b/src/petals/server/server.py
@ -16,8 +16,9 @@ from hivemind.proto.runtime_pb2 import CompressionType
 from hivemind.utils.logging import get_logger
 from transformers import PretrainedConfig

+import petals
 from petals.constants import DTYPE_MAP, PUBLIC_INITIAL_PEERS
-from petals.data_structures import CHAIN_DELIMITER, UID_DELIMITER, ServerState
+from petals.data_structures import CHAIN_DELIMITER, UID_DELIMITER, ServerInfo, ServerState
 from petals.dht_utils import declare_active_modules, get_remote_module_infos
 from petals.server import block_selection
 from petals.server.backend import TransformerBackend, merge_inference_pools_inplace
@ -29,7 +30,6 @@ from petals.server.reachability import ReachabilityProtocol, check_direct_reacha
 from petals.server.throughput import get_dtype_name, get_server_throughput
 from petals.utils.auto_config import AutoDistributedConfig
 from petals.utils.convert_block import QuantType, check_device_balance, convert_block
-from petals.utils.disk_cache import DEFAULT_CACHE_DIR
 from petals.utils.version import get_compatible_model_repo

 logger = get_logger(__name__)
@ -81,7 +81,7 @@ class Server:
        dht_client_mode: Optional[bool] = None,
        use_relay: bool = True,
        use_auto_relay: bool = True,
-        adapters: Optional[List[str]] = None,
+        adapters: Sequence[str] = (),
        **kwargs,
    ):
        """Create a server with one or more bloom blocks. See run_server.py for documentation."""
@ -215,7 +215,15 @@ class Server:
                force_eval=(throughput == "eval"),
                cache_dir=cache_dir,
            )
-        self.throughput = throughput
+        self.server_info = ServerInfo(
+            state=ServerState.JOINING,
+            throughput=throughput,
+            adapters=tuple(adapters),
+            version=petals.__version__,
+            torch_dtype=str(torch_dtype).lstrip("torch."),
+            quant_type=quant_type.name.lower(),
+            using_relay=self.dht.client_mode,
+        )

        self.balance_quality = balance_quality
        self.mean_balance_check_period = mean_balance_check_period
@ -283,7 +291,7 @@ class Server:
                block_config=self.block_config,
                attn_cache_bytes=self.attn_cache_bytes,
                alloc_timeout=self.alloc_timeout,
-                throughput=self.throughput,
+                server_info=self.server_info,
                block_indices=block_indices,
                num_handlers=self.num_handlers,
                min_batch_size=self.min_batch_size,
@ -307,7 +315,6 @@ class Server:
                quant_type=self.quant_type,
                tensor_parallel_devices=self.tensor_parallel_devices,
                should_validate_reachability=self.should_validate_reachability,
-                adapters=self.adapters,
                start=True,
            )
            try:
@ -385,7 +392,7 @@ class ModuleContainer(threading.Thread):
        block_config: PretrainedConfig,
        attn_cache_bytes: int,
        alloc_timeout: float,
-        throughput: float,
+        server_info: ServerInfo,
        block_indices: List[int],
        min_batch_size: int,
        max_batch_size: int,
@ -401,16 +408,18 @@ class ModuleContainer(threading.Thread):
        quant_type: QuantType,
        tensor_parallel_devices: Sequence[torch.device],
        should_validate_reachability: bool,
-        adapters: Optional[List[str]] = None,
        **kwargs,
    ) -> ModuleContainer:
        module_uids = [f"{dht_prefix}{UID_DELIMITER}{block_index}" for block_index in block_indices]
+        memory_cache = MemoryCache(attn_cache_bytes, alloc_timeout)
+
+        server_info.state = ServerState.JOINING
        joining_announcer = ModuleAnnouncerThread(
            module_uids,
            dht,
-            ServerState.JOINING,
-            adapters=adapters,
-            throughput=throughput,
+            server_info,
+            block_config=block_config,
+            memory_cache=memory_cache,
            update_period=update_period,
            expiration=expiration,
            daemon=True,
@ -420,7 +429,6 @@ class ModuleContainer(threading.Thread):

        assert len(tensor_parallel_devices) >= 1 and all(isinstance(d, torch.device) for d in tensor_parallel_devices)

-        memory_cache = MemoryCache(attn_cache_bytes, alloc_timeout)
        blocks = {}
        try:
            for module_uid, block_index in zip(module_uids, block_indices):
@ -441,7 +449,7 @@ class ModuleContainer(threading.Thread):
                    tensor_parallel_devices,
                    device,
                    quant_type,
-                    adapters=adapters,
+                    adapters=server_info.adapters,
                    freeze=True,
                    use_auth_token=use_auth_token,
                    cache_dir=cache_dir,
@ -477,13 +485,12 @@ class ModuleContainer(threading.Thread):

            joining_announcer.stop.set()
            joining_announcer.join()
+            server_info.state = ServerState.OFFLINE
            declare_active_modules(
                dht,
                module_uids,
+                server_info,
                expiration_time=get_dht_time() + expiration,
-                state=ServerState.OFFLINE,
-                throughput=throughput,
-                adapters=adapters,
            )
            logger.info(f"Announced that blocks {module_uids} are offline")
            raise
@ -497,8 +504,9 @@ class ModuleContainer(threading.Thread):
            dht,
            dht_prefix,
            blocks,
-            adapters=adapters,
-            throughput=throughput,
+            block_config=block_config,
+            memory_cache=memory_cache,
+            server_info=server_info,
            update_period=update_period,
            expiration=expiration,
            **kwargs,
@ -510,10 +518,11 @@ class ModuleContainer(threading.Thread):
        dht_prefix: str,
        module_backends: Dict[str, TransformerBackend],
        *,
+        block_config: PretrainedConfig,
+        memory_cache: MemoryCache,
        inference_max_length: int,
        num_handlers: int,
-        throughput: float,
-        adapters: Optional[Sequence[str]],
+        server_info: ServerInfo,
        update_period: float,
        expiration: Optional[float] = None,
        request_timeout: float,
@ -525,7 +534,7 @@ class ModuleContainer(threading.Thread):
        super().__init__()

        self.dht, self.module_backends = dht, module_backends
-        self.throughput, self.update_period, self.expiration = throughput, update_period, expiration
+        self.server_info, self.update_period, self.expiration = server_info, update_period, expiration

        self.push_manager = mp.Manager()
        self.push_manager.__enter__()
@ -534,7 +543,7 @@ class ModuleContainer(threading.Thread):
            TransformerConnectionHandler(
                dht,
                self.module_backends,
-                adapters=adapters,
+                adapters=server_info.adapters,
                dht_prefix=dht_prefix,
                push_manager=self.push_manager,
                session_queues=session_queues,
@ -548,12 +557,14 @@ class ModuleContainer(threading.Thread):

        self.runtime = RuntimeWithDeduplicatedPools(self.module_backends, device=None, **kwargs)
        # note: We set device=None in runtime to avoid moving all modules to device 0 in runtime.run(). tensor_parallel has already moved it as needed.
+
+        self.server_info.state = ServerState.ONLINE
        self.online_announcer = ModuleAnnouncerThread(
            list(self.module_backends.keys()),
            dht,
-            ServerState.ONLINE,
-            adapters=adapters,
-            throughput=throughput,
+            self.server_info,
+            block_config=block_config,
+            memory_cache=memory_cache,
            update_period=update_period,
            expiration=expiration,
            daemon=True,
@ -613,12 +624,12 @@ class ModuleContainer(threading.Thread):
        self.online_announcer.stop.set()
        self.online_announcer.join()

+        self.server_info.state = ServerState.OFFLINE
        declare_active_modules(
            self.dht,
            self.module_backends.keys(),
+            self.server_info,
            expiration_time=get_dht_time() + self.expiration,
-            state=ServerState.OFFLINE,
-            throughput=self.throughput,
        )
        logger.info(f"Announced that blocks {list(self.module_backends.keys())} are offline")

@ -651,10 +662,10 @@ class ModuleAnnouncerThread(threading.Thread):
        self,
        module_uids: List[str],
        dht: DHT,
-        state: ServerState,
-        adapters: Optional[Sequence[str]],
+        server_info: ServerInfo,
        *,
-        throughput: float,
+        block_config: PretrainedConfig,
+        memory_cache: MemoryCache,
        update_period: float = 30,
        expiration: float,
        **kwargs,
@ -662,22 +673,21 @@ class ModuleAnnouncerThread(threading.Thread):
        super().__init__(**kwargs)
        self.module_uids = module_uids
        self.dht = dht
-        self.state = state
-        self.adapters = adapters
-        self.throughput = throughput
+        self.server_info = server_info
+        self.memory_cache = memory_cache
+        self.bytes_per_token = block_config.hidden_size * torch.finfo(DTYPE_MAP[server_info.torch_dtype]).bits // 8
        self.update_period = update_period
        self.expiration = expiration
        self.stop = threading.Event()

    def run(self) -> None:
        while True:
+            self.server_info.cache_tokens_left = self.memory_cache.bytes_left // self.bytes_per_token
            declare_active_modules(
                self.dht,
                self.module_uids,
+                self.server_info,
                expiration_time=get_dht_time() + self.expiration,
-                state=self.state,
-                throughput=self.throughput,
-                adapters=self.adapters,
            )
            if self.stop.wait(self.update_period):
                break
--- a/src/petals/utils/convert_block.py
+++ b/src/petals/utils/convert_block.py
@ -2,7 +2,7 @@
 Tools for converting transformer blocks, applying quantization and/or tensor parallelism
 """
 import re
-from typing import List, Optional, Sequence
+from typing import Optional, Sequence

 import tensor_parallel as tp
 import torch
@ -25,7 +25,7 @@ def convert_block(
    output_device: torch.device,
    quant_type: QuantType,
    freeze: bool = True,
-    adapters: Optional[List[str]] = None,
+    adapters: Optional[Sequence[str]] = None,
    **kwargs,
 ) -> tp.TensorParallel:
    """