petals/src/petals/client/from_pretrained.py

import contextlib
import json
import os
import re
import tempfile
import threading
from typing import List, Optional, Tuple, Union

import torch
from hivemind.utils.logging import get_logger
from transformers import BloomPreTrainedModel, modeling_utils

from petals.utils.version import get_compatible_model_repo

logger = get_logger(__name__)


class FromPretrainedMixin:
    @classmethod
    def from_pretrained(
        cls,
        model_name_or_path: Union[str, os.PathLike, None],
        *args,
        low_cpu_mem_usage: Optional[bool] = None,
        torch_dtype: Optional[Union[str, torch.dtype]] = None,
        **kwargs,
    ):
        model_name_or_path = get_compatible_model_repo(model_name_or_path)
        if low_cpu_mem_usage is None:
            low_cpu_mem_usage = True
        if torch_dtype is None:
            # torch_dtype=None gives torch.float32 in transformers>=4.26.0. In contrast,
            # torch_dtype="auto" attempts to (1) use config.torch_dtype (if exists), (2) use dtype of the weights.
            torch_dtype = "auto"

        with ignore_keys(cls._keys_to_ignore_on_load_unexpected):
            return super().from_pretrained(
                model_name_or_path, *args, low_cpu_mem_usage=low_cpu_mem_usage, torch_dtype=torch_dtype, **kwargs
            )

    from_pretrained.__doc__ = BloomPreTrainedModel.from_pretrained.__doc__.replace(
        "low_cpu_mem_usage(`bool`, *optional*)",
        "low_cpu_mem_usage(`bool`, *optional*, defaults to `True` in Petals)",
    ).replace(
        "torch_dtype (`str` or `torch.dtype`, *optional*)",
        'torch_dtype (`str` or `torch.dtype`, *optional*, defaults to `"auto"` in Petals)',
    )


_shard_config = threading.local()
_shard_config.ignored_keys = None


@contextlib.contextmanager
def ignore_keys(patterns: List[str]):
    try:
        prev_patterns = _shard_config.ignored_keys
        _shard_config.ignored_keys = patterns
        yield
    finally:
        _shard_config.ignored_keys = prev_patterns


def patched_get_checkpoint_shard_files(
    pretrained_model_name_or_path, index_filename, *args, **kwargs
) -> Tuple[List[str], dict]:
    """Same as modeling_utils.get_checkpoint_shard_files(), but does not download shards for the ignored keys."""

    should_ignore_keys = _shard_config.ignored_keys is not None
    tempdir_ctx = tempfile.TemporaryDirectory() if should_ignore_keys else contextlib.nullcontext()
    with tempdir_ctx as tempdir:
        if should_ignore_keys:
            with open(index_filename) as f:
                index = json.load(f)
            n_original_shards = len(set(index["weight_map"].values()))

            index["weight_map"] = {
                param_name: filename
                for param_name, filename in index["weight_map"].items()
                if all(re.search(pattern, param_name) is None for pattern in _shard_config.ignored_keys)
            }
            n_loaded_shards = len(set(index["weight_map"].values()))
            logger.debug(f"Loading {n_loaded_shards} shards out of {n_original_shards}")

            # Replace the original index with a patched JSON, where ignored keys are removed
            index_filename = os.path.join(tempdir, "pytorch_model.bin.index.json")
            with open(index_filename, "w") as f:
                json.dump(index, f)

        return original_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs)


original_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files
modeling_utils.get_checkpoint_shard_files = patched_get_checkpoint_shard_files
Add LLaMA support (#323) This PR: 1. Abolishes the model conversion procedure. Now, models are downloaded directly from original repositories like https://huggingface.co/bigscience/bloom. Servers download only shards with blocks to be hosted, and clients download only shards with input/output embeddings and layernorms. - BLOOM is loaded from `bigscience/bloom`, but we use the DHT prefix `bigscience/bloom-petals` for backward compatibility. Same with smaller BLOOMs and BLOOMZ. - LLaMA can be loaded from any repo like `username/llama-65b-hf`, but we use the DHT prefix `llama-65b-hf` (without the username) to accomodate blocks from different repos (there're a few of them with minor differences, such as `Llama` vs. `LLaMA` in the class name). 2. Refactors the client to generalize it for multiple models. Now, we have `petals.models` packages that contain model-specific code (e.g. `petals.models.bloom`, `petals.models.llama`). General code (e.g. CPU-efficient LM head, p-tuning) is kept in `petals.client`. 3. Introduces `WrappedLlamaBlock`, `DistributedLlamaConfig`, `DistributedLlamaForCausalLM`, `DistributedLlamaForSequenceClassification`, and `DistributedLlamaModel` compatible with Petals functionality (p-tuning, adapters, etc.). 4. Introduces `AutoDistributedConfig` that automatically chooses the correct config class (`DistributedLlamaConfig` or `DistributedBloomConfig`). The refactored configs contain all model-specific info for both clients and servers. Upgrade instructions: - Remove disk caches for blocks in old (converted) format to save disk space. That is, remove `~/.cache/petals/model--bigscience--bloom-petals` and `~/.cache/petals/model--bigscience--bloomz-petals` directories (if present). 11 months ago			`import contextlib`
			`import json`
			`import os`
			`import re`
			`import tempfile`
			`import threading`
			`from typing import List, Optional, Tuple, Union`

			`import torch`
			`from hivemind.utils.logging import get_logger`
			`from transformers import BloomPreTrainedModel, modeling_utils`

			`from petals.utils.version import get_compatible_model_repo`

			`logger = get_logger(__name__)`


			`class FromPretrainedMixin:`
			`@classmethod`
			`def from_pretrained(`
			`cls,`
			`model_name_or_path: Union[str, os.PathLike, None],`
			`*args,`
			`low_cpu_mem_usage: Optional[bool] = None,`
			`torch_dtype: Optional[Union[str, torch.dtype]] = None,`
			`**kwargs,`
			`):`
			`model_name_or_path = get_compatible_model_repo(model_name_or_path)`
			`if low_cpu_mem_usage is None:`
			`low_cpu_mem_usage = True`
			`if torch_dtype is None:`
			`# torch_dtype=None gives torch.float32 in transformers>=4.26.0. In contrast,`
			`# torch_dtype="auto" attempts to (1) use config.torch_dtype (if exists), (2) use dtype of the weights.`
			`torch_dtype = "auto"`

			`with ignore_keys(cls._keys_to_ignore_on_load_unexpected):`
			`return super().from_pretrained(`
			`model_name_or_path, args, low_cpu_mem_usage=low_cpu_mem_usage, torch_dtype=torch_dtype, *kwargs`
			`)`

			`from_pretrained.__doc__ = BloomPreTrainedModel.from_pretrained.__doc__.replace(`
			"low_cpu_mem_usage(`bool`, optional)",
			"low_cpu_mem_usage(`bool`, optional, defaults to `True` in Petals)",
			`).replace(`
			"torch_dtype (`str` or `torch.dtype`, optional)",
			'torch_dtype (`str` or `torch.dtype`, optional, defaults to `"auto"` in Petals)',
			`)`


			`_shard_config = threading.local()`
			`_shard_config.ignored_keys = None`


			`@contextlib.contextmanager`
			`def ignore_keys(patterns: List[str]):`
			`try:`
			`prev_patterns = _shard_config.ignored_keys`
			`_shard_config.ignored_keys = patterns`
			`yield`
			`finally:`
			`_shard_config.ignored_keys = prev_patterns`


			`def patched_get_checkpoint_shard_files(`
			`pretrained_model_name_or_path, index_filename, args, *kwargs`
			`) -> Tuple[List[str], dict]:`
			`"""Same as modeling_utils.get_checkpoint_shard_files(), but does not download shards for the ignored keys."""`

			`should_ignore_keys = _shard_config.ignored_keys is not None`
			`tempdir_ctx = tempfile.TemporaryDirectory() if should_ignore_keys else contextlib.nullcontext()`
			`with tempdir_ctx as tempdir:`
			`if should_ignore_keys:`
			`with open(index_filename) as f:`
			`index = json.load(f)`
			`n_original_shards = len(set(index["weight_map"].values()))`

			`index["weight_map"] = {`
			`param_name: filename`
			`for param_name, filename in index["weight_map"].items()`
			`if all(re.search(pattern, param_name) is None for pattern in _shard_config.ignored_keys)`
			`}`
			`n_loaded_shards = len(set(index["weight_map"].values()))`
			`logger.debug(f"Loading {n_loaded_shards} shards out of {n_original_shards}")`

			`# Replace the original index with a patched JSON, where ignored keys are removed`
			`index_filename = os.path.join(tempdir, "pytorch_model.bin.index.json")`
			`with open(index_filename, "w") as f:`
			`json.dump(index, f)`

			`return original_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, args, *kwargs)`


			`original_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files`
			`modeling_utils.get_checkpoint_shard_files = patched_get_checkpoint_shard_files`