2023-07-14 14:40:47 +00:00
|
|
|
import subprocess
|
|
|
|
import sys
|
|
|
|
|
2022-12-13 08:03:49 +00:00
|
|
|
import pytest
|
|
|
|
import torch
|
2023-08-14 08:23:16 +00:00
|
|
|
from hivemind import nested_compare, nested_flatten
|
2022-12-13 08:03:49 +00:00
|
|
|
|
Add LLaMA support (#323)
This PR:
1. **Abolishes the model conversion procedure.** Now, models are downloaded directly from original repositories like https://huggingface.co/bigscience/bloom. Servers download only shards with blocks to be hosted, and clients download only shards with input/output embeddings and layernorms.
- BLOOM is loaded from `bigscience/bloom`, but we use the DHT prefix `bigscience/bloom-petals` for backward compatibility. Same with smaller BLOOMs and BLOOMZ.
- LLaMA can be loaded from any repo like `username/llama-65b-hf`, but we use the DHT prefix `llama-65b-hf` (without the username) to accomodate blocks from different repos (there're a few of them with minor differences, such as `Llama` vs. `LLaMA` in the class name).
2. **Refactors the client to generalize it for multiple models.** Now, we have `petals.models` packages that contain model-specific code (e.g. `petals.models.bloom`, `petals.models.llama`). General code (e.g. CPU-efficient LM head, p-tuning) is kept in `petals.client`.
3. **Introduces** `WrappedLlamaBlock`, `DistributedLlamaConfig`, `DistributedLlamaForCausalLM`, `DistributedLlamaForSequenceClassification`, and `DistributedLlamaModel` compatible with Petals functionality (p-tuning, adapters, etc.).
4. **Introduces** `AutoDistributedConfig` that automatically chooses the correct config class (`DistributedLlamaConfig` or `DistributedBloomConfig`). The refactored configs contain all model-specific info for both clients and servers.
Upgrade instructions:
- Remove disk caches for blocks in old (converted) format to save disk space. That is, remove `~/.cache/petals/model--bigscience--bloom-petals` and `~/.cache/petals/model--bigscience--bloomz-petals` directories (if present).
2023-06-23 11:46:10 +00:00
|
|
|
from petals import AutoDistributedConfig
|
2023-03-12 21:49:04 +00:00
|
|
|
from petals.server.throughput import measure_compute_rps
|
2023-07-03 16:13:04 +00:00
|
|
|
from petals.utils.convert_block import QuantType
|
2023-08-14 08:23:16 +00:00
|
|
|
from petals.utils.misc import DUMMY, is_dummy
|
|
|
|
from petals.utils.packaging import pack_args_kwargs, unpack_args_kwargs
|
2023-03-12 21:49:04 +00:00
|
|
|
from test_utils import MODEL_NAME
|
2022-12-13 08:03:49 +00:00
|
|
|
|
|
|
|
|
2023-07-14 14:40:47 +00:00
|
|
|
def test_bnb_not_imported_when_unnecessary():
|
|
|
|
"""
|
|
|
|
We avoid importing bitsandbytes when it's not used,
|
|
|
|
since bitsandbytes doesn't always find correct CUDA libs and may raise exceptions because of that.
|
|
|
|
|
|
|
|
If this test fails, please change your code to import bitsandbytes and/or petals.utils.peft
|
|
|
|
in the function's/method's code when it's actually needed instead of importing them in the beginning of the file.
|
|
|
|
This won't slow down the code - importing a module for the 2nd time doesn't rerun module code.
|
|
|
|
"""
|
|
|
|
|
|
|
|
subprocess.check_call([sys.executable, "-c", "import petals, sys; assert 'bitsandbytes' not in sys.modules"])
|
|
|
|
|
|
|
|
|
2022-12-13 08:03:49 +00:00
|
|
|
@pytest.mark.forked
|
2023-07-17 09:45:59 +00:00
|
|
|
@pytest.mark.parametrize("inference", [False, True])
|
|
|
|
@pytest.mark.parametrize("n_tokens", [1, 16])
|
2023-01-03 15:35:51 +00:00
|
|
|
@pytest.mark.parametrize("tensor_parallel", [False, True])
|
2023-07-17 09:45:59 +00:00
|
|
|
def test_compute_throughput(inference: bool, n_tokens: int, tensor_parallel: bool):
|
Add LLaMA support (#323)
This PR:
1. **Abolishes the model conversion procedure.** Now, models are downloaded directly from original repositories like https://huggingface.co/bigscience/bloom. Servers download only shards with blocks to be hosted, and clients download only shards with input/output embeddings and layernorms.
- BLOOM is loaded from `bigscience/bloom`, but we use the DHT prefix `bigscience/bloom-petals` for backward compatibility. Same with smaller BLOOMs and BLOOMZ.
- LLaMA can be loaded from any repo like `username/llama-65b-hf`, but we use the DHT prefix `llama-65b-hf` (without the username) to accomodate blocks from different repos (there're a few of them with minor differences, such as `Llama` vs. `LLaMA` in the class name).
2. **Refactors the client to generalize it for multiple models.** Now, we have `petals.models` packages that contain model-specific code (e.g. `petals.models.bloom`, `petals.models.llama`). General code (e.g. CPU-efficient LM head, p-tuning) is kept in `petals.client`.
3. **Introduces** `WrappedLlamaBlock`, `DistributedLlamaConfig`, `DistributedLlamaForCausalLM`, `DistributedLlamaForSequenceClassification`, and `DistributedLlamaModel` compatible with Petals functionality (p-tuning, adapters, etc.).
4. **Introduces** `AutoDistributedConfig` that automatically chooses the correct config class (`DistributedLlamaConfig` or `DistributedBloomConfig`). The refactored configs contain all model-specific info for both clients and servers.
Upgrade instructions:
- Remove disk caches for blocks in old (converted) format to save disk space. That is, remove `~/.cache/petals/model--bigscience--bloom-petals` and `~/.cache/petals/model--bigscience--bloomz-petals` directories (if present).
2023-06-23 11:46:10 +00:00
|
|
|
config = AutoDistributedConfig.from_pretrained(MODEL_NAME)
|
2023-08-08 15:10:27 +00:00
|
|
|
if tensor_parallel and config.model_type != "bloom":
|
|
|
|
pytest.skip("Tensor parallelism is implemented only for BLOOM for now")
|
|
|
|
|
2023-01-03 15:35:51 +00:00
|
|
|
tensor_parallel_devices = ("cpu", "cpu") if tensor_parallel else ()
|
2022-12-15 12:21:33 +00:00
|
|
|
compute_rps = measure_compute_rps(
|
2023-01-03 15:35:51 +00:00
|
|
|
config,
|
|
|
|
device=torch.device("cpu"),
|
|
|
|
dtype=torch.bfloat16,
|
2023-07-03 16:13:04 +00:00
|
|
|
quant_type=QuantType.NONE,
|
2023-01-03 15:35:51 +00:00
|
|
|
tensor_parallel_devices=tensor_parallel_devices,
|
2023-07-17 09:45:59 +00:00
|
|
|
n_tokens=n_tokens,
|
|
|
|
n_steps=5,
|
|
|
|
inference=inference,
|
2022-12-13 08:03:49 +00:00
|
|
|
)
|
2022-12-15 12:21:33 +00:00
|
|
|
assert isinstance(compute_rps, float) and compute_rps > 0
|
2023-08-14 08:23:16 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.forked
|
|
|
|
def test_pack_inputs():
|
|
|
|
x = torch.ones(3)
|
|
|
|
y = torch.arange(5)
|
|
|
|
z = DUMMY
|
|
|
|
|
|
|
|
args = (x, z, None, (y, y), z)
|
|
|
|
kwargs = dict(foo=torch.zeros(1, 1), bar={"l": "i", "g": "h", "t": ("y", "e", "a", "r", torch.rand(1), x, y)})
|
|
|
|
|
|
|
|
flat_tensors, args_structure = pack_args_kwargs(*args, **kwargs)
|
|
|
|
|
|
|
|
assert len(flat_tensors) == 5
|
|
|
|
assert all(isinstance(t, torch.Tensor) for t in flat_tensors)
|
|
|
|
|
|
|
|
restored_args, restored_kwargs = unpack_args_kwargs(flat_tensors, args_structure)
|
|
|
|
|
|
|
|
assert len(restored_args) == len(args)
|
|
|
|
assert torch.all(restored_args[0] == x).item() and restored_args[2] is None
|
|
|
|
assert nested_compare((args, kwargs), (restored_args, restored_kwargs))
|
|
|
|
for original, restored in zip(nested_flatten((args, kwargs)), nested_flatten((restored_args, restored_kwargs))):
|
|
|
|
if isinstance(original, torch.Tensor):
|
|
|
|
assert torch.all(original == restored)
|
|
|
|
else:
|
|
|
|
assert original == restored
|