petals/tests/test_aux_functions.py

import subprocess
import sys

import pytest
import torch
from hivemind import nested_compare, nested_flatten

from petals import AutoDistributedConfig
from petals.server.throughput import measure_compute_rps
from petals.utils.convert_block import QuantType
from petals.utils.misc import DUMMY, is_dummy
from petals.utils.packaging import pack_args_kwargs, unpack_args_kwargs
from test_utils import MODEL_NAME


def test_bnb_not_imported_when_unnecessary():
    """
    We avoid importing bitsandbytes when it's not used,
    since bitsandbytes doesn't always find correct CUDA libs and may raise exceptions because of that.

    If this test fails, please change your code to import bitsandbytes and/or petals.utils.peft
    in the function's/method's code when it's actually needed instead of importing them in the beginning of the file.
    This won't slow down the code - importing a module for the 2nd time doesn't rerun module code.
    """

    subprocess.check_call([sys.executable, "-c", "import petals, sys; assert 'bitsandbytes' not in sys.modules"])


@pytest.mark.forked
@pytest.mark.parametrize("inference", [False, True])
@pytest.mark.parametrize("n_tokens", [1, 16])
@pytest.mark.parametrize("tensor_parallel", [False, True])
def test_compute_throughput(inference: bool, n_tokens: int, tensor_parallel: bool):
    config = AutoDistributedConfig.from_pretrained(MODEL_NAME)
    if tensor_parallel and config.model_type != "bloom":
        pytest.skip("Tensor parallelism is implemented only for BLOOM for now")

    tensor_parallel_devices = ("cpu", "cpu") if tensor_parallel else ()
    compute_rps = measure_compute_rps(
        config,
        device=torch.device("cpu"),
        dtype=torch.bfloat16,
        quant_type=QuantType.NONE,
        tensor_parallel_devices=tensor_parallel_devices,
        n_tokens=n_tokens,
        n_steps=5,
        inference=inference,
    )
    assert isinstance(compute_rps, float) and compute_rps > 0


@pytest.mark.forked
def test_pack_inputs():
    x = torch.ones(3)
    y = torch.arange(5)
    z = DUMMY

    args = (x, z, None, (y, y), z)
    kwargs = dict(foo=torch.zeros(1, 1), bar={"l": "i", "g": "h", "t": ("y", "e", "a", "r", torch.rand(1), x, y)})

    flat_tensors, args_structure = pack_args_kwargs(*args, **kwargs)

    assert len(flat_tensors) == 5
    assert all(isinstance(t, torch.Tensor) for t in flat_tensors)

    restored_args, restored_kwargs = unpack_args_kwargs(flat_tensors, args_structure)

    assert len(restored_args) == len(args)
    assert torch.all(restored_args[0] == x).item() and restored_args[2] is None
    assert nested_compare((args, kwargs), (restored_args, restored_kwargs))
    for original, restored in zip(nested_flatten((args, kwargs)), nested_flatten((restored_args, restored_kwargs))):
        if isinstance(original, torch.Tensor):
            assert torch.all(original == restored)
        else:
            assert original == restored
Test that bitsandbytes is not imported when it's not used (#351) We avoid importing bitsandbytes when it's not used, since bitsandbytes doesn't always find correct CUDA libs and may raise exceptions because of that. 10 months ago			`import subprocess`
			`import sys`

Bump transformers to 4.25.1 (#151) - latest accelerate, transformers, huggingface_hub - rearrange attention caches to support https://github.com/huggingface/transformers/pull/18344 - remove unused code - fix edge case where session crashes when receiving seq length 0 - assert transformer version when importing WrappedBloomBlock Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> Co-authored-by: Max Ryabinin <mryabinin0@gmail.com> 1 year ago			`import pytest`
			`import torch`
Add customizable input tensors (#445) 9 months ago			`from hivemind import nested_compare, nested_flatten`
Bump transformers to 4.25.1 (#151) - latest accelerate, transformers, huggingface_hub - rearrange attention caches to support https://github.com/huggingface/transformers/pull/18344 - remove unused code - fix edge case where session crashes when receiving seq length 0 - assert transformer version when importing WrappedBloomBlock Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> Co-authored-by: Max Ryabinin <mryabinin0@gmail.com> 1 year ago
Add LLaMA support (#323) This PR: 1. Abolishes the model conversion procedure. Now, models are downloaded directly from original repositories like https://huggingface.co/bigscience/bloom. Servers download only shards with blocks to be hosted, and clients download only shards with input/output embeddings and layernorms. - BLOOM is loaded from `bigscience/bloom`, but we use the DHT prefix `bigscience/bloom-petals` for backward compatibility. Same with smaller BLOOMs and BLOOMZ. - LLaMA can be loaded from any repo like `username/llama-65b-hf`, but we use the DHT prefix `llama-65b-hf` (without the username) to accomodate blocks from different repos (there're a few of them with minor differences, such as `Llama` vs. `LLaMA` in the class name). 2. Refactors the client to generalize it for multiple models. Now, we have `petals.models` packages that contain model-specific code (e.g. `petals.models.bloom`, `petals.models.llama`). General code (e.g. CPU-efficient LM head, p-tuning) is kept in `petals.client`. 3. Introduces `WrappedLlamaBlock`, `DistributedLlamaConfig`, `DistributedLlamaForCausalLM`, `DistributedLlamaForSequenceClassification`, and `DistributedLlamaModel` compatible with Petals functionality (p-tuning, adapters, etc.). 4. Introduces `AutoDistributedConfig` that automatically chooses the correct config class (`DistributedLlamaConfig` or `DistributedBloomConfig`). The refactored configs contain all model-specific info for both clients and servers. Upgrade instructions: - Remove disk caches for blocks in old (converted) format to save disk space. That is, remove `~/.cache/petals/model--bigscience--bloom-petals` and `~/.cache/petals/model--bigscience--bloomz-petals` directories (if present). 11 months ago			`from petals import AutoDistributedConfig`
Speed up loading blocks using init with meta weights (#285) * Init WrappedBloomBlock with meta weights --------- Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> 1 year ago			`from petals.server.throughput import measure_compute_rps`
Support loading blocks in 4-bit (QLoRA NF4 format, disabled by default) (#333) 11 months ago			`from petals.utils.convert_block import QuantType`
Add customizable input tensors (#445) 9 months ago			`from petals.utils.misc import DUMMY, is_dummy`
			`from petals.utils.packaging import pack_args_kwargs, unpack_args_kwargs`
Speed up loading blocks using init with meta weights (#285) * Init WrappedBloomBlock with meta weights --------- Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> 1 year ago			`from test_utils import MODEL_NAME`
Bump transformers to 4.25.1 (#151) - latest accelerate, transformers, huggingface_hub - rearrange attention caches to support https://github.com/huggingface/transformers/pull/18344 - remove unused code - fix edge case where session crashes when receiving seq length 0 - assert transformer version when importing WrappedBloomBlock Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> Co-authored-by: Max Ryabinin <mryabinin0@gmail.com> 1 year ago

Test that bitsandbytes is not imported when it's not used (#351) We avoid importing bitsandbytes when it's not used, since bitsandbytes doesn't always find correct CUDA libs and may raise exceptions because of that. 10 months ago			`def test_bnb_not_imported_when_unnecessary():`
			`"""`
			`We avoid importing bitsandbytes when it's not used,`
			`since bitsandbytes doesn't always find correct CUDA libs and may raise exceptions because of that.`

			`If this test fails, please change your code to import bitsandbytes and/or petals.utils.peft`
			`in the function's/method's code when it's actually needed instead of importing them in the beginning of the file.`
			`This won't slow down the code - importing a module for the 2nd time doesn't rerun module code.`
			`"""`

			`subprocess.check_call([sys.executable, "-c", "import petals, sys; assert 'bitsandbytes' not in sys.modules"])`


Bump transformers to 4.25.1 (#151) - latest accelerate, transformers, huggingface_hub - rearrange attention caches to support https://github.com/huggingface/transformers/pull/18344 - remove unused code - fix edge case where session crashes when receiving seq length 0 - assert transformer version when importing WrappedBloomBlock Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> Co-authored-by: Max Ryabinin <mryabinin0@gmail.com> 1 year ago			`@pytest.mark.forked`
Report inference, forward, and network RPS separately (#358) Inference RPS may be very different from forward RPS. E.g., currently bnb uses a completely different algorithm for NF4 inference. We report detailed RPS info that can be then used for shortest-path routing for inference. 10 months ago			`@pytest.mark.parametrize("inference", [False, True])`
			`@pytest.mark.parametrize("n_tokens", [1, 16])`
Add local tensor-parallel fwd/bwd (#143) This pull request adds an option to run Petals server on multiple local GPUs. It uses https://github.com/BlackSamorez/tensor_parallel - 8bit approximation error same as in main (mean~=2% q0.9~=5%) - TP=1, 2, 3 (see screenshots above) - forward, grad w.r.t. input and inference exact match with main with TP=1 - `>=`80% GPU utilization with 3x 1080ti, batch = 8 tokens - throughput measured with and without TP - TP on 1080Tis has near-linear speedup comparable to the benchmarks (see first message) Co-authored-by: Iaroslav Lisniak <yalisnyak@nes.ru> Co-authored-by: Andrei Panferov <andrei@blacksamorez.ru> Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> 1 year ago			`@pytest.mark.parametrize("tensor_parallel", [False, True])`
Report inference, forward, and network RPS separately (#358) Inference RPS may be very different from forward RPS. E.g., currently bnb uses a completely different algorithm for NF4 inference. We report detailed RPS info that can be then used for shortest-path routing for inference. 10 months ago			`def test_compute_throughput(inference: bool, n_tokens: int, tensor_parallel: bool):`
Add LLaMA support (#323) This PR: 1. Abolishes the model conversion procedure. Now, models are downloaded directly from original repositories like https://huggingface.co/bigscience/bloom. Servers download only shards with blocks to be hosted, and clients download only shards with input/output embeddings and layernorms. - BLOOM is loaded from `bigscience/bloom`, but we use the DHT prefix `bigscience/bloom-petals` for backward compatibility. Same with smaller BLOOMs and BLOOMZ. - LLaMA can be loaded from any repo like `username/llama-65b-hf`, but we use the DHT prefix `llama-65b-hf` (without the username) to accomodate blocks from different repos (there're a few of them with minor differences, such as `Llama` vs. `LLaMA` in the class name). 2. Refactors the client to generalize it for multiple models. Now, we have `petals.models` packages that contain model-specific code (e.g. `petals.models.bloom`, `petals.models.llama`). General code (e.g. CPU-efficient LM head, p-tuning) is kept in `petals.client`. 3. Introduces `WrappedLlamaBlock`, `DistributedLlamaConfig`, `DistributedLlamaForCausalLM`, `DistributedLlamaForSequenceClassification`, and `DistributedLlamaModel` compatible with Petals functionality (p-tuning, adapters, etc.). 4. Introduces `AutoDistributedConfig` that automatically chooses the correct config class (`DistributedLlamaConfig` or `DistributedBloomConfig`). The refactored configs contain all model-specific info for both clients and servers. Upgrade instructions: - Remove disk caches for blocks in old (converted) format to save disk space. That is, remove `~/.cache/petals/model--bigscience--bloom-petals` and `~/.cache/petals/model--bigscience--bloomz-petals` directories (if present). 11 months ago			`config = AutoDistributedConfig.from_pretrained(MODEL_NAME)`
Test Llama, rebalancing, throughput eval, and all CLI scripts (#452) This PR extends CI to: 1. Test Llama code using [TinyLlama-v0](https://huggingface.co/Maykeye/TinyLLama-v0). 2. Test rebalancing (sets up a situation where the 1st server needs to change its original position). 3. Check if benchmark scripts run (in case someone breaks its code). Note that the benchmark results are meaningless here (since they're measured on a tiny swarm of CPU servers, with low `--n_steps`). 4. Test `petals.cli.run_dht`. 5. Increase swap space and watch free RAM (a common issue is that actions are cancelled without explanation if there's not enough RAM - so it's a useful reminder + debug tool). 6. Fix flapping tests for bloom-560m by increasing tolerance. Other minor changes: fix `--help` messages to show defaults, fix docs, tune rebalancing constants. 9 months ago			`if tensor_parallel and config.model_type != "bloom":`
			`pytest.skip("Tensor parallelism is implemented only for BLOOM for now")`

Add local tensor-parallel fwd/bwd (#143) This pull request adds an option to run Petals server on multiple local GPUs. It uses https://github.com/BlackSamorez/tensor_parallel - 8bit approximation error same as in main (mean~=2% q0.9~=5%) - TP=1, 2, 3 (see screenshots above) - forward, grad w.r.t. input and inference exact match with main with TP=1 - `>=`80% GPU utilization with 3x 1080ti, batch = 8 tokens - throughput measured with and without TP - TP on 1080Tis has near-linear speedup comparable to the benchmarks (see first message) Co-authored-by: Iaroslav Lisniak <yalisnyak@nes.ru> Co-authored-by: Andrei Panferov <andrei@blacksamorez.ru> Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> 1 year ago			`tensor_parallel_devices = ("cpu", "cpu") if tensor_parallel else ()`
Switch to speedtest-cli (#157) This pullrequest removes custom speed_test code in favour of speedtest-cli module. This is necessary to ensure that random warnings / print-outs do not mess with our outputs. Co-authored-by: Max Ryabinin <mryabinin0@gmail.com> 1 year ago			`compute_rps = measure_compute_rps(`
Add local tensor-parallel fwd/bwd (#143) This pull request adds an option to run Petals server on multiple local GPUs. It uses https://github.com/BlackSamorez/tensor_parallel - 8bit approximation error same as in main (mean~=2% q0.9~=5%) - TP=1, 2, 3 (see screenshots above) - forward, grad w.r.t. input and inference exact match with main with TP=1 - `>=`80% GPU utilization with 3x 1080ti, batch = 8 tokens - throughput measured with and without TP - TP on 1080Tis has near-linear speedup comparable to the benchmarks (see first message) Co-authored-by: Iaroslav Lisniak <yalisnyak@nes.ru> Co-authored-by: Andrei Panferov <andrei@blacksamorez.ru> Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> 1 year ago			`config,`
			`device=torch.device("cpu"),`
			`dtype=torch.bfloat16,`
Support loading blocks in 4-bit (QLoRA NF4 format, disabled by default) (#333) 11 months ago			`quant_type=QuantType.NONE,`
Add local tensor-parallel fwd/bwd (#143) This pull request adds an option to run Petals server on multiple local GPUs. It uses https://github.com/BlackSamorez/tensor_parallel - 8bit approximation error same as in main (mean~=2% q0.9~=5%) - TP=1, 2, 3 (see screenshots above) - forward, grad w.r.t. input and inference exact match with main with TP=1 - `>=`80% GPU utilization with 3x 1080ti, batch = 8 tokens - throughput measured with and without TP - TP on 1080Tis has near-linear speedup comparable to the benchmarks (see first message) Co-authored-by: Iaroslav Lisniak <yalisnyak@nes.ru> Co-authored-by: Andrei Panferov <andrei@blacksamorez.ru> Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> 1 year ago			`tensor_parallel_devices=tensor_parallel_devices,`
Report inference, forward, and network RPS separately (#358) Inference RPS may be very different from forward RPS. E.g., currently bnb uses a completely different algorithm for NF4 inference. We report detailed RPS info that can be then used for shortest-path routing for inference. 10 months ago			`n_tokens=n_tokens,`
			`n_steps=5,`
			`inference=inference,`
Bump transformers to 4.25.1 (#151) - latest accelerate, transformers, huggingface_hub - rearrange attention caches to support https://github.com/huggingface/transformers/pull/18344 - remove unused code - fix edge case where session crashes when receiving seq length 0 - assert transformer version when importing WrappedBloomBlock Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> Co-authored-by: Max Ryabinin <mryabinin0@gmail.com> 1 year ago			`)`
Switch to speedtest-cli (#157) This pullrequest removes custom speed_test code in favour of speedtest-cli module. This is necessary to ensure that random warnings / print-outs do not mess with our outputs. Co-authored-by: Max Ryabinin <mryabinin0@gmail.com> 1 year ago			`assert isinstance(compute_rps, float) and compute_rps > 0`
Add customizable input tensors (#445) 9 months ago

			`@pytest.mark.forked`
			`def test_pack_inputs():`
			`x = torch.ones(3)`
			`y = torch.arange(5)`
			`z = DUMMY`

			`args = (x, z, None, (y, y), z)`
			`kwargs = dict(foo=torch.zeros(1, 1), bar={"l": "i", "g": "h", "t": ("y", "e", "a", "r", torch.rand(1), x, y)})`

			`flat_tensors, args_structure = pack_args_kwargs(args, *kwargs)`

			`assert len(flat_tensors) == 5`
			`assert all(isinstance(t, torch.Tensor) for t in flat_tensors)`

			`restored_args, restored_kwargs = unpack_args_kwargs(flat_tensors, args_structure)`

			`assert len(restored_args) == len(args)`
			`assert torch.all(restored_args[0] == x).item() and restored_args[2] is None`
			`assert nested_compare((args, kwargs), (restored_args, restored_kwargs))`
			`for original, restored in zip(nested_flatten((args, kwargs)), nested_flatten((restored_args, restored_kwargs))):`
			`if isinstance(original, torch.Tensor):`
			`assert torch.all(original == restored)`
			`else:`
			`assert original == restored`