From 8c546d988a4205d440a2625523f0a6f69ee24e1c Mon Sep 17 00:00:00 2001
From: Alexander Borzunov <borzunov.alexander@gmail.com>
Date: Tue, 8 Aug 2023 19:10:27 +0400
Subject: [PATCH] Test Llama, rebalancing, throughput eval, and all CLI scripts
 (#452)

This PR extends CI to:

1. Test Llama code using [TinyLlama-v0](https://huggingface.co/Maykeye/TinyLLama-v0).
2. Test rebalancing (sets up a situation where the 1st server needs to change its original position).
3. Check if benchmark scripts run (in case someone breaks its code). Note that the benchmark results are meaningless here (since they're measured on a tiny swarm of CPU servers, with low `--n_steps`).
4. Test `petals.cli.run_dht`.
5. Increase swap space and watch free RAM (a common issue is that actions are cancelled without explanation if there's not enough RAM - so it's a useful reminder + debug tool).
6. Fix flapping tests for bloom-560m by increasing tolerance.

Other minor changes: fix `--help` messages to show defaults, fix docs, tune rebalancing constants.
---
 .github/workflows/run-tests.yaml  |  93 ++++++++++++++++++++++--------
 benchmarks/benchmark_forward.py   |  18 +++---
 benchmarks/benchmark_inference.py |  14 ++---
 benchmarks/benchmark_training.py  |  24 ++++----
 src/petals/cli/run_dht.py         |   8 ++-
 src/petals/cli/run_server.py      |   2 +-
 src/petals/server/server.py       |   6 +-
 tests/{test.id => bootstrap.id}   | Bin
 tests/server2.id                  | Bin 0 -> 1197 bytes
 tests/test_aux_functions.py       |   3 +
 tests/test_block_exact_match.py   |   4 +-
 tests/test_chained_calls.py       |   6 +-
 tests/test_full_model.py          |  57 +++++++++---------
 tests/test_remote_sequential.py   |  12 ++--
 tests/test_sequence_manager.py    |   4 +-
 tests/test_server_stats.py        |   8 ++-
 tests/test_tensor_parallel.py     |   5 +-
 17 files changed, 161 insertions(+), 103 deletions(-)
 rename tests/{test.id => bootstrap.id} (100%)
 create mode 100644 tests/server2.id

diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml
index 3bccda3..735fd2a 100644
--- a/.github/workflows/run-tests.yaml
+++ b/.github/workflows/run-tests.yaml
@@ -10,10 +10,20 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ '3.8', '3.9', '3.10', '3.11' ]
+        include:
+          - { model: 'bigscience/bloom-560m', python-version: '3.8' }
+          - { model: 'bigscience/bloom-560m', python-version: '3.9' }
+          - { model: 'bigscience/bloom-560m', python-version: '3.10' }
+          - { model: 'bigscience/bloom-560m', python-version: '3.11' }
+          - { model: 'Maykeye/TinyLLama-v0', python-version: '3.8' }
+          - { model: 'Maykeye/TinyLLama-v0', python-version: '3.11' }
       fail-fast: false
     timeout-minutes: 15
     steps:
+      - name: Increase swap space
+        uses: pierotofy/set-swap-space@master
+        with:
+          swap-size-gb: 10
       - name: Checkout
         uses: actions/checkout@v3
       - name: Set up Python
@@ -31,44 +41,77 @@ jobs:
           pip install .[dev]
       - name: Test
         run: |
-          export MODEL_NAME=bigscience/bloom-560m
-          export REF_NAME=bigscience/bloom-560m
-          export ADAPTER_NAME=artek0chumak/bloom-560m-safe-peft
-
-          python -m petals.cli.run_server --converted_model_name_or_path $MODEL_NAME --block_indices 0:12 \
-            --new_swarm --identity tests/test.id --host_maddrs /ip4/127.0.0.1/tcp/31337 --throughput 1 \
-            --torch_dtype float32 --compression NONE --attn_cache_tokens 2048 --max_chunk_size_bytes 1024 \
-            --adapters $ADAPTER_NAME &> server1.log &
-          SERVER1_PID=$!
+          export MODEL_NAME="${{ matrix.model }}"
+          export REF_NAME="${{ matrix.model }}"
+          export ADAPTER_NAME="${{ matrix.model == 'bigscience/bloom-560m' && 'artek0chumak/bloom-560m-safe-peft' || '' }}"
+          export TENSOR_PARALLEL_ARGS="${{ matrix.model == 'bigscience/bloom-560m' && '--tensor_parallel_devices cpu cpu' || '' }}"
+
+          # [Step 1] Watch free RAM (lack of RAM is a common issue in CI)
+
+          bash -c 'while true; do free -h && sleep 30s; done' &
+          RAM_WATCH_PID=$!
 
-          sleep 5  # wait for the first server to initialize DHT
+          # [Step 2] Set up a tiny test swarm (see https://github.com/bigscience-workshop/petals/wiki/Launch-your-own-swarm)
+
+          python -m petals.cli.run_dht \
+            --identity_path tests/bootstrap.id --host_maddrs /ip4/127.0.0.1/tcp/31337 &> bootstrap.log &
+          BOOTSTRAP_PID=$!
 
           export INITIAL_PEERS=/ip4/127.0.0.1/tcp/31337/p2p/QmS9KwZptnVdB9FFV7uGgaTq4sEKBwcYeKZDfSpyKDUd1g
-          # ^-- server 1 multiaddr is determined by --identity and --host_maddrs
+          # ^-- multiaddr in INITIAL_PEERS is determined by --identity_path and --host_maddrs
 
-          python -m petals.cli.run_server --converted_model_name_or_path $MODEL_NAME --block_indices 12:22 \
-            --initial_peers $INITIAL_PEERS --throughput 1 --torch_dtype float32 --adapters $ADAPTER_NAME &> server2.log &
-          SERVER2_PID=$!
+          sleep 5  # wait for DHT init
+
+          python -m petals.cli.run_server $MODEL_NAME --adapters $ADAPTER_NAME --torch_dtype float32 --num_blocks 5 \
+            --mean_balance_check_period 10 \
+            --initial_peers $INITIAL_PEERS --throughput 1 &> server1.log &
+          SERVER1_PID=$!
+          # ^-- rebalacing test: this server chooses blocks 0:5, then sees a gap in the swarm and moves there
 
-          sleep 10 # wait for initial servers to declare blocks, then let server decide which blocks to serve
+          sleep 10  # wait for the 1st server to choose blocks
 
-          python -m petals.cli.run_server --converted_model_name_or_path $MODEL_NAME --block_indices 12:15 \
-            --initial_peers $INITIAL_PEERS --throughput 1 --torch_dtype float32 --tensor_parallel_devices cpu cpu &> server3.log &
+          python -m petals.cli.run_server $MODEL_NAME --adapters $ADAPTER_NAME --torch_dtype float32 --block_indices 0:5 \
+            --identity_path tests/server2.id \
+            --initial_peers $INITIAL_PEERS --throughput 1 &> server2.log &
+          SERVER2_PID=$!
+
+          python -m petals.cli.run_server $MODEL_NAME --adapters $ADAPTER_NAME --torch_dtype float32 --num_blocks 14 \
+            --attn_cache_tokens 2048 --max_chunk_size_bytes 1024 \
+            --initial_peers $INITIAL_PEERS --throughput auto &> server3.log &
           SERVER3_PID=$!
+          # ^-- chunking test
 
-          python -m petals.cli.run_server --converted_model_name_or_path $MODEL_NAME --num_blocks 3 \
-            --initial_peers $INITIAL_PEERS --throughput 1 --torch_dtype float32 --adapters $ADAPTER_NAME &> server4.log &
+          python -m petals.cli.run_server $MODEL_NAME $TENSOR_PARALLEL_ARGS --torch_dtype float32 --block_indices 0:2 \
+            --initial_peers $INITIAL_PEERS --throughput auto &> server4.log &
           SERVER4_PID=$!
+          # ^-- tensor parallelism test (not compatible with adapters yet)
 
-          tail -n 100 -f server*.log &
+          sleep 5  # wait for the log files to appear
+
+          tail -n 100 -f bootstrap.log server*.log &
           LOGGER_PID=$!
-          sleep 30  # wait for servers to download layers
 
-          kill -0 $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID # ensure all servers survived init
+          sleep 30  # wait for servers to eval throughput, download layers, and rebalance
+          kill -0 $BOOTSTRAP_PID $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID  # ensure all peers survived init
+
+          # [Step 3] Run PyTest
 
           pytest tests --durations=0 --durations-min=1.0 -v
 
-          kill -0 $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID # ensure all servers survived tests
+          # [Step 4] Check if benchmarks work (their results here are meaningless since it's a tiny swarm of CPU servers)
+
+          python benchmarks/benchmark_inference.py --model $MODEL_NAME --initial_peers $INITIAL_PEERS --torch_dtype float32 \
+            --seq_len 3
+          python benchmarks/benchmark_forward.py --model $MODEL_NAME --initial_peers $INITIAL_PEERS --torch_dtype float32 \
+            --seq_len 3 --batch_size 3 --n_steps 1
+          python benchmarks/benchmark_training.py --model $MODEL_NAME --initial_peers $INITIAL_PEERS --torch_dtype float32 \
+            --seq_len 3 --batch_size 3 --pre_seq_len 1 --n_steps 1 --task cls
+          python benchmarks/benchmark_training.py --model $MODEL_NAME --initial_peers $INITIAL_PEERS --torch_dtype float32 \
+            --seq_len 3 --batch_size 3 --pre_seq_len 1 --n_steps 1 --task causal_lm
+
+          # [Step 5] Clean up
+
+          kill -0 $BOOTSTRAP_PID $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID  # ensure all peers survived tests
 
-          kill -s SIGINT $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID $LOGGER_PID
+          kill -s SIGINT $BOOTSTRAP_PID $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID $LOGGER_PID $RAM_WATCH_PID
           echo "Done!"
diff --git a/benchmarks/benchmark_forward.py b/benchmarks/benchmark_forward.py
index e95c5ec..bf547ec 100755
--- a/benchmarks/benchmark_forward.py
+++ b/benchmarks/benchmark_forward.py
@@ -15,15 +15,15 @@ logger = get_logger()
 
 
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, default="bigscience/bloom")
-    parser.add_argument("--initial_peers", type=str, nargs="+", default=PUBLIC_INITIAL_PEERS)
-    parser.add_argument("--torch_dtype", type=str, default="bfloat16")
-    parser.add_argument("--n_processes", type=str, default=1)
-    parser.add_argument("--seq_len", type=int, default=128)
-    parser.add_argument("--n_steps", type=int, default=100)
-    parser.add_argument("--batch_size", type=int, required=True)
-    parser.add_argument("--warmup_steps", type=int, default=1)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--model", type=str, required=True, help="Model")
+    parser.add_argument("--initial_peers", type=str, nargs="+", default=PUBLIC_INITIAL_PEERS, help="Initial peers")
+    parser.add_argument("--torch_dtype", type=str, default="bfloat16", help="Torch dtype")
+    parser.add_argument("--n_processes", type=str, default=1, help="Number of concurrent processes")
+    parser.add_argument("--seq_len", type=int, default=128, help="Sequence length")
+    parser.add_argument("--n_steps", type=int, default=100, help="Number of benchmark steps")
+    parser.add_argument("--batch_size", type=int, required=True, help="Batch size")
+    parser.add_argument("--warmup_steps", type=int, default=1, help="Number of warmup steps")
     args = parser.parse_args()
 
     if args.n_processes == "n_gpus":
diff --git a/benchmarks/benchmark_inference.py b/benchmarks/benchmark_inference.py
index 607ff88..e894bb1 100755
--- a/benchmarks/benchmark_inference.py
+++ b/benchmarks/benchmark_inference.py
@@ -16,13 +16,13 @@ logger = get_logger()
 
 
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, default="bigscience/bloom")
-    parser.add_argument("--initial_peers", type=str, nargs="+", default=PUBLIC_INITIAL_PEERS)
-    parser.add_argument("--torch_dtype", type=str, default="bfloat16")
-    parser.add_argument("--n_processes", type=str, default=1)
-    parser.add_argument("--seq_len", type=int, default=2048)
-    parser.add_argument("--warmup_steps", type=int, default=1)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--model", type=str, required=True, help="Model")
+    parser.add_argument("--initial_peers", type=str, nargs="+", default=PUBLIC_INITIAL_PEERS, help="Initial peers")
+    parser.add_argument("--torch_dtype", type=str, default="bfloat16", help="Torch dtype")
+    parser.add_argument("--n_processes", type=str, default=1, help="Number of concurrent processes")
+    parser.add_argument("--seq_len", type=int, default=2048, help="Sequence length")
+    parser.add_argument("--warmup_steps", type=int, default=1, help="Number of warmup steps")
     args = parser.parse_args()
 
     if args.n_processes == "n_gpus":
diff --git a/benchmarks/benchmark_training.py b/benchmarks/benchmark_training.py
index 0853dfc..85061a3 100755
--- a/benchmarks/benchmark_training.py
+++ b/benchmarks/benchmark_training.py
@@ -15,18 +15,18 @@ logger = get_logger()
 
 
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, default="bigscience/bloom")
-    parser.add_argument("--device", type=str, default="cpu")
-    parser.add_argument("--task", type=str, default="cls")
-    parser.add_argument("--initial_peers", type=str, nargs="+", default=PUBLIC_INITIAL_PEERS)
-    parser.add_argument("--torch_dtype", type=str, default="bfloat16")
-    parser.add_argument("--n_processes", type=str, default=1)
-    parser.add_argument("--seq_len", type=int, default=128)
-    parser.add_argument("--pre_seq_len", type=int, default=16)
-    parser.add_argument("--n_steps", type=int, default=10)
-    parser.add_argument("--batch_size", type=int, required=True)
-    parser.add_argument("--warmup_steps", type=int, default=1)
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--model", type=str, required=True, help="Model")
+    parser.add_argument("--device", type=str, default="cpu", help="Torch device hosting the client")
+    parser.add_argument("--task", type=str, default="cls", help="Training task type")
+    parser.add_argument("--initial_peers", type=str, nargs="+", default=PUBLIC_INITIAL_PEERS, help="Initial peers")
+    parser.add_argument("--torch_dtype", type=str, default="bfloat16", help="Torch dtype")
+    parser.add_argument("--n_processes", type=str, default=1, help="Number of concurrent processes")
+    parser.add_argument("--seq_len", type=int, default=128, help="Sequence length")
+    parser.add_argument("--pre_seq_len", type=int, default=16, help="Number of trainable tokens")
+    parser.add_argument("--n_steps", type=int, default=10, help="Number of benchmark steps")
+    parser.add_argument("--batch_size", type=int, required=True, help="Batch size")
+    parser.add_argument("--warmup_steps", type=int, default=1, help="Number of warmup steps")
     args = parser.parse_args()
 
     assert args.task in ["cls", "causal_lm"]
diff --git a/src/petals/cli/run_dht.py b/src/petals/cli/run_dht.py
index 2f30516..777d9d0 100644
--- a/src/petals/cli/run_dht.py
+++ b/src/petals/cli/run_dht.py
@@ -7,8 +7,8 @@ This script may be used for launching lightweight CPU machines serving as bootst
 This may be eventually merged to the hivemind upstream.
 """
 
+import argparse
 import time
-from argparse import ArgumentParser
 from secrets import token_hex
 
 from hivemind.dht import DHT, DHTNode
@@ -35,7 +35,7 @@ async def report_status(dht: DHT, node: DHTNode):
 
 
 def main():
-    parser = ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
         "--initial_peers",
         nargs="*",
@@ -73,7 +73,9 @@ def main():
         help="Disable circuit relay functionality in libp2p (see https://docs.libp2p.io/concepts/nat/circuit-relay/)",
     )
     parser.add_argument(
-        "--use_auto_relay", action="store_true", help="Look for libp2p relays to reach peers behind NATs/firewalls"
+        "--use_auto_relay",
+        action="store_true",
+        help="Look for libp2p relays to become reachable if we are behind NAT/firewall",
     )
     parser.add_argument(
         "--refresh_period", type=int, default=30, help="Period (in seconds) for fetching the keys from DHT"
diff --git a/src/petals/cli/run_server.py b/src/petals/cli/run_server.py
index c82ff44..d85c8ac 100644
--- a/src/petals/cli/run_server.py
+++ b/src/petals/cli/run_server.py
@@ -158,7 +158,7 @@ def main():
                              "when connecting to the public swarm. If you connect to a private swarm, "
                              "the check is skipped by default. Use this option only if you know what you are doing")
 
-    parser.add_argument("--adapters", nargs='+', default=(),
+    parser.add_argument("--adapters", nargs='*', default=(),
                         help="List of pre-loaded LoRA adapters that can be used for inference or training")
 
     # fmt:on
diff --git a/src/petals/server/server.py b/src/petals/server/server.py
index 7772fa6..bf7470a 100644
--- a/src/petals/server/server.py
+++ b/src/petals/server/server.py
@@ -78,7 +78,7 @@ class Server:
         sender_threads: int = 1,
         balance_quality: float = 0.75,
         mean_balance_check_period: float = 120,
-        mean_block_selection_delay: float = 2.5,
+        mean_block_selection_delay: float = 5,
         token: Optional[Union[str, bool]] = None,
         quant_type: Optional[QuantType] = None,
         tensor_parallel_devices: Optional[Sequence[torch.device]] = None,
@@ -201,6 +201,8 @@ class Server:
         assert num_blocks is None or block_indices is None, "Please specify num_blocks or block_indices, not both"
         if num_blocks is None and block_indices is None:
             num_blocks = self._choose_num_blocks()
+        if num_blocks is not None:
+            num_blocks = min(num_blocks, self.block_config.num_hidden_layers)
         if block_indices is not None:
             try:
                 first_block_index, last_block_index = block_indices.split(":")
@@ -295,7 +297,7 @@ class Server:
 
         num_blocks = min(num_blocks, self.block_config.num_hidden_layers)
         logger.info(
-            f"Server will fill all your GPU memory with {num_blocks} transformer blocks. "
+            f"Server will fill your GPU memory with {num_blocks} transformer blocks. "
             f"If you want to leave some free GPU memory, please specify a lesser --num_blocks manually"
         )
         return num_blocks
diff --git a/tests/test.id b/tests/bootstrap.id
similarity index 100%
rename from tests/test.id
rename to tests/bootstrap.id
diff --git a/tests/server2.id b/tests/server2.id
new file mode 100644
index 0000000000000000000000000000000000000000..261555779ed4194b8a3cfb49a7759755f13b270f
GIT binary patch
literal 1197
zcmV;e1XBA501~JPFoFc60s#O5f&l>l*q);s;IqHH@*TeGz3p8h%ogpB@p~dfJljpA
zmte=Ie-Gw8J^pzE;N;A|TySjx1ITHq&Fw2mHDu=LQ2lYRrY>DWrj4QP<2_0oz!S0g
z@)w-t(L+;5N{@a1yYto5*dj`NLP!=o%-#^^PAEF0xtrMu4o6pYD^O7tSrS^2Y=S?b
z&bw`eDc91oG@pLQ7mY*9HXPSlvGz~uU+CX0ot^Qp`&O>>Z7IQt%Z;o5=UtuvOr?t=
z{q_OT^!9R#u?@xC_CRk!lArRsJn+Zu*k#>yxP|dgnrZX3@2(eUjw}Vy!~PsWNai@M
zFE6W)xe}tvWxpx%f!JEtX3v{r0s{d60Rn;n0RXKXf^O;&+`i1Cw+q2qYu`ReXp3vg
zBPDa0GoZm0s2P;=2<DQX#^|w^NQL?XBPsWC_x7tw$`}-)(c?}<ZP8`Tk=<Fsz|m8b
zjG+@_>J2?hU#U)}#%w4HD*q%12Qamk8UETGupi+4>vms2$u$w&nggM<q!Cb$JZ+jJ
zkJ&`yX<fVGV}M4pTCtlXyZ7hL)i@=snISW(mpYz`(`<-*GdlDkEwp3v8_~vUmc8IK
zt*f2x1Sirn&M)zBi$Z16W3mtF$eGNuKwqyDB#vNhGux*_K@N^sRHAC}%x-EnbO>Wk
z`irkmy|(rs--zp8M;QL=+{)g6n)UQEATa`gfdKwc-hnoGwdV@~_{9}<5=y;u<T)?~
z0MROQRY0ELWLz{yJS6~D19mN0a}WR(<0+2IX%Fvh@At=>@eOicSvo%SWl}CRVNK@W
zn!y03xCc>(6p@G6r94hYn_D2p9ZIR1AS6xTjxgbu-teQuBfVq#kxkN#5m-|W>I(bU
zSc?LIfdJYK-1@S;1xE0$a8+)C7ho~FSe5p`prKwzXe9u06xr=CZP>H*7igmqfuv!a
zMR|e0l*4N1>0`FPLft$KHMqLrLnnVTT&CS`l<QMZ)1XT+A}bQ5h9XH(zf~-=*t6Lv
zP-1ZVKYQ+bR0ZlFozNa)Ga%9tiYQq{0q<CG%ZUPkfD>0E6u^hk1(_JNN{6pEjOUAh
zx45S~IIH=|Y51!r_PZkwx8SC!?7G$}q4eLiKHJqSP(ke?YQ2kQahF*b_zeM>68_+z
zUBpQZ1JHooP4O*Et0bYFFHq#MGvx9t7#V%nBkQO@C<lN2JYTo+-`rA}>Pax+IL=r+
z1WD4qu>ygB3(`)Nq}2?OF#$9K-LBo&kUFUC&}Y{Th>!Gqbq=~js!&4INySpskZPrk
zq~R3*CSsk94xzj4bp79fApKHVMu2JLokz-%9abz$sl%NlhK~+hDRH6M00Nhd<&%bg
z5t-KM&saaR*Hs2oAHq)S_KCV(;3cI<)V3lCxX{Xp0)c@5#3qS(2@Ge+yD|^y#pT(_
zHUPiXqU^@iKc0jJ(|7+UE8$9Ujgl7jofv8!K3%fbpE1yr47Mzvdk<yPj$d)}y8zRE
z&3=s(Rp97Q?BIhN$Sg7=cZ!CvPhz%=S0nayS~!AtTT&l=fM)5X7j7qVQAtx$S8N~B
LPR_D&6VPXa**s7z

literal 0
HcmV?d00001

diff --git a/tests/test_aux_functions.py b/tests/test_aux_functions.py
index 64c9c6a..f75281e 100644
--- a/tests/test_aux_functions.py
+++ b/tests/test_aux_functions.py
@@ -29,6 +29,9 @@ def test_bnb_not_imported_when_unnecessary():
 @pytest.mark.parametrize("tensor_parallel", [False, True])
 def test_compute_throughput(inference: bool, n_tokens: int, tensor_parallel: bool):
     config = AutoDistributedConfig.from_pretrained(MODEL_NAME)
+    if tensor_parallel and config.model_type != "bloom":
+        pytest.skip("Tensor parallelism is implemented only for BLOOM for now")
+
     tensor_parallel_devices = ("cpu", "cpu") if tensor_parallel else ()
     compute_rps = measure_compute_rps(
         config,
diff --git a/tests/test_block_exact_match.py b/tests/test_block_exact_match.py
index 62c4e89..d98918b 100644
--- a/tests/test_block_exact_match.py
+++ b/tests/test_block_exact_match.py
@@ -3,14 +3,14 @@ import random
 import pytest
 import torch
 
-from petals import DistributedBloomConfig, RemoteSequential
+from petals import AutoDistributedConfig, RemoteSequential
 from petals.server.from_pretrained import load_pretrained_block
 from test_utils import *
 
 
 @pytest.mark.forked
 def test_remote_block_exact_match(atol_forward=1e-4, atol_inference=1e-3):
-    config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
+    config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
     remote_sequential = RemoteSequential(config)
 
     for block_index in random.sample(range(config.num_hidden_layers), 3):
diff --git a/tests/test_chained_calls.py b/tests/test_chained_calls.py
index d20f654..d4b012c 100644
--- a/tests/test_chained_calls.py
+++ b/tests/test_chained_calls.py
@@ -7,7 +7,7 @@
 import pytest
 import torch
 
-from petals import DistributedBloomConfig
+from petals import AutoDistributedConfig
 from petals.client.remote_sequential import RemoteSequential
 from petals.server.from_pretrained import load_pretrained_block
 from test_utils import *
@@ -15,7 +15,7 @@ from test_utils import *
 
 @pytest.mark.forked
 def test_forward_backward_exact_match(atol_forward=1e-4, atol_backward=1e-4, seq_length=1):
-    config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
+    config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
     remote_blocks = RemoteSequential(config, start_block=3, end_block=6)
     assert isinstance(remote_blocks, RemoteSequential)
 
@@ -43,7 +43,7 @@ def test_forward_backward_exact_match(atol_forward=1e-4, atol_backward=1e-4, seq
 
 @pytest.mark.forked
 def test_chained_inference_exact_match(atol_inference=1e-4):
-    config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
+    config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
     remote_blocks = RemoteSequential(config, start_block=3, end_block=5)
 
     inputs = torch.randn(1, 8, config.hidden_size)
diff --git a/tests/test_full_model.py b/tests/test_full_model.py
index 511604b..dc2f3d7 100644
--- a/tests/test_full_model.py
+++ b/tests/test_full_model.py
@@ -3,29 +3,31 @@ import pytest
 import torch
 import transformers
 from hivemind import get_logger
-from transformers.generation import BeamSearchScorer
-from transformers.models.bloom import BloomForCausalLM
+from transformers.generation import BeamSearchScorer, GenerationMixin as HfGenerationMixin
 
-from petals import DistributedBloomForCausalLM
+from petals import AutoDistributedModelForCausalLM
 from test_utils import *
 
 logger = get_logger(__name__)
 
 
+@pytest.fixture
+def tokenizer():
+    # We set use_fast=False since LlamaTokenizerFast is slow on load
+    return transformers.AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
+
+
 @pytest.mark.forked
 @pytest.mark.parametrize("use_peft", (True, False) if ADAPTER_NAME else (False,))
 @pytest.mark.parametrize("pass_empty_tensors", (True, False))
-def test_full_model_exact_match(use_peft: bool, pass_empty_tensors: bool, atol_forward=1e-3, atol_inference=1e-3):
-    tokenizer = transformers.BloomTokenizerFast.from_pretrained(MODEL_NAME)
-    model = DistributedBloomForCausalLM.from_pretrained(
+def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_forward=1e-3, atol_inference=1e-3):
+    model = AutoDistributedModelForCausalLM.from_pretrained(
         MODEL_NAME,
         initial_peers=INITIAL_PEERS,
-        low_cpu_mem_usage=True,
         torch_dtype=torch.float32,
         active_adapter=ADAPTER_NAME if use_peft else None,
     )
     config = model.config
-    assert isinstance(model, DistributedBloomForCausalLM)
     assert len(model.transformer.h) == model.config.num_hidden_layers
 
     test_inputs = tokenizer("A quick brown fox was minding its own buisness", return_tensors="pt")["input_ids"]
@@ -63,7 +65,7 @@ def test_full_model_exact_match(use_peft: bool, pass_empty_tensors: bool, atol_f
         del model, embs, recurrent_outputs
 
         if REF_NAME:
-            ref_model = transformers.BloomForCausalLM.from_pretrained(
+            ref_model = transformers.AutoModelForCausalLM.from_pretrained(
                 REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
             )
             if use_peft:
@@ -86,27 +88,29 @@ def test_full_model_exact_match(use_peft: bool, pass_empty_tensors: bool, atol_f
 
 
 @pytest.mark.forked
-def test_greedy_generation(max_new_tokens=4):
-    tokenizer = transformers.BloomTokenizerFast.from_pretrained(MODEL_NAME)
-    model = DistributedBloomForCausalLM.from_pretrained(
-        MODEL_NAME, initial_peers=INITIAL_PEERS, low_cpu_mem_usage=True, torch_dtype=torch.float32
+def test_greedy_generation(tokenizer, max_new_tokens=4):
+    model = AutoDistributedModelForCausalLM.from_pretrained(
+        MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32
     )
     inputs = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
     remote_outputs = model.generate(
         inputs,
         max_new_tokens=max_new_tokens,
     )
-    hf_outputs = BloomForCausalLM.greedy_search(model, input_ids=inputs, max_length=inputs.size(1) + max_new_tokens)
+    hf_outputs = HfGenerationMixin.greedy_search(model, input_ids=inputs, max_length=inputs.size(1) + max_new_tokens)
     assert torch.allclose(remote_outputs, hf_outputs), "Greedy search results are not identical to HF"
 
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
     inputs_batch = tokenizer(["A cat sat on a mat", "A dog sat on a mat"], return_tensors="pt", padding=True)[
         "input_ids"
     ]
+
     remote_outputs_batch = model.generate(
         inputs_batch,
         max_new_tokens=max_new_tokens,
     )
-    hf_outputs_batch = BloomForCausalLM.greedy_search(
+    hf_outputs_batch = HfGenerationMixin.greedy_search(
         model, input_ids=inputs_batch, max_length=inputs_batch.size(1) + max_new_tokens
     )
     assert torch.allclose(
@@ -117,13 +121,13 @@ def test_greedy_generation(max_new_tokens=4):
 @pytest.mark.forked
 @pytest.mark.parametrize("sampling_options", [dict(), dict(temperature=100.0), dict(top_k=5), dict(top_p=0.9)])
 @pytest.mark.skip("Sampling is currently not consistent with outputs from Transformers")
-def test_sampling(sampling_options, max_new_tokens=4):
+def test_sampling(tokenizer, sampling_options, max_new_tokens=4):
     torch.manual_seed(0)
-    tokenizer = transformers.BloomTokenizerFast.from_pretrained(MODEL_NAME)
-    model = DistributedBloomForCausalLM.from_pretrained(
-        MODEL_NAME, initial_peers=INITIAL_PEERS, low_cpu_mem_usage=True, torch_dtype=torch.float32
+
+    model = AutoDistributedModelForCausalLM.from_pretrained(
+        MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32
     )
-    logits_warper = BloomForCausalLM._get_logits_warper(model, num_beams=1, **sampling_options)
+    logits_warper = HfGenerationMixin._get_logits_warper(model, num_beams=1, **sampling_options)
     inputs = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
     with torch.random.fork_rng():
         remote_outputs = model.generate(
@@ -133,7 +137,7 @@ def test_sampling(sampling_options, max_new_tokens=4):
             **sampling_options,
         )
     with torch.random.fork_rng():
-        hf_outputs = BloomForCausalLM.sample(
+        hf_outputs = HfGenerationMixin.sample(
             model, input_ids=inputs, max_length=inputs.size(1) + max_new_tokens, logits_warper=logits_warper
         )
     assert torch.allclose(remote_outputs, hf_outputs), "Sampling results are not identical to HF"
@@ -149,7 +153,7 @@ def test_sampling(sampling_options, max_new_tokens=4):
             **sampling_options,
         )
     with torch.random.fork_rng():
-        hf_outputs_batch = BloomForCausalLM.sample(
+        hf_outputs_batch = HfGenerationMixin.sample(
             model,
             input_ids=inputs_batch,
             max_length=inputs_batch.size(1) + max_new_tokens,
@@ -161,10 +165,9 @@ def test_sampling(sampling_options, max_new_tokens=4):
 
 
 @pytest.mark.forked
-def test_beam_search_generation(max_new_tokens=4, num_beams=2):
-    tokenizer = transformers.BloomTokenizerFast.from_pretrained(MODEL_NAME)
-    model = DistributedBloomForCausalLM.from_pretrained(
-        MODEL_NAME, initial_peers=INITIAL_PEERS, low_cpu_mem_usage=True, torch_dtype=torch.float32
+def test_beam_search_generation(tokenizer, max_new_tokens=4, num_beams=2):
+    model = AutoDistributedModelForCausalLM.from_pretrained(
+        MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32
     )
     text = "A cat sat on a mat"
     inputs = tokenizer(text, return_tensors="pt")["input_ids"]
@@ -181,7 +184,7 @@ def test_beam_search_generation(max_new_tokens=4, num_beams=2):
         do_early_stopping=False,
     )
     hf_inputs = tokenizer([text] * 2, return_tensors="pt")["input_ids"]
-    hf_outputs = BloomForCausalLM.beam_search(
+    hf_outputs = HfGenerationMixin.beam_search(
         model, input_ids=hf_inputs, max_length=inputs.size(1) + max_new_tokens, beam_scorer=beam_scorer
     )
     assert torch.allclose(remote_outputs, hf_outputs), "Beam search results are not identical to HF"
diff --git a/tests/test_remote_sequential.py b/tests/test_remote_sequential.py
index 3c8a48f..9189e68 100644
--- a/tests/test_remote_sequential.py
+++ b/tests/test_remote_sequential.py
@@ -4,7 +4,7 @@ import torch.nn.functional as F
 from hivemind import DHT, BatchTensorDescriptor, get_logger
 from hivemind.proto import runtime_pb2
 
-from petals import DistributedBloomConfig
+from petals import AutoDistributedConfig
 from petals.client import RemoteSequenceManager, RemoteSequential
 from petals.data_structures import UID_DELIMITER
 from petals.server.from_pretrained import load_pretrained_block
@@ -15,7 +15,7 @@ logger = get_logger(__name__)
 
 @pytest.mark.forked
 def test_remote_sequential():
-    config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
+    config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
     dht = DHT(initial_peers=config.initial_peers, client_mode=True, start=True)
     test_inputs = torch.randn(1, 5, config.hidden_size, requires_grad=True)
     grad_proj = torch.randn(1, 5, config.hidden_size)
@@ -40,10 +40,10 @@ def test_remote_sequential():
     assert hidden.shape == test_inputs.shape
     assert hidden.requires_grad
     second_half_outputs = second_half(hidden)
-    assert torch.allclose(second_half_outputs, full_outputs, atol=1e-4)
+    assert torch.allclose(second_half_outputs, full_outputs, atol=3e-4)
 
     (second_half_outputs * grad_proj).sum().backward()
-    assert torch.allclose(test_inputs.grad, full_grad, atol=1e-3)
+    assert torch.allclose(test_inputs.grad, full_grad, atol=1e-2)
 
     # test RemoteSequential with lossy compression
     block_uids = [f"{config.dht_prefix}{UID_DELIMITER}{i}" for i in range(config.num_hidden_layers)]
@@ -56,7 +56,7 @@ def test_remote_sequential():
     (approx_outputs * grad_proj).sum().backward()
 
     assert not torch.allclose(approx_outputs, full_outputs, rtol=0, atol=1e-4), "compression was not used"
-    assert not torch.allclose(test_inputs.grad, full_grad, rtol=0, atol=1e-2), "compression was not used"
+    assert not torch.allclose(test_inputs.grad, full_grad, rtol=0, atol=1e-3), "compression was not used"
     assert abs(approx_outputs - full_outputs).mean() < 0.01
     absmax = abs(full_grad).max()
     assert abs(test_inputs.grad / absmax - full_grad / absmax).mean() < 0.05
@@ -87,7 +87,7 @@ class DummyCustomSequenceManager(RemoteSequenceManager):
 
 @pytest.mark.forked
 def test_remote_sequential_prompts(batch_size=2, seq_len=5, pre_seq_len=3):
-    config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
+    config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
     remote_sequential = RemoteSequential(config)
 
     inputs = F.normalize(torch.randn(batch_size, seq_len, config.hidden_size), dim=-1)
diff --git a/tests/test_sequence_manager.py b/tests/test_sequence_manager.py
index 03e17e3..6e3d3d3 100644
--- a/tests/test_sequence_manager.py
+++ b/tests/test_sequence_manager.py
@@ -5,7 +5,7 @@ import pytest
 import torch
 from hivemind import DHT, get_logger
 
-from petals import DistributedBloomConfig
+from petals import AutoDistributedConfig
 from petals.client import RemoteSequenceManager, RemoteSequential
 from petals.data_structures import UID_DELIMITER
 from test_utils import *
@@ -16,7 +16,7 @@ logger = get_logger(__name__)
 @pytest.mark.forked
 @pytest.mark.parametrize("mode", ["max_throughput", "min_latency"])
 def test_sequence_manager_basics(mode: str):
-    config = DistributedBloomConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
+    config = AutoDistributedConfig.from_pretrained(MODEL_NAME, initial_peers=INITIAL_PEERS)
     dht = DHT(initial_peers=config.initial_peers, client_mode=True, start=True)
     sequential = RemoteSequential(config, dht=dht)
     shutdown_evt = threading.Event()
diff --git a/tests/test_server_stats.py b/tests/test_server_stats.py
index 5de3393..c8e6ab6 100644
--- a/tests/test_server_stats.py
+++ b/tests/test_server_stats.py
@@ -4,14 +4,16 @@ import hivemind
 import pytest
 import torch
 
-from petals import DistributedBloomConfig, RemoteSequential
+from petals import AutoDistributedConfig, RemoteSequential
 from petals.server.handler import CACHE_TOKENS_AVAILABLE
 from test_utils import *
 
 
 @pytest.mark.forked
-def test_server_info(block_from: int = 22, block_to: int = 24, max_length: int = 100, max_length2: int = 50):
-    config = DistributedBloomConfig.from_pretrained(MODEL_NAME)
+def test_server_info(block_from: int = 2, block_to: int = 5, max_length: int = 100, max_length2: int = 50):
+    config = AutoDistributedConfig.from_pretrained(MODEL_NAME)
+    config.allowed_servers = ["QmNV5G3hq2UmAck2htEgsqrmPFBff5goFZAdmKDcZLBZLX"]  # PeerID from server2.id
+
     dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
     blocks1 = RemoteSequential(config, dht=dht, start_block=block_from, end_block=block_to)
     blocks2 = RemoteSequential(config, dht=dht, start_block=block_to - 1, end_block=block_to)
diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 408a261..5630f4a 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -14,8 +14,11 @@ from test_utils import MODEL_NAME
 @pytest.mark.parametrize("custom_config", [True, False])
 @pytest.mark.parametrize("devices", [("cpu",) * 2, ("cpu",) * 3, ("cpu",) * 4])
 def test_tp_block(devices, custom_config):
-    block_index = random.randint(0, 10)
     model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME)
+    if model_config.model_type != "bloom":
+        pytest.skip("Tensor parallelism is implemented only for BLOOM for now")
+
+    block_index = random.randint(0, 10)
     block = load_pretrained_block(MODEL_NAME, block_index=block_index, torch_dtype=torch.float32).to(devices[0])
 
     tp_config = None