Add automated tests (#23)
This PR will run basic tests automatically on each subsequent PR - convert a small model on every PR - run existing tests on every PR - enforce black / isort - require checks on merge - make sure tests are not flappy Co-authored-by: Alexander Borzunov <hxrussia@gmail.com> Co-authored-by: Dmitry Baranchuk <dmitrybaranchuk@gmail.com>pull/24/head
parent
f5463812ad
commit
e2711a033b
@ -0,0 +1,26 @@
|
||||
name: Check style
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master ]
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
black:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: psf/black@stable
|
||||
with:
|
||||
options: "--check --diff"
|
||||
version: "22.3.0"
|
||||
isort:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.8
|
||||
- uses: isort/isort-action@master
|
||||
with:
|
||||
isortVersion: "5.10.1"
|
@ -0,0 +1,89 @@
|
||||
name: Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master ]
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
convert-model:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
BLOOM_TESTING_WRITE_TOKEN: ${{ secrets.BLOOM_TESTING_WRITE_TOKEN }}
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.9
|
||||
- name: Cache dependencies
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: Key-v1-py3.9-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
- name: Delete previous model, if exists
|
||||
run: |
|
||||
python -c "from huggingface_hub import delete_repo; delete_repo(token='$BLOOM_TESTING_WRITE_TOKEN', \
|
||||
name='test-bloomd-350m-$GITHUB_HEAD_REF', organization='bloom-testing')" || true
|
||||
- name: Convert model and push to hub
|
||||
run: |
|
||||
python -m cli.convert_model --model bigscience/bloom-350m --output_path ./converted_model \
|
||||
--output_repo bloom-testing/test-bloomd-350m-$GITHUB_HEAD_REF --use_auth_token $BLOOM_TESTING_WRITE_TOKEN
|
||||
|
||||
|
||||
run-tests:
|
||||
runs-on: ubuntu-latest
|
||||
needs: convert-model
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [ 3.7, 3.8, 3.9 ]
|
||||
fail-fast: false
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Cache dependencies
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: Key-v1-${{ matrix.python-version }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
pip install -r requirements-dev.txt
|
||||
- name: Test
|
||||
run: |
|
||||
export MODEL_NAME=bloom-testing/test-bloomd-350m-$GITHUB_HEAD_REF
|
||||
python -m cli.run_server --converted_model_name_or_path $MODEL_NAME --block_indices 0:12 \
|
||||
--torch_dtype float32 --identity tests/test.id --host_maddrs /ip4/127.0.0.1/tcp/31337 --throughput 1 &
|
||||
SERVER1_PID=$!
|
||||
|
||||
export INITIAL_PEERS=/ip4/127.0.0.1/tcp/31337/p2p/QmS9KwZptnVdB9FFV7uGgaTq4sEKBwcYeKZDfSpyKDUd1g
|
||||
# ^-- server 1 multiaddr is determined by --identity and --host_maddrs
|
||||
|
||||
python -m cli.run_server --converted_model_name_or_path $MODEL_NAME --block_indices 12:24 \
|
||||
--torch_dtype float32 --initial_peers $INITIAL_PEERS --throughput 1 &> server2.log &
|
||||
SERVER2_PID=$!
|
||||
|
||||
sleep 30 # wait for server to download layers
|
||||
|
||||
# test individual blocks
|
||||
export PYTHONPATH=.
|
||||
BLOCK_UID=$MODEL_NAME.0 REF_NAME=$MODEL_NAME REF_INDEX=0 pytest tests/test_block_exact_match.py
|
||||
BLOCK_UID=$MODEL_NAME.19 REF_NAME=$MODEL_NAME REF_INDEX=19 pytest tests/test_block_exact_match.py
|
||||
|
||||
REF_NAME=$MODEL_NAME pytest tests/test_chained_calls.py
|
||||
|
||||
REF_NAME=bigscience/bloom-350m pytest tests/test_full_model.py
|
||||
|
||||
kill -s SIGINT $SERVER1_PID $SERVER2_PID
|
||||
echo "Done!"
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,10 @@
|
||||
[tool.black]
|
||||
line-length = 120
|
||||
required-version = "22.3.0"
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
line_length = 120
|
||||
combine_as_imports = true
|
||||
combine_star = true
|
||||
known_local_folder = ["tests", "cli"]
|
@ -0,0 +1,6 @@
|
||||
pytest==6.2.5 # see https://github.com/pytest-dev/pytest/issues/9621
|
||||
pytest-forked
|
||||
pytest-asyncio==0.16.0
|
||||
black==22.3.0
|
||||
isort==5.10.1
|
||||
psutil
|
@ -0,0 +1,6 @@
|
||||
torch==1.12.0
|
||||
accelerate==0.10.0
|
||||
huggingface-hub==0.7.0
|
||||
bitsandbytes-cuda113==0.26.0
|
||||
https://github.com/learning-at-home/hivemind/archive/d42c70331da43667da6d9020666df54806d8b561.zip
|
||||
https://github.com/huggingface/transformers/archive/6589e510fa4e6c442059de2fab84752535de9b23.zip
|
@ -1,4 +1,4 @@
|
||||
from src.client.remote_block import RemoteTransformerBlock, RemoteTransformerBlockInferenceSession
|
||||
from src.client.remote_model import DistributedBloomConfig, DistributedBloomForCausalLM, DistributedBloomModel
|
||||
from src.client.remote_sequence_info import RemoteSequenceInfo
|
||||
from src.client.remote_sequential import RemoteSequential
|
||||
from src.client.sequence_manager import RemoteSequenceManager
|
||||
|
Binary file not shown.
@ -0,0 +1,97 @@
|
||||
######
|
||||
# Warning:torch this test is a work in progress. It will be modified soon.
|
||||
# - if you want more stable tests, see test_block_exact_match
|
||||
# - if you want to figure out chained inference, ask yozh
|
||||
|
||||
import os
|
||||
|
||||
import hivemind
|
||||
import torch
|
||||
import transformers
|
||||
from hivemind.moe.expert_uid import UID_DELIMITER, ExpertInfo
|
||||
|
||||
from src.bloom.from_pretrained import load_pretrained_block
|
||||
from src.client.remote_block import RemoteTransformerBlock
|
||||
from src.dht_utils import get_remote_module
|
||||
|
||||
INITIAL_PEERS = os.environ.get("INITIAL_PEERS")
|
||||
if not INITIAL_PEERS:
|
||||
raise RuntimeError("Must specify INITIAL_PEERS environment variable with one or more peer ids")
|
||||
INITIAL_PEERS = INITIAL_PEERS.split()
|
||||
|
||||
|
||||
MODEL_NAME = os.environ.get("MODEL_NAME")
|
||||
if not MODEL_NAME:
|
||||
raise RuntimeError("Must specify MODEL_NAME as a name of a model to be tested")
|
||||
|
||||
REF_NAME = os.environ.get("REF_NAME", "bigscience/test-bloomd-6b3")
|
||||
|
||||
|
||||
def test_forward_backward_exact_match(atol_forward=1e-4, atol_backward=1e-4, seq_length=1):
|
||||
dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
|
||||
config = transformers.AutoConfig.from_pretrained(MODEL_NAME)
|
||||
remote_block = get_remote_module(dht, f"{MODEL_NAME}{UID_DELIMITER}0")
|
||||
assert remote_block is not None, f"Could not find {MODEL_NAME}{UID_DELIMITER}0 in DHT"
|
||||
assert isinstance(remote_block, RemoteTransformerBlock)
|
||||
|
||||
_ = remote_block.info # lazy-init info now, because otherwise we will _break_ info init by chaning _info
|
||||
remote_block._info = ExpertInfo(f"{MODEL_NAME}.3 {MODEL_NAME}.4 {MODEL_NAME}.5", remote_block._info.peer_id)
|
||||
|
||||
ref_blocks = [
|
||||
load_pretrained_block(REF_NAME, 3, torch_dtype=torch.float32),
|
||||
load_pretrained_block(REF_NAME, 4, torch_dtype=torch.float32),
|
||||
load_pretrained_block(REF_NAME, 5, torch_dtype=torch.float32),
|
||||
]
|
||||
inputs = torch.randn(1, seq_length, config.hidden_size, requires_grad=True)
|
||||
outputs_rpc = remote_block.forward(inputs)[0]
|
||||
outputs_rpc.sum().backward()
|
||||
grads_rpc = inputs.grad
|
||||
|
||||
inputs.grad = None
|
||||
hidden_states = inputs
|
||||
for ref_block in ref_blocks:
|
||||
hidden_states = ref_block.forward(hidden_states)[0]
|
||||
outputs_ref = hidden_states
|
||||
outputs_ref.sum().backward()
|
||||
grads_ref = inputs.grad
|
||||
|
||||
assert torch.allclose(outputs_ref, outputs_rpc, rtol=0, atol=atol_forward)
|
||||
assert torch.allclose(grads_ref, grads_rpc, rtol=0, atol=atol_backward)
|
||||
|
||||
|
||||
def test_chained_inference_exact_match(atol_inference=1e-4):
|
||||
dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
|
||||
config = transformers.AutoConfig.from_pretrained(MODEL_NAME)
|
||||
remote_block = get_remote_module(dht, f"{MODEL_NAME}{UID_DELIMITER}0")
|
||||
assert remote_block is not None, f"Could not find {MODEL_NAME}{UID_DELIMITER}0 in DHT"
|
||||
assert isinstance(remote_block, RemoteTransformerBlock)
|
||||
|
||||
_ = remote_block.info # lazy-init info now, because otherwise we will _break_ info init by chaning _info
|
||||
remote_block._info = ExpertInfo(f"{MODEL_NAME}.3 {MODEL_NAME}.4", remote_block._info.peer_id)
|
||||
|
||||
inputs = torch.randn(1, 8, config.hidden_size)
|
||||
|
||||
outputs_inference = []
|
||||
with remote_block.inference_session() as sess:
|
||||
for i in range(inputs.shape[1]):
|
||||
outputs_inference.append(sess.step(inputs[:, i : i + 1, :]))
|
||||
outputs_inference = torch.cat(outputs_inference, dim=1)
|
||||
|
||||
ref_blocks = [
|
||||
load_pretrained_block(REF_NAME, 3, torch_dtype=torch.float32),
|
||||
load_pretrained_block(REF_NAME, 4, torch_dtype=torch.float32),
|
||||
]
|
||||
outputs_ref = []
|
||||
caches = [None, None]
|
||||
for i in range(inputs.shape[1]):
|
||||
new_caches = []
|
||||
hidden_states = inputs[:, i : i + 1, :]
|
||||
for ref_block, cache in zip(ref_blocks, caches):
|
||||
with torch.no_grad():
|
||||
hidden_states, new_cache = ref_block.forward(hidden_states, use_cache=True, layer_past=cache)
|
||||
new_caches.append(new_cache)
|
||||
|
||||
outputs_ref.append(hidden_states)
|
||||
caches = new_caches
|
||||
outputs_ref = torch.cat(outputs_ref, dim=1)
|
||||
assert torch.allclose(outputs_ref, outputs_inference, rtol=0, atol=atol_inference)
|
@ -1,59 +0,0 @@
|
||||
######
|
||||
# Warning:torch this test is a work in progress. It will be modified soon.
|
||||
# - if you want more stable tests, see test_block_exact_match
|
||||
# - if you want to figure out chained inference, ask yozh
|
||||
|
||||
import os
|
||||
|
||||
import hivemind
|
||||
import torch
|
||||
from hivemind.moe.expert_uid import ExpertInfo
|
||||
|
||||
from src.bloom.from_pretrained import load_pretrained_block
|
||||
from src.client.remote_block import RemoteTransformerBlock
|
||||
from src.dht_utils import get_remote_module
|
||||
|
||||
INITIAL_PEERS = os.environ.get("INITIAL_PEERS")
|
||||
if not INITIAL_PEERS:
|
||||
raise RuntimeError("Must specify INITIAL_PEERS environment variable with one or more peer ids")
|
||||
INITIAL_PEERS = INITIAL_PEERS.split()
|
||||
|
||||
|
||||
BLOCK_UID = os.environ.get("BLOCK_UID")
|
||||
if not BLOCK_UID:
|
||||
raise RuntimeError("Must specify BLOCK_UID as an index of a transformer block to be tested")
|
||||
|
||||
REF_NAME = os.environ.get("REF_NAME", "bigscience/test-bloomd-6b3")
|
||||
|
||||
|
||||
# seq_length > 128: rpc_forward_stream & rpc_backward_stream
|
||||
# seq_length <= 128: rpc_forward & rpc_backward
|
||||
def test_forward_backward_exact_match(atol_forward=1e-4, atol_backward=1e-4, seq_length=1):
|
||||
dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
|
||||
(remote_block,) = get_remote_module(dht, BLOCK_UID)
|
||||
assert remote_block is not None, f"Could not find {BLOCK_UID} in DHT"
|
||||
assert isinstance(remote_block, RemoteTransformerBlock)
|
||||
|
||||
_ = remote_block.info # lazy-init info now, because otherwise we will _break_ info init by chaning _info
|
||||
remote_block._info = ExpertInfo("bloom6b3.3 bloom6b3.4 bloom6b3.5", remote_block._info.peer_id)
|
||||
|
||||
ref_blocks = [
|
||||
load_pretrained_block(REF_NAME, 3, torch_dtype=torch.float32),
|
||||
load_pretrained_block(REF_NAME, 4, torch_dtype=torch.float32),
|
||||
load_pretrained_block(REF_NAME, 5, torch_dtype=torch.float32),
|
||||
]
|
||||
inputs = torch.randn(1, seq_length, 4096, requires_grad=True)
|
||||
outputs_rpc = remote_block.forward(inputs)[0]
|
||||
outputs_rpc.sum().backward()
|
||||
grads_rpc = inputs.grad
|
||||
|
||||
inputs.grad = None
|
||||
hidden_states = inputs
|
||||
for ref_block in ref_blocks:
|
||||
hidden_states = ref_block.forward(hidden_states)[0]
|
||||
outputs_ref = hidden_states
|
||||
outputs_ref.sum().backward()
|
||||
grads_ref = inputs.grad
|
||||
|
||||
assert torch.allclose(outputs_ref, outputs_rpc, rtol=0, atol=atol_forward)
|
||||
assert torch.allclose(grads_ref, grads_rpc, rtol=0, atol=atol_backward)
|
@ -1,64 +0,0 @@
|
||||
######
|
||||
# Warning:torch this test is a work in progress. It will be modified soon.
|
||||
# - if you want more stable tests, see test_block_exact_match
|
||||
# - if you want to figure out chained inference, ask yozh
|
||||
|
||||
import os
|
||||
|
||||
import hivemind
|
||||
import torch
|
||||
from hivemind.moe.expert_uid import ExpertInfo
|
||||
|
||||
from src.bloom.from_pretrained import load_pretrained_block
|
||||
from src.client.remote_block import RemoteTransformerBlock
|
||||
from src.dht_utils import get_remote_module
|
||||
|
||||
INITIAL_PEERS = os.environ.get("INITIAL_PEERS")
|
||||
if not INITIAL_PEERS:
|
||||
raise RuntimeError("Must specify INITIAL_PEERS environment variable with one or more peer ids")
|
||||
INITIAL_PEERS = INITIAL_PEERS.split()
|
||||
|
||||
|
||||
BLOCK_UID = os.environ.get("BLOCK_UID")
|
||||
if not BLOCK_UID:
|
||||
raise RuntimeError("Must specify BLOCK_UID as an index of a transformer block to be tested")
|
||||
|
||||
REF_NAME = os.environ.get("REF_NAME", "bigscience/test-bloomd-6b3")
|
||||
REF_INDEX = int(os.environ.get("REF_INDEX", BLOCK_UID[-1].split(".")[-1]))
|
||||
|
||||
|
||||
def test_remote_block_exact_match(atol_inference=1e-4):
|
||||
dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=True, start=True)
|
||||
remote_block = get_remote_module(dht, BLOCK_UID)
|
||||
assert remote_block is not None, f"Could not find {BLOCK_UID} in DHT"
|
||||
assert isinstance(remote_block, RemoteTransformerBlock)
|
||||
|
||||
_ = remote_block.info # lazy-init info now, because otherwise we will _break_ info init by chaning _info
|
||||
remote_block._info = ExpertInfo("bloom6b3.3 bloom6b3.4", remote_block._info.peer_id)
|
||||
|
||||
inputs = torch.randn(1, 8, 4096)
|
||||
|
||||
outputs_inference = []
|
||||
with remote_block.inference_session() as sess:
|
||||
for i in range(inputs.shape[1]):
|
||||
outputs_inference.append(sess.step(inputs[:, i : i + 1, :]))
|
||||
outputs_inference = torch.cat(outputs_inference, dim=1)
|
||||
|
||||
ref_blocks = [
|
||||
load_pretrained_block(REF_NAME, 3, torch_dtype=torch.float32),
|
||||
load_pretrained_block(REF_NAME, 4, torch_dtype=torch.float32),
|
||||
]
|
||||
outputs_ref = []
|
||||
caches = [None, None]
|
||||
for i in range(inputs.shape[1]):
|
||||
new_caches = []
|
||||
hidden_states = inputs[:, i : i + 1, :]
|
||||
for ref_block, cache in zip(ref_blocks, caches):
|
||||
with torch.no_grad():
|
||||
hidden_states, new_cache = ref_block.forward(hidden_states, use_cache=True, layer_past=cache)
|
||||
new_caches.append(new_cache)
|
||||
|
||||
outputs_ref.append(hidden_states)
|
||||
caches = new_caches
|
||||
outputs_ref = torch.cat(outputs_ref, dim=1)
|
||||
assert torch.allclose(outputs_ref, outputs_inference, rtol=0, atol=atol_inference)
|
Loading…
Reference in New Issue