|
|
|
@ -47,12 +47,7 @@ jobs:
|
|
|
|
|
export ADAPTER_NAME="${{ matrix.model == 'bigscience/bloom-560m' && 'artek0chumak/bloom-560m-safe-peft' || '' }}"
|
|
|
|
|
export TENSOR_PARALLEL_ARGS="${{ matrix.model == 'bigscience/bloom-560m' && '--tensor_parallel_devices cpu cpu' || '' }}"
|
|
|
|
|
|
|
|
|
|
# [Step 1] Watch free RAM (lack of RAM is a common issue in CI)
|
|
|
|
|
|
|
|
|
|
bash -c 'while true; do free -h && sleep 30s; done' &
|
|
|
|
|
RAM_WATCH_PID=$!
|
|
|
|
|
|
|
|
|
|
# [Step 2] Set up a tiny test swarm (see https://github.com/bigscience-workshop/petals/wiki/Launch-your-own-swarm)
|
|
|
|
|
# [Step 1] Set up a tiny test swarm (see https://github.com/bigscience-workshop/petals/wiki/Launch-your-own-swarm)
|
|
|
|
|
|
|
|
|
|
python -m petals.cli.run_dht \
|
|
|
|
|
--identity_path tests/bootstrap.id --host_maddrs /ip4/127.0.0.1/tcp/31337 &> bootstrap.log &
|
|
|
|
@ -95,11 +90,11 @@ jobs:
|
|
|
|
|
sleep 30 # wait for servers to eval throughput, download layers, and rebalance
|
|
|
|
|
kill -0 $BOOTSTRAP_PID $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID # ensure all peers survived init
|
|
|
|
|
|
|
|
|
|
# [Step 3] Run PyTest
|
|
|
|
|
# [Step 2] Run PyTest
|
|
|
|
|
|
|
|
|
|
pytest tests --durations=0 --durations-min=1.0 -v
|
|
|
|
|
|
|
|
|
|
# [Step 4] Check if benchmarks work (their results here are meaningless since it's a tiny swarm of CPU servers)
|
|
|
|
|
# [Step 3] Check if benchmarks work (their results here are meaningless since it's a tiny swarm of CPU servers)
|
|
|
|
|
|
|
|
|
|
python benchmarks/benchmark_inference.py --model $MODEL_NAME --initial_peers $INITIAL_PEERS --torch_dtype float32 \
|
|
|
|
|
--seq_len 3
|
|
|
|
@ -110,9 +105,9 @@ jobs:
|
|
|
|
|
python benchmarks/benchmark_training.py --model $MODEL_NAME --initial_peers $INITIAL_PEERS --torch_dtype float32 \
|
|
|
|
|
--seq_len 3 --batch_size 3 --pre_seq_len 1 --n_steps 1 --task causal_lm
|
|
|
|
|
|
|
|
|
|
# [Step 5] Clean up
|
|
|
|
|
# [Step 4] Clean up
|
|
|
|
|
|
|
|
|
|
kill -0 $BOOTSTRAP_PID $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID # ensure all peers survived tests
|
|
|
|
|
|
|
|
|
|
kill -s SIGINT $BOOTSTRAP_PID $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID $LOGGER_PID $RAM_WATCH_PID
|
|
|
|
|
kill -s SIGINT $BOOTSTRAP_PID $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID $LOGGER_PID
|
|
|
|
|
echo "Done!"
|
|
|
|
|