add minimalistic benchmarks

2 years ago · a798ea04a6
parent e3a7d5af30
commit a798ea04a6
4 changed files with 83 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -21,3 +21,14 @@ pip install bitsandbytes-cuda113==0.26.0
 pip install https://github.com/learning-at-home/hivemind/archive/dac8940c324dd612d89c773b51a53e4a04c59064.zip
 pip install https://github.com/huggingface/transformers/archive/224bde91caff4ccfd12277ab5e9bf97c61e22ee9.zip
 ```
+
+
+# tests
+
+```bash
+# run one bloom block for a few steps
+python -m cli.inference_one_block --config cli/config.json  # see other args
+
+# minimalistic server
+python -m cli.run_server --block_config bigscience/bloom-6b3 --num_blocks 2
+```
--- a/cli/config.json
+++ b/cli/config.json
@ -0,0 +1,20 @@
+{
+  "apply_residual_connection_post_layernorm": false,
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_dropout": 0.0,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "masked_softmax_fusion": true,
+  "model_type": "bloom",
+  "n_embed": 14336,
+  "n_layer": 70,
+  "num_attention_heads": 112,
+  "pretraining_tp": 4,
+  "slow_but_exact": false,
+  "transformers_version": "4.20.0.dev0",
+  "use_cache": true,
+  "vocab_size": 250880
+}
--- a/cli/inference_one_block.py
+++ b/cli/inference_one_block.py
@ -0,0 +1,52 @@
+import argparse
+
+import torch
+from hivemind.utils.logging import use_hivemind_log_handler, get_logger
+
+from src.bloom.model import DistributedBloomConfig
+from src.bloom.block import BloomBlock
+from src.bloom.ops import build_alibi_tensor
+from tqdm.auto import trange
+
+
+use_hivemind_log_handler("in_root_logger")
+logger = get_logger(__file__)
+
+
+def print_device_info(device=None):
+    """Prints device stats. Code from https://stackoverflow.com/a/53374933/12891528"""
+    device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
+    logger.info(f"Using device: {device}")
+
+    # Additional Info when using cuda
+    if device.type == "cuda":
+        logger.info(torch.cuda.get_device_name(0))
+        logger.info(f"Memory Usage:")
+        logger.info(f"Allocated: {round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1)} GB")
+        logger.info(f"Cached:   {round(torch.cuda.memory_cached(0) / 1024 ** 3, 1)} GB")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Run a single bloom block locally on dummy data")
+    parser.add_argument("--config", required=True, type=str, help="Path to a config json file")
+    parser.add_argument("--state_dict", default=None, type=str, help="Optional path to saved block state dict")
+    parser.add_argument("--layer_index", default=0, type=int, help="Optional path to saved block state dict")
+    parser.add_argument("--num_steps", default=500, type=int, help="How many inference steps to run")
+    parser.add_argument("--device", default=None, type=str, help="Run inference on this device")
+    args = parser.parse_args()
+
+    if args.device is None:
+        args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    config = DistributedBloomConfig.from_json_file(args.config)
+    block = BloomBlock(config, args.layer_index).to(args.device)
+
+    cache = None
+
+    for i in trange(args.num_steps):
+        dummy_input = torch.randn(1, 1, config.hidden_size, device=args.device)
+        alibi = build_alibi_tensor(i + 1, config.num_attention_heads).to(args.device)
+        with torch.no_grad():
+            outputs, cache = block.forward(dummy_input, alibi=alibi, use_cache=True, layer_past=cache)
+
+    print_device_info(args.device)
--- a/cli/run_server.py
+++ b/cli/run_server.py
@ -1,7 +1,3 @@
-import os, sys
-
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  # add path to src
-
 import configargparse

 from hivemind.proto.runtime_pb2 import CompressionType