petals/src/petals/server/throughput.py

import fcntl
import json
import os
import subprocess
import time
from hashlib import sha256
from pathlib import Path
from typing import Optional, Union

import torch
from hivemind.utils.logging import get_logger, use_hivemind_log_handler
from transformers import BloomConfig

from petals.bloom.block import WrappedBloomBlock
from petals.server.block_utils import resolve_block_dtype
from petals.utils.convert_8bit import replace_8bit_linear
from petals.utils.disk_cache import DEFAULT_CACHE_DIR

use_hivemind_log_handler("in_root_logger")
logger = get_logger(__file__)


def get_host_throughput(
    config: BloomConfig,
    device: torch.device,
    dtype: Union[str, torch.dtype],
    *,
    load_in_8bit: bool,
    force_eval: bool = False,
    cache_dir: Optional[str] = None,
) -> float:
    dtype = resolve_block_dtype(config, dtype)

    if cache_dir is None:
        cache_dir = DEFAULT_CACHE_DIR
    lock_path = Path(cache_dir, "throughput.lock")
    cache_path = Path(cache_dir, "throughput_v2.json")

    # We use the system-wide lock since only one process at a time can measure the host throughput
    os.makedirs(lock_path.parent, exist_ok=True)
    with open(lock_path, "wb") as lock_fd:
        logger.info("Loading throughput info")
        fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX)
        # The OS will release the lock when lock_fd is closed or the process is killed

        cache_key = f"config_{sha256(str(config).encode()).hexdigest()[-16:]}"
        cache_key += f"_device_{get_device_name(device).replace(' ', '_')}"
        cache_key += f"_dtype_{get_dtype_name(dtype, load_in_8bit)}"

        cache = {}
        try:
            if not force_eval and os.path.exists(cache_path):
                with open(cache_path) as cache_fd:
                    cache = json.load(cache_fd)
                assert isinstance(cache, dict)
        except Exception:
            logger.exception(f"Failed to read throughput info from {cache_path}")
            cache = {}

        if cache_key not in cache:
            cache[cache_key] = measure_throughput_info(config, device, dtype, load_in_8bit=load_in_8bit)

            try:
                os.makedirs(cache_path.parent, exist_ok=True)
                with open(cache_path, "w") as cache_fd:
                    json.dump(cache, cache_fd)
            except Exception:
                logger.exception(f"Failed to save throughput info in {cache_path}")

    return cache[cache_key]


def measure_throughput_info(
    config: BloomConfig,
    device: torch.device,
    dtype: torch.dtype,
    *,
    load_in_8bit: bool,
) -> float:
    """Measure network and compute throughput in forward pass tokens per second"""

    logger.info(
        "Measuring network and compute throughput. This takes about a minute and will be cached for future runs"
    )
    return min(
        measure_network_rps(config),
        measure_compute_rps(config, device, dtype, load_in_8bit=load_in_8bit),
    )


def measure_network_rps(config: BloomConfig) -> float:
    proc = subprocess.run("python3 -m petals.cli.speed_test --json", shell=True, capture_output=True)
    if proc.returncode != 0:
        raise RuntimeError(f"Failed to measure network throughput (stdout: {proc.stdout}, stderr: {proc.stderr})")
    network_info = json.loads(proc.stdout)

    bits_per_request = config.hidden_size * 16  # Clients usually send 16-bit tensors for forward/backward
    network_rps = min(network_info["download"], network_info["upload"]) / bits_per_request

    logger.info(
        f"Network throughput: "
        f"{network_info['download'] / 1e6:.2f} Mbit/s on download, "
        f"{network_info['upload'] / 1e6:.2f} Mbit/s on upload, "
        f"{network_rps:.1f} RPS"
    )
    return network_rps


def measure_compute_rps(
    config: BloomConfig,
    device: torch.device,
    dtype: torch.dtype,
    *,
    load_in_8bit: bool,
    n_tokens: int = 16,
    n_steps: int = 500,
) -> float:
    with torch.inference_mode():
        block = WrappedBloomBlock(config).to(dtype)
        if load_in_8bit:
            block = replace_8bit_linear(block)
        block = block.to(device)

        cache = None
        elapsed = 0
        for step in range(n_steps + 1):
            dummy_input = torch.randn(n_tokens, 1, config.hidden_size, device=device, dtype=dtype)

            start_time = time.perf_counter()
            _, cache = block.forward(dummy_input, use_cache=True, layer_past=cache)
            if step >= 1:  # Skip the 1st step to exclude the initialization time
                elapsed += time.perf_counter() - start_time
        device_rps = n_steps * n_tokens / elapsed

    logger.info(
        f"Forward pass throughput ({get_device_name(device)}, {get_dtype_name(dtype, load_in_8bit)}): "
        f"{device_rps:.1f} RPS"
    )
    return device_rps


def get_device_name(device: torch.device) -> str:
    return f"{torch.cuda.get_device_name(device)} GPU" if device.type == "cuda" else "CPU"


def get_dtype_name(dtype: torch.dtype, load_in_8bit: bool) -> str:
    return "8-bit" if load_in_8bit else str(dtype)
Measure and cache network & compute throughput (#21) 2 years ago			`import fcntl`
			`import json`
			`import os`
			`import subprocess`
			`import time`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`from hashlib import sha256`
Measure and cache network & compute throughput (#21) 2 years ago			`from pathlib import Path`
Use common folder for all caches, make it a volume in Dockerfile (#141) 2 years ago			`from typing import Optional, Union`
Measure and cache network & compute throughput (#21) 2 years ago
			`import torch`
			`from hivemind.utils.logging import get_logger, use_hivemind_log_handler`
Bump transformers to 4.25.1 (#151) - latest accelerate, transformers, huggingface_hub - rearrange attention caches to support https://github.com/huggingface/transformers/pull/18344 - remove unused code - fix edge case where session crashes when receiving seq length 0 - assert transformer version when importing WrappedBloomBlock Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> Co-authored-by: Max Ryabinin <mryabinin0@gmail.com> 2 years ago			`from transformers import BloomConfig`
Measure and cache network & compute throughput (#21) 2 years ago
Bump transformers to 4.25.1 (#151) - latest accelerate, transformers, huggingface_hub - rearrange attention caches to support https://github.com/huggingface/transformers/pull/18344 - remove unused code - fix edge case where session crashes when receiving seq length 0 - assert transformer version when importing WrappedBloomBlock Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> Co-authored-by: Max Ryabinin <mryabinin0@gmail.com> 2 years ago			`from petals.bloom.block import WrappedBloomBlock`
Improve block size calculations (#149) 2 years ago			`from petals.server.block_utils import resolve_block_dtype`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`from petals.utils.convert_8bit import replace_8bit_linear`
Use common folder for all caches, make it a volume in Dockerfile (#141) 2 years ago			`from petals.utils.disk_cache import DEFAULT_CACHE_DIR`
Measure and cache network & compute throughput (#21) 2 years ago
			`use_hivemind_log_handler("in_root_logger")`
			`logger = get_logger(__file__)`


			`def get_host_throughput(`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`config: BloomConfig,`
			`device: torch.device,`
Make server use smart defaults (#115) Summary: ```python parser.add_argument('--attn_cache_size', type=str, default=None, help='The size of GPU memory allocated for storing past attention keys/values between inference steps. ' 'Examples: 500MB, 1.2GB, 1073741824 (bytes). Note that 1KB != 1KiB here. ' 'Default: 0.5GiB * num_blocks * hidden_size / 14336. ' 'The latter is the hidden size of the bigscience/bloom-petals model.') parser.add_argument('--request_timeout', type=float, required=False, default=3 * 60, help='Timeout (in seconds) for the whole rpc_forward/rpc_backward/rpc_forward_stream/rpc_backward_stream request') parser.add_argument('--session_timeout', type=float, required=False, default=30 * 60, help='Timeout (in seconds) for the whole inference session') parser.add_argument('--step_timeout', type=float, required=False, default=60, help="Timeout (in seconds) for waiting the next step's inputs inside an inference session") parser.add_argument('--load_in_8bit', type=bool, default=None, help="Convert the loaded model into mixed-8bit quantized model. Default: True if GPU is available") ``` Co-authored-by: justheuristic <justheuristic@gmail.com> 2 years ago			`dtype: Union[str, torch.dtype],`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`*,`
			`load_in_8bit: bool,`
Measure and cache network & compute throughput (#21) 2 years ago			`force_eval: bool = False,`
Use common folder for all caches, make it a volume in Dockerfile (#141) 2 years ago			`cache_dir: Optional[str] = None,`
Measure and cache network & compute throughput (#21) 2 years ago			`) -> float:`
Improve block size calculations (#149) 2 years ago			`dtype = resolve_block_dtype(config, dtype)`
Suppress quantization warning and fix dtype defaults in compute benchmark (#117) 2 years ago
Use common folder for all caches, make it a volume in Dockerfile (#141) 2 years ago			`if cache_dir is None:`
			`cache_dir = DEFAULT_CACHE_DIR`
			`lock_path = Path(cache_dir, "throughput.lock")`
			`cache_path = Path(cache_dir, "throughput_v2.json")`

Measure and cache network & compute throughput (#21) 2 years ago			`# We use the system-wide lock since only one process at a time can measure the host throughput`
			`os.makedirs(lock_path.parent, exist_ok=True)`
Add automated tests (#23) This PR will run basic tests automatically on each subsequent PR - convert a small model on every PR - run existing tests on every PR - enforce black / isort - require checks on merge - make sure tests are not flappy Co-authored-by: Alexander Borzunov <hxrussia@gmail.com> Co-authored-by: Dmitry Baranchuk <dmitrybaranchuk@gmail.com> 2 years ago			`with open(lock_path, "wb") as lock_fd:`
Measure and cache network & compute throughput (#21) 2 years ago			`logger.info("Loading throughput info")`
			`fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX)`
			`# The OS will release the lock when lock_fd is closed or the process is killed`

Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`cache_key = f"config_{sha256(str(config).encode()).hexdigest()[-16:]}"`
Suppress quantization warning and fix dtype defaults in compute benchmark (#117) 2 years ago			`cache_key += f"_device_{get_device_name(device).replace(' ', '_')}"`
			`cache_key += f"_dtype_{get_dtype_name(dtype, load_in_8bit)}"`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago
			`cache = {}`
Measure and cache network & compute throughput (#21) 2 years ago			`try:`
			`if not force_eval and os.path.exists(cache_path):`
			`with open(cache_path) as cache_fd:`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`cache = json.load(cache_fd)`
			`assert isinstance(cache, dict)`
Measure and cache network & compute throughput (#21) 2 years ago			`except Exception:`
			`logger.exception(f"Failed to read throughput info from {cache_path}")`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`cache = {}`

			`if cache_key not in cache:`
Make server use smart defaults (#115) Summary: ```python parser.add_argument('--attn_cache_size', type=str, default=None, help='The size of GPU memory allocated for storing past attention keys/values between inference steps. ' 'Examples: 500MB, 1.2GB, 1073741824 (bytes). Note that 1KB != 1KiB here. ' 'Default: 0.5GiB * num_blocks * hidden_size / 14336. ' 'The latter is the hidden size of the bigscience/bloom-petals model.') parser.add_argument('--request_timeout', type=float, required=False, default=3 * 60, help='Timeout (in seconds) for the whole rpc_forward/rpc_backward/rpc_forward_stream/rpc_backward_stream request') parser.add_argument('--session_timeout', type=float, required=False, default=30 * 60, help='Timeout (in seconds) for the whole inference session') parser.add_argument('--step_timeout', type=float, required=False, default=60, help="Timeout (in seconds) for waiting the next step's inputs inside an inference session") parser.add_argument('--load_in_8bit', type=bool, default=None, help="Convert the loaded model into mixed-8bit quantized model. Default: True if GPU is available") ``` Co-authored-by: justheuristic <justheuristic@gmail.com> 2 years ago			`cache[cache_key] = measure_throughput_info(config, device, dtype, load_in_8bit=load_in_8bit)`
Measure and cache network & compute throughput (#21) 2 years ago
			`try:`
			`os.makedirs(cache_path.parent, exist_ok=True)`
Add automated tests (#23) This PR will run basic tests automatically on each subsequent PR - convert a small model on every PR - run existing tests on every PR - enforce black / isort - require checks on merge - make sure tests are not flappy Co-authored-by: Alexander Borzunov <hxrussia@gmail.com> Co-authored-by: Dmitry Baranchuk <dmitrybaranchuk@gmail.com> 2 years ago			`with open(cache_path, "w") as cache_fd:`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`json.dump(cache, cache_fd)`
Measure and cache network & compute throughput (#21) 2 years ago			`except Exception:`
			`logger.exception(f"Failed to save throughput info in {cache_path}")`

Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`return cache[cache_key]`
Measure and cache network & compute throughput (#21) 2 years ago

Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`def measure_throughput_info(`
			`config: BloomConfig,`
			`device: torch.device,`
Suppress quantization warning and fix dtype defaults in compute benchmark (#117) 2 years ago			`dtype: torch.dtype,`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`*,`
			`load_in_8bit: bool,`
			`) -> float:`
			`"""Measure network and compute throughput in forward pass tokens per second"""`

Add automated tests (#23) This PR will run basic tests automatically on each subsequent PR - convert a small model on every PR - run existing tests on every PR - enforce black / isort - require checks on merge - make sure tests are not flappy Co-authored-by: Alexander Borzunov <hxrussia@gmail.com> Co-authored-by: Dmitry Baranchuk <dmitrybaranchuk@gmail.com> 2 years ago			`logger.info(`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`"Measuring network and compute throughput. This takes about a minute and will be cached for future runs"`
			`)`
			`return min(`
			`measure_network_rps(config),`
			`measure_compute_rps(config, device, dtype, load_in_8bit=load_in_8bit),`
Add automated tests (#23) This PR will run basic tests automatically on each subsequent PR - convert a small model on every PR - run existing tests on every PR - enforce black / isort - require checks on merge - make sure tests are not flappy Co-authored-by: Alexander Borzunov <hxrussia@gmail.com> Co-authored-by: Dmitry Baranchuk <dmitrybaranchuk@gmail.com> 2 years ago			`)`
Measure and cache network & compute throughput (#21) 2 years ago

			`def measure_network_rps(config: BloomConfig) -> float:`
Make Petals a pip-installable package (attempt 2) (#102) 1. Petals can be now installed using `pip install git+https://github.com/bigscience-workshop/petals` - In case if you already cloned the repo, you can do `pip install .` or `pip install .[dev]` 2. Moved `src` => `src/petals` - Replaced `from src.smth import smth` with `from petals.smth import smth` 3. Moved `cli` => `src/petals/cli` - Replaced `python -m cli.run_smth` with `python -m petals.cli.run_smth` (all utilities are now available right after pip installation) 4. Moved the `requirements*.txt` contents to `setup.cfg` (`requirements.txt` for packages is not supported well by modern packaging utils) 5. Increased the package version from `0.2` to `1.0alpha1` 2 years ago			`proc = subprocess.run("python3 -m petals.cli.speed_test --json", shell=True, capture_output=True)`
Measure and cache network & compute throughput (#21) 2 years ago			`if proc.returncode != 0:`
			`raise RuntimeError(f"Failed to measure network throughput (stdout: {proc.stdout}, stderr: {proc.stderr})")`
			`network_info = json.loads(proc.stdout)`

Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`bits_per_request = config.hidden_size * 16 # Clients usually send 16-bit tensors for forward/backward`
Add automated tests (#23) This PR will run basic tests automatically on each subsequent PR - convert a small model on every PR - run existing tests on every PR - enforce black / isort - require checks on merge - make sure tests are not flappy Co-authored-by: Alexander Borzunov <hxrussia@gmail.com> Co-authored-by: Dmitry Baranchuk <dmitrybaranchuk@gmail.com> 2 years ago			`network_rps = min(network_info["download"], network_info["upload"]) / bits_per_request`
Measure and cache network & compute throughput (#21) 2 years ago
			`logger.info(`
			`f"Network throughput: "`
			`f"{network_info['download'] / 1e6:.2f} Mbit/s on download, "`
			`f"{network_info['upload'] / 1e6:.2f} Mbit/s on upload, "`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`f"{network_rps:.1f} RPS"`
Measure and cache network & compute throughput (#21) 2 years ago			`)`
			`return network_rps`


Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`def measure_compute_rps(`
			`config: BloomConfig,`
			`device: torch.device,`
Suppress quantization warning and fix dtype defaults in compute benchmark (#117) 2 years ago			`dtype: torch.dtype,`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`*,`
			`load_in_8bit: bool,`
			`n_tokens: int = 16,`
			`n_steps: int = 500,`
			`) -> float:`
Measure and cache network & compute throughput (#21) 2 years ago			`with torch.inference_mode():`
Bump transformers to 4.25.1 (#151) - latest accelerate, transformers, huggingface_hub - rearrange attention caches to support https://github.com/huggingface/transformers/pull/18344 - remove unused code - fix edge case where session crashes when receiving seq length 0 - assert transformer version when importing WrappedBloomBlock Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> Co-authored-by: Max Ryabinin <mryabinin0@gmail.com> 2 years ago			`block = WrappedBloomBlock(config).to(dtype)`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`if load_in_8bit:`
			`block = replace_8bit_linear(block)`
			`block = block.to(device)`

Measure and cache network & compute throughput (#21) 2 years ago			`cache = None`
			`elapsed = 0`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`for step in range(n_steps + 1):`
Suppress quantization warning and fix dtype defaults in compute benchmark (#117) 2 years ago			`dummy_input = torch.randn(n_tokens, 1, config.hidden_size, device=device, dtype=dtype)`
Measure and cache network & compute throughput (#21) 2 years ago
			`start_time = time.perf_counter()`
Bump transformers to 4.25.1 (#151) - latest accelerate, transformers, huggingface_hub - rearrange attention caches to support https://github.com/huggingface/transformers/pull/18344 - remove unused code - fix edge case where session crashes when receiving seq length 0 - assert transformer version when importing WrappedBloomBlock Co-authored-by: Alexander Borzunov <borzunov.alexander@gmail.com> Co-authored-by: Max Ryabinin <mryabinin0@gmail.com> 2 years ago			`_, cache = block.forward(dummy_input, use_cache=True, layer_past=cache)`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`if step >= 1: # Skip the 1st step to exclude the initialization time`
			`elapsed += time.perf_counter() - start_time`
			`device_rps = n_steps * n_tokens / elapsed`
Measure and cache network & compute throughput (#21) 2 years ago
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`logger.info(`
Suppress quantization warning and fix dtype defaults in compute benchmark (#117) 2 years ago			`f"Forward pass throughput ({get_device_name(device)}, {get_dtype_name(dtype, load_in_8bit)}): "`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`f"{device_rps:.1f} RPS"`
			`)`
Measure and cache network & compute throughput (#21) 2 years ago			`return device_rps`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago

Suppress quantization warning and fix dtype defaults in compute benchmark (#117) 2 years ago			`def get_device_name(device: torch.device) -> str:`
Fix inference and rpc_info() fault tolerance (#131) 2 years ago			`return f"{torch.cuda.get_device_name(device)} GPU" if device.type == "cuda" else "CPU"`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago

Suppress quantization warning and fix dtype defaults in compute benchmark (#117) 2 years ago			`def get_dtype_name(dtype: torch.dtype, load_in_8bit: bool) -> str:`
Measure throughput for different configs, devices, and dtypes separately (#114) 2 years ago			`return "8-bit" if load_in_8bit else str(dtype)`