|
|
|
@ -207,14 +207,12 @@ def measure_compute_rps(
|
|
|
|
|
elapsed = 0
|
|
|
|
|
dummy_input = torch.randn(1, n_tokens, config.hidden_size, device=device, dtype=dtype)
|
|
|
|
|
_, cache = block.forward(dummy_input, use_cache=True) # Skip the 1st step to exclude the initialization time
|
|
|
|
|
if device.type == "cuda":
|
|
|
|
|
torch.cuda.synchronize(device)
|
|
|
|
|
synchronize(device)
|
|
|
|
|
|
|
|
|
|
start_time = time.perf_counter()
|
|
|
|
|
for step in range(n_steps):
|
|
|
|
|
for _ in range(n_steps):
|
|
|
|
|
_, cache = block.forward(dummy_input, use_cache=True, layer_past=cache if inference else None)
|
|
|
|
|
if device.type == "cuda":
|
|
|
|
|
torch.cuda.synchronize(device)
|
|
|
|
|
synchronize(device)
|
|
|
|
|
elapsed = time.perf_counter() - start_time
|
|
|
|
|
device_rps = n_steps * n_tokens / elapsed
|
|
|
|
|
|
|
|
|
@ -230,8 +228,15 @@ def measure_compute_rps(
|
|
|
|
|
return device_rps
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def synchronize(device: torch.device):
|
|
|
|
|
if device.type == "cuda":
|
|
|
|
|
torch.cuda.synchronize(device)
|
|
|
|
|
elif device.type == "mps":
|
|
|
|
|
torch.mps.synchronize(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_device_name(device: torch.device) -> str:
|
|
|
|
|
return f"{torch.cuda.get_device_name(device)} GPU" if device.type == "cuda" else "CPU"
|
|
|
|
|
return f"{torch.cuda.get_device_name(device)} GPU" if device.type == "cuda" else device.type.upper()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_dtype_name(dtype: torch.dtype, quant_type: QuantType) -> str:
|
|
|
|
|