|
|
@ -186,7 +186,7 @@ class Server:
|
|
|
|
|
|
|
|
|
|
|
|
# For attention cache in GPU or RAM
|
|
|
|
# For attention cache in GPU or RAM
|
|
|
|
if attn_cache_tokens is None:
|
|
|
|
if attn_cache_tokens is None:
|
|
|
|
attn_cache_tokens = 32768 if is_multiquery_attn else 2048
|
|
|
|
attn_cache_tokens = 32768 if is_multiquery_attn else 8192
|
|
|
|
cache_values_per_block = 2 * self.block_config.hidden_size * attn_cache_tokens
|
|
|
|
cache_values_per_block = 2 * self.block_config.hidden_size * attn_cache_tokens
|
|
|
|
cache_values_per_block //= self.block_config.num_key_value_groups
|
|
|
|
cache_values_per_block //= self.block_config.num_key_value_groups
|
|
|
|
self._cache_bytes_per_block = cache_values_per_block * torch.finfo(self.torch_dtype).bits // 8
|
|
|
|
self._cache_bytes_per_block = cache_values_per_block * torch.finfo(self.torch_dtype).bits // 8
|
|
|
|