From 398a384075d17aad1ded769b876e659d8c15802a Mon Sep 17 00:00:00 2001 From: justheuristic Date: Wed, 19 Jul 2023 13:08:52 +0300 Subject: [PATCH] Inherit bitsandbytes compute dtype correctly (override peft quirk) (#377) --- src/petals/client/routing/sequence_manager.py | 3 +-- src/petals/utils/peft.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/petals/client/routing/sequence_manager.py b/src/petals/client/routing/sequence_manager.py index 5b1ab3f..9230185 100644 --- a/src/petals/client/routing/sequence_manager.py +++ b/src/petals/client/routing/sequence_manager.py @@ -212,7 +212,6 @@ class RemoteSequenceManager: end_index: int, *, cache_tokens_needed: Optional[int], - overhead_coeff: float = 1.82, # Backend overhead (empirically measured) overhead_delay: float = 0.018, # Serialization overhead (empirically measured) default_inference_rps: float = 300, # If inference RPS unknown alloc_delay: float = 10, # If not enough cache left, we penalize the edge @@ -266,7 +265,7 @@ class RemoteSequenceManager: inference_rps = span.server_info.inference_rps if inference_rps is None: inference_rps = default_inference_rps - graph.add_edge((span.peer_id, block_idx), (span.peer_id, block_idx + 1), overhead_coeff / inference_rps) + graph.add_edge((span.peer_id, block_idx), (span.peer_id, block_idx + 1), 1.0 / inference_rps) return graph diff --git a/src/petals/utils/peft.py b/src/petals/utils/peft.py index bbad779..23661ae 100644 --- a/src/petals/utils/peft.py +++ b/src/petals/utils/peft.py @@ -198,6 +198,7 @@ def create_lora_adapter(block, quant_type: QuantType): child.out_features, **kwargs, ) + lora_wrapped_child.compute_dtype = child.compute_dtype else: bias = hasattr(child, "bias") and child.bias is not None lora_wrapped_child = LoraLinear(