From 398a384075d17aad1ded769b876e659d8c15802a Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Wed, 19 Jul 2023 13:08:52 +0300
Subject: [PATCH] Inherit bitsandbytes compute dtype correctly (override peft
 quirk) (#377)

---
 src/petals/client/routing/sequence_manager.py | 3 +--
 src/petals/utils/peft.py                      | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/petals/client/routing/sequence_manager.py b/src/petals/client/routing/sequence_manager.py
index 5b1ab3f..9230185 100644
--- a/src/petals/client/routing/sequence_manager.py
+++ b/src/petals/client/routing/sequence_manager.py
@@ -212,7 +212,6 @@ class RemoteSequenceManager:
         end_index: int,
         *,
         cache_tokens_needed: Optional[int],
-        overhead_coeff: float = 1.82,  # Backend overhead (empirically measured)
         overhead_delay: float = 0.018,  # Serialization overhead (empirically measured)
         default_inference_rps: float = 300,  # If inference RPS unknown
         alloc_delay: float = 10,  # If not enough cache left, we penalize the edge
@@ -266,7 +265,7 @@ class RemoteSequenceManager:
                 inference_rps = span.server_info.inference_rps
                 if inference_rps is None:
                     inference_rps = default_inference_rps
-                graph.add_edge((span.peer_id, block_idx), (span.peer_id, block_idx + 1), overhead_coeff / inference_rps)
+                graph.add_edge((span.peer_id, block_idx), (span.peer_id, block_idx + 1), 1.0 / inference_rps)
 
         return graph
 
diff --git a/src/petals/utils/peft.py b/src/petals/utils/peft.py
index bbad779..23661ae 100644
--- a/src/petals/utils/peft.py
+++ b/src/petals/utils/peft.py
@@ -198,6 +198,7 @@ def create_lora_adapter(block, quant_type: QuantType):
                     child.out_features,
                     **kwargs,
                 )
+                lora_wrapped_child.compute_dtype = child.compute_dtype
             else:
                 bias = hasattr(child, "bias") and child.bias is not None
                 lora_wrapped_child = LoraLinear(