Inherit bitsandbytes compute dtype correctly (override peft quirk) (#377)

pull/378/head
justheuristic 10 months ago committed by GitHub
parent 5a8de2f1f8
commit 398a384075
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -212,7 +212,6 @@ class RemoteSequenceManager:
end_index: int,
*,
cache_tokens_needed: Optional[int],
overhead_coeff: float = 1.82, # Backend overhead (empirically measured)
overhead_delay: float = 0.018, # Serialization overhead (empirically measured)
default_inference_rps: float = 300, # If inference RPS unknown
alloc_delay: float = 10, # If not enough cache left, we penalize the edge
@ -266,7 +265,7 @@ class RemoteSequenceManager:
inference_rps = span.server_info.inference_rps
if inference_rps is None:
inference_rps = default_inference_rps
graph.add_edge((span.peer_id, block_idx), (span.peer_id, block_idx + 1), overhead_coeff / inference_rps)
graph.add_edge((span.peer_id, block_idx), (span.peer_id, block_idx + 1), 1.0 / inference_rps)
return graph

@ -198,6 +198,7 @@ def create_lora_adapter(block, quant_type: QuantType):
child.out_features,
**kwargs,
)
lora_wrapped_child.compute_dtype = child.compute_dtype
else:
bias = hasattr(child, "bias") and child.bias is not None
lora_wrapped_child = LoraLinear(

Loading…
Cancel
Save