Don't prioritize short inference steps (it's too much given that they

use a merged pool)
10 months ago · fe88505e59
parent 5ed96a44b1
commit fe88505e59
2 changed files with 4 additions and 6 deletions
--- a/src/petals/server/block_functions.py
+++ b/src/petals/server/block_functions.py
@ -196,7 +196,7 @@ async def iterate_rpc_inference(
            hypo_ids,
            points=point_per_piece,
            requested_uids=requested_uids,
-            type="short_inference" if can_merge_pools else "inference",
+            type="inference",
        )

        # A client may pass a tensor with 0 tokens. This is a special case that occurs, e.g.
--- a/src/petals/server/task_prioritizer.py
+++ b/src/petals/server/task_prioritizer.py
@ -14,9 +14,7 @@ class TaskPrioritizerBase(ABC):

 class DummyTaskPrioritizer(TaskPrioritizerBase):
    def prioritize(self, *input: torch.Tensor, points: float = 0.0, **kwargs) -> float:
-        # Inference steps (especially short ones) go first since they are more latency-sensitive
-        if kwargs.get("type") == "short_inference":
-            return 1.0
+        # Inference steps go first since they are more latency-sensitive
        if kwargs.get("type") == "inference":
-            return 2.0
-        return 3.0  # Forward, backward
+            return 1.0
+        return 2.0  # Forward, backward