Upgrade tests

10 months ago · aa9a0cc18d
parent 2958b3cb63
commit aa9a0cc18d
4 changed files with 40 additions and 49 deletions
--- a/src/petals/client/inference_session.py
+++ b/src/petals/client/inference_session.py
@ -87,7 +87,7 @@ class _ServerInferenceSession:
        self,
        inputs: torch.Tensor,
        prompts: Optional[torch.Tensor] = None,
-        hypo_ids: Optional[torch.LongTensor] = None,
+        hypo_ids: Optional[torch.Tensor] = None,
        *,
        step_id: str,
    ) -> torch.Tensor:
@ -276,8 +276,6 @@ class InferenceSession:
        return self

    def step(self, inputs: torch.Tensor, prompts: Optional[torch.Tensor] = None, **kwargs) -> torch.Tensor:
-        logger.warning(f"inference_session.step: {inputs.shape=} {self.position=}")
-
        assert not self._closed
        if torch.is_grad_enabled():
            logger.warning("Running inference session with grad enabled. Gradients will *not* be propagated correctly.")
--- a/src/petals/client/remote_sequential.py
+++ b/src/petals/client/remote_sequential.py
@ -54,7 +54,7 @@ class RemoteSequential(nn.Module):
    def forward(self, inputs: torch.Tensor, prompts: Optional[torch.Tensor] = None, **kwargs) -> torch.Tensor:
        assert inputs.ndim == 3, "inputs must be a tensor of shape [batch_size, seq_length, hidden_size]"
        if self._thread_local.active_session is None:
-            assert any(v is None for v in kwargs.values()), f"Extra kwargs are not supported in forward: {kwargs}"
+            assert all(v is None for v in kwargs.values()), f"Extra kwargs are not supported in forward: {kwargs}"
            return _RemoteSequentialAutogradFunction.apply(inputs, prompts, self.sequence_manager)
        else:
            return self._thread_local.active_session.step(inputs, prompts, **kwargs)
@ -65,7 +65,7 @@ class RemoteSequential(nn.Module):

    @contextmanager
    def use_session(self, session: InferenceSession) -> InferenceSession:
-        """ Inside this context, forward() will use the specified InferenceSession. """
+        """Inside this context, forward() will use the specified InferenceSession."""

        try:
            prev_session = self._thread_local.active_session
@ -76,7 +76,7 @@ class RemoteSequential(nn.Module):

    @contextmanager
    def inference_session(self, **kwargs) -> InferenceSession:
-        """ Inside this context, forward() will use a new InferenceSession created with given parameters. """
+        """Inside this context, forward() will use a new InferenceSession created with given parameters."""

        with self.use_session(InferenceSession(self.sequence_manager, **kwargs)) as session:
            yield session
--- a/src/petals/models/llama/model.py
+++ b/src/petals/models/llama/model.py
@ -85,7 +85,11 @@ class DistributedLlamaModel(FromPretrainedMixin, PTuneMixin, LlamaModel):
        hidden_states = inputs_embeds
        output_shape = input_shape + (hidden_states.size(-1),)

-        hidden_states = self.layers(hidden_states, prompts=intermediate_prompts, hypo_ids=past_key_values.hypo_ids if past_key_values is not None else None)
+        hidden_states = self.layers(
+            hidden_states,
+            prompts=intermediate_prompts,
+            hypo_ids=past_key_values.hypo_ids if past_key_values is not None else None,
+        )

        # Remove prefix
        if self.config.tuning_mode and "ptune" in self.config.tuning_mode:
--- a/tests/test_full_model.py
+++ b/tests/test_full_model.py
@ -16,17 +16,27 @@ def tokenizer():
    return transformers.AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)


-@pytest.mark.forked
-@pytest.mark.parametrize("use_peft", (True, False) if ADAPTER_NAME else (False,))
-@pytest.mark.parametrize("pass_empty_tensors", (True, False))
-def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_forward=1e-3, atol_inference=1e-3):
+@pytest.fixture(scope="module", params=[None, ADAPTER_NAME] if ADAPTER_NAME else [None])
+def models(request):
+    active_adapter = request.param
+
    model = AutoDistributedModelForCausalLM.from_pretrained(
-        MODEL_NAME,
-        initial_peers=INITIAL_PEERS,
-        torch_dtype=torch.float32,
-        active_adapter=ADAPTER_NAME if use_peft else None,
+        MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32, active_adapter=active_adapter
+    )
+
+    ref_model = transformers.AutoModelForCausalLM.from_pretrained(
+        REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
    )
-    config = model.config
+    if active_adapter is not None:
+        ref_model = peft.PeftModel.from_pretrained(active_adapter, ADAPTER_NAME)
+        ref_model.train(False)
+
+    return model, ref_model
+
+
+@pytest.mark.parametrize("pass_empty_tensors", (True, False))
+def test_full_model_exact_match(tokenizer, models, pass_empty_tensors, atol_forward=1e-3, atol_inference=1e-3):
+    model, ref_model = models
    assert len(model.transformer.h) == model.config.num_hidden_layers

    test_inputs = tokenizer("A quick brown fox was minding its own buisness", return_tensors="pt")["input_ids"]
@ -41,7 +51,7 @@ def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_fo
        recurrent_outputs = []
        with model.transformer.h.inference_session(max_length=embs.shape[1]) as sess:
            if pass_empty_tensors:
-                recurrent_outputs.append(sess.step(torch.empty(1, 0, config.hidden_size)))
+                recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))

            for t in range(embs.shape[1]):
                if t == 4:
@ -52,8 +62,8 @@ def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_fo
                    recurrent_outputs.append(sess.step(embs[:, t : t + 1, :]))

                if t == 2 and pass_empty_tensors:
-                    recurrent_outputs.append(sess.step(torch.empty(1, 0, config.hidden_size)))
-                    recurrent_outputs.append(sess.step(torch.empty(1, 0, config.hidden_size)))
+                    recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))
+                    recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))

        recurrent_outputs = torch.cat(recurrent_outputs, dim=1)
        recurrent_outputs = model.transformer.ln_f(recurrent_outputs)
@ -62,36 +72,15 @@ def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_fo
            recurrent_outputs, parallel_outputs, rtol=0, atol=atol_inference
        ), "Inference differs from forward pass"

-        del model, embs, recurrent_outputs
-
-        ref_model = transformers.AutoModelForCausalLM.from_pretrained(
-            REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
-        )
-        if use_peft:
-            ref_model = peft.PeftModel.from_pretrained(ref_model, ADAPTER_NAME)
-            ref_model.train(False)
        ref_outputs = ref_model.forward(test_inputs).logits.float()
        assert torch.allclose(
            ref_outputs, parallel_outputs, rtol=0, atol=atol_forward
        ), "Outputs are not identical to HF"


-@pytest.fixture
-def model():
-    return AutoDistributedModelForCausalLM.from_pretrained(
-        MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32
-    )
-
+def test_greedy_generation(tokenizer, models, max_new_tokens=4):
+    model, ref_model = models

-@pytest.fixture
-def ref_model():
-    return transformers.AutoModelForCausalLM.from_pretrained(
-        REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
-    )
-
-
-@pytest.mark.forked
-def test_greedy_generation(tokenizer, model, ref_model, max_new_tokens=4):
    inputs_single = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]

    if tokenizer.pad_token_id is None:
@ -101,13 +90,11 @@ def test_greedy_generation(tokenizer, model, ref_model, max_new_tokens=4):
    ]

    for inputs in [inputs_single, inputs_batch]:
-        logger.warning(f"test_greedy_generation: {inputs=}")
        outputs = model.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False)
        ref_outputs = ref_model.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False)
        assert torch.allclose(outputs, ref_outputs), f"Greedy generation is not identical to HF with {inputs.shape=}"


-@pytest.mark.forked
@pytest.mark.parametrize(
    "sampling_options",
    [
@ -117,7 +104,8 @@ def test_greedy_generation(tokenizer, model, ref_model, max_new_tokens=4):
        dict(do_sample=True, top_p=0.9),
    ],
 )
-def test_sampling(tokenizer, model, ref_model, sampling_options, max_new_tokens=4):
+def test_sampling(tokenizer, models, sampling_options, max_new_tokens=4):
+    model, ref_model = models
    torch.manual_seed(0)

    inputs_single = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
@ -129,17 +117,18 @@ def test_sampling(tokenizer, model, ref_model, sampling_options, max_new_tokens=
    ]

    for inputs in [inputs_single, inputs_batch]:
-        with torch.random.fork_rng():
+        with torch.random.fork_rng([model.device]):
            outputs = model.generate(inputs, max_new_tokens=max_new_tokens)
-        with torch.random.fork_rng():
+        with torch.random.fork_rng([ref_model.device]):
            ref_outputs = ref_model.generate(inputs, max_new_tokens=max_new_tokens)
        assert torch.allclose(
            outputs, ref_outputs
        ), f"Sampling is not identical to HF with {inputs.shape=}, {sampling_options=}"


-@pytest.mark.forked
-def test_beam_search_generation(tokenizer, model, ref_model, max_new_tokens=4, num_beams=2):
+def test_beam_search_generation(tokenizer, models, max_new_tokens=4, num_beams=2):
+    model, ref_model = models
+
    inputs = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]

    outputs = model.generate(inputs, max_new_tokens=max_new_tokens, num_beams=num_beams)