Upgrade tests

pull/464/head
Aleksandr Borzunov 10 months ago
parent 2958b3cb63
commit aa9a0cc18d

@ -87,7 +87,7 @@ class _ServerInferenceSession:
self,
inputs: torch.Tensor,
prompts: Optional[torch.Tensor] = None,
hypo_ids: Optional[torch.LongTensor] = None,
hypo_ids: Optional[torch.Tensor] = None,
*,
step_id: str,
) -> torch.Tensor:
@ -276,8 +276,6 @@ class InferenceSession:
return self
def step(self, inputs: torch.Tensor, prompts: Optional[torch.Tensor] = None, **kwargs) -> torch.Tensor:
logger.warning(f"inference_session.step: {inputs.shape=} {self.position=}")
assert not self._closed
if torch.is_grad_enabled():
logger.warning("Running inference session with grad enabled. Gradients will *not* be propagated correctly.")

@ -54,7 +54,7 @@ class RemoteSequential(nn.Module):
def forward(self, inputs: torch.Tensor, prompts: Optional[torch.Tensor] = None, **kwargs) -> torch.Tensor:
assert inputs.ndim == 3, "inputs must be a tensor of shape [batch_size, seq_length, hidden_size]"
if self._thread_local.active_session is None:
assert any(v is None for v in kwargs.values()), f"Extra kwargs are not supported in forward: {kwargs}"
assert all(v is None for v in kwargs.values()), f"Extra kwargs are not supported in forward: {kwargs}"
return _RemoteSequentialAutogradFunction.apply(inputs, prompts, self.sequence_manager)
else:
return self._thread_local.active_session.step(inputs, prompts, **kwargs)
@ -65,7 +65,7 @@ class RemoteSequential(nn.Module):
@contextmanager
def use_session(self, session: InferenceSession) -> InferenceSession:
""" Inside this context, forward() will use the specified InferenceSession. """
"""Inside this context, forward() will use the specified InferenceSession."""
try:
prev_session = self._thread_local.active_session
@ -76,7 +76,7 @@ class RemoteSequential(nn.Module):
@contextmanager
def inference_session(self, **kwargs) -> InferenceSession:
""" Inside this context, forward() will use a new InferenceSession created with given parameters. """
"""Inside this context, forward() will use a new InferenceSession created with given parameters."""
with self.use_session(InferenceSession(self.sequence_manager, **kwargs)) as session:
yield session

@ -85,7 +85,11 @@ class DistributedLlamaModel(FromPretrainedMixin, PTuneMixin, LlamaModel):
hidden_states = inputs_embeds
output_shape = input_shape + (hidden_states.size(-1),)
hidden_states = self.layers(hidden_states, prompts=intermediate_prompts, hypo_ids=past_key_values.hypo_ids if past_key_values is not None else None)
hidden_states = self.layers(
hidden_states,
prompts=intermediate_prompts,
hypo_ids=past_key_values.hypo_ids if past_key_values is not None else None,
)
# Remove prefix
if self.config.tuning_mode and "ptune" in self.config.tuning_mode:

@ -16,17 +16,27 @@ def tokenizer():
return transformers.AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
@pytest.mark.forked
@pytest.mark.parametrize("use_peft", (True, False) if ADAPTER_NAME else (False,))
@pytest.mark.parametrize("pass_empty_tensors", (True, False))
def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_forward=1e-3, atol_inference=1e-3):
@pytest.fixture(scope="module", params=[None, ADAPTER_NAME] if ADAPTER_NAME else [None])
def models(request):
active_adapter = request.param
model = AutoDistributedModelForCausalLM.from_pretrained(
MODEL_NAME,
initial_peers=INITIAL_PEERS,
torch_dtype=torch.float32,
active_adapter=ADAPTER_NAME if use_peft else None,
MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32, active_adapter=active_adapter
)
ref_model = transformers.AutoModelForCausalLM.from_pretrained(
REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
)
config = model.config
if active_adapter is not None:
ref_model = peft.PeftModel.from_pretrained(active_adapter, ADAPTER_NAME)
ref_model.train(False)
return model, ref_model
@pytest.mark.parametrize("pass_empty_tensors", (True, False))
def test_full_model_exact_match(tokenizer, models, pass_empty_tensors, atol_forward=1e-3, atol_inference=1e-3):
model, ref_model = models
assert len(model.transformer.h) == model.config.num_hidden_layers
test_inputs = tokenizer("A quick brown fox was minding its own buisness", return_tensors="pt")["input_ids"]
@ -41,7 +51,7 @@ def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_fo
recurrent_outputs = []
with model.transformer.h.inference_session(max_length=embs.shape[1]) as sess:
if pass_empty_tensors:
recurrent_outputs.append(sess.step(torch.empty(1, 0, config.hidden_size)))
recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))
for t in range(embs.shape[1]):
if t == 4:
@ -52,8 +62,8 @@ def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_fo
recurrent_outputs.append(sess.step(embs[:, t : t + 1, :]))
if t == 2 and pass_empty_tensors:
recurrent_outputs.append(sess.step(torch.empty(1, 0, config.hidden_size)))
recurrent_outputs.append(sess.step(torch.empty(1, 0, config.hidden_size)))
recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))
recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))
recurrent_outputs = torch.cat(recurrent_outputs, dim=1)
recurrent_outputs = model.transformer.ln_f(recurrent_outputs)
@ -62,36 +72,15 @@ def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_fo
recurrent_outputs, parallel_outputs, rtol=0, atol=atol_inference
), "Inference differs from forward pass"
del model, embs, recurrent_outputs
ref_model = transformers.AutoModelForCausalLM.from_pretrained(
REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
)
if use_peft:
ref_model = peft.PeftModel.from_pretrained(ref_model, ADAPTER_NAME)
ref_model.train(False)
ref_outputs = ref_model.forward(test_inputs).logits.float()
assert torch.allclose(
ref_outputs, parallel_outputs, rtol=0, atol=atol_forward
), "Outputs are not identical to HF"
@pytest.fixture
def model():
return AutoDistributedModelForCausalLM.from_pretrained(
MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32
)
def test_greedy_generation(tokenizer, models, max_new_tokens=4):
model, ref_model = models
@pytest.fixture
def ref_model():
return transformers.AutoModelForCausalLM.from_pretrained(
REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
)
@pytest.mark.forked
def test_greedy_generation(tokenizer, model, ref_model, max_new_tokens=4):
inputs_single = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
if tokenizer.pad_token_id is None:
@ -101,13 +90,11 @@ def test_greedy_generation(tokenizer, model, ref_model, max_new_tokens=4):
]
for inputs in [inputs_single, inputs_batch]:
logger.warning(f"test_greedy_generation: {inputs=}")
outputs = model.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False)
ref_outputs = ref_model.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False)
assert torch.allclose(outputs, ref_outputs), f"Greedy generation is not identical to HF with {inputs.shape=}"
@pytest.mark.forked
@pytest.mark.parametrize(
"sampling_options",
[
@ -117,7 +104,8 @@ def test_greedy_generation(tokenizer, model, ref_model, max_new_tokens=4):
dict(do_sample=True, top_p=0.9),
],
)
def test_sampling(tokenizer, model, ref_model, sampling_options, max_new_tokens=4):
def test_sampling(tokenizer, models, sampling_options, max_new_tokens=4):
model, ref_model = models
torch.manual_seed(0)
inputs_single = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
@ -129,17 +117,18 @@ def test_sampling(tokenizer, model, ref_model, sampling_options, max_new_tokens=
]
for inputs in [inputs_single, inputs_batch]:
with torch.random.fork_rng():
with torch.random.fork_rng([model.device]):
outputs = model.generate(inputs, max_new_tokens=max_new_tokens)
with torch.random.fork_rng():
with torch.random.fork_rng([ref_model.device]):
ref_outputs = ref_model.generate(inputs, max_new_tokens=max_new_tokens)
assert torch.allclose(
outputs, ref_outputs
), f"Sampling is not identical to HF with {inputs.shape=}, {sampling_options=}"
@pytest.mark.forked
def test_beam_search_generation(tokenizer, model, ref_model, max_new_tokens=4, num_beams=2):
def test_beam_search_generation(tokenizer, models, max_new_tokens=4, num_beams=2):
model, ref_model = models
inputs = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
outputs = model.generate(inputs, max_new_tokens=max_new_tokens, num_beams=num_beams)

Loading…
Cancel
Save