|
|
|
@ -16,17 +16,27 @@ def tokenizer():
|
|
|
|
|
return transformers.AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.forked
|
|
|
|
|
@pytest.mark.parametrize("use_peft", (True, False) if ADAPTER_NAME else (False,))
|
|
|
|
|
@pytest.mark.parametrize("pass_empty_tensors", (True, False))
|
|
|
|
|
def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_forward=1e-3, atol_inference=1e-3):
|
|
|
|
|
@pytest.fixture(scope="module", params=[None, ADAPTER_NAME] if ADAPTER_NAME else [None])
|
|
|
|
|
def models(request):
|
|
|
|
|
active_adapter = request.param
|
|
|
|
|
|
|
|
|
|
model = AutoDistributedModelForCausalLM.from_pretrained(
|
|
|
|
|
MODEL_NAME,
|
|
|
|
|
initial_peers=INITIAL_PEERS,
|
|
|
|
|
torch_dtype=torch.float32,
|
|
|
|
|
active_adapter=ADAPTER_NAME if use_peft else None,
|
|
|
|
|
MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32, active_adapter=active_adapter
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
ref_model = transformers.AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
|
|
|
|
|
)
|
|
|
|
|
config = model.config
|
|
|
|
|
if active_adapter is not None:
|
|
|
|
|
ref_model = peft.PeftModel.from_pretrained(active_adapter, ADAPTER_NAME)
|
|
|
|
|
ref_model.train(False)
|
|
|
|
|
|
|
|
|
|
return model, ref_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("pass_empty_tensors", (True, False))
|
|
|
|
|
def test_full_model_exact_match(tokenizer, models, pass_empty_tensors, atol_forward=1e-3, atol_inference=1e-3):
|
|
|
|
|
model, ref_model = models
|
|
|
|
|
assert len(model.transformer.h) == model.config.num_hidden_layers
|
|
|
|
|
|
|
|
|
|
test_inputs = tokenizer("A quick brown fox was minding its own buisness", return_tensors="pt")["input_ids"]
|
|
|
|
@ -41,7 +51,7 @@ def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_fo
|
|
|
|
|
recurrent_outputs = []
|
|
|
|
|
with model.transformer.h.inference_session(max_length=embs.shape[1]) as sess:
|
|
|
|
|
if pass_empty_tensors:
|
|
|
|
|
recurrent_outputs.append(sess.step(torch.empty(1, 0, config.hidden_size)))
|
|
|
|
|
recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))
|
|
|
|
|
|
|
|
|
|
for t in range(embs.shape[1]):
|
|
|
|
|
if t == 4:
|
|
|
|
@ -52,8 +62,8 @@ def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_fo
|
|
|
|
|
recurrent_outputs.append(sess.step(embs[:, t : t + 1, :]))
|
|
|
|
|
|
|
|
|
|
if t == 2 and pass_empty_tensors:
|
|
|
|
|
recurrent_outputs.append(sess.step(torch.empty(1, 0, config.hidden_size)))
|
|
|
|
|
recurrent_outputs.append(sess.step(torch.empty(1, 0, config.hidden_size)))
|
|
|
|
|
recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))
|
|
|
|
|
recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))
|
|
|
|
|
|
|
|
|
|
recurrent_outputs = torch.cat(recurrent_outputs, dim=1)
|
|
|
|
|
recurrent_outputs = model.transformer.ln_f(recurrent_outputs)
|
|
|
|
@ -62,36 +72,15 @@ def test_full_model_exact_match(tokenizer, use_peft, pass_empty_tensors, atol_fo
|
|
|
|
|
recurrent_outputs, parallel_outputs, rtol=0, atol=atol_inference
|
|
|
|
|
), "Inference differs from forward pass"
|
|
|
|
|
|
|
|
|
|
del model, embs, recurrent_outputs
|
|
|
|
|
|
|
|
|
|
ref_model = transformers.AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
|
|
|
|
|
)
|
|
|
|
|
if use_peft:
|
|
|
|
|
ref_model = peft.PeftModel.from_pretrained(ref_model, ADAPTER_NAME)
|
|
|
|
|
ref_model.train(False)
|
|
|
|
|
ref_outputs = ref_model.forward(test_inputs).logits.float()
|
|
|
|
|
assert torch.allclose(
|
|
|
|
|
ref_outputs, parallel_outputs, rtol=0, atol=atol_forward
|
|
|
|
|
), "Outputs are not identical to HF"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
|
def model():
|
|
|
|
|
return AutoDistributedModelForCausalLM.from_pretrained(
|
|
|
|
|
MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def test_greedy_generation(tokenizer, models, max_new_tokens=4):
|
|
|
|
|
model, ref_model = models
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
|
def ref_model():
|
|
|
|
|
return transformers.AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.forked
|
|
|
|
|
def test_greedy_generation(tokenizer, model, ref_model, max_new_tokens=4):
|
|
|
|
|
inputs_single = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
|
|
|
|
|
|
|
|
|
|
if tokenizer.pad_token_id is None:
|
|
|
|
@ -101,13 +90,11 @@ def test_greedy_generation(tokenizer, model, ref_model, max_new_tokens=4):
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for inputs in [inputs_single, inputs_batch]:
|
|
|
|
|
logger.warning(f"test_greedy_generation: {inputs=}")
|
|
|
|
|
outputs = model.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False)
|
|
|
|
|
ref_outputs = ref_model.generate(inputs, max_new_tokens=max_new_tokens, do_sample=False)
|
|
|
|
|
assert torch.allclose(outputs, ref_outputs), f"Greedy generation is not identical to HF with {inputs.shape=}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.forked
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"sampling_options",
|
|
|
|
|
[
|
|
|
|
@ -117,7 +104,8 @@ def test_greedy_generation(tokenizer, model, ref_model, max_new_tokens=4):
|
|
|
|
|
dict(do_sample=True, top_p=0.9),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_sampling(tokenizer, model, ref_model, sampling_options, max_new_tokens=4):
|
|
|
|
|
def test_sampling(tokenizer, models, sampling_options, max_new_tokens=4):
|
|
|
|
|
model, ref_model = models
|
|
|
|
|
torch.manual_seed(0)
|
|
|
|
|
|
|
|
|
|
inputs_single = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
|
|
|
|
@ -129,17 +117,18 @@ def test_sampling(tokenizer, model, ref_model, sampling_options, max_new_tokens=
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for inputs in [inputs_single, inputs_batch]:
|
|
|
|
|
with torch.random.fork_rng():
|
|
|
|
|
with torch.random.fork_rng([model.device]):
|
|
|
|
|
outputs = model.generate(inputs, max_new_tokens=max_new_tokens)
|
|
|
|
|
with torch.random.fork_rng():
|
|
|
|
|
with torch.random.fork_rng([ref_model.device]):
|
|
|
|
|
ref_outputs = ref_model.generate(inputs, max_new_tokens=max_new_tokens)
|
|
|
|
|
assert torch.allclose(
|
|
|
|
|
outputs, ref_outputs
|
|
|
|
|
), f"Sampling is not identical to HF with {inputs.shape=}, {sampling_options=}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.forked
|
|
|
|
|
def test_beam_search_generation(tokenizer, model, ref_model, max_new_tokens=4, num_beams=2):
|
|
|
|
|
def test_beam_search_generation(tokenizer, models, max_new_tokens=4, num_beams=2):
|
|
|
|
|
model, ref_model = models
|
|
|
|
|
|
|
|
|
|
inputs = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
|
|
|
|
|
|
|
|
|
|
outputs = model.generate(inputs, max_new_tokens=max_new_tokens, num_beams=num_beams)
|
|
|
|
|