2023-07-12 12:22:28 +00:00
|
|
|
import peft
|
2022-07-19 01:28:04 +00:00
|
|
|
import pytest
|
2022-07-01 00:38:38 +00:00
|
|
|
import torch
|
|
|
|
import transformers
|
2022-12-15 05:12:18 +00:00
|
|
|
from hivemind import get_logger
|
2022-07-01 00:38:38 +00:00
|
|
|
|
2023-08-08 15:10:27 +00:00
|
|
|
from petals import AutoDistributedModelForCausalLM
|
2023-03-12 21:49:04 +00:00
|
|
|
from test_utils import *
|
2022-07-01 00:38:38 +00:00
|
|
|
|
2023-02-19 01:46:17 +00:00
|
|
|
logger = get_logger(__name__)
|
2022-07-01 00:38:38 +00:00
|
|
|
|
|
|
|
|
2023-08-08 15:10:27 +00:00
|
|
|
@pytest.fixture
|
|
|
|
def tokenizer():
|
|
|
|
# We set use_fast=False since LlamaTokenizerFast is slow on load
|
|
|
|
return transformers.AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
|
|
|
|
|
|
|
|
|
Make client compatible with transformers' GenerationMixin (#464)
This PR drops custom generation codes and introduces compatibility with `transformers.GenerationMixin` instead. This includes support for more sampling options (`top_p`, `top_k`, `repetition_penalty` requested in #460) and beam search - all that is now identical to running model with transformers locally.
Most features (excluding beam search and other rarely used stuff) are also compatible with resuming existing sessions.
### Breaking changes
If `.generate()` or forward passes are being run inside an `.inference_session()` context, they now use the opened session by default. So, these snippets are now equivalent:
```python
# Using default session
with model.inference_session(max_length=100):
output_ids = model.generate(input_ids, max_new_tokens=3)
# Explicitly specifying a session
with model.inference_session(max_length=100) as sess:
output_ids = model.generate(input_ids, max_new_tokens=3, session=sess)
```
Earlier, the 1st snippet was creating a new session, which is not what most people expected (= such code was most likely to introduce a bug, which is now fixed).
2023-08-20 15:18:36 +00:00
|
|
|
@pytest.fixture
|
|
|
|
def model():
|
|
|
|
return AutoDistributedModelForCausalLM.from_pretrained(
|
|
|
|
MODEL_NAME, initial_peers=INITIAL_PEERS, torch_dtype=torch.float32
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def ref_model():
|
|
|
|
return transformers.AutoModelForCausalLM.from_pretrained(
|
|
|
|
REF_NAME, low_cpu_mem_usage=True, torch_dtype=torch.float32
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2022-07-19 01:28:04 +00:00
|
|
|
@pytest.mark.forked
|
2023-07-12 12:22:28 +00:00
|
|
|
@pytest.mark.parametrize("use_peft", (True, False) if ADAPTER_NAME else (False,))
|
2022-12-13 08:03:49 +00:00
|
|
|
@pytest.mark.parametrize("pass_empty_tensors", (True, False))
|
Make client compatible with transformers' GenerationMixin (#464)
This PR drops custom generation codes and introduces compatibility with `transformers.GenerationMixin` instead. This includes support for more sampling options (`top_p`, `top_k`, `repetition_penalty` requested in #460) and beam search - all that is now identical to running model with transformers locally.
Most features (excluding beam search and other rarely used stuff) are also compatible with resuming existing sessions.
### Breaking changes
If `.generate()` or forward passes are being run inside an `.inference_session()` context, they now use the opened session by default. So, these snippets are now equivalent:
```python
# Using default session
with model.inference_session(max_length=100):
output_ids = model.generate(input_ids, max_new_tokens=3)
# Explicitly specifying a session
with model.inference_session(max_length=100) as sess:
output_ids = model.generate(input_ids, max_new_tokens=3, session=sess)
```
Earlier, the 1st snippet was creating a new session, which is not what most people expected (= such code was most likely to introduce a bug, which is now fixed).
2023-08-20 15:18:36 +00:00
|
|
|
def test_full_model_exact_match(tokenizer, model, ref_model, use_peft, pass_empty_tensors, atol=1e-3):
|
|
|
|
if use_peft:
|
|
|
|
model.config.active_adapter = ADAPTER_NAME
|
|
|
|
|
|
|
|
ref_model = peft.PeftModel.from_pretrained(ref_model, ADAPTER_NAME)
|
|
|
|
ref_model.train(False)
|
2022-07-01 00:38:38 +00:00
|
|
|
|
2023-07-22 19:10:46 +00:00
|
|
|
test_inputs = tokenizer("A quick brown fox was minding its own buisness", return_tensors="pt")["input_ids"]
|
2022-07-01 00:38:38 +00:00
|
|
|
|
2022-07-22 19:38:40 +00:00
|
|
|
with torch.inference_mode():
|
2022-07-19 01:28:04 +00:00
|
|
|
parallel_outputs = model.forward(test_inputs).logits
|
|
|
|
assert torch.all(torch.isfinite(parallel_outputs))
|
|
|
|
logger.info("Forward outputs are finite")
|
2022-07-01 00:38:38 +00:00
|
|
|
|
2022-07-15 22:59:23 +00:00
|
|
|
embs = model.transformer.word_embeddings(test_inputs)
|
|
|
|
embs = model.transformer.word_embeddings_layernorm(embs)
|
|
|
|
recurrent_outputs = []
|
2022-08-29 18:04:37 +00:00
|
|
|
with model.transformer.h.inference_session(max_length=embs.shape[1]) as sess:
|
2022-12-13 08:03:49 +00:00
|
|
|
if pass_empty_tensors:
|
Make client compatible with transformers' GenerationMixin (#464)
This PR drops custom generation codes and introduces compatibility with `transformers.GenerationMixin` instead. This includes support for more sampling options (`top_p`, `top_k`, `repetition_penalty` requested in #460) and beam search - all that is now identical to running model with transformers locally.
Most features (excluding beam search and other rarely used stuff) are also compatible with resuming existing sessions.
### Breaking changes
If `.generate()` or forward passes are being run inside an `.inference_session()` context, they now use the opened session by default. So, these snippets are now equivalent:
```python
# Using default session
with model.inference_session(max_length=100):
output_ids = model.generate(input_ids, max_new_tokens=3)
# Explicitly specifying a session
with model.inference_session(max_length=100) as sess:
output_ids = model.generate(input_ids, max_new_tokens=3, session=sess)
```
Earlier, the 1st snippet was creating a new session, which is not what most people expected (= such code was most likely to introduce a bug, which is now fixed).
2023-08-20 15:18:36 +00:00
|
|
|
recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))
|
2022-12-13 08:03:49 +00:00
|
|
|
|
2022-07-15 22:59:23 +00:00
|
|
|
for t in range(embs.shape[1]):
|
2023-07-22 19:10:46 +00:00
|
|
|
if t == 4:
|
|
|
|
recurrent_outputs.append(sess.step(embs[:, 4:9, :]))
|
|
|
|
elif 4 < t < 9:
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
recurrent_outputs.append(sess.step(embs[:, t : t + 1, :]))
|
|
|
|
|
|
|
|
if t == 2 and pass_empty_tensors:
|
Make client compatible with transformers' GenerationMixin (#464)
This PR drops custom generation codes and introduces compatibility with `transformers.GenerationMixin` instead. This includes support for more sampling options (`top_p`, `top_k`, `repetition_penalty` requested in #460) and beam search - all that is now identical to running model with transformers locally.
Most features (excluding beam search and other rarely used stuff) are also compatible with resuming existing sessions.
### Breaking changes
If `.generate()` or forward passes are being run inside an `.inference_session()` context, they now use the opened session by default. So, these snippets are now equivalent:
```python
# Using default session
with model.inference_session(max_length=100):
output_ids = model.generate(input_ids, max_new_tokens=3)
# Explicitly specifying a session
with model.inference_session(max_length=100) as sess:
output_ids = model.generate(input_ids, max_new_tokens=3, session=sess)
```
Earlier, the 1st snippet was creating a new session, which is not what most people expected (= such code was most likely to introduce a bug, which is now fixed).
2023-08-20 15:18:36 +00:00
|
|
|
recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))
|
|
|
|
recurrent_outputs.append(sess.step(torch.empty(1, 0, model.config.hidden_size)))
|
2022-12-13 08:03:49 +00:00
|
|
|
|
2022-07-15 22:59:23 +00:00
|
|
|
recurrent_outputs = torch.cat(recurrent_outputs, dim=1)
|
|
|
|
recurrent_outputs = model.transformer.ln_f(recurrent_outputs)
|
2022-07-22 19:38:40 +00:00
|
|
|
recurrent_outputs = model.lm_head(recurrent_outputs)
|
Make client compatible with transformers' GenerationMixin (#464)
This PR drops custom generation codes and introduces compatibility with `transformers.GenerationMixin` instead. This includes support for more sampling options (`top_p`, `top_k`, `repetition_penalty` requested in #460) and beam search - all that is now identical to running model with transformers locally.
Most features (excluding beam search and other rarely used stuff) are also compatible with resuming existing sessions.
### Breaking changes
If `.generate()` or forward passes are being run inside an `.inference_session()` context, they now use the opened session by default. So, these snippets are now equivalent:
```python
# Using default session
with model.inference_session(max_length=100):
output_ids = model.generate(input_ids, max_new_tokens=3)
# Explicitly specifying a session
with model.inference_session(max_length=100) as sess:
output_ids = model.generate(input_ids, max_new_tokens=3, session=sess)
```
Earlier, the 1st snippet was creating a new session, which is not what most people expected (= such code was most likely to introduce a bug, which is now fixed).
2023-08-20 15:18:36 +00:00
|
|
|
assert torch.allclose(
|
|
|
|
recurrent_outputs, parallel_outputs, rtol=0, atol=atol
|
|
|
|
), "Inference differs from forward pass"
|
|
|
|
|
|
|
|
ref_outputs = ref_model.forward(test_inputs).logits.float()
|
|
|
|
assert torch.allclose(ref_outputs, parallel_outputs, rtol=0, atol=atol), "Outputs are not identical to HF"
|
|
|
|
|
|
|
|
|
|
|
|
def make_generate_calls(model, inputs, *, max_new_tokens, multiple_calls=False, **kwargs):
|
|
|
|
if not multiple_calls:
|
|
|
|
return model.generate(inputs, max_new_tokens=max_new_tokens, **kwargs)
|
|
|
|
|
|
|
|
with model.inference_session(max_length=inputs.shape[1] + max_new_tokens) as sess:
|
|
|
|
return torch.cat(
|
|
|
|
[
|
|
|
|
# Sessions provided both explicitly and implicitly should work
|
|
|
|
model.generate(inputs, max_new_tokens=1, **kwargs, session=sess),
|
|
|
|
model.generate(None, max_new_tokens=max_new_tokens - 2, **kwargs),
|
|
|
|
model.generate(None, max_new_tokens=1, **kwargs),
|
|
|
|
],
|
|
|
|
dim=1,
|
|
|
|
)
|
2022-07-27 06:19:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.forked
|
Make client compatible with transformers' GenerationMixin (#464)
This PR drops custom generation codes and introduces compatibility with `transformers.GenerationMixin` instead. This includes support for more sampling options (`top_p`, `top_k`, `repetition_penalty` requested in #460) and beam search - all that is now identical to running model with transformers locally.
Most features (excluding beam search and other rarely used stuff) are also compatible with resuming existing sessions.
### Breaking changes
If `.generate()` or forward passes are being run inside an `.inference_session()` context, they now use the opened session by default. So, these snippets are now equivalent:
```python
# Using default session
with model.inference_session(max_length=100):
output_ids = model.generate(input_ids, max_new_tokens=3)
# Explicitly specifying a session
with model.inference_session(max_length=100) as sess:
output_ids = model.generate(input_ids, max_new_tokens=3, session=sess)
```
Earlier, the 1st snippet was creating a new session, which is not what most people expected (= such code was most likely to introduce a bug, which is now fixed).
2023-08-20 15:18:36 +00:00
|
|
|
def test_greedy_generation(tokenizer, model, ref_model, max_new_tokens=4):
|
|
|
|
inputs_single = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
|
2022-07-27 06:19:45 +00:00
|
|
|
|
2023-08-08 15:10:27 +00:00
|
|
|
if tokenizer.pad_token_id is None:
|
|
|
|
tokenizer.pad_token_id = tokenizer.eos_token_id
|
2022-07-27 06:19:45 +00:00
|
|
|
inputs_batch = tokenizer(["A cat sat on a mat", "A dog sat on a mat"], return_tensors="pt", padding=True)[
|
|
|
|
"input_ids"
|
|
|
|
]
|
2023-08-08 15:10:27 +00:00
|
|
|
|
Make client compatible with transformers' GenerationMixin (#464)
This PR drops custom generation codes and introduces compatibility with `transformers.GenerationMixin` instead. This includes support for more sampling options (`top_p`, `top_k`, `repetition_penalty` requested in #460) and beam search - all that is now identical to running model with transformers locally.
Most features (excluding beam search and other rarely used stuff) are also compatible with resuming existing sessions.
### Breaking changes
If `.generate()` or forward passes are being run inside an `.inference_session()` context, they now use the opened session by default. So, these snippets are now equivalent:
```python
# Using default session
with model.inference_session(max_length=100):
output_ids = model.generate(input_ids, max_new_tokens=3)
# Explicitly specifying a session
with model.inference_session(max_length=100) as sess:
output_ids = model.generate(input_ids, max_new_tokens=3, session=sess)
```
Earlier, the 1st snippet was creating a new session, which is not what most people expected (= such code was most likely to introduce a bug, which is now fixed).
2023-08-20 15:18:36 +00:00
|
|
|
options = dict(max_new_tokens=max_new_tokens, do_sample=False)
|
|
|
|
for multiple_calls in [False, True]:
|
|
|
|
for inputs in [inputs_single, inputs_batch]:
|
|
|
|
outputs = make_generate_calls(model, inputs, multiple_calls=multiple_calls, **options)
|
|
|
|
ref_outputs = ref_model.generate(inputs, **options)
|
|
|
|
assert torch.allclose(
|
|
|
|
outputs, ref_outputs
|
|
|
|
), f"Greedy generation is not identical to HF with {multiple_calls=}, {inputs.shape=}"
|
2022-12-13 17:09:15 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.forked
|
Make client compatible with transformers' GenerationMixin (#464)
This PR drops custom generation codes and introduces compatibility with `transformers.GenerationMixin` instead. This includes support for more sampling options (`top_p`, `top_k`, `repetition_penalty` requested in #460) and beam search - all that is now identical to running model with transformers locally.
Most features (excluding beam search and other rarely used stuff) are also compatible with resuming existing sessions.
### Breaking changes
If `.generate()` or forward passes are being run inside an `.inference_session()` context, they now use the opened session by default. So, these snippets are now equivalent:
```python
# Using default session
with model.inference_session(max_length=100):
output_ids = model.generate(input_ids, max_new_tokens=3)
# Explicitly specifying a session
with model.inference_session(max_length=100) as sess:
output_ids = model.generate(input_ids, max_new_tokens=3, session=sess)
```
Earlier, the 1st snippet was creating a new session, which is not what most people expected (= such code was most likely to introduce a bug, which is now fixed).
2023-08-20 15:18:36 +00:00
|
|
|
def test_sampling(tokenizer, model, ref_model, max_new_tokens=10):
|
|
|
|
inputs_single = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
|
2022-12-13 17:09:15 +00:00
|
|
|
|
Make client compatible with transformers' GenerationMixin (#464)
This PR drops custom generation codes and introduces compatibility with `transformers.GenerationMixin` instead. This includes support for more sampling options (`top_p`, `top_k`, `repetition_penalty` requested in #460) and beam search - all that is now identical to running model with transformers locally.
Most features (excluding beam search and other rarely used stuff) are also compatible with resuming existing sessions.
### Breaking changes
If `.generate()` or forward passes are being run inside an `.inference_session()` context, they now use the opened session by default. So, these snippets are now equivalent:
```python
# Using default session
with model.inference_session(max_length=100):
output_ids = model.generate(input_ids, max_new_tokens=3)
# Explicitly specifying a session
with model.inference_session(max_length=100) as sess:
output_ids = model.generate(input_ids, max_new_tokens=3, session=sess)
```
Earlier, the 1st snippet was creating a new session, which is not what most people expected (= such code was most likely to introduce a bug, which is now fixed).
2023-08-20 15:18:36 +00:00
|
|
|
if tokenizer.pad_token_id is None:
|
|
|
|
tokenizer.pad_token_id = tokenizer.eos_token_id
|
2022-12-13 17:09:15 +00:00
|
|
|
inputs_batch = tokenizer(["A cat sat on a mat", "A dog sat on a mat"], return_tensors="pt", padding=True)[
|
|
|
|
"input_ids"
|
|
|
|
]
|
Make client compatible with transformers' GenerationMixin (#464)
This PR drops custom generation codes and introduces compatibility with `transformers.GenerationMixin` instead. This includes support for more sampling options (`top_p`, `top_k`, `repetition_penalty` requested in #460) and beam search - all that is now identical to running model with transformers locally.
Most features (excluding beam search and other rarely used stuff) are also compatible with resuming existing sessions.
### Breaking changes
If `.generate()` or forward passes are being run inside an `.inference_session()` context, they now use the opened session by default. So, these snippets are now equivalent:
```python
# Using default session
with model.inference_session(max_length=100):
output_ids = model.generate(input_ids, max_new_tokens=3)
# Explicitly specifying a session
with model.inference_session(max_length=100) as sess:
output_ids = model.generate(input_ids, max_new_tokens=3, session=sess)
```
Earlier, the 1st snippet was creating a new session, which is not what most people expected (= such code was most likely to introduce a bug, which is now fixed).
2023-08-20 15:18:36 +00:00
|
|
|
|
|
|
|
for options in [
|
|
|
|
dict(do_sample=True, temperature=0.5, top_k=5, top_p=0.9),
|
|
|
|
dict(do_sample=True, temperature=0.5, repetition_penalty=1.2),
|
|
|
|
]:
|
|
|
|
options.update(max_new_tokens=max_new_tokens)
|
|
|
|
for multiple_calls in [False, True]:
|
|
|
|
for inputs in [inputs_single, inputs_batch]:
|
|
|
|
torch.manual_seed(0)
|
|
|
|
outputs = make_generate_calls(model, inputs, multiple_calls=multiple_calls, **options)
|
|
|
|
|
|
|
|
torch.manual_seed(0)
|
|
|
|
ref_outputs = ref_model.generate(inputs, **options)
|
|
|
|
|
|
|
|
assert torch.allclose(
|
|
|
|
outputs, ref_outputs
|
|
|
|
), f"Sampling is not identical to HF with {options=}, {multiple_calls=}, {inputs.shape=}"
|
2022-11-28 09:02:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.forked
|
Make client compatible with transformers' GenerationMixin (#464)
This PR drops custom generation codes and introduces compatibility with `transformers.GenerationMixin` instead. This includes support for more sampling options (`top_p`, `top_k`, `repetition_penalty` requested in #460) and beam search - all that is now identical to running model with transformers locally.
Most features (excluding beam search and other rarely used stuff) are also compatible with resuming existing sessions.
### Breaking changes
If `.generate()` or forward passes are being run inside an `.inference_session()` context, they now use the opened session by default. So, these snippets are now equivalent:
```python
# Using default session
with model.inference_session(max_length=100):
output_ids = model.generate(input_ids, max_new_tokens=3)
# Explicitly specifying a session
with model.inference_session(max_length=100) as sess:
output_ids = model.generate(input_ids, max_new_tokens=3, session=sess)
```
Earlier, the 1st snippet was creating a new session, which is not what most people expected (= such code was most likely to introduce a bug, which is now fixed).
2023-08-20 15:18:36 +00:00
|
|
|
def test_beam_search_generation(tokenizer, model, ref_model, max_new_tokens=4, num_beams=5):
|
|
|
|
inputs = tokenizer("A cat sat on a mat", return_tensors="pt")["input_ids"]
|
|
|
|
|
|
|
|
options = dict(max_new_tokens=max_new_tokens, num_beams=num_beams, do_sample=False)
|
|
|
|
outputs = make_generate_calls(model, inputs, **options)
|
|
|
|
ref_outputs = ref_model.generate(inputs, **options)
|
|
|
|
assert torch.allclose(outputs, ref_outputs), f"Beam search results are not identical to HF"
|
2023-08-30 02:59:33 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.forked
|
|
|
|
def test_input_ids(tokenizer, model, ref_model, max_new_tokens=4):
|
|
|
|
inputs = tokenizer("A cat sat on a mat", return_tensors="pt")
|
|
|
|
assert inputs.keys() == {"input_ids", "attention_mask"}
|
|
|
|
|
|
|
|
outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
|
|
ref_outputs = ref_model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
|
|
assert torch.allclose(outputs, ref_outputs), f"Outputs are not identical to HF"
|
|
|
|
|
|
|
|
with model.inference_session(max_length=inputs["input_ids"].shape[1] + max_new_tokens):
|
|
|
|
outputs = torch.cat(
|
|
|
|
[
|
|
|
|
model.generate(**inputs, max_new_tokens=2),
|
|
|
|
model.generate(None, max_new_tokens=max_new_tokens - 2),
|
|
|
|
],
|
|
|
|
dim=1,
|
|
|
|
)
|
|
|
|
assert torch.allclose(outputs, ref_outputs), f"Multi-call outputs are not identical to HF"
|