enable the device_map parameter in huggingface pipeline (#12731)

### Enabling `device_map` in HuggingFacePipeline 

For multi-gpu settings with large models, the
[accelerate](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#using--accelerate)
library provides the `device_map` parameter to automatically distribute
the model across GPUs / disk.

The [Transformers
pipeline](3520e37e86/src/transformers/pipelines/__init__.py (L543))
enables users to specify `device` (or) `device_map`, and handles cases
(with warnings) when both are specified.

However, Langchain's HuggingFacePipeline only supports specifying
`device` when calling transformers which limits large models and
multi-gpu use-cases.
Additionally, the [default
value](8bd3ce59cd/libs/langchain/langchain/llms/huggingface_pipeline.py (L72))
of `device` is initialized to `-1` , which is incompatible with the
transformers pipeline when `device_map` is specified.

This PR addresses the addition of `device_map` as a parameter , and
solves the incompatibility of `device = -1` when `device_map` is also
specified.
An additional test has been added for this feature. 

Additionally, some existing tests no longer work since 
1. `max_new_tokens` has to be specified under `pipeline_kwargs` and not
`model_kwargs`
2. The GPT2 tokenizer raises a `ValueError: Pipeline with tokenizer
without pad_token cannot do batching`, since the `tokenizer.pad_token`
is `None` ([related
issue](https://github.com/huggingface/transformers/issues/19853) on the
transformers repo).

This PR handles fixing these tests as well.

Co-authored-by: Praveen Venkateswaran <praveen.venkateswaran@ibm.com>
pull/12805/head
Praveen Venkateswaran 11 months ago committed by GitHub
parent 3276aa3e17
commit 21eeba075c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -70,6 +70,7 @@ class HuggingFacePipeline(BaseLLM):
model_id: str,
task: str,
device: Optional[int] = -1,
device_map: Optional[str] = None,
model_kwargs: Optional[dict] = None,
pipeline_kwargs: Optional[dict] = None,
batch_size: int = DEFAULT_BATCH_SIZE,
@ -108,6 +109,9 @@ class HuggingFacePipeline(BaseLLM):
f"Could not load the {task} model due to missing dependencies."
) from e
if tokenizer.pad_token is None:
tokenizer.pad_token_id = model.config.eos_token_id
if (
getattr(model, "is_loaded_in_4bit", False)
or getattr(model, "is_loaded_in_8bit", False)
@ -129,7 +133,9 @@ class HuggingFacePipeline(BaseLLM):
f"Got device=={device}, "
f"device is required to be within [-1, {cuda_device_count})"
)
if device < 0 and cuda_device_count > 0:
if device_map is not None and device < 0:
device = None
if device is not None and device < 0 and cuda_device_count > 0:
logger.warning(
"Device has %d GPUs available. "
"Provide device={deviceId} to `from_model_id` to use available"
@ -147,6 +153,7 @@ class HuggingFacePipeline(BaseLLM):
model=model,
tokenizer=tokenizer,
device=device,
device_map=device_map,
batch_size=batch_size,
model_kwargs=_model_kwargs,
**_pipeline_kwargs,

@ -10,7 +10,7 @@ from tests.integration_tests.llms.utils import assert_llm_equality
def test_huggingface_pipeline_text_generation() -> None:
"""Test valid call to HuggingFace text generation model."""
llm = HuggingFacePipeline.from_model_id(
model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10}
model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}
)
output = llm("Say foo:")
assert isinstance(output, str)
@ -25,6 +25,18 @@ def test_huggingface_pipeline_text2text_generation() -> None:
assert isinstance(output, str)
def test_huggingface_pipeline_device_map() -> None:
"""Test pipelines specifying the device map parameter."""
llm = HuggingFacePipeline.from_model_id(
model_id="gpt2",
task="text-generation",
device_map="auto",
pipeline_kwargs={"max_new_tokens": 10},
)
output = llm("Say foo:")
assert isinstance(output, str)
def text_huggingface_pipeline_summarization() -> None:
"""Test valid call to HuggingFace summarization model."""
llm = HuggingFacePipeline.from_model_id(
@ -37,7 +49,7 @@ def text_huggingface_pipeline_summarization() -> None:
def test_saving_loading_llm(tmp_path: Path) -> None:
"""Test saving/loading an HuggingFaceHub LLM."""
llm = HuggingFacePipeline.from_model_id(
model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10}
model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}
)
llm.save(file_path=tmp_path / "hf.yaml")
loaded_llm = load_llm(tmp_path / "hf.yaml")

Loading…
Cancel
Save