From 21eeba075c05714f185e5541f25228f7b555f606 Mon Sep 17 00:00:00 2001 From: Praveen Venkateswaran Date: Thu, 2 Nov 2023 17:29:06 -0400 Subject: [PATCH] enable the device_map parameter in huggingface pipeline (#12731) ### Enabling `device_map` in HuggingFacePipeline For multi-gpu settings with large models, the [accelerate](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#using--accelerate) library provides the `device_map` parameter to automatically distribute the model across GPUs / disk. The [Transformers pipeline](https://github.com/huggingface/transformers/blob/3520e37e86913715959ff14fef76340010c8de57/src/transformers/pipelines/__init__.py#L543) enables users to specify `device` (or) `device_map`, and handles cases (with warnings) when both are specified. However, Langchain's HuggingFacePipeline only supports specifying `device` when calling transformers which limits large models and multi-gpu use-cases. Additionally, the [default value](https://github.com/praveenv/langchain/blob/8bd3ce59cd4ef80db5eb52922bb31588596939e1/libs/langchain/langchain/llms/huggingface_pipeline.py#L72) of `device` is initialized to `-1` , which is incompatible with the transformers pipeline when `device_map` is specified. This PR addresses the addition of `device_map` as a parameter , and solves the incompatibility of `device = -1` when `device_map` is also specified. An additional test has been added for this feature. Additionally, some existing tests no longer work since 1. `max_new_tokens` has to be specified under `pipeline_kwargs` and not `model_kwargs` 2. The GPT2 tokenizer raises a `ValueError: Pipeline with tokenizer without pad_token cannot do batching`, since the `tokenizer.pad_token` is `None` ([related issue](https://github.com/huggingface/transformers/issues/19853) on the transformers repo). This PR handles fixing these tests as well. Co-authored-by: Praveen Venkateswaran --- .../langchain/llms/huggingface_pipeline.py | 9 ++++++++- .../llms/test_huggingface_pipeline.py | 16 ++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) mode change 100644 => 100755 libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py diff --git a/libs/langchain/langchain/llms/huggingface_pipeline.py b/libs/langchain/langchain/llms/huggingface_pipeline.py index 4c77a71c01..b72ef2aa40 100644 --- a/libs/langchain/langchain/llms/huggingface_pipeline.py +++ b/libs/langchain/langchain/llms/huggingface_pipeline.py @@ -70,6 +70,7 @@ class HuggingFacePipeline(BaseLLM): model_id: str, task: str, device: Optional[int] = -1, + device_map: Optional[str] = None, model_kwargs: Optional[dict] = None, pipeline_kwargs: Optional[dict] = None, batch_size: int = DEFAULT_BATCH_SIZE, @@ -108,6 +109,9 @@ class HuggingFacePipeline(BaseLLM): f"Could not load the {task} model due to missing dependencies." ) from e + if tokenizer.pad_token is None: + tokenizer.pad_token_id = model.config.eos_token_id + if ( getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_loaded_in_8bit", False) @@ -129,7 +133,9 @@ class HuggingFacePipeline(BaseLLM): f"Got device=={device}, " f"device is required to be within [-1, {cuda_device_count})" ) - if device < 0 and cuda_device_count > 0: + if device_map is not None and device < 0: + device = None + if device is not None and device < 0 and cuda_device_count > 0: logger.warning( "Device has %d GPUs available. " "Provide device={deviceId} to `from_model_id` to use available" @@ -147,6 +153,7 @@ class HuggingFacePipeline(BaseLLM): model=model, tokenizer=tokenizer, device=device, + device_map=device_map, batch_size=batch_size, model_kwargs=_model_kwargs, **_pipeline_kwargs, diff --git a/libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py b/libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py old mode 100644 new mode 100755 index 46fbeb91b3..c1a1ea07f2 --- a/libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py +++ b/libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py @@ -10,7 +10,7 @@ from tests.integration_tests.llms.utils import assert_llm_equality def test_huggingface_pipeline_text_generation() -> None: """Test valid call to HuggingFace text generation model.""" llm = HuggingFacePipeline.from_model_id( - model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10} + model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10} ) output = llm("Say foo:") assert isinstance(output, str) @@ -25,6 +25,18 @@ def test_huggingface_pipeline_text2text_generation() -> None: assert isinstance(output, str) +def test_huggingface_pipeline_device_map() -> None: + """Test pipelines specifying the device map parameter.""" + llm = HuggingFacePipeline.from_model_id( + model_id="gpt2", + task="text-generation", + device_map="auto", + pipeline_kwargs={"max_new_tokens": 10}, + ) + output = llm("Say foo:") + assert isinstance(output, str) + + def text_huggingface_pipeline_summarization() -> None: """Test valid call to HuggingFace summarization model.""" llm = HuggingFacePipeline.from_model_id( @@ -37,7 +49,7 @@ def text_huggingface_pipeline_summarization() -> None: def test_saving_loading_llm(tmp_path: Path) -> None: """Test saving/loading an HuggingFaceHub LLM.""" llm = HuggingFacePipeline.from_model_id( - model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10} + model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10} ) llm.save(file_path=tmp_path / "hf.yaml") loaded_llm = load_llm(tmp_path / "hf.yaml")