From 21eeba075c05714f185e5541f25228f7b555f606 Mon Sep 17 00:00:00 2001
From: Praveen Venkateswaran <praveenv@uci.edu>
Date: Thu, 2 Nov 2023 17:29:06 -0400
Subject: [PATCH] enable the device_map parameter in huggingface pipeline
 (#12731)

### Enabling `device_map` in HuggingFacePipeline

For multi-gpu settings with large models, the
[accelerate](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#using--accelerate)
library provides the `device_map` parameter to automatically distribute
the model across GPUs / disk.

The [Transformers
pipeline](https://github.com/huggingface/transformers/blob/3520e37e86913715959ff14fef76340010c8de57/src/transformers/pipelines/__init__.py#L543)
enables users to specify `device` (or) `device_map`, and handles cases
(with warnings) when both are specified.

However, Langchain's HuggingFacePipeline only supports specifying
`device` when calling transformers which limits large models and
multi-gpu use-cases.
Additionally, the [default
value](https://github.com/praveenv/langchain/blob/8bd3ce59cd4ef80db5eb52922bb31588596939e1/libs/langchain/langchain/llms/huggingface_pipeline.py#L72)
of `device` is initialized to `-1` , which is incompatible with the
transformers pipeline when `device_map` is specified.

This PR addresses the addition of `device_map` as a parameter , and
solves the incompatibility of `device = -1` when `device_map` is also
specified.
An additional test has been added for this feature.

Additionally, some existing tests no longer work since
1. `max_new_tokens` has to be specified under `pipeline_kwargs` and not
`model_kwargs`
2. The GPT2 tokenizer raises a `ValueError: Pipeline with tokenizer
without pad_token cannot do batching`, since the `tokenizer.pad_token`
is `None` ([related
issue](https://github.com/huggingface/transformers/issues/19853) on the
transformers repo).

This PR handles fixing these tests as well.

Co-authored-by: Praveen Venkateswaran <praveen.venkateswaran@ibm.com>
---
 .../langchain/llms/huggingface_pipeline.py       |  9 ++++++++-
 .../llms/test_huggingface_pipeline.py            | 16 ++++++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py

diff --git a/libs/langchain/langchain/llms/huggingface_pipeline.py b/libs/langchain/langchain/llms/huggingface_pipeline.py
index 4c77a71c01..b72ef2aa40 100644
--- a/libs/langchain/langchain/llms/huggingface_pipeline.py
+++ b/libs/langchain/langchain/llms/huggingface_pipeline.py
@@ -70,6 +70,7 @@ class HuggingFacePipeline(BaseLLM):
         model_id: str,
         task: str,
         device: Optional[int] = -1,
+        device_map: Optional[str] = None,
         model_kwargs: Optional[dict] = None,
         pipeline_kwargs: Optional[dict] = None,
         batch_size: int = DEFAULT_BATCH_SIZE,
@@ -108,6 +109,9 @@ class HuggingFacePipeline(BaseLLM):
                 f"Could not load the {task} model due to missing dependencies."
             ) from e
 
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token_id = model.config.eos_token_id
+
         if (
             getattr(model, "is_loaded_in_4bit", False)
             or getattr(model, "is_loaded_in_8bit", False)
@@ -129,7 +133,9 @@ class HuggingFacePipeline(BaseLLM):
                     f"Got device=={device}, "
                     f"device is required to be within [-1, {cuda_device_count})"
                 )
-            if device < 0 and cuda_device_count > 0:
+            if device_map is not None and device < 0:
+                device = None
+            if device is not None and device < 0 and cuda_device_count > 0:
                 logger.warning(
                     "Device has %d GPUs available. "
                     "Provide device={deviceId} to `from_model_id` to use available"
@@ -147,6 +153,7 @@ class HuggingFacePipeline(BaseLLM):
             model=model,
             tokenizer=tokenizer,
             device=device,
+            device_map=device_map,
             batch_size=batch_size,
             model_kwargs=_model_kwargs,
             **_pipeline_kwargs,
diff --git a/libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py b/libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py
old mode 100644
new mode 100755
index 46fbeb91b3..c1a1ea07f2
--- a/libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py
+++ b/libs/langchain/tests/integration_tests/llms/test_huggingface_pipeline.py
@@ -10,7 +10,7 @@ from tests.integration_tests.llms.utils import assert_llm_equality
 def test_huggingface_pipeline_text_generation() -> None:
     """Test valid call to HuggingFace text generation model."""
     llm = HuggingFacePipeline.from_model_id(
-        model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10}
+        model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}
     )
     output = llm("Say foo:")
     assert isinstance(output, str)
@@ -25,6 +25,18 @@ def test_huggingface_pipeline_text2text_generation() -> None:
     assert isinstance(output, str)
 
 
+def test_huggingface_pipeline_device_map() -> None:
+    """Test pipelines specifying the device map parameter."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="gpt2",
+        task="text-generation",
+        device_map="auto",
+        pipeline_kwargs={"max_new_tokens": 10},
+    )
+    output = llm("Say foo:")
+    assert isinstance(output, str)
+
+
 def text_huggingface_pipeline_summarization() -> None:
     """Test valid call to HuggingFace summarization model."""
     llm = HuggingFacePipeline.from_model_id(
@@ -37,7 +49,7 @@ def text_huggingface_pipeline_summarization() -> None:
 def test_saving_loading_llm(tmp_path: Path) -> None:
     """Test saving/loading an HuggingFaceHub LLM."""
     llm = HuggingFacePipeline.from_model_id(
-        model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10}
+        model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}
     )
     llm.save(file_path=tmp_path / "hf.yaml")
     loaded_llm = load_llm(tmp_path / "hf.yaml")