langchain/libs/community/tests/integration_tests/llms/test_huggingface_pipeline.py

"""Test HuggingFace Pipeline wrapper."""

from pathlib import Path

from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_community.llms.loading import load_llm
from tests.integration_tests.llms.utils import assert_llm_equality


def test_huggingface_pipeline_text_generation() -> None:
    """Test valid call to HuggingFace text generation model."""
    llm = HuggingFacePipeline.from_model_id(
        model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}
    )
    output = llm("Say foo:")
    assert isinstance(output, str)


def test_huggingface_pipeline_text2text_generation() -> None:
    """Test valid call to HuggingFace text2text generation model."""
    llm = HuggingFacePipeline.from_model_id(
        model_id="google/flan-t5-small", task="text2text-generation"
    )
    output = llm("Say foo:")
    assert isinstance(output, str)


def test_huggingface_pipeline_device_map() -> None:
    """Test pipelines specifying the device map parameter."""
    llm = HuggingFacePipeline.from_model_id(
        model_id="gpt2",
        task="text-generation",
        device_map="auto",
        pipeline_kwargs={"max_new_tokens": 10},
    )
    output = llm("Say foo:")
    assert isinstance(output, str)


def text_huggingface_pipeline_summarization() -> None:
    """Test valid call to HuggingFace summarization model."""
    llm = HuggingFacePipeline.from_model_id(
        model_id="facebook/bart-large-cnn", task="summarization"
    )
    output = llm("Say foo:")
    assert isinstance(output, str)


def test_saving_loading_llm(tmp_path: Path) -> None:
    """Test saving/loading an HuggingFaceHub LLM."""
    llm = HuggingFacePipeline.from_model_id(
        model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}
    )
    llm.save(file_path=tmp_path / "hf.yaml")
    loaded_llm = load_llm(tmp_path / "hf.yaml")
    assert_llm_equality(llm, loaded_llm)


def test_init_with_pipeline() -> None:
    """Test initialization with a HF pipeline."""
    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

    model_id = "gpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id)
    pipe = pipeline(
        "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10
    )
    llm = HuggingFacePipeline(pipeline=pipe)
    output = llm("Say foo:")
    assert isinstance(output, str)


def test_huggingface_pipeline_runtime_kwargs() -> None:
    """Test pipelines specifying the device map parameter."""
    llm = HuggingFacePipeline.from_model_id(
        model_id="gpt2",
        task="text-generation",
    )
    prompt = "Say foo:"
    output = llm(prompt, pipeline_kwargs={"max_new_tokens": 2})
    assert len(output) < 10


ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}


def test_huggingface_pipeline_text_generation_ov() -> None:
    """Test valid call to HuggingFace text generation model with openvino."""
    llm = HuggingFacePipeline.from_model_id(
        model_id="gpt2",
        task="text-generation",
        backend="openvino",
        model_kwargs={"device": "CPU", "ov_config": ov_config},
        pipeline_kwargs={"max_new_tokens": 64},
    )
    output = llm("Say foo:")
    assert isinstance(output, str)


def test_huggingface_pipeline_text2text_generation_ov() -> None:
    """Test valid call to HuggingFace text2text generation model with openvino."""
    llm = HuggingFacePipeline.from_model_id(
        model_id="google/flan-t5-small",
        task="text2text-generation",
        backend="openvino",
        model_kwargs={"device": "CPU", "ov_config": ov_config},
        pipeline_kwargs={"max_new_tokens": 64},
    )
    output = llm("Say foo:")
    assert isinstance(output, str)


def text_huggingface_pipeline_summarization_ov() -> None:
    """Test valid call to HuggingFace summarization model with openvino."""
    llm = HuggingFacePipeline.from_model_id(
        model_id="facebook/bart-large-cnn",
        task="summarization",
        backend="openvino",
        model_kwargs={"device": "CPU", "ov_config": ov_config},
        pipeline_kwargs={"max_new_tokens": 64},
    )
    output = llm("Say foo:")
    assert isinstance(output, str)
Add HuggingFacePipeline LLM (#353) https://github.com/hwchase17/langchain/issues/354 Add support for running your own HF pipeline locally. This would allow you to get a lot more dynamic with what HF features and models you support since you wouldn't be beholden to what is hosted in HF hub. You could also do stuff with HF Optimum to quantize your models and stuff to get pretty fast inference even running on a laptop. 2022-12-17 15:00:04 +00:00			`"""Test HuggingFace Pipeline wrapper."""`

			`from pathlib import Path`

community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline`
			`from langchain_community.llms.loading import load_llm`
Add HuggingFacePipeline LLM (#353) https://github.com/hwchase17/langchain/issues/354 Add support for running your own HF pipeline locally. This would allow you to get a lot more dynamic with what HF features and models you support since you wouldn't be beholden to what is hosted in HF hub. You could also do stuff with HF Optimum to quantize your models and stuff to get pretty fast inference even running on a laptop. 2022-12-17 15:00:04 +00:00			`from tests.integration_tests.llms.utils import assert_llm_equality`


			`def test_huggingface_pipeline_text_generation() -> None:`
			`"""Test valid call to HuggingFace text generation model."""`
			`llm = HuggingFacePipeline.from_model_id(`
enable the device_map parameter in huggingface pipeline (#12731) ### Enabling `device_map` in HuggingFacePipeline For multi-gpu settings with large models, the [accelerate](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#using--accelerate) library provides the `device_map` parameter to automatically distribute the model across GPUs / disk. The [Transformers pipeline](https://github.com/huggingface/transformers/blob/3520e37e86913715959ff14fef76340010c8de57/src/transformers/pipelines/__init__.py#L543) enables users to specify `device` (or) `device_map`, and handles cases (with warnings) when both are specified. However, Langchain's HuggingFacePipeline only supports specifying `device` when calling transformers which limits large models and multi-gpu use-cases. Additionally, the [default value](https://github.com/praveenv/langchain/blob/8bd3ce59cd4ef80db5eb52922bb31588596939e1/libs/langchain/langchain/llms/huggingface_pipeline.py#L72) of `device` is initialized to `-1` , which is incompatible with the transformers pipeline when `device_map` is specified. This PR addresses the addition of `device_map` as a parameter , and solves the incompatibility of `device = -1` when `device_map` is also specified. An additional test has been added for this feature. Additionally, some existing tests no longer work since 1. `max_new_tokens` has to be specified under `pipeline_kwargs` and not `model_kwargs` 2. The GPT2 tokenizer raises a `ValueError: Pipeline with tokenizer without pad_token cannot do batching`, since the `tokenizer.pad_token` is `None` ([related issue](https://github.com/huggingface/transformers/issues/19853) on the transformers repo). This PR handles fixing these tests as well. Co-authored-by: Praveen Venkateswaran <praveen.venkateswaran@ibm.com> 2023-11-02 21:29:06 +00:00			`model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}`
Add HuggingFacePipeline LLM (#353) https://github.com/hwchase17/langchain/issues/354 Add support for running your own HF pipeline locally. This would allow you to get a lot more dynamic with what HF features and models you support since you wouldn't be beholden to what is hosted in HF hub. You could also do stuff with HF Optimum to quantize your models and stuff to get pretty fast inference even running on a laptop. 2022-12-17 15:00:04 +00:00			`)`
			`output = llm("Say foo:")`
			`assert isinstance(output, str)`


Harrison/version 0040 (#366) 2022-12-17 15:53:22 +00:00			`def test_huggingface_pipeline_text2text_generation() -> None:`
			`"""Test valid call to HuggingFace text2text generation model."""`
			`llm = HuggingFacePipeline.from_model_id(`
			`model_id="google/flan-t5-small", task="text2text-generation"`
			`)`
			`output = llm("Say foo:")`
			`assert isinstance(output, str)`


enable the device_map parameter in huggingface pipeline (#12731) ### Enabling `device_map` in HuggingFacePipeline For multi-gpu settings with large models, the [accelerate](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#using--accelerate) library provides the `device_map` parameter to automatically distribute the model across GPUs / disk. The [Transformers pipeline](https://github.com/huggingface/transformers/blob/3520e37e86913715959ff14fef76340010c8de57/src/transformers/pipelines/__init__.py#L543) enables users to specify `device` (or) `device_map`, and handles cases (with warnings) when both are specified. However, Langchain's HuggingFacePipeline only supports specifying `device` when calling transformers which limits large models and multi-gpu use-cases. Additionally, the [default value](https://github.com/praveenv/langchain/blob/8bd3ce59cd4ef80db5eb52922bb31588596939e1/libs/langchain/langchain/llms/huggingface_pipeline.py#L72) of `device` is initialized to `-1` , which is incompatible with the transformers pipeline when `device_map` is specified. This PR addresses the addition of `device_map` as a parameter , and solves the incompatibility of `device = -1` when `device_map` is also specified. An additional test has been added for this feature. Additionally, some existing tests no longer work since 1. `max_new_tokens` has to be specified under `pipeline_kwargs` and not `model_kwargs` 2. The GPT2 tokenizer raises a `ValueError: Pipeline with tokenizer without pad_token cannot do batching`, since the `tokenizer.pad_token` is `None` ([related issue](https://github.com/huggingface/transformers/issues/19853) on the transformers repo). This PR handles fixing these tests as well. Co-authored-by: Praveen Venkateswaran <praveen.venkateswaran@ibm.com> 2023-11-02 21:29:06 +00:00			`def test_huggingface_pipeline_device_map() -> None:`
			`"""Test pipelines specifying the device map parameter."""`
			`llm = HuggingFacePipeline.from_model_id(`
			`model_id="gpt2",`
			`task="text-generation",`
			`device_map="auto",`
			`pipeline_kwargs={"max_new_tokens": 10},`
			`)`
			`output = llm("Say foo:")`
			`assert isinstance(output, str)`


Add summarization task type for HuggingFace APIs (#4721) # Add summarization task type for HuggingFace APIs Add summarization task type for HuggingFace APIs. This task type is described by [HuggingFace inference API](https://huggingface.co/docs/api-inference/detailed_parameters#summarization-task) My project utilizes LangChain to connect multiple LLMs, including various HuggingFace models that support the summarization task. Integrating this task type is highly convenient and beneficial. Fixes #4720 2023-05-15 23:26:17 +00:00			`def text_huggingface_pipeline_summarization() -> None:`
			`"""Test valid call to HuggingFace summarization model."""`
			`llm = HuggingFacePipeline.from_model_id(`
			`model_id="facebook/bart-large-cnn", task="summarization"`
			`)`
			`output = llm("Say foo:")`
			`assert isinstance(output, str)`


Add HuggingFacePipeline LLM (#353) https://github.com/hwchase17/langchain/issues/354 Add support for running your own HF pipeline locally. This would allow you to get a lot more dynamic with what HF features and models you support since you wouldn't be beholden to what is hosted in HF hub. You could also do stuff with HF Optimum to quantize your models and stuff to get pretty fast inference even running on a laptop. 2022-12-17 15:00:04 +00:00			`def test_saving_loading_llm(tmp_path: Path) -> None:`
			`"""Test saving/loading an HuggingFaceHub LLM."""`
			`llm = HuggingFacePipeline.from_model_id(`
enable the device_map parameter in huggingface pipeline (#12731) ### Enabling `device_map` in HuggingFacePipeline For multi-gpu settings with large models, the [accelerate](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#using--accelerate) library provides the `device_map` parameter to automatically distribute the model across GPUs / disk. The [Transformers pipeline](https://github.com/huggingface/transformers/blob/3520e37e86913715959ff14fef76340010c8de57/src/transformers/pipelines/__init__.py#L543) enables users to specify `device` (or) `device_map`, and handles cases (with warnings) when both are specified. However, Langchain's HuggingFacePipeline only supports specifying `device` when calling transformers which limits large models and multi-gpu use-cases. Additionally, the [default value](https://github.com/praveenv/langchain/blob/8bd3ce59cd4ef80db5eb52922bb31588596939e1/libs/langchain/langchain/llms/huggingface_pipeline.py#L72) of `device` is initialized to `-1` , which is incompatible with the transformers pipeline when `device_map` is specified. This PR addresses the addition of `device_map` as a parameter , and solves the incompatibility of `device = -1` when `device_map` is also specified. An additional test has been added for this feature. Additionally, some existing tests no longer work since 1. `max_new_tokens` has to be specified under `pipeline_kwargs` and not `model_kwargs` 2. The GPT2 tokenizer raises a `ValueError: Pipeline with tokenizer without pad_token cannot do batching`, since the `tokenizer.pad_token` is `None` ([related issue](https://github.com/huggingface/transformers/issues/19853) on the transformers repo). This PR handles fixing these tests as well. Co-authored-by: Praveen Venkateswaran <praveen.venkateswaran@ibm.com> 2023-11-02 21:29:06 +00:00			`model_id="gpt2", task="text-generation", pipeline_kwargs={"max_new_tokens": 10}`
Add HuggingFacePipeline LLM (#353) https://github.com/hwchase17/langchain/issues/354 Add support for running your own HF pipeline locally. This would allow you to get a lot more dynamic with what HF features and models you support since you wouldn't be beholden to what is hosted in HF hub. You could also do stuff with HF Optimum to quantize your models and stuff to get pretty fast inference even running on a laptop. 2022-12-17 15:00:04 +00:00			`)`
			`llm.save(file_path=tmp_path / "hf.yaml")`
			`loaded_llm = load_llm(tmp_path / "hf.yaml")`
			`assert_llm_equality(llm, loaded_llm)`


			`def test_init_with_pipeline() -> None:`
			`"""Test initialization with a HF pipeline."""`
fix sched ci (more) (#9056) 2023-08-10 17:39:29 +00:00			`from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline`

Add HuggingFacePipeline LLM (#353) https://github.com/hwchase17/langchain/issues/354 Add support for running your own HF pipeline locally. This would allow you to get a lot more dynamic with what HF features and models you support since you wouldn't be beholden to what is hosted in HF hub. You could also do stuff with HF Optimum to quantize your models and stuff to get pretty fast inference even running on a laptop. 2022-12-17 15:00:04 +00:00			`model_id = "gpt2"`
			`tokenizer = AutoTokenizer.from_pretrained(model_id)`
			`model = AutoModelForCausalLM.from_pretrained(model_id)`
			`pipe = pipeline(`
			`"text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10`
			`)`
			`llm = HuggingFacePipeline(pipeline=pipe)`
			`output = llm("Say foo:")`
			`assert isinstance(output, str)`
community: add runtime kwargs to HuggingFacePipeline (#17005) This PR enables changing the behaviour of huggingface pipeline between different calls. For example, before this PR there's no way of changing maximum generation length between different invocations of the chain. This is desirable in cases, such as when we want to scale the maximum output size depending on a dynamic prompt size. Usage example: ```python from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline model_id = "gpt2" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) hf = HuggingFacePipeline(pipeline=pipe) hf("Say foo:", pipeline_kwargs={"max_new_tokens": 42}) ``` --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-02-08 21:58:31 +00:00

			`def test_huggingface_pipeline_runtime_kwargs() -> None:`
			`"""Test pipelines specifying the device map parameter."""`
			`llm = HuggingFacePipeline.from_model_id(`
			`model_id="gpt2",`
			`task="text-generation",`
			`)`
			`prompt = "Say foo:"`
			`output = llm(prompt, pipeline_kwargs={"max_new_tokens": 2})`
			`assert len(output) < 10`
community[minor]: Add openvino backend support (#11591) - Description: add openvino backend support by HuggingFace Optimum Intel, - Dependencies: “optimum[openvino]”, --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-03-01 18:04:24 +00:00

			`ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}`


			`def test_huggingface_pipeline_text_generation_ov() -> None:`
			`"""Test valid call to HuggingFace text generation model with openvino."""`
			`llm = HuggingFacePipeline.from_model_id(`
			`model_id="gpt2",`
			`task="text-generation",`
			`backend="openvino",`
			`model_kwargs={"device": "CPU", "ov_config": ov_config},`
			`pipeline_kwargs={"max_new_tokens": 64},`
			`)`
			`output = llm("Say foo:")`
			`assert isinstance(output, str)`


			`def test_huggingface_pipeline_text2text_generation_ov() -> None:`
			`"""Test valid call to HuggingFace text2text generation model with openvino."""`
			`llm = HuggingFacePipeline.from_model_id(`
			`model_id="google/flan-t5-small",`
			`task="text2text-generation",`
			`backend="openvino",`
			`model_kwargs={"device": "CPU", "ov_config": ov_config},`
			`pipeline_kwargs={"max_new_tokens": 64},`
			`)`
			`output = llm("Say foo:")`
			`assert isinstance(output, str)`


			`def text_huggingface_pipeline_summarization_ov() -> None:`
			`"""Test valid call to HuggingFace summarization model with openvino."""`
			`llm = HuggingFacePipeline.from_model_id(`
			`model_id="facebook/bart-large-cnn",`
			`task="summarization",`
			`backend="openvino",`
			`model_kwargs={"device": "CPU", "ov_config": ov_config},`
			`pipeline_kwargs={"max_new_tokens": 64},`
			`)`
			`output = llm("Say foo:")`
			`assert isinstance(output, str)`