langchain/libs/community/langchain_community/embeddings/huggingface_hub.py

import json
from typing import Any, Dict, List, Optional

from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator
from langchain_core.utils import get_from_dict_or_env

DEFAULT_MODEL = "sentence-transformers/all-mpnet-base-v2"
VALID_TASKS = ("feature-extraction",)


class HuggingFaceHubEmbeddings(BaseModel, Embeddings):
    """HuggingFaceHub embedding models.

    To use, you should have the ``huggingface_hub`` python package installed, and the
    environment variable ``HUGGINGFACEHUB_API_TOKEN`` set with your API token, or pass
    it as a named parameter to the constructor.

    Example:
        .. code-block:: python

            from langchain_community.embeddings import HuggingFaceHubEmbeddings
            model = "sentence-transformers/all-mpnet-base-v2"
            hf = HuggingFaceHubEmbeddings(
                model=model,
                task="feature-extraction",
                huggingfacehub_api_token="my-api-key",
            )
    """

    client: Any  #: :meta private:
    model: Optional[str] = None
    """Model name to use."""
    repo_id: Optional[str] = None
    """Huggingfacehub repository id, for backward compatibility."""
    task: Optional[str] = "feature-extraction"
    """Task to call the model with."""
    model_kwargs: Optional[dict] = None
    """Keyword arguments to pass to the model."""

    huggingfacehub_api_token: Optional[str] = None

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that api key and python package exists in environment."""
        huggingfacehub_api_token = get_from_dict_or_env(
            values, "huggingfacehub_api_token", "HUGGINGFACEHUB_API_TOKEN"
        )
        try:
            from huggingface_hub import InferenceClient

            if values["model"]:
                values["repo_id"] = values["model"]
            elif values["repo_id"]:
                values["model"] = values["repo_id"]
            else:
                values["model"] = DEFAULT_MODEL
                values["repo_id"] = DEFAULT_MODEL

            client = InferenceClient(
                model=values["model"],
                token=huggingfacehub_api_token,
            )
            if values["task"] not in VALID_TASKS:
                raise ValueError(
                    f"Got invalid task {values['task']}, "
                    f"currently only {VALID_TASKS} are supported"
                )
            values["client"] = client
        except ImportError:
            raise ImportError(
                "Could not import huggingface_hub python package. "
                "Please install it with `pip install huggingface_hub`."
            )
        return values

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Call out to HuggingFaceHub's embedding endpoint for embedding search docs.

        Args:
            texts: The list of texts to embed.

        Returns:
            List of embeddings, one for each text.
        """
        # replace newlines, which can negatively affect performance.
        texts = [text.replace("\n", " ") for text in texts]
        _model_kwargs = self.model_kwargs or {}
        responses = self.client.post(
            json={"inputs": texts, "parameters": _model_kwargs, "task": self.task}
        )
        return json.loads(responses.decode())

    def embed_query(self, text: str) -> List[float]:
        """Call out to HuggingFaceHub's embedding endpoint for embedding query text.

        Args:
            text: The text to embed.

        Returns:
            Embeddings for the text.
        """
        response = self.embed_documents([text])[0]
        return response
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`import json`
			`from typing import Any, Dict, List, Optional`

			`from langchain_core.embeddings import Embeddings`
			`from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator`
			`from langchain_core.utils import get_from_dict_or_env`

			`DEFAULT_MODEL = "sentence-transformers/all-mpnet-base-v2"`
			`VALID_TASKS = ("feature-extraction",)`


			`class HuggingFaceHubEmbeddings(BaseModel, Embeddings):`
			`"""HuggingFaceHub embedding models.`

			To use, you should have the ``huggingface_hub`` python package installed, and the
			environment variable ``HUGGINGFACEHUB_API_TOKEN`` set with your API token, or pass
			`it as a named parameter to the constructor.`

			`Example:`
			`.. code-block:: python`

			`from langchain_community.embeddings import HuggingFaceHubEmbeddings`
			`model = "sentence-transformers/all-mpnet-base-v2"`
			`hf = HuggingFaceHubEmbeddings(`
			`model=model,`
			`task="feature-extraction",`
			`huggingfacehub_api_token="my-api-key",`
			`)`
			`"""`

			`client: Any #: :meta private:`
			`model: Optional[str] = None`
			`"""Model name to use."""`
			`repo_id: Optional[str] = None`
			`"""Huggingfacehub repository id, for backward compatibility."""`
			`task: Optional[str] = "feature-extraction"`
			`"""Task to call the model with."""`
			`model_kwargs: Optional[dict] = None`
			`"""Keyword arguments to pass to the model."""`

			`huggingfacehub_api_token: Optional[str] = None`

			`class Config:`
			`"""Configuration for this pydantic object."""`

			`extra = Extra.forbid`

			`@root_validator()`
			`def validate_environment(cls, values: Dict) -> Dict:`
			`"""Validate that api key and python package exists in environment."""`
			`huggingfacehub_api_token = get_from_dict_or_env(`
			`values, "huggingfacehub_api_token", "HUGGINGFACEHUB_API_TOKEN"`
			`)`
			`try:`
			`from huggingface_hub import InferenceClient`

			`if values["model"]:`
			`values["repo_id"] = values["model"]`
			`elif values["repo_id"]:`
			`values["model"] = values["repo_id"]`
			`else:`
			`values["model"] = DEFAULT_MODEL`
			`values["repo_id"] = DEFAULT_MODEL`

			`client = InferenceClient(`
			`model=values["model"],`
			`token=huggingfacehub_api_token,`
			`)`
			`if values["task"] not in VALID_TASKS:`
			`raise ValueError(`
			`f"Got invalid task {values['task']}, "`
			`f"currently only {VALID_TASKS} are supported"`
			`)`
			`values["client"] = client`
			`except ImportError:`
			`raise ImportError(`
			`"Could not import huggingface_hub python package. "`
			"Please install it with `pip install huggingface_hub`."
			`)`
			`return values`

			`def embed_documents(self, texts: List[str]) -> List[List[float]]:`
			`"""Call out to HuggingFaceHub's embedding endpoint for embedding search docs.`

			`Args:`
			`texts: The list of texts to embed.`

			`Returns:`
			`List of embeddings, one for each text.`
			`"""`
			`# replace newlines, which can negatively affect performance.`
			`texts = [text.replace("\n", " ") for text in texts]`
			`_model_kwargs = self.model_kwargs or {}`
			`responses = self.client.post(`
			`json={"inputs": texts, "parameters": _model_kwargs, "task": self.task}`
			`)`
			`return json.loads(responses.decode())`

			`def embed_query(self, text: str) -> List[float]:`
			`"""Call out to HuggingFaceHub's embedding endpoint for embedding query text.`

			`Args:`
			`text: The text to embed.`

			`Returns:`
			`Embeddings for the text.`
			`"""`
			`response = self.embed_documents([text])[0]`
			`return response`