Harrison/tf embeddings (#817)

Co-authored-by: Ryohei Kuroki <10434946+yakigac@users.noreply.github.com>
2023-01-31 00:00:08 -08:00 · 2023-01-31 00:00:08 -08:00 · 7b4882a2f4
commit 7b4882a2f4
parent 5d4b6e4d4e
6 changed files with 1017 additions and 298 deletions
--- a/docs/modules/utils/combine_docs_examples/embeddings.ipynb
+++ b/docs/modules/utils/combine_docs_examples/embeddings.ipynb
@ -77,7 +77,6 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "42f76e43",
   "metadata": {},
@ -138,7 +137,6 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "ed47bb62",
   "metadata": {},
@ -196,11 +194,79 @@
   "source": [
    "doc_result = embeddings.embed_documents([text])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fff4734f",
   "metadata": {},
   "source": [
    "## TensorflowHub\n",
    "Let's load the TensorflowHub Embedding class."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f822104b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.embeddings import TensorflowHubEmbeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "bac84e46",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-01-30 23:53:01.652176: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2023-01-30 23:53:34.362802: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "embeddings = TensorflowHubEmbeddings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4790d770",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"This is a test document.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f556dcdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "query_result = embeddings.embed_query(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90f0db94",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "cohere",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -214,7 +280,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.10.9"
  },
  "vscode": {
   "interpreter": {
--- a/langchain/embeddings/init.py
+++ b/langchain/embeddings/init.py
@ -6,6 +6,7 @@ from langchain.embeddings.cohere import CohereEmbeddings
 from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.embeddings.tensorflow_hub import TensorflowHubEmbeddings
 logger = logging.getLogger(__name__)
@ -14,6 +15,7 @@ __all__ = [
    "HuggingFaceEmbeddings",
    "CohereEmbeddings",
    "HuggingFaceHubEmbeddings",
    "TensorflowHubEmbeddings",
 ]
--- a/langchain/embeddings/tensorflow_hub.py
+++ b/langchain/embeddings/tensorflow_hub.py
@ -0,0 +1,70 @@
 """Wrapper around TensorflowHub embedding models."""
 from typing import Any, List
 from pydantic import BaseModel, Extra
 from langchain.embeddings.base import Embeddings
 DEFAULT_MODEL_URL = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
 class TensorflowHubEmbeddings(BaseModel, Embeddings):
    """Wrapper around tensorflow_hub embedding models.
    To use, you should have the ``tensorflow_text`` python package installed.
    Example:
        .. code-block:: python
            from langchain.embeddings import TensorflowHubEmbeddings
            url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
            tf = TensorflowHubEmbeddings(model_url=url)
    """
    embed: Any  #: :meta private:
    model_url: str = DEFAULT_MODEL_URL
    """Model name to use."""
    def __init__(self, **kwargs: Any):
        """Initialize the tensorflow_hub and tensorflow_text."""
        super().__init__(**kwargs)
        try:
            import tensorflow_hub
            import tensorflow_text  # noqa
            self.embed = tensorflow_hub.load(self.model_url)
        except ImportError as e:
            raise ValueError(
                "Could not import some python packages." "Please install them."
            ) from e
    class Config:
        """Configuration for this pydantic object."""
        extra = Extra.forbid
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Compute doc embeddings using a TensorflowHub embedding model.
        Args:
            texts: The list of texts to embed.
        Returns:
            List of embeddings, one for each text.
        """
        texts = list(map(lambda x: x.replace("\n", " "), texts))
        embeddings = self.embed(texts).numpy()
        return embeddings.tolist()
    def embed_query(self, text: str) -> List[float]:
        """Compute query embeddings using a TensorflowHub embedding model.
        Args:
            text: The text to embed.
        Returns:
            Embeddings for the text.
        """
        text = text.replace("\n", " ")
        embedding = self.embed(text).numpy()[0]
        return embedding.tolist()
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -35,6 +35,7 @@ google-api-python-client = {version = "2.70.0", optional = true}
 wolframalpha = {version = "5.0.0", optional = true}
 qdrant-client = {version = "^0.11.7", optional = true}
 dataclasses-json = "^0.5.7"
 tensorflow-text = {version = "^2.11.0", optional = true, python = "^3.10, <3.12"}
 [tool.poetry.group.docs.dependencies]
 autodoc_pydantic = "^1.8.0"
@ -81,7 +82,7 @@ playwright = "^1.28.0"
 [tool.poetry.extras]
 llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
-all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client"]
+all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text"]
 [tool.isort]
 profile = "black"
--- a/tests/integration_tests/embeddings/test_tensorflow_hub.py
+++ b/tests/integration_tests/embeddings/test_tensorflow_hub.py
@ -0,0 +1,19 @@
 """Test TensorflowHub embeddings."""
 from langchain.embeddings import TensorflowHubEmbeddings
 def test_tensorflowhub_embedding_documents() -> None:
    """Test tensorflowhub embeddings."""
    documents = ["foo bar"]
    embedding = TensorflowHubEmbeddings()
    output = embedding.embed_documents(documents)
    assert len(output) == 1
    assert len(output[0]) == 512
 def test_tensorflowhub_embedding_query() -> None:
    """Test tensorflowhub embeddings."""
    document = "foo bar"
    embedding = TensorflowHubEmbeddings()
    output = embedding.embed_query(document)
    assert len(output) == 512