Harrison/tf embeddings (#817)

Co-authored-by: Ryohei Kuroki <10434946+yakigac@users.noreply.github.com>
2 years ago · 7b4882a2f4
parent 5d4b6e4d4e
commit 7b4882a2f4
6 changed files with 1020 additions and 301 deletions
--- a/docs/modules/utils/combine_docs_examples/embeddings.ipynb
+++ b/docs/modules/utils/combine_docs_examples/embeddings.ipynb
@ -77,7 +77,6 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "id": "42f76e43",
   "metadata": {},
@ -138,7 +137,6 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "id": "ed47bb62",
   "metadata": {},
@ -196,11 +194,79 @@
   "source": [
    "doc_result = embeddings.embed_documents([text])"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fff4734f",
+   "metadata": {},
+   "source": [
+    "## TensorflowHub\n",
+    "Let's load the TensorflowHub Embedding class."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f822104b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import TensorflowHubEmbeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "bac84e46",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-01-30 23:53:01.652176: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-01-30 23:53:34.362802: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "embeddings = TensorflowHubEmbeddings()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4790d770",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"This is a test document.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f556dcdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_result = embeddings.embed_query(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90f0db94",
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "cohere",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -214,7 +280,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.10.9"
  },
  "vscode": {
   "interpreter": {
--- a/langchain/embeddings/init.py
+++ b/langchain/embeddings/init.py
@ -6,6 +6,7 @@ from langchain.embeddings.cohere import CohereEmbeddings
 from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
 from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.embeddings.tensorflow_hub import TensorflowHubEmbeddings

 logger = logging.getLogger(__name__)

@ -14,6 +15,7 @@ __all__ = [
    "HuggingFaceEmbeddings",
    "CohereEmbeddings",
    "HuggingFaceHubEmbeddings",
+    "TensorflowHubEmbeddings",
 ]


--- a/langchain/embeddings/tensorflow_hub.py
+++ b/langchain/embeddings/tensorflow_hub.py
@ -0,0 +1,70 @@
+"""Wrapper around TensorflowHub embedding models."""
+from typing import Any, List
+
+from pydantic import BaseModel, Extra
+
+from langchain.embeddings.base import Embeddings
+
+DEFAULT_MODEL_URL = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
+
+
+class TensorflowHubEmbeddings(BaseModel, Embeddings):
+    """Wrapper around tensorflow_hub embedding models.
+
+    To use, you should have the ``tensorflow_text`` python package installed.
+
+    Example:
+        .. code-block:: python
+
+            from langchain.embeddings import TensorflowHubEmbeddings
+            url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
+            tf = TensorflowHubEmbeddings(model_url=url)
+    """
+
+    embed: Any  #: :meta private:
+    model_url: str = DEFAULT_MODEL_URL
+    """Model name to use."""
+
+    def __init__(self, **kwargs: Any):
+        """Initialize the tensorflow_hub and tensorflow_text."""
+        super().__init__(**kwargs)
+        try:
+            import tensorflow_hub
+            import tensorflow_text  # noqa
+
+            self.embed = tensorflow_hub.load(self.model_url)
+        except ImportError as e:
+            raise ValueError(
+                "Could not import some python packages." "Please install them."
+            ) from e
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Compute doc embeddings using a TensorflowHub embedding model.
+
+        Args:
+            texts: The list of texts to embed.
+
+        Returns:
+            List of embeddings, one for each text.
+        """
+        texts = list(map(lambda x: x.replace("\n", " "), texts))
+        embeddings = self.embed(texts).numpy()
+        return embeddings.tolist()
+
+    def embed_query(self, text: str) -> List[float]:
+        """Compute query embeddings using a TensorflowHub embedding model.
+
+        Args:
+            text: The text to embed.
+
+        Returns:
+            Embeddings for the text.
+        """
+        text = text.replace("\n", " ")
+        embedding = self.embed(text).numpy()[0]
+        return embedding.tolist()
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -35,6 +35,7 @@ google-api-python-client = {version = "2.70.0", optional = true}
 wolframalpha = {version = "5.0.0", optional = true}
 qdrant-client = {version = "^0.11.7", optional = true}
 dataclasses-json = "^0.5.7"
+tensorflow-text = {version = "^2.11.0", optional = true, python = "^3.10, <3.12"}

 [tool.poetry.group.docs.dependencies]
 autodoc_pydantic = "^1.8.0"
@ -81,7 +82,7 @@ playwright = "^1.28.0"

 [tool.poetry.extras]
 llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
-all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client"]
+all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text"]

 [tool.isort]
 profile = "black"
--- a/tests/integration_tests/embeddings/test_tensorflow_hub.py
+++ b/tests/integration_tests/embeddings/test_tensorflow_hub.py
@ -0,0 +1,19 @@
+"""Test TensorflowHub embeddings."""
+from langchain.embeddings import TensorflowHubEmbeddings
+
+
+def test_tensorflowhub_embedding_documents() -> None:
+    """Test tensorflowhub embeddings."""
+    documents = ["foo bar"]
+    embedding = TensorflowHubEmbeddings()
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 512
+
+
+def test_tensorflowhub_embedding_query() -> None:
+    """Test tensorflowhub embeddings."""
+    document = "foo bar"
+    embedding = TensorflowHubEmbeddings()
+    output = embedding.embed_query(document)
+    assert len(output) == 512