Integrate NLP Cloud embeddings endpoint (#7931)

Add embeddings for [NLPCloud](https://docs.nlpcloud.com/#embeddings). --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev>
2024-11-06 03:20:49 +00:00 · 2023-07-20 00:27:34 +02:00 · 2023-07-20 00:27:34 +02:00 · 3adab5e5be
commit 3adab5e5be
parent 854a2be0ca
3 changed files with 179 additions and 0 deletions
--- a/docs/extras/modules/data_connection/text_embedding/integrations/nlp_cloud.ipynb
+++ b/docs/extras/modules/data_connection/text_embedding/integrations/nlp_cloud.ipynb
@ -0,0 +1,106 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6802946f",
+   "metadata": {},
+   "source": [
+    "# NLP Cloud\n",
+    "\n",
+    "NLP Cloud is an artificial intelligence platform that allows you to use the most advanced AI engines, and even train your own engines with your own data. \n",
+    "\n",
+    "The [embeddings](https://docs.nlpcloud.com/#embeddings) endpoint offers several models:\n",
+    "\n",
+    "* `paraphrase-multilingual-mpnet-base-v2`: Paraphrase Multilingual MPNet Base V2 is a very fast model based on Sentence Transformers that is perfectly suited for embeddings extraction in more than 50 languages (see the full list here).\n",
+    "\n",
+    "* `gpt-j`: GPT-J returns advanced embeddings. It might return better results than Sentence Transformers based models (see above) but it is also much slower.\n",
+    "\n",
+    "* `dolphin`: Dolphin returns advanced embeddings. It might return better results than Sentence Transformers based models (see above) but it is also much slower. It natively understands the following languages: Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, French, German, Hungarian, Italian, Japanese, Polish, Portuguese, Romanian, Russian, Serbian, Slovenian, Spanish, Swedish, and Ukrainian."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "490d7923",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install nlpcloud"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6a39ed4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import NLPCloudEmbeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c105d8cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"NLPCLOUD_API_KEY\"] = \"xxx\"\n",
+    "nlpcloud_embd = NLPCloudEmbeddings()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "cca84023",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"This is a test document.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "26868d0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_result = nlpcloud_embd.embed_query(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "0c171c2f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "doc_result = nlpcloud_embd.embed_documents([text])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/langchain/embeddings/init.py
+++ b/langchain/embeddings/init.py
@ -27,6 +27,7 @@ from langchain.embeddings.minimax import MiniMaxEmbeddings
 from langchain.embeddings.mlflow_gateway import MlflowAIGatewayEmbeddings
 from langchain.embeddings.modelscope_hub import ModelScopeEmbeddings
 from langchain.embeddings.mosaicml import MosaicMLInstructorEmbeddings
+from langchain.embeddings.nlpcloud import NLPCloudEmbeddings
 from langchain.embeddings.octoai_embeddings import OctoAIEmbeddings
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.embeddings.sagemaker_endpoint import SagemakerEndpointEmbeddings
@ -73,6 +74,7 @@ __all__ = [
    "EmbaasEmbeddings",
    "OctoAIEmbeddings",
    "SpacyEmbeddings",
+    "NLPCloudEmbeddings",
    "GPT4AllEmbeddings",
 ]

--- a/langchain/embeddings/nlpcloud.py
+++ b/langchain/embeddings/nlpcloud.py
@ -0,0 +1,71 @@
+"""Wrapper around NLP Cloud embedding models."""
+from typing import Any, Dict, List
+
+from pydantic import BaseModel, root_validator
+
+from langchain.embeddings.base import Embeddings
+from langchain.utils import get_from_dict_or_env
+
+
+class NLPCloudEmbeddings(BaseModel, Embeddings):
+    """Wrapper around NLP Cloud embedding models.
+
+    To use, you should have the nlpcloud python package installed
+
+    Example:
+        .. code-block:: python
+
+            from langchain.embeddings import NLPCloudEmbeddings
+
+            embeddings = NLPCloudEmbeddings()
+    """
+
+    model_name: str  # Define model_name as a class attribute
+    client: Any  #: :meta private:
+
+    def __init__(
+        self, model_name: str = "paraphrase-multilingual-mpnet-base-v2", **kwargs: Any
+    ) -> None:
+        super().__init__(model_name=model_name, **kwargs)
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that api key and python package exists in environment."""
+        nlpcloud_api_key = get_from_dict_or_env(
+            values, "nlpcloud_api_key", "NLPCLOUD_API_KEY"
+        )
+        try:
+            import nlpcloud
+
+            values["client"] = nlpcloud.Client(
+                values["model_name"], nlpcloud_api_key, gpu=False, lang="en"
+            )
+        except ImportError:
+            raise ImportError(
+                "Could not import nlpcloud python package. "
+                "Please install it with `pip install nlpcloud`."
+            )
+        return values
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of documents using NLP Cloud.
+
+        Args:
+            texts: The list of texts to embed.
+
+        Returns:
+            List of embeddings, one for each text.
+        """
+
+        return self.client.embeddings(texts)["embeddings"]
+
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a query using NLP Cloud.
+
+        Args:
+            text: The text to embed.
+
+        Returns:
+            Embeddings for the text.
+        """
+        return self.client.embeddings([text])["embeddings"][0]