Integrate NLP Cloud embeddings endpoint (#7931)

Add embeddings for [NLPCloud](https://docs.nlpcloud.com/#embeddings).

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
Co-authored-by: Lance Martin <lance@langchain.dev>
This commit is contained in:
Julien Salinas 2023-07-20 00:27:34 +02:00 committed by GitHub
parent 854a2be0ca
commit 3adab5e5be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 179 additions and 0 deletions

View File

@ -0,0 +1,106 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6802946f",
"metadata": {},
"source": [
"# NLP Cloud\n",
"\n",
"NLP Cloud is an artificial intelligence platform that allows you to use the most advanced AI engines, and even train your own engines with your own data. \n",
"\n",
"The [embeddings](https://docs.nlpcloud.com/#embeddings) endpoint offers several models:\n",
"\n",
"* `paraphrase-multilingual-mpnet-base-v2`: Paraphrase Multilingual MPNet Base V2 is a very fast model based on Sentence Transformers that is perfectly suited for embeddings extraction in more than 50 languages (see the full list here).\n",
"\n",
"* `gpt-j`: GPT-J returns advanced embeddings. It might return better results than Sentence Transformers based models (see above) but it is also much slower.\n",
"\n",
"* `dolphin`: Dolphin returns advanced embeddings. It might return better results than Sentence Transformers based models (see above) but it is also much slower. It natively understands the following languages: Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, French, German, Hungarian, Italian, Japanese, Polish, Portuguese, Romanian, Russian, Serbian, Slovenian, Spanish, Swedish, and Ukrainian."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "490d7923",
"metadata": {},
"outputs": [],
"source": [
"! pip install nlpcloud"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6a39ed4b",
"metadata": {},
"outputs": [],
"source": [
"from langchain.embeddings import NLPCloudEmbeddings"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c105d8cd",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"os.environ[\"NLPCLOUD_API_KEY\"] = \"xxx\"\n",
"nlpcloud_embd = NLPCloudEmbeddings()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cca84023",
"metadata": {},
"outputs": [],
"source": [
"text = \"This is a test document.\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "26868d0f",
"metadata": {},
"outputs": [],
"source": [
"query_result = nlpcloud_embd.embed_query(text)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "0c171c2f",
"metadata": {},
"outputs": [],
"source": [
"doc_result = nlpcloud_embd.embed_documents([text])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -27,6 +27,7 @@ from langchain.embeddings.minimax import MiniMaxEmbeddings
from langchain.embeddings.mlflow_gateway import MlflowAIGatewayEmbeddings
from langchain.embeddings.modelscope_hub import ModelScopeEmbeddings
from langchain.embeddings.mosaicml import MosaicMLInstructorEmbeddings
from langchain.embeddings.nlpcloud import NLPCloudEmbeddings
from langchain.embeddings.octoai_embeddings import OctoAIEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.sagemaker_endpoint import SagemakerEndpointEmbeddings
@ -73,6 +74,7 @@ __all__ = [
"EmbaasEmbeddings",
"OctoAIEmbeddings",
"SpacyEmbeddings",
"NLPCloudEmbeddings",
"GPT4AllEmbeddings",
]

View File

@ -0,0 +1,71 @@
"""Wrapper around NLP Cloud embedding models."""
from typing import Any, Dict, List
from pydantic import BaseModel, root_validator
from langchain.embeddings.base import Embeddings
from langchain.utils import get_from_dict_or_env
class NLPCloudEmbeddings(BaseModel, Embeddings):
"""Wrapper around NLP Cloud embedding models.
To use, you should have the nlpcloud python package installed
Example:
.. code-block:: python
from langchain.embeddings import NLPCloudEmbeddings
embeddings = NLPCloudEmbeddings()
"""
model_name: str # Define model_name as a class attribute
client: Any #: :meta private:
def __init__(
self, model_name: str = "paraphrase-multilingual-mpnet-base-v2", **kwargs: Any
) -> None:
super().__init__(model_name=model_name, **kwargs)
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key and python package exists in environment."""
nlpcloud_api_key = get_from_dict_or_env(
values, "nlpcloud_api_key", "NLPCLOUD_API_KEY"
)
try:
import nlpcloud
values["client"] = nlpcloud.Client(
values["model_name"], nlpcloud_api_key, gpu=False, lang="en"
)
except ImportError:
raise ImportError(
"Could not import nlpcloud python package. "
"Please install it with `pip install nlpcloud`."
)
return values
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed a list of documents using NLP Cloud.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
return self.client.embeddings(texts)["embeddings"]
def embed_query(self, text: str) -> List[float]:
"""Embed a query using NLP Cloud.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return self.client.embeddings([text])["embeddings"][0]