From a7227ee01b55aa44d51474d36272539c02e2bf45 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 11 Jun 2023 13:35:14 -0700 Subject: [PATCH] Harrison/embaas (#6010) Co-authored-by: Julius Lipp <43986145+juliuslipp@users.noreply.github.com> --- .../text_embedding/examples/embaas.ipynb | 159 ++++++++++++++++++ langchain/embeddings/__init__.py | 2 + langchain/embeddings/embaas.py | 140 +++++++++++++++ .../embeddings/test_embaas.py | 58 +++++++ 4 files changed, 359 insertions(+) create mode 100644 docs/modules/models/text_embedding/examples/embaas.ipynb create mode 100644 langchain/embeddings/embaas.py create mode 100644 tests/integration_tests/embeddings/test_embaas.py diff --git a/docs/modules/models/text_embedding/examples/embaas.ipynb b/docs/modules/models/text_embedding/examples/embaas.ipynb new file mode 100644 index 00000000..cb5132e8 --- /dev/null +++ b/docs/modules/models/text_embedding/examples/embaas.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "[embaas](https://embaas.io) is a fully managed NLP API service that offers features like embedding generation, document text extraction, document to embeddings and more. You can choose a [variety of pre-trained models](https://embaas.io/docs/models/embeddings).\n", + "\n", + "In this tutorial, we will show you how to use the embaas Embeddings API to generate embeddings for a given text.\n", + "\n", + "### Prerequisites\n", + "Create your free embaas account at [https://embaas.io/register](https://embaas.io/register) and generate an [API key](https://embaas.io/dashboard/api-keys)." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Set API key\n", + "embaas_api_key = \"YOUR_API_KEY\"\n", + "# or set environment variable\n", + "os.environ[\"EMBAAS_API_KEY\"] = \"YOUR_API_KEY\"" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from langchain.embeddings import EmbaasEmbeddings" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "embeddings = EmbaasEmbeddings()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Create embeddings for a single document\n", + "doc_text = \"This is a test document.\"\n", + "doc_text_embedding = embeddings.embed_query(doc_text)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-06-10T11:17:55.938517Z", + "end_time": "2023-06-10T11:17:55.940265Z" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Print created embedding\n", + "print(doc_text_embedding)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [], + "source": [ + "# Create embeddings for multiple documents\n", + "doc_texts = [\"This is a test document.\", \"This is another test document.\"]\n", + "doc_texts_embeddings = embeddings.embed_documents(doc_texts)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-06-10T11:19:25.235320Z", + "end_time": "2023-06-10T11:19:25.237161Z" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Print created embeddings\n", + "for i, doc_text_embedding in enumerate(doc_texts_embeddings):\n", + " print(f\"Embedding for document {i + 1}: {doc_text_embedding}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [], + "source": [ + "# Using a different model and/or custom instruction\n", + "embeddings = EmbaasEmbeddings(model=\"instructor-large\", instruction=\"Represent the Wikipedia document for retrieval\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-06-10T11:22:26.138357Z", + "end_time": "2023-06-10T11:22:26.139769Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "For more detailed information about the embaas Embeddings API, please refer to [the official embaas API documentation](https://embaas.io/api-reference)." + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/langchain/embeddings/__init__.py b/langchain/embeddings/__init__.py index dafc3e64..a54ea9aa 100644 --- a/langchain/embeddings/__init__.py +++ b/langchain/embeddings/__init__.py @@ -10,6 +10,7 @@ from langchain.embeddings.bedrock import BedrockEmbeddings from langchain.embeddings.cohere import CohereEmbeddings from langchain.embeddings.deepinfra import DeepInfraEmbeddings from langchain.embeddings.elasticsearch import ElasticsearchEmbeddings +from langchain.embeddings.embaas import EmbaasEmbeddings from langchain.embeddings.fake import FakeEmbeddings from langchain.embeddings.google_palm import GooglePalmEmbeddings from langchain.embeddings.huggingface import ( @@ -60,6 +61,7 @@ __all__ = [ "VertexAIEmbeddings", "BedrockEmbeddings", "DeepInfraEmbeddings", + "EmbaasEmbeddings", ] diff --git a/langchain/embeddings/embaas.py b/langchain/embeddings/embaas.py new file mode 100644 index 00000000..8a9134f7 --- /dev/null +++ b/langchain/embeddings/embaas.py @@ -0,0 +1,140 @@ +"""Wrapper around embaas embeddings API.""" +from typing import Any, Dict, List, Mapping, Optional + +import requests +from pydantic import BaseModel, Extra, root_validator +from typing_extensions import NotRequired, TypedDict + +from langchain.embeddings.base import Embeddings +from langchain.utils import get_from_dict_or_env + +# Currently supported maximum batch size for embedding requests +MAX_BATCH_SIZE = 256 +EMBAAS_API_URL = "https://api.embaas.io/v1/embeddings/" + + +class EmbaasEmbeddingsPayload(TypedDict): + """Payload for the embaas embeddings API.""" + + model: str + texts: List[str] + instruction: NotRequired[str] + + +class EmbaasEmbeddings(BaseModel, Embeddings): + """Wrapper around embaas's embedding service. + + To use, you should have the + environment variable ``EMBAAS_API_KEY`` set with your API key, or pass + it as a named parameter to the constructor. + + Example: + .. code-block:: python + + # Initialise with default model and instruction + from langchain.llms import EmbaasEmbeddings + emb = EmbaasEmbeddings() + + # Initialise with custom model and instruction + from langchain.llms import EmbaasEmbeddings + emb_model = "instructor-large" + emb_inst = "Represent the Wikipedia document for retrieval" + emb = EmbaasEmbeddings( + model=emb_model, + instruction=emb_inst, + embaas_api_key="your-api-key" + ) + """ + + model: str = "e5-large-v2" + """The model used for embeddings.""" + instruction: Optional[str] = None + """Instruction used for domain-specific embeddings.""" + api_url: str = EMBAAS_API_URL + """The URL for the embaas embeddings API.""" + embaas_api_key: Optional[str] = None + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key and python package exists in environment.""" + embaas_api_key = get_from_dict_or_env( + values, "embaas_api_key", "EMBAAS_API_KEY" + ) + values["embaas_api_key"] = embaas_api_key + return values + + @property + def _identifying_params(self) -> Mapping[str, Any]: + """Get the identifying params.""" + return {"model": self.model, "instruction": self.instruction} + + def _generate_payload(self, texts: List[str]) -> EmbaasEmbeddingsPayload: + """Generates payload for the API request.""" + payload = EmbaasEmbeddingsPayload(texts=texts, model=self.model) + if self.instruction: + payload["instruction"] = self.instruction + return payload + + def _handle_request(self, payload: EmbaasEmbeddingsPayload) -> List[List[float]]: + """Sends a request to the Embaas API and handles the response.""" + headers = { + "Authorization": f"Bearer {self.embaas_api_key}", + "Content-Type": "application/json", + } + + response = requests.post(self.api_url, headers=headers, json=payload) + response.raise_for_status() + + parsed_response = response.json() + embeddings = [item["embedding"] for item in parsed_response["data"]] + + return embeddings + + def _generate_embeddings(self, texts: List[str]) -> List[List[float]]: + """Generate embeddings using the Embaas API.""" + payload = self._generate_payload(texts) + try: + return self._handle_request(payload) + except requests.exceptions.RequestException as e: + if e.response is None or not e.response.text: + raise ValueError(f"Error raised by embaas embeddings API: {e}") + + parsed_response = e.response.json() + if "message" in parsed_response: + raise ValueError( + "Validation Error raised by embaas embeddings API:" + f"{parsed_response['message']}" + ) + raise + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Get embeddings for a list of texts. + + Args: + texts: The list of texts to get embeddings for. + + Returns: + List of embeddings, one for each text. + """ + batches = [ + texts[i : i + MAX_BATCH_SIZE] for i in range(0, len(texts), MAX_BATCH_SIZE) + ] + embeddings = [self._generate_embeddings(batch) for batch in batches] + # flatten the list of lists into a single list + return [embedding for batch in embeddings for embedding in batch] + + def embed_query(self, text: str) -> List[float]: + """Get embeddings for a single text. + + Args: + text: The text to get embeddings for. + + Returns: + List of embeddings. + """ + return self.embed_documents([text])[0] diff --git a/tests/integration_tests/embeddings/test_embaas.py b/tests/integration_tests/embeddings/test_embaas.py new file mode 100644 index 00000000..8a13f4d9 --- /dev/null +++ b/tests/integration_tests/embeddings/test_embaas.py @@ -0,0 +1,58 @@ +"""Test embaas embeddings.""" +import responses + +from langchain.embeddings.embaas import EMBAAS_API_URL, EmbaasEmbeddings + + +def test_embaas_embed_documents() -> None: + """Test embaas embeddings with multiple texts.""" + texts = ["foo bar", "bar foo", "foo"] + embedding = EmbaasEmbeddings() + output = embedding.embed_documents(texts) + assert len(output) == 3 + assert len(output[0]) == 1024 + assert len(output[1]) == 1024 + assert len(output[2]) == 1024 + + +def test_embaas_embed_query() -> None: + """Test embaas embeddings with multiple texts.""" + text = "foo" + embeddings = EmbaasEmbeddings() + output = embeddings.embed_query(text) + assert len(output) == 1024 + + +def test_embaas_embed_query_instruction() -> None: + """Test embaas embeddings with a different instruction.""" + text = "Test" + instruction = "query" + embeddings = EmbaasEmbeddings(instruction=instruction) + output = embeddings.embed_query(text) + assert len(output) == 1024 + + +def test_embaas_embed_query_model() -> None: + """Test embaas embeddings with a different model.""" + text = "Test" + model = "instructor-large" + instruction = "Represent the query for retrieval" + embeddings = EmbaasEmbeddings(model=model, instruction=instruction) + output = embeddings.embed_query(text) + assert len(output) == 768 + + +@responses.activate +def test_embaas_embed_documents_response() -> None: + """Test embaas embeddings with multiple texts.""" + responses.add( + responses.POST, + EMBAAS_API_URL, + json={"data": [{"embedding": [0.0] * 1024}]}, + status=200, + ) + + text = "asd" + embeddings = EmbaasEmbeddings() + output = embeddings.embed_query(text) + assert len(output) == 1024