From a5bf8c9b9d991df179131f4ee50382a052e65e49 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Tue, 28 Mar 2023 15:18:03 -0700 Subject: [PATCH] Harrison/aleph alpha embeddings (#2117) Co-authored-by: Piotr Mazurek Co-authored-by: PiotrMazurek --- .../modules/indexes/examples/embeddings.ipynb | 905 ++++++++++++++++++ langchain/embeddings/__init__.py | 6 + langchain/embeddings/aleph_alpha.py | 218 +++++ 3 files changed, 1129 insertions(+) create mode 100644 docs/modules/indexes/examples/embeddings.ipynb create mode 100644 langchain/embeddings/aleph_alpha.py diff --git a/docs/modules/indexes/examples/embeddings.ipynb b/docs/modules/indexes/examples/embeddings.ipynb new file mode 100644 index 00000000..3f19830e --- /dev/null +++ b/docs/modules/indexes/examples/embeddings.ipynb @@ -0,0 +1,905 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "249b4058", + "metadata": {}, + "source": [ + "# Embeddings\n", + "\n", + "This notebook goes over how to use the Embedding class in LangChain.\n", + "\n", + "The Embedding class is a class designed for interfacing with embeddings. There are lots of Embedding providers (OpenAI, Cohere, Hugging Face, etc) - this class is designed to provide a standard interface for all of them.\n", + "\n", + "Embeddings create a vector representation of a piece of text. This is useful because it means we can think about text in the vector space, and do things like semantic search where we look for pieces of text that are most similar in the vector space.\n", + "\n", + "The base Embedding class in LangChain exposes two methods: `embed_documents` and `embed_query`. The largest difference is that these two methods have different interfaces: one works over multiple documents, while the other works over a single document. Besides this, another reason for having these as two separate methods is that some embedding providers have different embedding methods for documents (to be searched over) vs queries (the search query itself)." + ] + }, + { + "cell_type": "markdown", + "id": "278b6c63", + "metadata": {}, + "source": [ + "## OpenAI\n", + "\n", + "Let's load the OpenAI Embedding class." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0be1af71", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import OpenAIEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2c66e5da", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "01370375", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"This is a test document.\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bfb6142c", + "metadata": {}, + "outputs": [], + "source": [ + "query_result = embeddings.embed_query(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0356c3b7", + "metadata": {}, + "outputs": [], + "source": [ + "doc_result = embeddings.embed_documents([text])" + ] + }, + { + "cell_type": "markdown", + "id": "bb61bbeb", + "metadata": {}, + "source": [ + "Let's load the OpenAI Embedding class with first generation models (e.g. text-search-ada-doc-001/text-search-ada-query-001). Note: These are not recommended models - see [here](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0b072cc", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a56b70f5", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = OpenAIEmbeddings(model_name=\"ada\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14aefb64", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"This is a test document.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c39ed33", + "metadata": {}, + "outputs": [], + "source": [ + "query_result = embeddings.embed_query(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3221db6", + "metadata": {}, + "outputs": [], + "source": [ + "doc_result = embeddings.embed_documents([text])" + ] + }, + { + "cell_type": "markdown", + "id": "c3852491", + "metadata": {}, + "source": [ + "## AzureOpenAI\n", + "\n", + "Let's load the OpenAI Embedding class with environment variables set to indicate to use Azure endpoints." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b40f827", + "metadata": {}, + "outputs": [], + "source": [ + "# set the environment variables needed for openai package to know to reach out to azure\n", + "import os\n", + "\n", + "os.environ[\"OPENAI_API_TYPE\"] = \"azure\"\n", + "os.environ[\"OPENAI_API_BASE\"] = \"https://'],\n", + "# ssh_creds={'ssh_user': '...', 'ssh_private_key':''},\n", + "# name='my-cluster')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1230f7df", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = SelfHostedHuggingFaceEmbeddings(hardware=gpu)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2684e928", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"This is a test document.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dc5e606", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "query_result = embeddings.embed_query(text)" + ] + }, + { + "cell_type": "markdown", + "id": "cef9cc54", + "metadata": {}, + "source": [ + "And similarly for SelfHostedHuggingFaceInstructEmbeddings:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81a17ca3", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = SelfHostedHuggingFaceInstructEmbeddings(hardware=gpu)" + ] + }, + { + "cell_type": "markdown", + "id": "5a33d1c8", + "metadata": {}, + "source": [ + "Now let's load an embedding model with a custom load function:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c4af5679", + "metadata": {}, + "outputs": [], + "source": [ + "def get_pipeline():\n", + " from transformers import (\n", + " AutoModelForCausalLM,\n", + " AutoTokenizer,\n", + " pipeline,\n", + " ) # Must be inside the function in notebooks\n", + "\n", + " model_id = \"facebook/bart-base\"\n", + " tokenizer = AutoTokenizer.from_pretrained(model_id)\n", + " model = AutoModelForCausalLM.from_pretrained(model_id)\n", + " return pipeline(\"feature-extraction\", model=model, tokenizer=tokenizer)\n", + "\n", + "\n", + "def inference_fn(pipeline, prompt):\n", + " # Return last hidden state of the model\n", + " if isinstance(prompt, list):\n", + " return [emb[0][-1] for emb in pipeline(prompt)]\n", + " return pipeline(prompt)[0][-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8654334b", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = SelfHostedEmbeddings(\n", + " model_load_fn=get_pipeline,\n", + " hardware=gpu,\n", + " model_reqs=[\"./\", \"torch\", \"transformers\"],\n", + " inference_fn=inference_fn,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc1bfd0f", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "query_result = embeddings.embed_query(text)" + ] + }, + { + "cell_type": "markdown", + "id": "f9c02c78", + "metadata": {}, + "source": [ + "## Fake Embeddings\n", + "\n", + "LangChain also provides a fake embedding class. You can use this to test your pipelines." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2ffc2e4b", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import FakeEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "80777571", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = FakeEmbeddings(size=1352)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3ec9d8f0", + "metadata": {}, + "outputs": [], + "source": [ + "query_result = embeddings.embed_query(\"foo\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3b9ae9e1", + "metadata": {}, + "outputs": [], + "source": [ + "doc_results = embeddings.embed_documents([\"foo\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1f83f273", + "metadata": {}, + "source": [ + "## SageMaker Endpoint Embeddings\n", + "\n", + "Let's load the SageMaker Endpoints Embeddings class. The class can be used if you host, e.g. your own Hugging Face model on SageMaker.\n", + "\n", + "For instrucstions on how to do this, please see [here](https://www.philschmid.de/custom-inference-huggingface-sagemaker)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88d366bd", + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install langchain boto3" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1e9b926a", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict\n", + "from langchain.embeddings import SagemakerEndpointEmbeddings\n", + "from langchain.llms.sagemaker_endpoint import ContentHandlerBase\n", + "import json\n", + "\n", + "\n", + "class ContentHandler(ContentHandlerBase):\n", + " content_type = \"application/json\"\n", + " accepts = \"application/json\"\n", + "\n", + " def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:\n", + " input_str = json.dumps({\"inputs\": prompt, **model_kwargs})\n", + " return input_str.encode('utf-8')\n", + " \n", + " def transform_output(self, output: bytes) -> str:\n", + " response_json = json.loads(output.read().decode(\"utf-8\"))\n", + " return response_json[\"embeddings\"]\n", + "\n", + "content_handler = ContentHandler()\n", + "\n", + "\n", + "embeddings = SagemakerEndpointEmbeddings(\n", + " # endpoint_name=\"endpoint-name\", \n", + " # credentials_profile_name=\"credentials-profile-name\", \n", + " endpoint_name=\"huggingface-pytorch-inference-2023-03-21-16-14-03-834\", \n", + " region_name=\"us-east-1\", \n", + " content_handler=content_handler\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe9797b8", + "metadata": {}, + "outputs": [], + "source": [ + "query_result = embeddings.embed_query(\"foo\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "76f1b752", + "metadata": {}, + "outputs": [], + "source": [ + "doc_results = embeddings.embed_documents([\"foo\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fff99b21", + "metadata": {}, + "outputs": [], + "source": [ + "doc_results" + ] + }, + { + "cell_type": "markdown", + "id": "eb1c0ea9", + "metadata": {}, + "source": [ + "## Aleph Alpha\n", + "\n", + "There are two possible ways to use Aleph Alpha's semantic embeddings. If you have texts with a dissimilar structure (e.g. a Document and a Query) you would want to use asymmetric embeddings. Conversely, for texts with comparable structures, symmetric embeddings are the suggested approach." + ] + }, + { + "cell_type": "markdown", + "id": "9ecc84f9", + "metadata": {}, + "source": [ + "### Asymmetric" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8a920a89", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import AlephAlphaAsymmetricSemanticEmbedding" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f2d04da3", + "metadata": {}, + "outputs": [], + "source": [ + "document = \"This is a content of the document\"\n", + "query = \"What is the contnt of the document?\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6ecde96", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = AlephAlphaAsymmetricSemanticEmbedding()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90e68411", + "metadata": {}, + "outputs": [], + "source": [ + "doc_result = embeddings.embed_documents([document])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55903233", + "metadata": {}, + "outputs": [], + "source": [ + "query_result = embeddings.embed_query(query)" + ] + }, + { + "cell_type": "markdown", + "id": "b8c00aab", + "metadata": {}, + "source": [ + "### Symmetric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eabb763a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import AlephAlphaSymmetricSemanticEmbedding" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0ad799f7", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"This is a test text\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af86dc10", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = AlephAlphaSymmetricSemanticEmbedding()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d292536f", + "metadata": {}, + "outputs": [], + "source": [ + "doc_result = embeddings.embed_documents([text])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c704a7cf", + "metadata": {}, + "outputs": [], + "source": [ + "query_result = embeddings.embed_query(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33492471", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "vscode": { + "interpreter": { + "hash": "7377c2ccc78bc62c2683122d48c8cd1fb85a53850a1b1fc29736ed39852c9885" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/embeddings/__init__.py b/langchain/embeddings/__init__.py index 20e3e296..c1b33d7a 100644 --- a/langchain/embeddings/__init__.py +++ b/langchain/embeddings/__init__.py @@ -2,6 +2,10 @@ import logging from typing import Any +from langchain.embeddings.aleph_alpha import ( + AlephAlphaAsymmetricSemanticEmbedding, + AlephAlphaSymmetricSemanticEmbedding, +) from langchain.embeddings.cohere import CohereEmbeddings from langchain.embeddings.fake import FakeEmbeddings from langchain.embeddings.huggingface import ( @@ -34,6 +38,8 @@ __all__ = [ "SelfHostedHuggingFaceEmbeddings", "SelfHostedHuggingFaceInstructEmbeddings", "FakeEmbeddings", + "AlephAlphaAsymmetricSemanticEmbedding", + "AlephAlphaSymmetricSemanticEmbedding", ] diff --git a/langchain/embeddings/aleph_alpha.py b/langchain/embeddings/aleph_alpha.py new file mode 100644 index 00000000..6ae76bdb --- /dev/null +++ b/langchain/embeddings/aleph_alpha.py @@ -0,0 +1,218 @@ +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, root_validator + +from langchain.embeddings.base import Embeddings +from langchain.utils import get_from_dict_or_env + + +class AlephAlphaAsymmetricSemanticEmbedding(BaseModel, Embeddings): + """ + Wrapper for Aleph Alpha's Asymmetric Embeddings + AA provides you with an endpoint to embed a document and a query. + The models were optimized to make the embeddings of a document and + the query about the document as similar to each other + as possible. Wrapper for Aleph Alpha's Asymmetric Embeddings + AA provides you with an endpoint to embed a document and a query. + The models were optimized to make the embeddings of documents and + the query for a document as similar as possible. + To learn more, check out: https://docs.aleph-alpha.com/docs/tasks/semantic_embed/ + To learn more, check out: + https://docs.aleph-alpha.com/docs/tasks/semantic_embed/ + + Example: + .. code-block:: python + + from aleph_alpha import AlephAlphaAsymmetricSemanticEmbedding + + embeddings = AlephAlphaSymmetricSemanticEmbedding() + + document = "This is a content of the document" + query = "What is the content of the document?" + + doc_result = embeddings.embed_documents([document]) + query_result = embeddings.embed_query(query) + + """ + + client: Any #: :meta private: + + model: Optional[str] = "luminous-base" + """Model name to use.""" + hosting: Optional[str] = "https://api.aleph-alpha.com" + """Optional parameter that specifies which datacenters may process the request.""" + normalize: Optional[bool] = True + """Should returned embeddings be normalized""" + compress_to_size: Optional[int] = 128 + """Should the returned embeddings come back as an original 5120-dim vector, + or should it be compressed to 128-dim.""" + contextual_control_threshold: Optional[int] = None + """Attention control parameters only apply to those tokens that have + explicitly been set in the request.""" + control_log_additive: Optional[bool] = True + """Apply controls on prompt items by adding the log(control_factor) + to attention scores.""" + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key and python package exists in environment.""" + aleph_alpha_api_key = get_from_dict_or_env( + values, "aleph_alpha_api_key", "ALEPH_ALPHA_API_KEY" + ) + try: + from aleph_alpha_client import ( + Client, + ) + except ImportError: + raise ValueError( + "Could not import aleph_alpha_client python package. " + "Please it install it with `pip install aleph_alpha_client`." + ) + values["client"] = Client(token=aleph_alpha_api_key) + return values + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Call out to Aleph Alpha's asymmetric Document endpoint. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + try: + from aleph_alpha_client import ( + Prompt, + SemanticEmbeddingRequest, + SemanticRepresentation, + ) + except ImportError: + raise ValueError( + "Could not import aleph_alpha_client python package. " + "Please it install it with `pip install aleph_alpha_client`." + ) + document_embeddings = [] + + for text in texts: + document_params = { + "prompt": Prompt.from_text(text), + "representation": SemanticRepresentation.Document, + "compress_to_size": self.compress_to_size, + "normalize": self.normalize, + "contextual_control_threshold": self.contextual_control_threshold, + "control_log_additive": self.control_log_additive, + } + + document_request = SemanticEmbeddingRequest(**document_params) + document_response = self.client.semantic_embed( + request=document_request, model=self.model + ) + + document_embeddings.append(document_response.embedding) + + return document_embeddings + + def embed_query(self, text: str) -> List[float]: + """Call out to Aleph Alpha's asymmetric, query embedding endpoint + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + try: + from aleph_alpha_client import ( + Prompt, + SemanticEmbeddingRequest, + SemanticRepresentation, + ) + except ImportError: + raise ValueError( + "Could not import aleph_alpha_client python package. " + "Please it install it with `pip install aleph_alpha_client`." + ) + symmetric_params = { + "prompt": Prompt.from_text(text), + "representation": SemanticRepresentation.Query, + "compress_to_size": self.compress_to_size, + "normalize": self.normalize, + "contextual_control_threshold": self.contextual_control_threshold, + "control_log_additive": self.control_log_additive, + } + + symmetric_request = SemanticEmbeddingRequest(**symmetric_params) + symmetric_response = self.client.semantic_embed( + request=symmetric_request, model=self.model + ) + + return symmetric_response.embedding + + +class AlephAlphaSymmetricSemanticEmbedding(AlephAlphaAsymmetricSemanticEmbedding): + """The symmetric version of the Aleph Alpha's semantic embeddings. + + The main difference is that here, both the documents and + queries are embedded with a SemanticRepresentation.Symmetric + Example: + .. code-block:: python + from aleph_alpha import AlephAlphaSymmetricSemanticEmbedding + + embeddings = AlephAlphaAsymmetricSemanticEmbedding() + text = "This is a test text" + + doc_result = embeddings.embed_documents([text]) + query_result = embeddings.embed_query(text) + """ + + def _embed(self, text: str) -> List[float]: + try: + from aleph_alpha_client import ( + Prompt, + SemanticEmbeddingRequest, + SemanticRepresentation, + ) + except ImportError: + raise ValueError( + "Could not import aleph_alpha_client python package. " + "Please it install it with `pip install aleph_alpha_client`." + ) + query_params = { + "prompt": Prompt.from_text(text), + "representation": SemanticRepresentation.Symmetric, + "compress_to_size": self.compress_to_size, + "normalize": self.normalize, + "contextual_control_threshold": self.contextual_control_threshold, + "control_log_additive": self.control_log_additive, + } + + query_request = SemanticEmbeddingRequest(**query_params) + query_response = self.client.semantic_embed( + request=query_request, model=self.model + ) + + return query_response.embedding + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Call out to Aleph Alpha's Document endpoint. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + document_embeddings = [] + + for text in texts: + document_embeddings.append(self._embed(text)) + return document_embeddings + + def embed_query(self, text: str) -> List[float]: + """Call out to Aleph Alpha's asymmetric, query embedding endpoint + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + return self._embed(text)