From 46e181aa8bd3d116cb9a1f3c2488d44bd84b60a3 Mon Sep 17 00:00:00 2001 From: Jeff Vestal <53237856+jeffvestal@users.noreply.github.com> Date: Tue, 30 May 2023 19:26:30 -0500 Subject: [PATCH] Allow ElasticsearchEmbeddings to create a connection with ES Client object (#5321) This PR adds a new method `from_es_connection` to the `ElasticsearchEmbeddings` class allowing users to use Elasticsearch clusters outside of Elastic Cloud. Users can create an Elasticsearch Client object and pass that to the new function. The returned object is identical to the one returned by calling `from_credentials` ``` # Create Elasticsearch connection es_connection = Elasticsearch( hosts=['https://es_cluster_url:port'], basic_auth=('user', 'password') ) # Instantiate ElasticsearchEmbeddings using es_connection embeddings = ElasticsearchEmbeddings.from_es_connection( model_id, es_connection, ) ``` I also added examples to the elasticsearch jupyter notebook Fixes # https://github.com/hwchase17/langchain/issues/5239 --------- Co-authored-by: Dev 2049 --- .../examples/elasticsearch.ipynb | 374 ++++++++++++------ langchain/embeddings/elasticsearch.py | 63 +++ 2 files changed, 314 insertions(+), 123 deletions(-) diff --git a/docs/modules/models/text_embedding/examples/elasticsearch.ipynb b/docs/modules/models/text_embedding/examples/elasticsearch.ipynb index a9aa7988..3a9b6b7d 100644 --- a/docs/modules/models/text_embedding/examples/elasticsearch.ipynb +++ b/docs/modules/models/text_embedding/examples/elasticsearch.ipynb @@ -1,124 +1,252 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "code", - "source": [ - "!pip -q install elasticsearch langchain" - ], - "metadata": { - "id": "6dJxqebov4eU" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import elasticsearch\n", - "from langchain.embeddings.elasticsearch import ElasticsearchEmbeddings" - ], - "metadata": { - "id": "RV7C3DUmv4aq" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Define the model ID\n", - "model_id = 'your_model_id'" - ], - "metadata": { - "id": "MrT3jplJvp09" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Instantiate ElasticsearchEmbeddings using credentials\n", - "embeddings = ElasticsearchEmbeddings.from_credentials(\n", - " model_id,\n", - " es_cloud_id='your_cloud_id', \n", - " es_user='your_user', \n", - " es_password='your_password'\n", - ")\n" - ], - "metadata": { - "id": "svtdnC-dvpxR" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Create embeddings for multiple documents\n", - "documents = [\n", - " 'This is an example document.', \n", - " 'Another example document to generate embeddings for.'\n", - "]\n", - "document_embeddings = embeddings.embed_documents(documents)\n" - ], - "metadata": { - "id": "7DXZAK7Kvpth" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Print document embeddings\n", - "for i, embedding in enumerate(document_embeddings):\n", - " print(f\"Embedding for document {i+1}: {embedding}\")\n" - ], - "metadata": { - "id": "K8ra75W_vpqy" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Create an embedding for a single query\n", - "query = 'This is a single query.'\n", - "query_embedding = embeddings.embed_query(query)\n" - ], - "metadata": { - "id": "V4Q5kQo9vpna" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Print query embedding\n", - "print(f\"Embedding for query: {query_embedding}\")\n" - ], - "metadata": { - "id": "O0oQDzGKvpkz" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "1eZl1oaVUNeC" + }, + "source": [ + "# Elasticsearch\n", + "Walkthrough of how to generate embeddings using a hosted embedding model in Elasticsearch\n", + "\n", + "The easiest way to instantiate the `ElasticsearchEmebddings` class it either\n", + "- using the `from_credentials` constructor if you are using Elastic Cloud\n", + "- or using the `from_es_connection` constructor with any Elasticsearch cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6dJxqebov4eU" + }, + "outputs": [], + "source": [ + "!pip -q install elasticsearch langchain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RV7C3DUmv4aq" + }, + "outputs": [], + "source": [ + "import elasticsearch\n", + "from langchain.embeddings.elasticsearch import ElasticsearchEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MrT3jplJvp09" + }, + "outputs": [], + "source": [ + "# Define the model ID\n", + "model_id = 'your_model_id'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "j5F-nwLVS_Zu" + }, + "source": [ + "## Testing with `from_credentials`\n", + "This required an Elastic Cloud `cloud_id`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "svtdnC-dvpxR" + }, + "outputs": [], + "source": [ + "# Instantiate ElasticsearchEmbeddings using credentials\n", + "embeddings = ElasticsearchEmbeddings.from_credentials(\n", + " model_id,\n", + " es_cloud_id='your_cloud_id', \n", + " es_user='your_user', \n", + " es_password='your_password'\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7DXZAK7Kvpth" + }, + "outputs": [], + "source": [ + "# Create embeddings for multiple documents\n", + "documents = [\n", + " 'This is an example document.', \n", + " 'Another example document to generate embeddings for.'\n", + "]\n", + "document_embeddings = embeddings.embed_documents(documents)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K8ra75W_vpqy" + }, + "outputs": [], + "source": [ + "# Print document embeddings\n", + "for i, embedding in enumerate(document_embeddings):\n", + " print(f\"Embedding for document {i+1}: {embedding}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "V4Q5kQo9vpna" + }, + "outputs": [], + "source": [ + "# Create an embedding for a single query\n", + "query = 'This is a single query.'\n", + "query_embedding = embeddings.embed_query(query)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "O0oQDzGKvpkz" + }, + "outputs": [], + "source": [ + "# Print query embedding\n", + "print(f\"Embedding for query: {query_embedding}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rHN03yV6TJ5q" + }, + "source": [ + "## Testing with Existing Elasticsearch client connection\n", + "This can be used with any Elasticsearch deployment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GMQcJDwBTJFm" + }, + "outputs": [], + "source": [ + "# Create Elasticsearch connection\n", + "es_connection = Elasticsearch(\n", + " hosts=['https://es_cluster_url:port'], \n", + " basic_auth=('user', 'password')\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WTYIU4u3TJO1" + }, + "outputs": [], + "source": [ + "# Instantiate ElasticsearchEmbeddings using es_connection\n", + "embeddings = ElasticsearchEmbeddings.from_es_connection(\n", + " model_id,\n", + " es_connection,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4gdAUHwoTJO3" + }, + "outputs": [], + "source": [ + "# Create embeddings for multiple documents\n", + "documents = [\n", + " 'This is an example document.', \n", + " 'Another example document to generate embeddings for.'\n", + "]\n", + "document_embeddings = embeddings.embed_documents(documents)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RC_-tov6TJO3" + }, + "outputs": [], + "source": [ + "# Print document embeddings\n", + "for i, embedding in enumerate(document_embeddings):\n", + " print(f\"Embedding for document {i+1}: {embedding}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6GEnHBqETJO3" + }, + "outputs": [], + "source": [ + "# Create an embedding for a single query\n", + "query = 'This is a single query.'\n", + "query_embedding = embeddings.embed_query(query)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-kyUQAXDTJO4" + }, + "outputs": [], + "source": [ + "# Print query embedding\n", + "print(f\"Embedding for query: {query_embedding}\")\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/langchain/embeddings/elasticsearch.py b/langchain/embeddings/elasticsearch.py index 78d7dec0..9d3b1192 100644 --- a/langchain/embeddings/elasticsearch.py +++ b/langchain/embeddings/elasticsearch.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, List, Optional from langchain.utils import get_from_env if TYPE_CHECKING: + from elasticsearch import Elasticsearch from elasticsearch.client import MlClient from langchain.embeddings.base import Embeddings @@ -110,6 +111,68 @@ class ElasticsearchEmbeddings(Embeddings): client = MlClient(es_connection) return cls(client, model_id, input_field=input_field) + @classmethod + def from_es_connection( + cls, + model_id: str, + es_connection: Elasticsearch, + input_field: str = "text_field", + ) -> ElasticsearchEmbeddings: + """ + Instantiate embeddings from an existing Elasticsearch connection. + + This method provides a way to create an instance of the ElasticsearchEmbeddings + class using an existing Elasticsearch connection. The connection object is used + to create an MlClient, which is then used to initialize the + ElasticsearchEmbeddings instance. + + Args: + model_id (str): The model_id of the model deployed in the Elasticsearch cluster. + es_connection (elasticsearch.Elasticsearch): An existing Elasticsearch + connection object. input_field (str, optional): The name of the key for the + input text field in the document. Defaults to 'text_field'. + + Returns: + ElasticsearchEmbeddings: An instance of the ElasticsearchEmbeddings class. + + Example Usage: + from elasticsearch import Elasticsearch + from langchain.embeddings import ElasticsearchEmbeddings + + # Define the model ID and input field name (if different from default) + model_id = "your_model_id" + # Optional, only if different from 'text_field' + input_field = "your_input_field" + + # Create Elasticsearch connection + es_connection = Elasticsearch( + hosts=["localhost:9200"], http_auth=("user", "password") + ) + + # Instantiate ElasticsearchEmbeddings using the existing connection + embeddings = ElasticsearchEmbeddings.from_es_connection( + model_id, + es_connection, + input_field=input_field, + ) + + documents = [ + "This is an example document.", + "Another example document to generate embeddings for.", + ] + embeddings_generator.embed_documents(documents) + """ + # Importing MlClient from elasticsearch.client within the method to + # avoid unnecessary import if the method is not used + from elasticsearch.client import MlClient + + # Create an MlClient from the given Elasticsearch connection + client = MlClient(es_connection) + + # Return a new instance of the ElasticsearchEmbeddings class with + # the MlClient, model_id, and input_field + return cls(client, model_id, input_field=input_field) + def _embedding_func(self, texts: List[str]) -> List[List[float]]: """ Generate embeddings for the given texts using the Elasticsearch model.