mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
0b542a9706
This PR introduces a new module, `elasticsearch_embeddings.py`, which provides a wrapper around Elasticsearch embedding models. The new ElasticsearchEmbeddings class allows users to generate embeddings for documents and query texts using a [model deployed in an Elasticsearch cluster](https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-model-ref.html#ml-nlp-model-ref-text-embedding). ### Main features: 1. The ElasticsearchEmbeddings class initializes with an Elasticsearch connection object and a model_id, providing an interface to interact with the Elasticsearch ML client through [infer_trained_model](https://elasticsearch-py.readthedocs.io/en/v8.7.0/api.html?highlight=trained%20model%20infer#elasticsearch.client.MlClient.infer_trained_model) . 2. The `embed_documents()` method generates embeddings for a list of documents, and the `embed_query()` method generates an embedding for a single query text. 3. The class supports custom input text field names in case the deployed model expects a different field name than the default `text_field`. 4. The implementation is compatible with any model deployed in Elasticsearch that generates embeddings as output. ### Benefits: 1. Simplifies the process of generating embeddings using Elasticsearch models. 2. Provides a clean and intuitive interface to interact with the Elasticsearch ML client. 3. Allows users to easily integrate Elasticsearch-generated embeddings. Related issue https://github.com/hwchase17/langchain/issues/3400 --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
137 lines
3.4 KiB
Plaintext
137 lines
3.4 KiB
Plaintext
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
},
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"!pip install elasticsearch langchain"
|
|
],
|
|
"metadata": {
|
|
"id": "OOiBBjc0Kd-6"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"%env ES_CLOUDID=<cloud id from cloud.elastic.co>\n",
|
|
"%env ES_USER=<user>\n",
|
|
"%env ES_PASS=<password>\n",
|
|
"\n",
|
|
"es_cloudid = os.environ.get(\"ES_CLOUDID\")\n",
|
|
"es_user = os.environ.get(\"ES_USER\")\n",
|
|
"es_pass = os.environ.get(\"ES_PASS\")"
|
|
],
|
|
"metadata": {
|
|
"id": "Wr8unljAKdCh"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Connect to Elasticsearch\n",
|
|
"es_connection = Elasticsearch(cloud_id=es_cloudid, basic_auth=(es_user, es_pass))"
|
|
],
|
|
"metadata": {
|
|
"id": "YIDsrBqTKs85"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Define the model ID and input field name (if different from default)\n",
|
|
"model_id = \"your_model_id\"\n",
|
|
"input_field = \"your_input_field\" # Optional, only if different from 'text_field'"
|
|
],
|
|
"metadata": {
|
|
"id": "sfFhnFHOKvbM"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Initialize the ElasticsearchEmbeddings instance\n",
|
|
"embeddings_generator = ElasticsearchEmbeddings(es_connection, model_id, input_field)"
|
|
],
|
|
"metadata": {
|
|
"id": "V-pCgqLCKvYs"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Generate embeddings for a list of documents\n",
|
|
"documents = [\n",
|
|
" \"This is an example document.\",\n",
|
|
" \"Another example document to generate embeddings for.\",\n",
|
|
" ]\n",
|
|
"document_embeddings = embeddings_generator.embed_documents(documents)"
|
|
],
|
|
"metadata": {
|
|
"id": "lJg2iRDWKvV_"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Print the generated document embeddings\n",
|
|
"for i, doc_embedding in enumerate(document_embeddings):\n",
|
|
" print(f\"Embedding for document {i + 1}: {doc_embedding}\")"
|
|
],
|
|
"metadata": {
|
|
"id": "R3sYQlh3KvTQ"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Generate an embedding for a single query text\n",
|
|
"query_text = \"What is the meaning of life?\"\n",
|
|
"query_embedding = embeddings_generator.embed_query(query_text)"
|
|
],
|
|
"metadata": {
|
|
"id": "n0un5Vc0KvQd"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"# Print the generated query embedding\n",
|
|
"print(f\"Embedding for query: {query_embedding}\")"
|
|
],
|
|
"metadata": {
|
|
"id": "PANph6pmKvLD"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
}
|
|
]
|
|
} |