mirror of https://github.com/hwchase17/langchain
Elasticsearch Store Improvements (#8636)
Todo: - [x] Connection options (cloud, localhost url, es_connection) support - [x] Logging support - [x] Customisable field support - [x] Distance Similarity support - [x] Metadata support - [x] Metadata Filter support - [x] Retrieval Strategies - [x] Approx - [x] Approx with Hybrid - [x] Exact - [x] Custom - [x] ELSER (excluding hybrid as we are working on RRF support) - [x] integration tests - [x] Documentation 👋 this is a contribution to improve Elasticsearch integration with Langchain. Its based loosely on the changes that are in master but with some notable changes: ## Package name & design improvements The import name is now `ElasticsearchStore`, to aid discoverability of the VectorStore. ```py ## Before from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch, ElasticKnnSearch ## Now from langchain.vectorstores.elasticsearch import ElasticsearchStore ``` ## Retrieval Strategy support Before we had a number of classes, depending on the strategy you wanted. `ElasticKnnSearch` for approx, `ElasticVectorSearch` for exact / brute force. With `ElasticsearchStore` we have retrieval strategies: ### Approx Example Default strategy for the vast majority of developers who use Elasticsearch will be inferring the embeddings from outside of Elasticsearch. Uses KNN functionality of _search. ```py texts = ["foo", "bar", "baz"] docsearch = ElasticsearchStore.from_texts( texts, FakeEmbeddings(), es_url="http://localhost:9200", index_name="sample-index" ) output = docsearch.similarity_search("foo", k=1) ``` ### Approx, with hybrid Developers who want to search, using both the embedding and the text bm25 match. Its simple to enable. ```py texts = ["foo", "bar", "baz"] docsearch = ElasticsearchStore.from_texts( texts, FakeEmbeddings(), es_url="http://localhost:9200", index_name="sample-index", strategy=ElasticsearchStore.ApproxRetrievalStrategy(hybrid=True) ) output = docsearch.similarity_search("foo", k=1) ``` ### Approx, with `query_model_id` Developers who want to infer within Elasticsearch, using the model loaded in the ml node. This relies on the developer to setup the pipeline and index if they wish to embed the text in Elasticsearch. Example of this in the test. ```py texts = ["foo", "bar", "baz"] docsearch = ElasticsearchStore.from_texts( texts, FakeEmbeddings(), es_url="http://localhost:9200", index_name="sample-index", strategy=ElasticsearchStore.ApproxRetrievalStrategy( query_model_id="sentence-transformers__all-minilm-l6-v2" ), ) output = docsearch.similarity_search("foo", k=1) ``` ### I want to provide my own custom Elasticsearch Query You might want to have more control over the query, to perform multi-phase retrieval such as LTR, linearly boosting on document parameters like recently updated or geo-distance. You can do this with `custom_query_fn` ```py def my_custom_query(query_body: dict, query: str) -> dict: return {"query": {"match": {"text": {"query": "bar"}}}} texts = ["foo", "bar", "baz"] docsearch = ElasticsearchStore.from_texts( texts, FakeEmbeddings(), **elasticsearch_connection, index_name=index_name ) docsearch.similarity_search("foo", k=1, custom_query=my_custom_query) ``` ### Exact Example Developers who have a small dataset in Elasticsearch, dont want the cost of indexing the dims vs tradeoff on cost at query time. Uses script_score. ```py texts = ["foo", "bar", "baz"] docsearch = ElasticsearchStore.from_texts( texts, FakeEmbeddings(), es_url="http://localhost:9200", index_name="sample-index", strategy=ElasticsearchStore.ExactRetrievalStrategy(), ) output = docsearch.similarity_search("foo", k=1) ``` ### ELSER Example Elastic provides its own sparse vector model called ELSER. With these changes, its really easy to use. The vector store creates a pipeline and index thats setup for ELSER. All the developer needs to do is configure, ingest and query via langchain tooling. ```py texts = ["foo", "bar", "baz"] docsearch = ElasticsearchStore.from_texts( texts, FakeEmbeddings(), es_url="http://localhost:9200", index_name="sample-index", strategy=ElasticsearchStore.SparseVectorStrategy(), ) output = docsearch.similarity_search("foo", k=1) ``` ## Architecture In future, we can introduce new strategies and allow us to not break bwc as we evolve the index / query strategy. ## Credit On release, could you credit @elastic and @phoey1 please? Thank you! --------- Co-authored-by: Bagatur <baskaryan@gmail.com>pull/9242/head
parent
71d5b7c9bf
commit
eac4ddb4bb
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,160 @@
|
|||||||
|
"""Test ElasticSearch functionality."""
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from typing import Generator, List, Union
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
|
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
|
||||||
|
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
"""
|
||||||
|
cd tests/integration_tests/vectorstores/docker-compose
|
||||||
|
docker-compose -f elasticsearch.yml up
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class TestElasticsearch:
|
||||||
|
@classmethod
|
||||||
|
def setup_class(cls) -> None:
|
||||||
|
if not os.getenv("OPENAI_API_KEY"):
|
||||||
|
raise ValueError("OPENAI_API_KEY environment variable is not set")
|
||||||
|
|
||||||
|
@pytest.fixture(scope="class", autouse=True)
|
||||||
|
def elasticsearch_url(self) -> Union[str, Generator[str, None, None]]:
|
||||||
|
"""Return the elasticsearch url."""
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
|
||||||
|
url = "http://localhost:9200"
|
||||||
|
yield url
|
||||||
|
es = Elasticsearch(hosts=url)
|
||||||
|
|
||||||
|
# Clear all indexes
|
||||||
|
index_names = es.indices.get(index="_all").keys()
|
||||||
|
for index_name in index_names:
|
||||||
|
# print(index_name)
|
||||||
|
es.indices.delete(index=index_name)
|
||||||
|
|
||||||
|
def test_similarity_search_without_metadata(self, elasticsearch_url: str) -> None:
|
||||||
|
"""Test end to end construction and search without metadata."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docsearch = ElasticVectorSearch.from_texts(
|
||||||
|
texts, FakeEmbeddings(), elasticsearch_url=elasticsearch_url
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
|
assert output == [Document(page_content="foo")]
|
||||||
|
|
||||||
|
@pytest.mark.skip(
|
||||||
|
reason="Docker build has no ssl certs. Enable this test when testing with ssl."
|
||||||
|
)
|
||||||
|
def test_similarity_search_with_ssl_verify(self, elasticsearch_url: str) -> None:
|
||||||
|
"""Test end to end construction and search with ssl verify."""
|
||||||
|
ssl_verify = {
|
||||||
|
"verify_certs": True,
|
||||||
|
"basic_auth": ("ES_USER", "ES_PASSWORD"),
|
||||||
|
"ca_certs": "ES_CA_CERTS_PATH",
|
||||||
|
}
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docsearch = ElasticVectorSearch.from_texts(
|
||||||
|
texts,
|
||||||
|
FakeEmbeddings(),
|
||||||
|
elasticsearch_url="http://localhost:9200",
|
||||||
|
ssl_verify=ssl_verify,
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
|
assert output == [Document(page_content="foo")]
|
||||||
|
|
||||||
|
def test_similarity_search_with_metadata(self, elasticsearch_url: str) -> None:
|
||||||
|
"""Test end to end construction and search with metadata."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = ElasticVectorSearch.from_texts(
|
||||||
|
texts,
|
||||||
|
FakeEmbeddings(),
|
||||||
|
metadatas=metadatas,
|
||||||
|
elasticsearch_url=elasticsearch_url,
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
|
assert output == [Document(page_content="foo", metadata={"page": 0})]
|
||||||
|
|
||||||
|
@pytest.mark.vcr(ignore_localhost=True)
|
||||||
|
def test_default_index_from_documents(
|
||||||
|
self,
|
||||||
|
documents: List[Document],
|
||||||
|
embedding_openai: OpenAIEmbeddings,
|
||||||
|
elasticsearch_url: str,
|
||||||
|
) -> None:
|
||||||
|
"""This test checks the construction of a default
|
||||||
|
ElasticSearch index using the 'from_documents'."""
|
||||||
|
|
||||||
|
elastic_vector_search = ElasticVectorSearch.from_documents(
|
||||||
|
documents=documents,
|
||||||
|
embedding=embedding_openai,
|
||||||
|
elasticsearch_url=elasticsearch_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
search_result = elastic_vector_search.similarity_search("sharks")
|
||||||
|
|
||||||
|
assert len(search_result) != 0
|
||||||
|
|
||||||
|
@pytest.mark.vcr(ignore_localhost=True)
|
||||||
|
def test_custom_index_from_documents(
|
||||||
|
self,
|
||||||
|
documents: List[Document],
|
||||||
|
embedding_openai: OpenAIEmbeddings,
|
||||||
|
elasticsearch_url: str,
|
||||||
|
) -> None:
|
||||||
|
"""This test checks the construction of a custom
|
||||||
|
ElasticSearch index using the 'from_documents'."""
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
|
||||||
|
index_name = f"custom_index_{uuid.uuid4().hex}"
|
||||||
|
elastic_vector_search = ElasticVectorSearch.from_documents(
|
||||||
|
documents=documents,
|
||||||
|
embedding=embedding_openai,
|
||||||
|
elasticsearch_url=elasticsearch_url,
|
||||||
|
index_name=index_name,
|
||||||
|
)
|
||||||
|
es = Elasticsearch(hosts=elasticsearch_url)
|
||||||
|
index_names = es.indices.get(index="_all").keys()
|
||||||
|
assert index_name in index_names
|
||||||
|
|
||||||
|
search_result = elastic_vector_search.similarity_search("sharks")
|
||||||
|
|
||||||
|
assert len(search_result) != 0
|
||||||
|
|
||||||
|
@pytest.mark.vcr(ignore_localhost=True)
|
||||||
|
def test_custom_index_add_documents(
|
||||||
|
self,
|
||||||
|
documents: List[Document],
|
||||||
|
embedding_openai: OpenAIEmbeddings,
|
||||||
|
elasticsearch_url: str,
|
||||||
|
) -> None:
|
||||||
|
"""This test checks the construction of a custom
|
||||||
|
ElasticSearch index using the 'add_documents'."""
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
|
||||||
|
index_name = f"custom_index_{uuid.uuid4().hex}"
|
||||||
|
elastic_vector_search = ElasticVectorSearch(
|
||||||
|
embedding=embedding_openai,
|
||||||
|
elasticsearch_url=elasticsearch_url,
|
||||||
|
index_name=index_name,
|
||||||
|
)
|
||||||
|
es = Elasticsearch(hosts=elasticsearch_url)
|
||||||
|
elastic_vector_search.add_documents(documents)
|
||||||
|
|
||||||
|
index_names = es.indices.get(index="_all").keys()
|
||||||
|
assert index_name in index_names
|
||||||
|
|
||||||
|
search_result = elastic_vector_search.similarity_search("sharks")
|
||||||
|
|
||||||
|
assert len(search_result) != 0
|
||||||
|
|
||||||
|
def test_custom_index_add_documents_to_exists_store(self) -> None:
|
||||||
|
# TODO: implement it
|
||||||
|
pass
|
Loading…
Reference in New Issue