You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/tests/integration_tests/vectorstores/test_elasticsearch.py

937 lines
33 KiB
Python

"""Test ElasticSearch functionality."""
import logging
import os
import re
import uuid
from typing import Any, Dict, Generator, List, Union
import pytest
from langchain_core.documents import Document
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
from tests.integration_tests.vectorstores.fake_embeddings import (
ConsistentFakeEmbeddings,
FakeEmbeddings,
)
logging.basicConfig(level=logging.DEBUG)
"""
cd tests/integration_tests/vectorstores/docker-compose
docker-compose -f elasticsearch.yml up
By default runs against local docker instance of Elasticsearch.
To run against Elastic Cloud, set the following environment variables:
- ES_CLOUD_ID
- ES_USERNAME
- ES_PASSWORD
Some of the tests require the following models to be deployed in the ML Node:
- elser (can be downloaded and deployed through Kibana and trained models UI)
- sentence-transformers__all-minilm-l6-v2 (can be deployed
through API, loaded via eland)
These tests that require the models to be deployed are skipped by default.
Enable them by adding the model name to the modelsDeployed list below.
"""
modelsDeployed: List[str] = [
# "elser",
# "sentence-transformers__all-minilm-l6-v2",
]
class TestElasticsearch:
@classmethod
def setup_class(cls) -> None:
if not os.getenv("OPENAI_API_KEY"):
raise ValueError("OPENAI_API_KEY environment variable is not set")
@pytest.fixture(scope="class", autouse=True)
def elasticsearch_connection(self) -> Union[dict, Generator[dict, None, None]]:
# Running this integration test with Elastic Cloud
# Required for in-stack inference testing (ELSER + model_id)
from elasticsearch import Elasticsearch
es_url = os.environ.get("ES_URL", "http://localhost:9200")
cloud_id = os.environ.get("ES_CLOUD_ID")
es_username = os.environ.get("ES_USERNAME", "elastic")
es_password = os.environ.get("ES_PASSWORD", "changeme")
if cloud_id:
es = Elasticsearch(
cloud_id=cloud_id,
basic_auth=(es_username, es_password),
)
yield {
"es_cloud_id": cloud_id,
"es_user": es_username,
"es_password": es_password,
}
else:
# Running this integration test with local docker instance
es = Elasticsearch(hosts=es_url)
yield {"es_url": es_url}
# Clear all indexes
index_names = es.indices.get(index="_all").keys()
for index_name in index_names:
if index_name.startswith("test_"):
es.indices.delete(index=index_name)
es.indices.refresh(index="_all")
# clear all test pipelines
try:
response = es.ingest.get_pipeline(id="test_*,*_sparse_embedding")
for pipeline_id, _ in response.items():
try:
es.ingest.delete_pipeline(id=pipeline_id)
print(f"Deleted pipeline: {pipeline_id}")
except Exception as e:
print(f"Pipeline error: {e}")
except Exception:
pass
@pytest.fixture(scope="function")
def es_client(self) -> Any:
# Running this integration test with Elastic Cloud
# Required for in-stack inference testing (ELSER + model_id)
from elastic_transport import Transport
from elasticsearch import Elasticsearch
class CustomTransport(Transport):
requests = []
def perform_request(self, *args, **kwargs): # type: ignore
self.requests.append(kwargs)
return super().perform_request(*args, **kwargs)
es_url = os.environ.get("ES_URL", "http://localhost:9200")
cloud_id = os.environ.get("ES_CLOUD_ID")
es_username = os.environ.get("ES_USERNAME", "elastic")
es_password = os.environ.get("ES_PASSWORD", "changeme")
if cloud_id:
es = Elasticsearch(
cloud_id=cloud_id,
basic_auth=(es_username, es_password),
transport_class=CustomTransport,
)
return es
else:
# Running this integration test with local docker instance
es = Elasticsearch(hosts=es_url, transport_class=CustomTransport)
return es
@pytest.fixture(scope="function")
def index_name(self) -> str:
"""Return the index name."""
return f"test_{uuid.uuid4().hex}"
def test_similarity_search_without_metadata(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test end to end construction and search without metadata."""
def assert_query(query_body: dict, query: str) -> dict:
assert query_body == {
"knn": {
"field": "vector",
"filter": [],
"k": 1,
"num_candidates": 50,
"query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
}
}
return query_body
texts = ["foo", "bar", "baz"]
docsearch = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
)
output = docsearch.similarity_search("foo", k=1, custom_query=assert_query)
assert output == [Document(page_content="foo")]
async def test_similarity_search_without_metadata_async(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test end to end construction and search without metadata."""
texts = ["foo", "bar", "baz"]
docsearch = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
)
output = await docsearch.asimilarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_add_embeddings(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""
Test add_embeddings, which accepts pre-built embeddings instead of
using inference for the texts.
This allows you to separate the embeddings text and the page_content
for better proximity between user's question and embedded text.
For example, your embedding text can be a question, whereas page_content
is the answer.
"""
embeddings = ConsistentFakeEmbeddings()
text_input = ["foo1", "foo2", "foo3"]
metadatas = [{"page": i} for i in range(len(text_input))]
"""In real use case, embedding_input can be questions for each text"""
embedding_input = ["foo2", "foo3", "foo1"]
embedding_vectors = embeddings.embed_documents(embedding_input)
docsearch = ElasticsearchStore._create_cls_from_kwargs(
embeddings,
**elasticsearch_connection,
index_name=index_name,
)
docsearch.add_embeddings(list(zip(text_input, embedding_vectors)), metadatas)
output = docsearch.similarity_search("foo1", k=1)
assert output == [Document(page_content="foo3", metadata={"page": 2})]
def test_similarity_search_with_metadata(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test end to end construction and search with metadata."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = ElasticsearchStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
**elasticsearch_connection,
index_name=index_name,
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={"page": 0})]
output = docsearch.similarity_search("bar", k=1)
assert output == [Document(page_content="bar", metadata={"page": 1})]
def test_similarity_search_with_filter(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test end to end construction and search with metadata."""
texts = ["foo", "foo", "foo"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
metadatas=metadatas,
**elasticsearch_connection,
index_name=index_name,
)
def assert_query(query_body: dict, query: str) -> dict:
assert query_body == {
"knn": {
"field": "vector",
"filter": [{"term": {"metadata.page": "1"}}],
"k": 3,
"num_candidates": 50,
"query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
}
}
return query_body
output = docsearch.similarity_search(
query="foo",
k=3,
filter=[{"term": {"metadata.page": "1"}}],
custom_query=assert_query,
)
assert output == [Document(page_content="foo", metadata={"page": 1})]
def test_similarity_search_with_doc_builder(
self, elasticsearch_connection: dict, index_name: str
) -> None:
texts = ["foo", "foo", "foo"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
metadatas=metadatas,
**elasticsearch_connection,
index_name=index_name,
)
def custom_document_builder(_: Dict) -> Document:
return Document(
page_content="Mock content!",
metadata={
"page_number": -1,
"original_filename": "Mock filename!",
},
)
output = docsearch.similarity_search(
query="foo", k=1, doc_builder=custom_document_builder
)
assert output[0].page_content == "Mock content!"
assert output[0].metadata["page_number"] == -1
assert output[0].metadata["original_filename"] == "Mock filename!"
def test_similarity_search_exact_search(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test end to end construction and search with metadata."""
texts = ["foo", "bar", "baz"]
docsearch = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
strategy=ElasticsearchStore.ExactRetrievalStrategy(),
)
expected_query = {
"query": {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'vector') + 1.0", # noqa: E501
"params": {
"query_vector": [
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
0.0,
]
},
},
}
}
}
def assert_query(query_body: dict, query: str) -> dict:
assert query_body == expected_query
return query_body
output = docsearch.similarity_search("foo", k=1, custom_query=assert_query)
assert output == [Document(page_content="foo")]
def test_similarity_search_exact_search_with_filter(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test end to end construction and search with metadata."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
metadatas=metadatas,
strategy=ElasticsearchStore.ExactRetrievalStrategy(),
)
def assert_query(query_body: dict, query: str) -> dict:
expected_query = {
"query": {
"script_score": {
"query": {"bool": {"filter": [{"term": {"metadata.page": 0}}]}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'vector') + 1.0", # noqa: E501
"params": {
"query_vector": [
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
0.0,
]
},
},
}
}
}
assert query_body == expected_query
return query_body
output = docsearch.similarity_search(
"foo",
k=1,
custom_query=assert_query,
filter=[{"term": {"metadata.page": 0}}],
)
assert output == [Document(page_content="foo", metadata={"page": 0})]
def test_similarity_search_exact_search_distance_dot_product(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test end to end construction and search with metadata."""
texts = ["foo", "bar", "baz"]
docsearch = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
strategy=ElasticsearchStore.ExactRetrievalStrategy(),
distance_strategy="DOT_PRODUCT",
)
def assert_query(query_body: dict, query: str) -> dict:
assert query_body == {
"query": {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": """
double value = dotProduct(params.query_vector, 'vector');
return sigmoid(1, Math.E, -value);
""",
"params": {
"query_vector": [
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
0.0,
]
},
},
}
}
}
return query_body
output = docsearch.similarity_search("foo", k=1, custom_query=assert_query)
assert output == [Document(page_content="foo")]
def test_similarity_search_exact_search_unknown_distance_strategy(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test end to end construction and search with unknown distance strategy."""
with pytest.raises(KeyError):
texts = ["foo", "bar", "baz"]
ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
strategy=ElasticsearchStore.ExactRetrievalStrategy(),
distance_strategy="NOT_A_STRATEGY",
)
def test_max_marginal_relevance_search(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test max marginal relevance search."""
texts = ["foo", "bar", "baz"]
docsearch = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
strategy=ElasticsearchStore.ExactRetrievalStrategy(),
)
mmr_output = docsearch.max_marginal_relevance_search(texts[0], k=3, fetch_k=3)
sim_output = docsearch.similarity_search(texts[0], k=3)
assert mmr_output == sim_output
mmr_output = docsearch.max_marginal_relevance_search(texts[0], k=2, fetch_k=3)
assert len(mmr_output) == 2
assert mmr_output[0].page_content == texts[0]
assert mmr_output[1].page_content == texts[1]
mmr_output = docsearch.max_marginal_relevance_search(
texts[0],
k=2,
fetch_k=3,
lambda_mult=0.1, # more diversity
)
assert len(mmr_output) == 2
assert mmr_output[0].page_content == texts[0]
assert mmr_output[1].page_content == texts[2]
# if fetch_k < k, then the output will be less than k
mmr_output = docsearch.max_marginal_relevance_search(texts[0], k=3, fetch_k=2)
assert len(mmr_output) == 2
def test_similarity_search_approx_with_hybrid_search(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test end to end construction and search with metadata."""
texts = ["foo", "bar", "baz"]
docsearch = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
strategy=ElasticsearchStore.ApproxRetrievalStrategy(hybrid=True),
)
def assert_query(query_body: dict, query: str) -> dict:
assert query_body == {
"knn": {
"field": "vector",
"filter": [],
"k": 1,
"num_candidates": 50,
"query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
},
"query": {
"bool": {
"filter": [],
"must": [{"match": {"text": {"query": "foo"}}}],
}
},
"rank": {"rrf": {}},
}
return query_body
output = docsearch.similarity_search("foo", k=1, custom_query=assert_query)
assert output == [Document(page_content="foo")]
def test_similarity_search_approx_with_hybrid_search_rrf(
self, es_client: Any, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test end to end construction and rrf hybrid search with metadata."""
from functools import partial
from typing import Optional
# 1. check query_body is okay
rrf_test_cases: List[Optional[Union[dict, bool]]] = [
True,
False,
{"rank_constant": 1, "window_size": 5},
]
for rrf_test_case in rrf_test_cases:
texts = ["foo", "bar", "baz"]
docsearch = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
strategy=ElasticsearchStore.ApproxRetrievalStrategy(
hybrid=True, rrf=rrf_test_case
),
)
def assert_query(
query_body: dict,
query: str,
rrf: Optional[Union[dict, bool]] = True,
) -> dict:
cmp_query_body = {
"knn": {
"field": "vector",
"filter": [],
"k": 3,
"num_candidates": 50,
"query_vector": [
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
0.0,
],
},
"query": {
"bool": {
"filter": [],
"must": [{"match": {"text": {"query": "foo"}}}],
}
},
}
if isinstance(rrf, dict):
cmp_query_body["rank"] = {"rrf": rrf}
elif isinstance(rrf, bool) and rrf is True:
cmp_query_body["rank"] = {"rrf": {}}
assert query_body == cmp_query_body
return query_body
## without fetch_k parameter
output = docsearch.similarity_search(
"foo", k=3, custom_query=partial(assert_query, rrf=rrf_test_case)
)
# 2. check query result is okay
es_output = es_client.search(
index=index_name,
query={
"bool": {
"filter": [],
"must": [{"match": {"text": {"query": "foo"}}}],
}
},
knn={
"field": "vector",
"filter": [],
"k": 3,
"num_candidates": 50,
"query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
},
size=3,
rank={"rrf": {"rank_constant": 1, "window_size": 5}},
)
assert [o.page_content for o in output] == [
e["_source"]["text"] for e in es_output["hits"]["hits"]
]
# 3. check rrf default option is okay
docsearch = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
strategy=ElasticsearchStore.ApproxRetrievalStrategy(hybrid=True),
)
## with fetch_k parameter
output = docsearch.similarity_search(
"foo", k=3, fetch_k=50, custom_query=assert_query
)
def test_similarity_search_approx_with_custom_query_fn(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""test that custom query function is called
with the query string and query body"""
def my_custom_query(query_body: dict, query: str) -> dict:
assert query == "foo"
assert query_body == {
"knn": {
"field": "vector",
"filter": [],
"k": 1,
"num_candidates": 50,
"query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
}
}
return {"query": {"match": {"text": {"query": "bar"}}}}
"""Test end to end construction and search with metadata."""
texts = ["foo", "bar", "baz"]
docsearch = ElasticsearchStore.from_texts(
texts, FakeEmbeddings(), **elasticsearch_connection, index_name=index_name
)
output = docsearch.similarity_search("foo", k=1, custom_query=my_custom_query)
assert output == [Document(page_content="bar")]
@pytest.mark.skipif(
"sentence-transformers__all-minilm-l6-v2" not in modelsDeployed,
reason="Sentence Transformers model not deployed in ML Node, skipping test",
)
def test_similarity_search_with_approx_infer_instack(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""test end to end with approx retrieval strategy and inference in-stack"""
docsearch = ElasticsearchStore(
index_name=index_name,
strategy=ElasticsearchStore.ApproxRetrievalStrategy(
query_model_id="sentence-transformers__all-minilm-l6-v2"
),
query_field="text_field",
vector_query_field="vector_query_field.predicted_value",
**elasticsearch_connection,
)
# setting up the pipeline for inference
docsearch.client.ingest.put_pipeline(
id="test_pipeline",
processors=[
{
"inference": {
"model_id": "sentence-transformers__all-minilm-l6-v2",
"field_map": {"query_field": "text_field"},
"target_field": "vector_query_field",
}
}
],
)
# creating a new index with the pipeline,
# not relying on langchain to create the index
docsearch.client.indices.create(
index=index_name,
mappings={
"properties": {
"text_field": {"type": "text"},
"vector_query_field": {
"properties": {
"predicted_value": {
"type": "dense_vector",
"dims": 384,
"index": True,
"similarity": "l2_norm",
}
}
},
}
},
settings={"index": {"default_pipeline": "test_pipeline"}},
)
# adding documents to the index
texts = ["foo", "bar", "baz"]
for i, text in enumerate(texts):
docsearch.client.create(
index=index_name,
id=str(i),
document={"text_field": text, "metadata": {}},
)
docsearch.client.indices.refresh(index=index_name)
def assert_query(query_body: dict, query: str) -> dict:
assert query_body == {
"knn": {
"filter": [],
"field": "vector_query_field.predicted_value",
"k": 1,
"num_candidates": 50,
"query_vector_builder": {
"text_embedding": {
"model_id": "sentence-transformers__all-minilm-l6-v2",
"model_text": "foo",
}
},
}
}
return query_body
output = docsearch.similarity_search("foo", k=1, custom_query=assert_query)
assert output == [Document(page_content="foo")]
output = docsearch.similarity_search("bar", k=1)
assert output == [Document(page_content="bar")]
@pytest.mark.skipif(
"elser" not in modelsDeployed,
reason="ELSER not deployed in ML Node, skipping test",
)
def test_similarity_search_with_sparse_infer_instack(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""test end to end with sparse retrieval strategy and inference in-stack"""
texts = ["foo", "bar", "baz"]
docsearch = ElasticsearchStore.from_texts(
texts,
**elasticsearch_connection,
index_name=index_name,
strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(),
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_elasticsearch_with_relevance_score(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test to make sure the relevance score is scaled to 0-1."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
embeddings = FakeEmbeddings()
docsearch = ElasticsearchStore.from_texts(
index_name=index_name,
texts=texts,
embedding=embeddings,
metadatas=metadatas,
**elasticsearch_connection,
)
embedded_query = embeddings.embed_query("foo")
output = docsearch.similarity_search_by_vector_with_relevance_scores(
embedding=embedded_query, k=1
)
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)]
def test_elasticsearch_with_relevance_threshold(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test to make sure the relevance threshold is respected."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
embeddings = FakeEmbeddings()
docsearch = ElasticsearchStore.from_texts(
index_name=index_name,
texts=texts,
embedding=embeddings,
metadatas=metadatas,
**elasticsearch_connection,
)
# Find a good threshold for testing
query_string = "foo"
embedded_query = embeddings.embed_query(query_string)
top3 = docsearch.similarity_search_by_vector_with_relevance_scores(
embedding=embedded_query, k=3
)
similarity_of_second_ranked = top3[1][1]
assert len(top3) == 3
# Test threshold
retriever = docsearch.as_retriever(
search_type="similarity_score_threshold",
search_kwargs={"score_threshold": similarity_of_second_ranked},
)
output = retriever.get_relevant_documents(query=query_string)
assert output == [
top3[0][0],
top3[1][0],
# third ranked is out
]
def test_elasticsearch_delete_ids(
self, elasticsearch_connection: dict, index_name: str
) -> None:
"""Test delete methods from vector store."""
texts = ["foo", "bar", "baz", "gni"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = ElasticsearchStore(
embedding=ConsistentFakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
)
ids = docsearch.add_texts(texts, metadatas)
output = docsearch.similarity_search("foo", k=10)
assert len(output) == 4
docsearch.delete(ids[1:3])
output = docsearch.similarity_search("foo", k=10)
assert len(output) == 2
docsearch.delete(["not-existing"])
output = docsearch.similarity_search("foo", k=10)
assert len(output) == 2
docsearch.delete([ids[0]])
output = docsearch.similarity_search("foo", k=10)
assert len(output) == 1
docsearch.delete([ids[3]])
output = docsearch.similarity_search("gni", k=10)
assert len(output) == 0
def test_elasticsearch_indexing_exception_error(
self,
elasticsearch_connection: dict,
index_name: str,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test bulk exception logging is giving better hints."""
from elasticsearch.helpers import BulkIndexError
docsearch = ElasticsearchStore(
embedding=ConsistentFakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
)
docsearch.client.indices.create(
index=index_name,
mappings={"properties": {}},
settings={"index": {"default_pipeline": "not-existing-pipeline"}},
)
texts = ["foo"]
with pytest.raises(BulkIndexError):
docsearch.add_texts(texts)
error_reason = "pipeline with id [not-existing-pipeline] does not exist"
log_message = f"First error reason: {error_reason}"
assert log_message in caplog.text
def test_elasticsearch_with_user_agent(
self, es_client: Any, index_name: str
) -> None:
"""Test to make sure the user-agent is set correctly."""
texts = ["foo", "bob", "baz"]
ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
es_connection=es_client,
index_name=index_name,
)
user_agent = es_client.transport.requests[0]["headers"]["User-Agent"]
pattern = r"^langchain-py-vs/\d+\.\d+\.\d+$"
match = re.match(pattern, user_agent)
assert (
match is not None
), f"The string '{user_agent}' does not match the expected pattern."
def test_elasticsearch_with_internal_user_agent(
self, elasticsearch_connection: Dict, index_name: str
) -> None:
"""Test to make sure the user-agent is set correctly."""
texts = ["foo"]
store = ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
**elasticsearch_connection,
index_name=index_name,
)
user_agent = store.client._headers["User-Agent"]
pattern = r"^langchain-py-vs/\d+\.\d+\.\d+$"
match = re.match(pattern, user_agent)
assert (
match is not None
), f"The string '{user_agent}' does not match the expected pattern."
def test_bulk_args(self, es_client: Any, index_name: str) -> None:
"""Test to make sure the user-agent is set correctly."""
texts = ["foo", "bob", "baz"]
ElasticsearchStore.from_texts(
texts,
FakeEmbeddings(),
es_connection=es_client,
index_name=index_name,
bulk_kwargs={"chunk_size": 1},
)
# 1 for index exist, 1 for index create, 3 for index docs
assert len(es_client.transport.requests) == 5 # type: ignore