fix: elasticsearch (#2402)

- Create a new docker-compose file to start an Elasticsearch instance
for integration tests.
- Add new tests to `test_elasticsearch.py` to verify Elasticsearch
functionality.
- Include an optional group `test_integration` in the `pyproject.toml`
file. This group should contain dependencies for integration tests and
can be installed using the command `poetry install --with
test_integration`. Any new dependencies should be added by running
`poetry add some_new_deps --group "test_integration" `

Note:
New tests running in live mode, which involve end-to-end testing of the
OpenAI API. In the future, adding `pytest-vcr` to record and replay all
API requests would be a nice feature for testing process.More info:
https://pytest-vcr.readthedocs.io/en/latest/

Fixes https://github.com/hwchase17/langchain/issues/2386
doc
sergerdn 1 year ago committed by GitHub
parent 4d730a9bbc
commit b410dc76aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

1
.gitignore vendored

@ -141,3 +141,4 @@ wandb/
# asdf tool versions
.tool-versions
/.ruff_cache/

@ -241,7 +241,7 @@ class ElasticVectorSearch(VectorStore, ABC):
raise ValueError(
"Your elasticsearch client string is misformatted. " f"Got error: {e} "
)
index_name = uuid.uuid4().hex
index_name = kwargs.get("index_name", uuid.uuid4().hex)
embeddings = embedding.embed_documents(texts)
dim = len(embeddings[0])
mapping = _default_text_mapping(dim)

21
poetry.lock generated

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry and should not be changed by hand.
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
[[package]]
name = "absl-py"
@ -1144,7 +1144,6 @@ files = [
{file = "debugpy-1.6.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b5d1b13d7c7bf5d7cf700e33c0b8ddb7baf030fcf502f76fc061ddd9405d16c"},
{file = "debugpy-1.6.6-cp38-cp38-win32.whl", hash = "sha256:70ab53918fd907a3ade01909b3ed783287ede362c80c75f41e79596d5ccacd32"},
{file = "debugpy-1.6.6-cp38-cp38-win_amd64.whl", hash = "sha256:c05349890804d846eca32ce0623ab66c06f8800db881af7a876dc073ac1c2225"},
{file = "debugpy-1.6.6-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:11a0f3a106f69901e4a9a5683ce943a7a5605696024134b522aa1bfda25b5fec"},
{file = "debugpy-1.6.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a771739902b1ae22a120dbbb6bd91b2cae6696c0e318b5007c5348519a4211c6"},
{file = "debugpy-1.6.6-cp39-cp39-win32.whl", hash = "sha256:549ae0cb2d34fc09d1675f9b01942499751d174381b6082279cf19cdb3c47cbe"},
{file = "debugpy-1.6.6-cp39-cp39-win_amd64.whl", hash = "sha256:de4a045fbf388e120bb6ec66501458d3134f4729faed26ff95de52a754abddb1"},
@ -4153,14 +4152,14 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
[[package]]
name = "openai"
version = "0.27.3"
version = "0.27.4"
description = "Python client library for the OpenAI API"
category = "main"
optional = true
optional = false
python-versions = ">=3.7.1"
files = [
{file = "openai-0.27.3-py3-none-any.whl", hash = "sha256:d5fca76f541f123a43d27baa5987a8a2949ae2b758180bdebd29b52f67d5ac4c"},
{file = "openai-0.27.3.tar.gz", hash = "sha256:0941a7322dc1ddbf15ed76702bb88d4f0c7586c3536433906dbd24cf6f2398d9"},
{file = "openai-0.27.4-py3-none-any.whl", hash = "sha256:3b82c867d531e1fd2003d9de2131e1c4bfd4c70b1a3149e0543a555b30807b70"},
{file = "openai-0.27.4.tar.gz", hash = "sha256:9f9d27d26e62c6068f516c0729449954b5ef6994be1a6cbfe7dbefbc84423a04"},
]
[package.dependencies]
@ -6813,7 +6812,7 @@ files = [
]
[package.dependencies]
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""}
[package.extras]
aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
@ -7552,7 +7551,7 @@ name = "tqdm"
version = "4.65.0"
description = "Fast, Extensible Progress Meter"
category = "main"
optional = true
optional = false
python-versions = ">=3.7"
files = [
{file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"},
@ -8473,13 +8472,13 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker
testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
[extras]
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm"]
all = ["aleph-alpha-client", "anthropic", "beautifulsoup4", "boto3", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "psycopg2-binary", "pyowm", "pypdf", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
cohere = ["cohere"]
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
openai = ["openai"]
qdrant = ["qdrant-client"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "d8c864a82150029b9f7cd66dd8bf9b77c985ace08475d9fbca4d9d11712c53d6"
content-hash = "bd1c3cfb286c9e27e189bad22cfa272223234a38fec4f6c7220fe181d133aa78"

@ -82,6 +82,13 @@ freezegun = "^1.2.2"
responses = "^0.22.0"
pytest-asyncio = "^0.20.3"
[tool.poetry.group.test_integration]
optional = true
[tool.poetry.group.test_integration.dependencies]
openai = "^0.27.4"
elasticsearch = {extras = ["async"], version = "^8.6.2"}
[tool.poetry.group.lint.dependencies]
ruff = "^0.0.249"
types-toml = "^0.10.8.1"

@ -0,0 +1,30 @@
version: "3"
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.7.0
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- xpack.security.http.ssl.enabled=false
- ELASTIC_PASSWORD=password
ports:
- "9200:9200"
healthcheck:
test: [ "CMD-SHELL", "curl --silent --fail http://localhost:9200/_cluster/health || exit 1" ]
interval: 1s
retries: 360
kibana:
image: docker.elastic.co/kibana/kibana:8.7.0
environment:
- ELASTICSEARCH_URL=http://elasticsearch:9200
- ELASTICSEARCH_USERNAME=kibana_system
- ELASTICSEARCH_PASSWORD=password
- KIBANA_PASSWORD=password
ports:
- "5601:5601"
healthcheck:
test: [ "CMD-SHELL", "curl --silent --fail http://localhost:5601/login || exit 1" ]
interval: 10s
retries: 60

@ -0,0 +1,7 @@
Sharks are a group of elasmobranch fish characterized by a cartilaginous skeleton, five to seven gill slits on the sides of the head, and pectoral fins that are not fused to the head. Modern sharks are classified within the clade Selachimorpha (or Selachii) and are the sister group to the Batoidea (rays and kin). Some sources extend the term "shark" as an informal category including extinct members of Chondrichthyes (cartilaginous fish) with a shark-like morphology, such as hybodonts and xenacanths. Shark-like chondrichthyans such as Cladoselache and Doliodus first appeared in the Devonian Period (419-359 Ma), though some fossilized chondrichthyan-like scales are as old as the Late Ordovician (458-444 Ma). The oldest modern sharks (selachians) are known from the Early Jurassic, about 200 Ma.
Sharks range in size from the small dwarf lanternshark (Etmopterus perryi), a deep sea species that is only 17 centimetres (6.7 in) in length, to the whale shark (Rhincodon typus), the largest fish in the world, which reaches approximately 12 metres (40 ft) in length. They are found in all seas and are common to depths up to 2,000 metres (6,600 ft). They generally do not live in freshwater, although there are a few known exceptions, such as the bull shark and the river shark, which can be found in both seawater and freshwater.[3] Sharks have a covering of dermal denticles that protects their skin from damage and parasites in addition to improving their fluid dynamics. They have numerous sets of replaceable teeth.
Several species are apex predators, which are organisms that are at the top of their food chain. Select examples include the tiger shark, blue shark, great white shark, mako shark, thresher shark, and hammerhead shark.
Sharks are caught by humans for shark meat or shark fin soup. Many shark populations are threatened by human activities. Since 1970, shark populations have been reduced by 71%, mostly from overfishing.

@ -1,29 +1,137 @@
"""Test ElasticSearch functionality."""
import logging
import os
from typing import Generator, List, Union
import pytest
from elasticsearch import Elasticsearch
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
logging.basicConfig(level=logging.DEBUG)
"""
cd tests/integration_tests/vectorstores/docker-compose
docker-compose -f elasticsearch.yml up
"""
class TestElasticsearch:
@pytest.fixture(scope="class", autouse=True)
def elasticsearch_url(self) -> Union[str, Generator[str, None, None]]:
"""Return the elasticsearch url."""
url = "http://localhost:9200"
yield url
es = Elasticsearch(hosts=url)
# Clear all indexes
index_names = es.indices.get(index="_all").keys()
for index_name in index_names:
# print(index_name)
es.indices.delete(index=index_name)
@pytest.fixture(scope="class", autouse=True)
def openai_api_key(self) -> Union[str, Generator[str, None, None]]:
"""Return the OpenAI API key."""
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
raise ValueError("OPENAI_API_KEY environment variable is not set")
yield openai_api_key
@pytest.fixture(scope="class")
def documents(self) -> Generator[List[Document], None, None]:
"""Return a generator that yields a list of documents."""
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = TextLoader(
os.path.join(os.path.dirname(__file__), "fixtures", "sharks.txt")
).load()
yield text_splitter.split_documents(documents)
def test_similarity_search_without_metadata(self, elasticsearch_url: str) -> None:
"""Test end to end construction and search without metadata."""
texts = ["foo", "bar", "baz"]
docsearch = ElasticVectorSearch.from_texts(
texts, FakeEmbeddings(), elasticsearch_url=elasticsearch_url
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_similarity_search_with_metadata(self, elasticsearch_url: str) -> None:
"""Test end to end construction and search with metadata."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = ElasticVectorSearch.from_texts(
texts,
FakeEmbeddings(),
metadatas=metadatas,
elasticsearch_url=elasticsearch_url,
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={"page": 0})]
def test_default_index_from_documents(
self, documents: List[Document], openai_api_key: str, elasticsearch_url: str
) -> None:
"""This test checks the construction of a default
ElasticSearch index using the 'from_documents'."""
embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
elastic_vector_search = ElasticVectorSearch.from_documents(
documents=documents,
embedding=embedding,
elasticsearch_url=elasticsearch_url,
)
search_result = elastic_vector_search.similarity_search("sharks")
print(search_result)
assert len(search_result) != 0
def test_custom_index_from_documents(
self, documents: List[Document], openai_api_key: str, elasticsearch_url: str
) -> None:
"""This test checks the construction of a custom
ElasticSearch index using the 'from_documents'."""
embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
elastic_vector_search = ElasticVectorSearch.from_documents(
documents=documents,
embedding=embedding,
elasticsearch_url=elasticsearch_url,
index_name="custom_index",
)
es = Elasticsearch(hosts=elasticsearch_url)
index_names = es.indices.get(index="_all").keys()
assert "custom_index" in index_names
search_result = elastic_vector_search.similarity_search("sharks")
print(search_result)
assert len(search_result) != 0
def test_custom_index_add_documents(
self, documents: List[Document], openai_api_key: str, elasticsearch_url: str
) -> None:
"""This test checks the construction of a custom
ElasticSearch index using the 'add_documents'."""
embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
elastic_vector_search = ElasticVectorSearch(
embedding=embedding,
elasticsearch_url=elasticsearch_url,
index_name="custom_index",
)
es = Elasticsearch(hosts=elasticsearch_url)
index_names = es.indices.get(index="_all").keys()
assert "custom_index" in index_names
elastic_vector_search.add_documents(documents)
search_result = elastic_vector_search.similarity_search("sharks")
print(search_result)
def test_elasticsearch() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = ElasticVectorSearch.from_texts(
texts, FakeEmbeddings(), elasticsearch_url="http://localhost:9200"
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_elasticsearch_with_metadatas() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = ElasticVectorSearch.from_texts(
texts,
FakeEmbeddings(),
metadatas=metadatas,
elasticsearch_url="http://localhost:9200",
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={"page": 0})]
assert len(search_result) != 0

Loading…
Cancel
Save