From b410dc76aaa833482a43b52db257e635d56b61e9 Mon Sep 17 00:00:00 2001 From: sergerdn <64213648+sergerdn@users.noreply.github.com> Date: Wed, 5 Apr 2023 13:51:32 +0000 Subject: [PATCH] fix: elasticsearch (#2402) - Create a new docker-compose file to start an Elasticsearch instance for integration tests. - Add new tests to `test_elasticsearch.py` to verify Elasticsearch functionality. - Include an optional group `test_integration` in the `pyproject.toml` file. This group should contain dependencies for integration tests and can be installed using the command `poetry install --with test_integration`. Any new dependencies should be added by running `poetry add some_new_deps --group "test_integration" ` Note: New tests running in live mode, which involve end-to-end testing of the OpenAI API. In the future, adding `pytest-vcr` to record and replay all API requests would be a nice feature for testing process.More info: https://pytest-vcr.readthedocs.io/en/latest/ Fixes https://github.com/hwchase17/langchain/issues/2386 --- .gitignore | 1 + .../vectorstores/elastic_vector_search.py | 2 +- poetry.lock | 21 ++- pyproject.toml | 7 + .../docker-compose/elasticsearch.yml | 30 ++++ .../vectorstores/fixtures/sharks.txt | 7 + .../vectorstores/test_elasticsearch.py | 152 +++++++++++++++--- 7 files changed, 186 insertions(+), 34 deletions(-) create mode 100644 tests/integration_tests/vectorstores/docker-compose/elasticsearch.yml create mode 100644 tests/integration_tests/vectorstores/fixtures/sharks.txt diff --git a/.gitignore b/.gitignore index 354e08c2..0d7c9ce6 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,4 @@ wandb/ # asdf tool versions .tool-versions +/.ruff_cache/ diff --git a/langchain/vectorstores/elastic_vector_search.py b/langchain/vectorstores/elastic_vector_search.py index e808e80f..e0eb7b15 100644 --- a/langchain/vectorstores/elastic_vector_search.py +++ b/langchain/vectorstores/elastic_vector_search.py @@ -241,7 +241,7 @@ class ElasticVectorSearch(VectorStore, ABC): raise ValueError( "Your elasticsearch client string is misformatted. " f"Got error: {e} " ) - index_name = uuid.uuid4().hex + index_name = kwargs.get("index_name", uuid.uuid4().hex) embeddings = embedding.embed_documents(texts) dim = len(embeddings[0]) mapping = _default_text_mapping(dim) diff --git a/poetry.lock b/poetry.lock index eb5c572c..c6f07a71 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "absl-py" @@ -1144,7 +1144,6 @@ files = [ {file = "debugpy-1.6.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b5d1b13d7c7bf5d7cf700e33c0b8ddb7baf030fcf502f76fc061ddd9405d16c"}, {file = "debugpy-1.6.6-cp38-cp38-win32.whl", hash = "sha256:70ab53918fd907a3ade01909b3ed783287ede362c80c75f41e79596d5ccacd32"}, {file = "debugpy-1.6.6-cp38-cp38-win_amd64.whl", hash = "sha256:c05349890804d846eca32ce0623ab66c06f8800db881af7a876dc073ac1c2225"}, - {file = "debugpy-1.6.6-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:11a0f3a106f69901e4a9a5683ce943a7a5605696024134b522aa1bfda25b5fec"}, {file = "debugpy-1.6.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a771739902b1ae22a120dbbb6bd91b2cae6696c0e318b5007c5348519a4211c6"}, {file = "debugpy-1.6.6-cp39-cp39-win32.whl", hash = "sha256:549ae0cb2d34fc09d1675f9b01942499751d174381b6082279cf19cdb3c47cbe"}, {file = "debugpy-1.6.6-cp39-cp39-win_amd64.whl", hash = "sha256:de4a045fbf388e120bb6ec66501458d3134f4729faed26ff95de52a754abddb1"}, @@ -4153,14 +4152,14 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] [[package]] name = "openai" -version = "0.27.3" +version = "0.27.4" description = "Python client library for the OpenAI API" category = "main" -optional = true +optional = false python-versions = ">=3.7.1" files = [ - {file = "openai-0.27.3-py3-none-any.whl", hash = "sha256:d5fca76f541f123a43d27baa5987a8a2949ae2b758180bdebd29b52f67d5ac4c"}, - {file = "openai-0.27.3.tar.gz", hash = "sha256:0941a7322dc1ddbf15ed76702bb88d4f0c7586c3536433906dbd24cf6f2398d9"}, + {file = "openai-0.27.4-py3-none-any.whl", hash = "sha256:3b82c867d531e1fd2003d9de2131e1c4bfd4c70b1a3149e0543a555b30807b70"}, + {file = "openai-0.27.4.tar.gz", hash = "sha256:9f9d27d26e62c6068f516c0729449954b5ef6994be1a6cbfe7dbefbc84423a04"}, ] [package.dependencies] @@ -6813,7 +6812,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} +greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""} [package.extras] aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] @@ -7552,7 +7551,7 @@ name = "tqdm" version = "4.65.0" description = "Fast, Extensible Progress Meter" category = "main" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"}, @@ -8473,13 +8472,13 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [extras] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm"] +all = ["aleph-alpha-client", "anthropic", "beautifulsoup4", "boto3", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "psycopg2-binary", "pyowm", "pypdf", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] cohere = ["cohere"] -llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] +llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "d8c864a82150029b9f7cd66dd8bf9b77c985ace08475d9fbca4d9d11712c53d6" +content-hash = "bd1c3cfb286c9e27e189bad22cfa272223234a38fec4f6c7220fe181d133aa78" diff --git a/pyproject.toml b/pyproject.toml index 3db5082d..2b269d7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,6 +82,13 @@ freezegun = "^1.2.2" responses = "^0.22.0" pytest-asyncio = "^0.20.3" +[tool.poetry.group.test_integration] +optional = true + +[tool.poetry.group.test_integration.dependencies] +openai = "^0.27.4" +elasticsearch = {extras = ["async"], version = "^8.6.2"} + [tool.poetry.group.lint.dependencies] ruff = "^0.0.249" types-toml = "^0.10.8.1" diff --git a/tests/integration_tests/vectorstores/docker-compose/elasticsearch.yml b/tests/integration_tests/vectorstores/docker-compose/elasticsearch.yml new file mode 100644 index 00000000..609cf4e9 --- /dev/null +++ b/tests/integration_tests/vectorstores/docker-compose/elasticsearch.yml @@ -0,0 +1,30 @@ +version: "3" + +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.7.0 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - xpack.security.http.ssl.enabled=false + - ELASTIC_PASSWORD=password + ports: + - "9200:9200" + healthcheck: + test: [ "CMD-SHELL", "curl --silent --fail http://localhost:9200/_cluster/health || exit 1" ] + interval: 1s + retries: 360 + + kibana: + image: docker.elastic.co/kibana/kibana:8.7.0 + environment: + - ELASTICSEARCH_URL=http://elasticsearch:9200 + - ELASTICSEARCH_USERNAME=kibana_system + - ELASTICSEARCH_PASSWORD=password + - KIBANA_PASSWORD=password + ports: + - "5601:5601" + healthcheck: + test: [ "CMD-SHELL", "curl --silent --fail http://localhost:5601/login || exit 1" ] + interval: 10s + retries: 60 diff --git a/tests/integration_tests/vectorstores/fixtures/sharks.txt b/tests/integration_tests/vectorstores/fixtures/sharks.txt new file mode 100644 index 00000000..b2aeb8f2 --- /dev/null +++ b/tests/integration_tests/vectorstores/fixtures/sharks.txt @@ -0,0 +1,7 @@ +Sharks are a group of elasmobranch fish characterized by a cartilaginous skeleton, five to seven gill slits on the sides of the head, and pectoral fins that are not fused to the head. Modern sharks are classified within the clade Selachimorpha (or Selachii) and are the sister group to the Batoidea (rays and kin). Some sources extend the term "shark" as an informal category including extinct members of Chondrichthyes (cartilaginous fish) with a shark-like morphology, such as hybodonts and xenacanths. Shark-like chondrichthyans such as Cladoselache and Doliodus first appeared in the Devonian Period (419-359 Ma), though some fossilized chondrichthyan-like scales are as old as the Late Ordovician (458-444 Ma). The oldest modern sharks (selachians) are known from the Early Jurassic, about 200 Ma. + +Sharks range in size from the small dwarf lanternshark (Etmopterus perryi), a deep sea species that is only 17 centimetres (6.7 in) in length, to the whale shark (Rhincodon typus), the largest fish in the world, which reaches approximately 12 metres (40 ft) in length. They are found in all seas and are common to depths up to 2,000 metres (6,600 ft). They generally do not live in freshwater, although there are a few known exceptions, such as the bull shark and the river shark, which can be found in both seawater and freshwater.[3] Sharks have a covering of dermal denticles that protects their skin from damage and parasites in addition to improving their fluid dynamics. They have numerous sets of replaceable teeth. + +Several species are apex predators, which are organisms that are at the top of their food chain. Select examples include the tiger shark, blue shark, great white shark, mako shark, thresher shark, and hammerhead shark. + +Sharks are caught by humans for shark meat or shark fin soup. Many shark populations are threatened by human activities. Since 1970, shark populations have been reduced by 71%, mostly from overfishing. \ No newline at end of file diff --git a/tests/integration_tests/vectorstores/test_elasticsearch.py b/tests/integration_tests/vectorstores/test_elasticsearch.py index 075fab4a..c5222565 100644 --- a/tests/integration_tests/vectorstores/test_elasticsearch.py +++ b/tests/integration_tests/vectorstores/test_elasticsearch.py @@ -1,29 +1,137 @@ """Test ElasticSearch functionality.""" +import logging +import os +from typing import Generator, List, Union + +import pytest +from elasticsearch import Elasticsearch from langchain.docstore.document import Document +from langchain.document_loaders import TextLoader +from langchain.embeddings import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings +logging.basicConfig(level=logging.DEBUG) + +""" +cd tests/integration_tests/vectorstores/docker-compose +docker-compose -f elasticsearch.yml up +""" + + +class TestElasticsearch: + @pytest.fixture(scope="class", autouse=True) + def elasticsearch_url(self) -> Union[str, Generator[str, None, None]]: + """Return the elasticsearch url.""" + url = "http://localhost:9200" + yield url + es = Elasticsearch(hosts=url) + + # Clear all indexes + index_names = es.indices.get(index="_all").keys() + for index_name in index_names: + # print(index_name) + es.indices.delete(index=index_name) + + @pytest.fixture(scope="class", autouse=True) + def openai_api_key(self) -> Union[str, Generator[str, None, None]]: + """Return the OpenAI API key.""" + openai_api_key = os.getenv("OPENAI_API_KEY") + if not openai_api_key: + raise ValueError("OPENAI_API_KEY environment variable is not set") + + yield openai_api_key + + @pytest.fixture(scope="class") + def documents(self) -> Generator[List[Document], None, None]: + """Return a generator that yields a list of documents.""" + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + + documents = TextLoader( + os.path.join(os.path.dirname(__file__), "fixtures", "sharks.txt") + ).load() + yield text_splitter.split_documents(documents) + + def test_similarity_search_without_metadata(self, elasticsearch_url: str) -> None: + """Test end to end construction and search without metadata.""" + texts = ["foo", "bar", "baz"] + docsearch = ElasticVectorSearch.from_texts( + texts, FakeEmbeddings(), elasticsearch_url=elasticsearch_url + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + def test_similarity_search_with_metadata(self, elasticsearch_url: str) -> None: + """Test end to end construction and search with metadata.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = ElasticVectorSearch.from_texts( + texts, + FakeEmbeddings(), + metadatas=metadatas, + elasticsearch_url=elasticsearch_url, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": 0})] + + def test_default_index_from_documents( + self, documents: List[Document], openai_api_key: str, elasticsearch_url: str + ) -> None: + """This test checks the construction of a default + ElasticSearch index using the 'from_documents'.""" + embedding = OpenAIEmbeddings(openai_api_key=openai_api_key) + + elastic_vector_search = ElasticVectorSearch.from_documents( + documents=documents, + embedding=embedding, + elasticsearch_url=elasticsearch_url, + ) + + search_result = elastic_vector_search.similarity_search("sharks") + + print(search_result) + assert len(search_result) != 0 + + def test_custom_index_from_documents( + self, documents: List[Document], openai_api_key: str, elasticsearch_url: str + ) -> None: + """This test checks the construction of a custom + ElasticSearch index using the 'from_documents'.""" + embedding = OpenAIEmbeddings(openai_api_key=openai_api_key) + elastic_vector_search = ElasticVectorSearch.from_documents( + documents=documents, + embedding=embedding, + elasticsearch_url=elasticsearch_url, + index_name="custom_index", + ) + es = Elasticsearch(hosts=elasticsearch_url) + index_names = es.indices.get(index="_all").keys() + assert "custom_index" in index_names + + search_result = elastic_vector_search.similarity_search("sharks") + print(search_result) + + assert len(search_result) != 0 + + def test_custom_index_add_documents( + self, documents: List[Document], openai_api_key: str, elasticsearch_url: str + ) -> None: + """This test checks the construction of a custom + ElasticSearch index using the 'add_documents'.""" + embedding = OpenAIEmbeddings(openai_api_key=openai_api_key) + elastic_vector_search = ElasticVectorSearch( + embedding=embedding, + elasticsearch_url=elasticsearch_url, + index_name="custom_index", + ) + es = Elasticsearch(hosts=elasticsearch_url) + index_names = es.indices.get(index="_all").keys() + assert "custom_index" in index_names + + elastic_vector_search.add_documents(documents) + search_result = elastic_vector_search.similarity_search("sharks") + print(search_result) -def test_elasticsearch() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - docsearch = ElasticVectorSearch.from_texts( - texts, FakeEmbeddings(), elasticsearch_url="http://localhost:9200" - ) - output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo")] - - -def test_elasticsearch_with_metadatas() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [{"page": i} for i in range(len(texts))] - docsearch = ElasticVectorSearch.from_texts( - texts, - FakeEmbeddings(), - metadatas=metadatas, - elasticsearch_url="http://localhost:9200", - ) - output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo", metadata={"page": 0})] + assert len(search_result) != 0