forked from Archives/langchain
444ca3f669
Hi there: As I implement the AnalyticDB VectorStore use two table to store the document before. It seems just use one table is a better way. So this commit is try to improve AnalyticDB VectorStore implementation without affecting user behavior: **1. Streamline the `post_init `behavior by creating a single table with vector indexing. 2. Update the `add_texts` API for document insertion. 3. Optimize `similarity_search_with_score_by_vector` to retrieve results directly from the table. 4. Implement `_similarity_search_with_relevance_scores`. 5. Add `embedding_dimension` parameter to support different dimension embedding functions.** Users can continue using the API as before. Test cases added before is enough to meet this commit.
129 lines
4.6 KiB
Python
129 lines
4.6 KiB
Python
"""Test PGVector functionality."""
|
|
import os
|
|
from typing import List
|
|
|
|
from langchain.docstore.document import Document
|
|
from langchain.vectorstores.analyticdb import AnalyticDB
|
|
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
|
|
|
CONNECTION_STRING = AnalyticDB.connection_string_from_db_params(
|
|
driver=os.environ.get("PG_DRIVER", "psycopg2cffi"),
|
|
host=os.environ.get("PG_HOST", "localhost"),
|
|
port=int(os.environ.get("PG_PORT", "5432")),
|
|
database=os.environ.get("PG_DATABASE", "postgres"),
|
|
user=os.environ.get("PG_USER", "postgres"),
|
|
password=os.environ.get("PG_PASSWORD", "postgres"),
|
|
)
|
|
|
|
|
|
ADA_TOKEN_COUNT = 1536
|
|
|
|
|
|
class FakeEmbeddingsWithAdaDimension(FakeEmbeddings):
|
|
"""Fake embeddings functionality for testing."""
|
|
|
|
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
"""Return simple embeddings."""
|
|
return [
|
|
[float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] for i in range(len(texts))
|
|
]
|
|
|
|
def embed_query(self, text: str) -> List[float]:
|
|
"""Return simple embeddings."""
|
|
return [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(0.0)]
|
|
|
|
|
|
def test_analyticdb() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
docsearch = AnalyticDB.from_texts(
|
|
texts=texts,
|
|
collection_name="test_collection",
|
|
embedding=FakeEmbeddingsWithAdaDimension(),
|
|
connection_string=CONNECTION_STRING,
|
|
pre_delete_collection=True,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
|
|
def test_analyticdb_with_metadatas() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = AnalyticDB.from_texts(
|
|
texts=texts,
|
|
collection_name="test_collection",
|
|
embedding=FakeEmbeddingsWithAdaDimension(),
|
|
metadatas=metadatas,
|
|
connection_string=CONNECTION_STRING,
|
|
pre_delete_collection=True,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output == [Document(page_content="foo", metadata={"page": "0"})]
|
|
|
|
|
|
def test_analyticdb_with_metadatas_with_scores() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = AnalyticDB.from_texts(
|
|
texts=texts,
|
|
collection_name="test_collection",
|
|
embedding=FakeEmbeddingsWithAdaDimension(),
|
|
metadatas=metadatas,
|
|
connection_string=CONNECTION_STRING,
|
|
pre_delete_collection=True,
|
|
)
|
|
output = docsearch.similarity_search_with_score("foo", k=1)
|
|
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
|
|
|
|
|
|
def test_analyticdb_with_filter_match() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = AnalyticDB.from_texts(
|
|
texts=texts,
|
|
collection_name="test_collection_filter",
|
|
embedding=FakeEmbeddingsWithAdaDimension(),
|
|
metadatas=metadatas,
|
|
connection_string=CONNECTION_STRING,
|
|
pre_delete_collection=True,
|
|
)
|
|
output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "0"})
|
|
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
|
|
|
|
|
|
def test_analyticdb_with_filter_distant_match() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = AnalyticDB.from_texts(
|
|
texts=texts,
|
|
collection_name="test_collection_filter",
|
|
embedding=FakeEmbeddingsWithAdaDimension(),
|
|
metadatas=metadatas,
|
|
connection_string=CONNECTION_STRING,
|
|
pre_delete_collection=True,
|
|
)
|
|
output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "2"})
|
|
print(output)
|
|
assert output == [(Document(page_content="baz", metadata={"page": "2"}), 4.0)]
|
|
|
|
|
|
def test_analyticdb_with_filter_no_match() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = AnalyticDB.from_texts(
|
|
texts=texts,
|
|
collection_name="test_collection_filter",
|
|
embedding=FakeEmbeddingsWithAdaDimension(),
|
|
metadatas=metadatas,
|
|
connection_string=CONNECTION_STRING,
|
|
pre_delete_collection=True,
|
|
)
|
|
output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "5"})
|
|
assert output == []
|