mirror of https://github.com/hwchase17/langchain
community[minor]: Add VDMS vectorstore (#19551)
- **Description:** Add support for Intel Lab's [Visual Data Management System (VDMS)](https://github.com/IntelLabs/vdms) as a vector store - **Dependencies:** `vdms` library which requires protobuf = "4.24.2". There is a conflict with dashvector in `langchain` package but conflict is resolved in `community`. - **Contribution maintainer:** [@cwlacewe](https://github.com/cwlacewe) - **Added tests:** libs/community/tests/integration_tests/vectorstores/test_vdms.py - **Added docs:** docs/docs/integrations/vectorstores/vdms.ipynb - **Added cookbook:** cookbook/multi_modal_RAG_vdms.ipynb --------- Co-authored-by: Eugene Yurtsev <eugene@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>pull/19696/head
parent
b7b62e29fb
commit
a31f692f4e
File diff suppressed because one or more lines are too long
@ -0,0 +1,62 @@
|
|||||||
|
# VDMS
|
||||||
|
|
||||||
|
> [VDMS](https://github.com/IntelLabs/vdms/blob/master/README.md) is a storage solution for efficient access
|
||||||
|
> of big-”visual”-data that aims to achieve cloud scale by searching for relevant visual data via visual metadata
|
||||||
|
> stored as a graph and enabling machine friendly enhancements to visual data for faster access.
|
||||||
|
|
||||||
|
## Installation and Setup
|
||||||
|
|
||||||
|
### Install Client
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install vdms
|
||||||
|
```
|
||||||
|
|
||||||
|
### Install Database
|
||||||
|
|
||||||
|
There are two ways to get started with VDMS:
|
||||||
|
|
||||||
|
#### Install VDMS on your local machine via docker
|
||||||
|
```bash
|
||||||
|
docker run -d -p 55555:55555 intellabs/vdms:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Install VDMS directly on your local machine
|
||||||
|
Please see [installation instructions](https://github.com/IntelLabs/vdms/blob/master/INSTALL.md).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## VectorStore
|
||||||
|
|
||||||
|
The vector store is a simple wrapper around VDMS. It provides a simple interface to store and retrieve data.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain_community.document_loaders import TextLoader
|
||||||
|
from langchain.text_splitter import CharacterTextSplitter
|
||||||
|
|
||||||
|
loader = TextLoader("./state_of_the_union.txt")
|
||||||
|
documents = loader.load()
|
||||||
|
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
|
||||||
|
docs = text_splitter.split_documents(documents)
|
||||||
|
|
||||||
|
from langchain_community.vectorstores import VDMS
|
||||||
|
from langchain_community.vectorstores.vdms import VDMS_Client
|
||||||
|
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
|
||||||
|
|
||||||
|
client = VDMS_Client("localhost", 55555)
|
||||||
|
vectorstore = VDMS.from_documents(
|
||||||
|
docs,
|
||||||
|
client=client,
|
||||||
|
collection_name="langchain-demo",
|
||||||
|
embedding_function=HuggingFaceEmbeddings(),
|
||||||
|
engine="FaissFlat"
|
||||||
|
distance_strategy="L2",
|
||||||
|
)
|
||||||
|
|
||||||
|
query = "What did the president say about Ketanji Brown Jackson"
|
||||||
|
results = vectorstore.similarity_search(query)
|
||||||
|
```
|
||||||
|
|
||||||
|
For a more detailed walkthrough of the VDMS wrapper, see [this notebook](/docs/integrations/vectorstores/vdms)
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,365 @@
|
|||||||
|
"""Test VDMS functionality."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
from langchain_community.vectorstores import VDMS
|
||||||
|
from langchain_community.vectorstores.vdms import VDMS_Client, embedding2bytes
|
||||||
|
from tests.integration_tests.vectorstores.fake_embeddings import (
|
||||||
|
ConsistentFakeEmbeddings,
|
||||||
|
FakeEmbeddings,
|
||||||
|
)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import vdms
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
|
# The connection string matches the default settings in the docker-compose file
|
||||||
|
# located in the root of the repository: [root]/docker/docker-compose.yml
|
||||||
|
# To spin up a detached VDMS server:
|
||||||
|
# cd [root]/docker
|
||||||
|
# docker compose up -d vdms
|
||||||
|
@pytest.fixture
|
||||||
|
def vdms_client() -> vdms.vdms:
|
||||||
|
return VDMS_Client(
|
||||||
|
host=os.getenv("VDMS_DBHOST", "localhost"),
|
||||||
|
port=int(os.getenv("VDMS_DBPORT", 6025)),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_init_from_client(vdms_client: vdms.vdms) -> None:
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
_ = VDMS(
|
||||||
|
embedding_function=embedding_function,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_from_texts_with_metadatas(vdms_client: vdms.vdms) -> None:
|
||||||
|
"""Test end to end construction and search."""
|
||||||
|
collection_name = "test_from_texts_with_metadatas"
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
ids = [f"test_from_texts_with_metadatas_{i}" for i in range(len(texts))]
|
||||||
|
metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)]
|
||||||
|
docsearch = VDMS.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
ids=ids,
|
||||||
|
embedding=embedding_function,
|
||||||
|
metadatas=metadatas,
|
||||||
|
collection_name=collection_name,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
|
assert output == [
|
||||||
|
Document(page_content="foo", metadata={"page": "1", "id": ids[0]})
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_from_texts_with_metadatas_with_scores(vdms_client: vdms.vdms) -> None:
|
||||||
|
"""Test end to end construction and scored search."""
|
||||||
|
collection_name = "test_from_texts_with_metadatas_with_scores"
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
ids = [f"test_from_texts_with_metadatas_with_scores_{i}" for i in range(len(texts))]
|
||||||
|
metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)]
|
||||||
|
docsearch = VDMS.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
ids=ids,
|
||||||
|
embedding=embedding_function,
|
||||||
|
metadatas=metadatas,
|
||||||
|
collection_name=collection_name,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search_with_score("foo", k=1)
|
||||||
|
assert output == [
|
||||||
|
(Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_from_texts_with_metadatas_with_scores_using_vector(
|
||||||
|
vdms_client: vdms.vdms,
|
||||||
|
) -> None:
|
||||||
|
"""Test end to end construction and scored search, using embedding vector."""
|
||||||
|
collection_name = "test_from_texts_with_metadatas_with_scores_using_vector"
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
ids = [f"test_from_texts_with_metadatas_{i}" for i in range(len(texts))]
|
||||||
|
metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)]
|
||||||
|
docsearch = VDMS.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
ids=ids,
|
||||||
|
embedding=embedding_function,
|
||||||
|
metadatas=metadatas,
|
||||||
|
collection_name=collection_name,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
output = docsearch._similarity_search_with_relevance_scores("foo", k=1)
|
||||||
|
assert output == [
|
||||||
|
(Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_search_filter(vdms_client: vdms.vdms) -> None:
|
||||||
|
"""Test end to end construction and search with metadata filtering."""
|
||||||
|
collection_name = "test_search_filter"
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
texts = ["far", "bar", "baz"]
|
||||||
|
ids = [f"test_search_filter_{i}" for i in range(len(texts))]
|
||||||
|
metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
|
||||||
|
docsearch = VDMS.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
ids=ids,
|
||||||
|
embedding=embedding_function,
|
||||||
|
metadatas=metadatas,
|
||||||
|
collection_name=collection_name,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search(
|
||||||
|
"far", k=1, filter={"first_letter": ["==", "f"]}
|
||||||
|
)
|
||||||
|
assert output == [
|
||||||
|
Document(page_content="far", metadata={"first_letter": "f", "id": ids[0]})
|
||||||
|
]
|
||||||
|
output = docsearch.similarity_search(
|
||||||
|
"far", k=2, filter={"first_letter": ["==", "b"]}
|
||||||
|
)
|
||||||
|
assert output == [
|
||||||
|
Document(page_content="bar", metadata={"first_letter": "b", "id": ids[1]}),
|
||||||
|
Document(page_content="baz", metadata={"first_letter": "b", "id": ids[2]}),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_search_filter_with_scores(vdms_client: vdms.vdms) -> None:
|
||||||
|
"""Test end to end construction and scored search with metadata filtering."""
|
||||||
|
collection_name = "test_search_filter_with_scores"
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
texts = ["far", "bar", "baz"]
|
||||||
|
ids = [f"test_search_filter_with_scores_{i}" for i in range(len(texts))]
|
||||||
|
metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
|
||||||
|
docsearch = VDMS.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
ids=ids,
|
||||||
|
embedding=embedding_function,
|
||||||
|
metadatas=metadatas,
|
||||||
|
collection_name=collection_name,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search_with_score(
|
||||||
|
"far", k=1, filter={"first_letter": ["==", "f"]}
|
||||||
|
)
|
||||||
|
assert output == [
|
||||||
|
(
|
||||||
|
Document(page_content="far", metadata={"first_letter": "f", "id": ids[0]}),
|
||||||
|
0.0,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
output = docsearch.similarity_search_with_score(
|
||||||
|
"far", k=2, filter={"first_letter": ["==", "b"]}
|
||||||
|
)
|
||||||
|
assert output == [
|
||||||
|
(
|
||||||
|
Document(page_content="bar", metadata={"first_letter": "b", "id": ids[1]}),
|
||||||
|
1.0,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Document(page_content="baz", metadata={"first_letter": "b", "id": ids[2]}),
|
||||||
|
4.0,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_mmr(vdms_client: vdms.vdms) -> None:
|
||||||
|
"""Test end to end construction and search."""
|
||||||
|
collection_name = "test_mmr"
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
ids = [f"test_mmr_{i}" for i in range(len(texts))]
|
||||||
|
docsearch = VDMS.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
ids=ids,
|
||||||
|
embedding=embedding_function,
|
||||||
|
collection_name=collection_name,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
output = docsearch.max_marginal_relevance_search("foo", k=1)
|
||||||
|
assert output == [Document(page_content="foo", metadata={"id": ids[0]})]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_mmr_by_vector(vdms_client: vdms.vdms) -> None:
|
||||||
|
"""Test end to end construction and search."""
|
||||||
|
collection_name = "test_mmr_by_vector"
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
ids = [f"test_mmr_by_vector_{i}" for i in range(len(texts))]
|
||||||
|
docsearch = VDMS.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
ids=ids,
|
||||||
|
embedding=embedding_function,
|
||||||
|
collection_name=collection_name,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
embedded_query = embedding_function.embed_query("foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_by_vector(embedded_query, k=1)
|
||||||
|
assert output == [Document(page_content="foo", metadata={"id": ids[0]})]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_with_include_parameter(vdms_client: vdms.vdms) -> None:
|
||||||
|
"""Test end to end construction and include parameter."""
|
||||||
|
collection_name = "test_with_include_parameter"
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docsearch = VDMS.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
embedding=embedding_function,
|
||||||
|
collection_name=collection_name,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
response, response_array = docsearch.get(collection_name, include=["embeddings"])
|
||||||
|
assert response_array != []
|
||||||
|
response, response_array = docsearch.get(collection_name)
|
||||||
|
assert response_array == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_update_document(vdms_client: vdms.vdms) -> None:
|
||||||
|
"""Test the update_document function in the VDMS class."""
|
||||||
|
collection_name = "test_update_document"
|
||||||
|
|
||||||
|
# Make a consistent embedding
|
||||||
|
embedding_function = ConsistentFakeEmbeddings()
|
||||||
|
|
||||||
|
# Initial document content and id
|
||||||
|
initial_content = "foo"
|
||||||
|
document_id = "doc1"
|
||||||
|
|
||||||
|
# Create an instance of Document with initial content and metadata
|
||||||
|
original_doc = Document(page_content=initial_content, metadata={"page": "1"})
|
||||||
|
|
||||||
|
# Initialize a VDMS instance with the original document
|
||||||
|
docsearch = VDMS.from_documents(
|
||||||
|
client=vdms_client,
|
||||||
|
collection_name=collection_name,
|
||||||
|
documents=[original_doc],
|
||||||
|
embedding=embedding_function,
|
||||||
|
ids=[document_id],
|
||||||
|
)
|
||||||
|
response, old_embedding = docsearch.get(
|
||||||
|
collection_name,
|
||||||
|
constraints={"id": ["==", document_id]},
|
||||||
|
include=["metadata", "embeddings"],
|
||||||
|
)
|
||||||
|
# old_embedding = response_array[0]
|
||||||
|
|
||||||
|
# Define updated content for the document
|
||||||
|
updated_content = "updated foo"
|
||||||
|
|
||||||
|
# Create a new Document instance with the updated content and the same id
|
||||||
|
updated_doc = Document(page_content=updated_content, metadata={"page": "1"})
|
||||||
|
|
||||||
|
# Update the document in the VDMS instance
|
||||||
|
docsearch.update_document(
|
||||||
|
collection_name, document_id=document_id, document=updated_doc
|
||||||
|
)
|
||||||
|
|
||||||
|
# Perform a similarity search with the updated content
|
||||||
|
output = docsearch.similarity_search(updated_content, k=1)
|
||||||
|
|
||||||
|
# Assert that the updated document is returned by the search
|
||||||
|
assert output == [
|
||||||
|
Document(
|
||||||
|
page_content=updated_content, metadata={"page": "1", "id": document_id}
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Assert that the new embedding is correct
|
||||||
|
response, new_embedding = docsearch.get(
|
||||||
|
collection_name,
|
||||||
|
constraints={"id": ["==", document_id]},
|
||||||
|
include=["metadata", "embeddings"],
|
||||||
|
)
|
||||||
|
# new_embedding = response_array[0]
|
||||||
|
|
||||||
|
assert new_embedding[0] == embedding2bytes(
|
||||||
|
embedding_function.embed_documents([updated_content])[0]
|
||||||
|
)
|
||||||
|
assert new_embedding != old_embedding
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_with_relevance_score(vdms_client: vdms.vdms) -> None:
|
||||||
|
"""Test to make sure the relevance score is scaled to 0-1."""
|
||||||
|
collection_name = "test_with_relevance_score"
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
ids = [f"test_relevance_scores_{i}" for i in range(len(texts))]
|
||||||
|
metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)]
|
||||||
|
docsearch = VDMS.from_texts(
|
||||||
|
texts=texts,
|
||||||
|
ids=ids,
|
||||||
|
embedding=embedding_function,
|
||||||
|
metadatas=metadatas,
|
||||||
|
collection_name=collection_name,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
|
||||||
|
assert output == [
|
||||||
|
(Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0),
|
||||||
|
(Document(page_content="bar", metadata={"page": "2", "id": ids[1]}), 0.25),
|
||||||
|
(Document(page_content="baz", metadata={"page": "3", "id": ids[2]}), 1.0),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_add_documents_no_metadata(vdms_client: vdms.vdms) -> None:
|
||||||
|
collection_name = "test_add_documents_no_metadata"
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
db = VDMS(
|
||||||
|
collection_name=collection_name,
|
||||||
|
embedding_function=embedding_function,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
db.add_documents([Document(page_content="foo")])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("vdms")
|
||||||
|
def test_add_documents_mixed_metadata(vdms_client: vdms.vdms) -> None:
|
||||||
|
collection_name = "test_add_documents_mixed_metadata"
|
||||||
|
embedding_function = FakeEmbeddings()
|
||||||
|
db = VDMS(
|
||||||
|
collection_name=collection_name,
|
||||||
|
embedding_function=embedding_function,
|
||||||
|
client=vdms_client,
|
||||||
|
)
|
||||||
|
|
||||||
|
docs = [
|
||||||
|
Document(page_content="foo"),
|
||||||
|
Document(page_content="bar", metadata={"baz": 1}),
|
||||||
|
]
|
||||||
|
ids = ["10", "11"]
|
||||||
|
actual_ids = db.add_documents(docs, ids=ids)
|
||||||
|
assert actual_ids == ids
|
||||||
|
|
||||||
|
search = db.similarity_search("foo bar", k=2)
|
||||||
|
docs[0].metadata = {"id": ids[0]}
|
||||||
|
docs[1].metadata["id"] = ids[1]
|
||||||
|
assert sorted(search, key=lambda d: d.page_content) == sorted(
|
||||||
|
docs, key=lambda d: d.page_content
|
||||||
|
)
|
Loading…
Reference in New Issue