diff --git a/docs/docs/integrations/providers/vlite.mdx b/docs/docs/integrations/providers/vlite.mdx new file mode 100644 index 0000000000..6599dec720 --- /dev/null +++ b/docs/docs/integrations/providers/vlite.mdx @@ -0,0 +1,31 @@ +# vlite + +This page covers how to use [vlite](https://github.com/sdan/vlite) within LangChain. vlite is a simple and fast vector database for storing and retrieving embeddings. + +## Installation and Setup + +To install vlite, run the following command: + +```bash +pip install vlite +``` + +For PDF OCR support, install the `vlite[ocr]` extra: + +```bash +pip install vlite[ocr] +``` + +## VectorStore + +vlite provides a wrapper around its vector database, allowing you to use it as a vectorstore for semantic search and example selection. + +To import the vlite vectorstore: + +```python +from langchain_community.vectorstores import vlite +``` + +### Usage + +For a more detailed walkthrough of the vlite wrapper, see [this notebook](/docs/integrations/vectorstores/vlite). \ No newline at end of file diff --git a/docs/docs/integrations/vectorstores/vlite.ipynb b/docs/docs/integrations/vectorstores/vlite.ipynb new file mode 100644 index 0000000000..46a2f46a44 --- /dev/null +++ b/docs/docs/integrations/vectorstores/vlite.ipynb @@ -0,0 +1,186 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# vlite\n", + "\n", + "VLite is a simple and blazing fast vector database that allows you to store and retrieve data semantically using embeddings. Made with numpy, vlite is a lightweight batteries-included database to implement RAG, similarity search, and embeddings into your projects.\n", + "\n", + "## Installation\n", + "\n", + "To use the VLite in LangChain, you need to install the `vlite` package:\n", + "\n", + "```bash\n", + "!pip install vlite\n", + "```\n", + "\n", + "## Importing VLite\n", + "\n", + "```python\n", + "from langchain.vectorstores import VLite\n", + "```\n", + "\n", + "## Basic Example\n", + "\n", + "In this basic example, we load a text document, and store them in the VLite vector database. Then, we perform a similarity search to retrieve relevant documents based on a query.\n", + "\n", + "VLite handles chunking and embedding of the text for you, and you can change these parameters by pre-chunking the text and/or embeddings those chunks into the VLite database.\n", + "\n", + "```python\n", + "from langchain.document_loaders import TextLoader\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "\n", + "# Load the document and split it into chunks\n", + "loader = TextLoader(\"path/to/document.txt\")\n", + "documents = loader.load()\n", + "\n", + "# Create a VLite instance\n", + "vlite = VLite(collection=\"my_collection\")\n", + "\n", + "# Add documents to the VLite vector database\n", + "vlite.add_documents(documents)\n", + "\n", + "# Perform a similarity search\n", + "query = \"What is the main topic of the document?\"\n", + "docs = vlite.similarity_search(query)\n", + "\n", + "# Print the most relevant document\n", + "print(docs[0].page_content)\n", + "```\n", + "\n", + "## Adding Texts and Documents\n", + "\n", + "You can add texts or documents to the VLite vector database using the `add_texts` and `add_documents` methods, respectively.\n", + "\n", + "```python\n", + "# Add texts to the VLite vector database\n", + "texts = [\"This is the first text.\", \"This is the second text.\"]\n", + "vlite.add_texts(texts)\n", + "\n", + "# Add documents to the VLite vector database\n", + "documents = [Document(page_content=\"This is a document.\", metadata={\"source\": \"example.txt\"})]\n", + "vlite.add_documents(documents)\n", + "```\n", + "\n", + "## Similarity Search\n", + "\n", + "VLite provides methods for performing similarity search on the stored documents.\n", + "\n", + "```python\n", + "# Perform a similarity search\n", + "query = \"What is the main topic of the document?\"\n", + "docs = vlite.similarity_search(query, k=3)\n", + "\n", + "# Perform a similarity search with scores\n", + "docs_with_scores = vlite.similarity_search_with_score(query, k=3)\n", + "```\n", + "\n", + "## Max Marginal Relevance Search\n", + "\n", + "VLite also supports Max Marginal Relevance (MMR) search, which optimizes for both similarity to the query and diversity among the retrieved documents.\n", + "\n", + "```python\n", + "# Perform an MMR search\n", + "docs = vlite.max_marginal_relevance_search(query, k=3)\n", + "```\n", + "\n", + "## Updating and Deleting Documents\n", + "\n", + "You can update or delete documents in the VLite vector database using the `update_document` and `delete` methods.\n", + "\n", + "```python\n", + "# Update a document\n", + "document_id = \"doc_id_1\"\n", + "updated_document = Document(page_content=\"Updated content\", metadata={\"source\": \"updated.txt\"})\n", + "vlite.update_document(document_id, updated_document)\n", + "\n", + "# Delete documents\n", + "document_ids = [\"doc_id_1\", \"doc_id_2\"]\n", + "vlite.delete(document_ids)\n", + "```\n", + "\n", + "## Retrieving Documents\n", + "\n", + "You can retrieve documents from the VLite vector database based on their IDs or metadata using the `get` method.\n", + "\n", + "```python\n", + "# Retrieve documents by IDs\n", + "document_ids = [\"doc_id_1\", \"doc_id_2\"]\n", + "docs = vlite.get(ids=document_ids)\n", + "\n", + "# Retrieve documents by metadata\n", + "metadata_filter = {\"source\": \"example.txt\"}\n", + "docs = vlite.get(where=metadata_filter)\n", + "```\n", + "\n", + "## Creating VLite Instances\n", + "\n", + "You can create VLite instances using various methods:\n", + "\n", + "```python\n", + "# Create a VLite instance from texts\n", + "vlite = VLite.from_texts(texts)\n", + "\n", + "# Create a VLite instance from documents\n", + "vlite = VLite.from_documents(documents)\n", + "\n", + "# Create a VLite instance from an existing index\n", + "vlite = VLite.from_existing_index(collection=\"existing_collection\")\n", + "```\n", + "\n", + "## Additional Features\n", + "\n", + "VLite provides additional features for managing the vector database:\n", + "\n", + "```python\n", + "from langchain.vectorstores import VLite\n", + "vlite = VLite(collection=\"my_collection\")\n", + "\n", + "# Get the number of items in the collection\n", + "count = vlite.count()\n", + "\n", + "# Save the collection\n", + "vlite.save()\n", + "\n", + "# Clear the collection\n", + "vlite.clear()\n", + "\n", + "# Get collection information\n", + "vlite.info()\n", + "\n", + "# Dump the collection data\n", + "data = vlite.dump()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index 49982d6cd6..fe0c1b4600 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -265,6 +265,9 @@ if TYPE_CHECKING: from langchain_community.vectorstores.vespa import ( VespaStore, # noqa: F401 ) + from langchain_community.vectorstores.vlite import ( + VLite, # noqa: F401 + ) from langchain_community.vectorstores.weaviate import ( Weaviate, # noqa: F401 ) @@ -364,6 +367,7 @@ __all__ = [ "Vectara", "VectorStore", "VespaStore", + "VLite", "Weaviate", "Yellowbrick", "ZepVectorStore", @@ -456,6 +460,7 @@ _module_lookup = { "Vectara": "langchain_community.vectorstores.vectara", "VectorStore": "langchain_core.vectorstores", "VespaStore": "langchain_community.vectorstores.vespa", + "VLite": "langchain_community.vectorstores.vlite", "Weaviate": "langchain_community.vectorstores.weaviate", "Yellowbrick": "langchain_community.vectorstores.yellowbrick", "ZepVectorStore": "langchain_community.vectorstores.zep", diff --git a/libs/community/langchain_community/vectorstores/vlite.py b/libs/community/langchain_community/vectorstores/vlite.py new file mode 100644 index 0000000000..41a790ff16 --- /dev/null +++ b/libs/community/langchain_community/vectorstores/vlite.py @@ -0,0 +1,247 @@ +from __future__ import annotations + +# Standard library imports +from typing import Any, Dict, Iterable, List, Optional, Tuple +from uuid import uuid4 + +# LangChain imports +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + + +class VLite(VectorStore): + """VLite is a simple and fast vector database for semantic search.""" + + def __init__( + self, + embedding_function: Embeddings, + collection: Optional[str] = None, + **kwargs: Any, + ): + super().__init__() + self.embedding_function = embedding_function + self.collection = collection or f"vlite_{uuid4().hex}" + # Third-party imports + try: + from vlite import VLite + except ImportError: + raise ImportError( + "Could not import vlite python package. " + "Please install it with `pip install vlite`." + ) + self.vlite = VLite(collection=self.collection, **kwargs) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + kwargs: vectorstore specific parameters + + Returns: + List of ids from adding the texts into the vectorstore. + """ + texts = list(texts) + ids = kwargs.pop("ids", [str(uuid4()) for _ in texts]) + embeddings = self.embedding_function.embed_documents(texts) + if not metadatas: + metadatas = [{} for _ in texts] + data_points = [ + {"text": text, "metadata": metadata, "id": id, "embedding": embedding} + for text, metadata, id, embedding in zip(texts, metadatas, ids, embeddings) + ] + results = self.vlite.add(data_points) + return [result[0] for result in results] + + def add_documents( + self, + documents: List[Document], + **kwargs: Any, + ) -> List[str]: + """Add a list of documents to the vectorstore. + + Args: + documents: List of documents to add to the vectorstore. + kwargs: vectorstore specific parameters such as "file_path" for processing + directly with vlite. + + Returns: + List of ids from adding the documents into the vectorstore. + """ + ids = kwargs.pop("ids", [str(uuid4()) for _ in documents]) + texts = [] + metadatas = [] + for doc, id in zip(documents, ids): + if "file_path" in kwargs: + # Third-party imports + try: + from vlite.utils import process_file + except ImportError: + raise ImportError( + "Could not import vlite python package. " + "Please install it with `pip install vlite`." + ) + processed_data = process_file(kwargs["file_path"]) + texts.extend(processed_data) + metadatas.extend([doc.metadata] * len(processed_data)) + ids.extend([f"{id}_{i}" for i in range(len(processed_data))]) + else: + texts.append(doc.page_content) + metadatas.append(doc.metadata) + return self.add_texts(texts, metadatas, ids=ids) + + def similarity_search( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query. + """ + docs_and_scores = self.similarity_search_with_score(query, k=k) + return [doc for doc, _ in docs_and_scores] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + filter: Optional[Dict[str, str]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Filter by metadata. Defaults to None. + + Returns: + List of Tuples of (doc, score), where score is the similarity score. + """ + metadata = filter or {} + embedding = self.embedding_function.embed_query(query) + results = self.vlite.retrieve( + text=query, + top_k=k, + metadata=metadata, + return_scores=True, + embedding=embedding, + ) + documents_with_scores = [ + (Document(page_content=text, metadata=metadata), score) + for text, score, metadata in results + ] + return documents_with_scores + + def update_document(self, document_id: str, document: Document) -> None: + """Update an existing document in the vectorstore.""" + self.vlite.update( + document_id, text=document.page_content, metadata=document.metadata + ) + + def get(self, ids: List[str]) -> List[Document]: + """Get documents by their IDs.""" + results = self.vlite.get(ids) + documents = [ + Document(page_content=text, metadata=metadata) for text, metadata in results + ] + return documents + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: + """Delete by ids.""" + if ids is not None: + self.vlite.delete(ids, **kwargs) + return True + return None + + @classmethod + def from_existing_index( + cls, + embedding: Embeddings, + collection: str, + **kwargs: Any, + ) -> VLite: + """Load an existing VLite index. + + Args: + embedding: Embedding function + collection: Name of the collection to load. + + Returns: + VLite vector store. + """ + vlite = cls(embedding_function=embedding, collection=collection, **kwargs) + return vlite + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + collection: Optional[str] = None, + **kwargs: Any, + ) -> VLite: + """Construct VLite wrapper from raw documents. + + This is a user-friendly interface that: + 1. Embeds documents. + 2. Adds the documents to the vectorstore. + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain import VLite + from langchain.embeddings import OpenAIEmbeddings + + embeddings = OpenAIEmbeddings() + vlite = VLite.from_texts(texts, embeddings) + """ + vlite = cls(embedding_function=embedding, collection=collection, **kwargs) + vlite.add_texts(texts, metadatas, **kwargs) + return vlite + + @classmethod + def from_documents( + cls, + documents: List[Document], + embedding: Embeddings, + collection: Optional[str] = None, + **kwargs: Any, + ) -> VLite: + """Construct VLite wrapper from a list of documents. + + This is a user-friendly interface that: + 1. Embeds documents. + 2. Adds the documents to the vectorstore. + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain import VLite + from langchain.embeddings import OpenAIEmbeddings + + embeddings = OpenAIEmbeddings() + vlite = VLite.from_documents(documents, embeddings) + """ + vlite = cls(embedding_function=embedding, collection=collection, **kwargs) + vlite.add_documents(documents, **kwargs) + return vlite diff --git a/libs/community/tests/integration_tests/vectorstores/test_vlite.py b/libs/community/tests/integration_tests/vectorstores/test_vlite.py new file mode 100644 index 0000000000..a0fc53f3c4 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/test_vlite.py @@ -0,0 +1,88 @@ +"""Test VLite functionality.""" + +from langchain_core.documents import Document + +from langchain_community.embeddings import FakeEmbeddings +from langchain_community.vectorstores import VLite + + +def test_vlite() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = VLite.from_texts(texts=texts, embedding=FakeEmbeddings()) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_vlite_with_metadatas() -> None: + """Test end to end construction and search with metadata.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = VLite.from_texts( + texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": "0"})] + + +def test_vlite_with_metadatas_with_scores() -> None: + """Test end to end construction and search with metadata and scores.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = VLite.from_texts( + texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas + ) + output = docsearch.similarity_search_with_score("foo", k=1) + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + + +def test_vlite_update_document() -> None: + """Test updating a document.""" + texts = ["foo", "bar", "baz"] + docsearch = VLite.from_texts( + texts=texts, embedding=FakeEmbeddings(), ids=["1", "2", "3"] + ) + docsearch.update_document("1", Document(page_content="updated_foo")) + output = docsearch.similarity_search("updated_foo", k=1) + assert output == [Document(page_content="updated_foo")] + + +def test_vlite_delete_document() -> None: + """Test deleting a document.""" + texts = ["foo", "bar", "baz"] + docsearch = VLite.from_texts( + texts=texts, embedding=FakeEmbeddings(), ids=["1", "2", "3"] + ) + docsearch.delete(["1"]) + output = docsearch.similarity_search("foo", k=3) + assert Document(page_content="foo") not in output + + +def test_vlite_get_documents() -> None: + """Test getting documents by IDs.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = VLite.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + ids=["1", "2", "3"], + ) + output = docsearch.get(ids=["1", "3"]) + assert output == [ + Document(page_content="foo", metadata={"page": "0"}), + Document(page_content="baz", metadata={"page": "2"}), + ] + + +def test_vlite_from_existing_index() -> None: + """Test loading from an existing index.""" + texts = ["foo", "bar", "baz"] + VLite.from_texts( + texts=texts, embedding=FakeEmbeddings(), collection="test_collection" + ) + new_docsearch = VLite.from_existing_index( + collection="test_collection", embedding=FakeEmbeddings() + ) + output = new_docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] diff --git a/libs/community/tests/unit_tests/vectorstores/test_imports.py b/libs/community/tests/unit_tests/vectorstores/test_imports.py index 6042db9f98..97a26daa1d 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_imports.py +++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py @@ -89,6 +89,7 @@ EXPECTED_ALL = [ "Vectara", "VectorStore", "VespaStore", + "VLite", "Weaviate", "Yellowbrick", "ZepVectorStore", diff --git a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py index b1736d1675..b5b9c4b78e 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py +++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py @@ -88,6 +88,7 @@ def test_compatible_vectorstore_documentation() -> None: "VDMS", "Vearch", "VespaStore", + "VLite", "Weaviate", "ZepVectorStore", "Zilliz", diff --git a/libs/community/tests/unit_tests/vectorstores/test_public_api.py b/libs/community/tests/unit_tests/vectorstores/test_public_api.py index 4741651413..96f62992de 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py +++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py @@ -82,6 +82,7 @@ _EXPECTED = [ "Vearch", "Vectara", "VespaStore", + "VLite", "Weaviate", "ZepVectorStore", "Zilliz",