diff --git a/docs/integrations/awadb.md b/docs/integrations/awadb.md new file mode 100644 index 00000000..fc940bcd --- /dev/null +++ b/docs/integrations/awadb.md @@ -0,0 +1,21 @@ +# AwaDB + +>[AwaDB](https://github.com/awa-ai/awadb) is an AI Native database for the search and storage of embedding vectors used by LLM Applications. + +## Installation and Setup + +```bash +pip install awadb +``` + + +## VectorStore + +There exists a wrapper around AwaDB vector databases, allowing you to use it as a vectorstore, +whether for semantic search or example selection. + +```python +from langchain.vectorstores import AwaDB +``` + +For a more detailed walkthrough of the AwaDB wrapper, see [this notebook](../modules/indexes/vectorstores/examples/awadb.ipynb) diff --git a/docs/modules/indexes/vectorstores/examples/awadb.ipynb b/docs/modules/indexes/vectorstores/examples/awadb.ipynb new file mode 100644 index 00000000..be1b40ee --- /dev/null +++ b/docs/modules/indexes/vectorstores/examples/awadb.ipynb @@ -0,0 +1,194 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "833c4789", + "metadata": {}, + "source": [ + "# AwaDB\n", + "[AwaDB](https://github.com/awa-ai/awadb) is an AI Native database for the search and storage of embedding vectors used by LLM Applications.\n", + "This notebook shows how to use functionality related to the AwaDB." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "252930ea", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install awadb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2b71a47", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import AwaDB\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49be0bac", + "metadata": {}, + "outputs": [], + "source": [ + "loader = TextLoader('../../../state_of_the_union.txt')\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size= 100, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18714278", + "metadata": {}, + "outputs": [], + "source": [ + "db = AwaDB.from_documents(docs)\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = db.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62b7a4c5", + "metadata": {}, + "outputs": [], + "source": [ + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "a9b4be48", + "metadata": {}, + "source": [ + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence." + ] + }, + { + "cell_type": "markdown", + "id": "87fec6b5", + "metadata": {}, + "source": [ + "## Similarity search with score" + ] + }, + { + "cell_type": "markdown", + "id": "17231924", + "metadata": {}, + "source": [ + "The returned distance score is between 0-1. 0 is dissimilar, 1 is the most similar" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f40ddae1", + "metadata": {}, + "outputs": [], + "source": [ + "docs = db.similarity_search_with_score(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0045583", + "metadata": {}, + "outputs": [], + "source": [ + "print(docs[0])" + ] + }, + { + "cell_type": "markdown", + "id": "8c2da99d", + "metadata": {}, + "source": [ + "(Document(page_content='And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt'}), 0.561813814013747)" + ] + }, + { + "cell_type": "markdown", + "id": "0b49fb59", + "metadata": {}, + "source": [ + "## Restore the table created and added data before" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bfa6e25", + "metadata": {}, + "outputs": [], + "source": [ + "AwaDB automatically persists added document data" + ] + }, + { + "cell_type": "markdown", + "id": "2a0f3b35", + "metadata": {}, + "source": [ + "If you can restore the table you created and added before, you can just do this as below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fd4b5b0", + "metadata": {}, + "outputs": [], + "source": [ + "awadb_client = awadb.Client()\n", + "ret = awadb_client.Load('langchain_awadb')\n", + "if ret : print('awadb load table success')\n", + "else:\n", + " print('awadb load table failed')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ae9a9dd", + "metadata": {}, + "outputs": [], + "source": [ + "awadb load table success" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index c2c37933..3a932c23 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -2,6 +2,7 @@ from langchain.vectorstores.analyticdb import AnalyticDB from langchain.vectorstores.annoy import Annoy from langchain.vectorstores.atlas import AtlasDB +from langchain.vectorstores.awadb import AwaDB from langchain.vectorstores.base import VectorStore from langchain.vectorstores.chroma import Chroma from langchain.vectorstores.clickhouse import Clickhouse, ClickhouseSettings @@ -60,4 +61,5 @@ __all__ = [ "ClickhouseSettings", "Tigris", "MatchingEngine", + "AwaDB", ] diff --git a/langchain/vectorstores/awadb.py b/langchain/vectorstores/awadb.py new file mode 100644 index 00000000..9c7d8a38 --- /dev/null +++ b/langchain/vectorstores/awadb.py @@ -0,0 +1,284 @@ +"""Wrapper around AwaDB for embedding vectors""" +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.base import VectorStore + +# from pydantic import BaseModel, Field, root_validator + + +if TYPE_CHECKING: + import awadb + +logger = logging.getLogger() +DEFAULT_TOPN = 4 + + +class AwaDB(VectorStore): + """Interface implemented by AwaDB vector stores.""" + + _DEFAULT_TABLE_NAME = "langchain_awadb" + + def __init__( + self, + table_name: str = _DEFAULT_TABLE_NAME, + embedding_model: Optional[Embeddings] = None, + log_and_data_dir: Optional[str] = None, + client: Optional[awadb.Client] = None, + ) -> None: + """Initialize with AwaDB client.""" + + try: + import awadb + except ImportError: + raise ValueError( + "Could not import awadb python package. " + "Please install it with `pip install awadb`." + ) + + if client is not None: + self.awadb_client = client + else: + if log_and_data_dir is not None: + self.awadb_client = awadb.Client(log_and_data_dir) + else: + self.awadb_client = awadb.Client() + + self.awadb_client.Create(table_name) + if embedding_model is not None: + self.embedding_model = embedding_model + + self.added_doc_count = 0 + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + kwargs: vectorstore specific parameters + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if self.awadb_client is None: + raise ValueError("AwaDB client is None!!!") + + embeddings = None + if self.embedding_model is not None: + embeddings = self.embedding_model.embed_documents(list(texts)) + added_results: List[str] = [] + doc_no = 0 + for text in texts: + doc: List[Any] = [] + if embeddings is not None: + doc.append(text) + doc.append(embeddings[doc_no]) + else: + dict_tmp = {} + dict_tmp["embedding_text"] = text + doc.append(dict_tmp) + + if metadatas is not None: + if doc_no < metadatas.__len__(): + doc.append(metadatas[doc_no]) + self.awadb_client.Add(doc) + added_results.append(str(self.added_doc_count)) + + doc_no = doc_no + 1 + self.added_doc_count = self.added_doc_count + 1 + + return added_results + + def load_local( + self, + table_name: str = _DEFAULT_TABLE_NAME, + **kwargs: Any, + ) -> bool: + if self.awadb_client is None: + raise ValueError("AwaDB client is None!!!") + + return self.awadb_client.Load(table_name) + + def similarity_search( + self, + query: str, + k: int = DEFAULT_TOPN, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to query.""" + if self.awadb_client is None: + raise ValueError("AwaDB client is None!!!") + + embedding = None + if self.embedding_model is not None: + embedding = self.embedding_model.embed_query(query) + + return self.similarity_search_by_vector(embedding, k) + + def similarity_search_with_score( + self, + query: str, + k: int = DEFAULT_TOPN, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores, normalized on a scale from 0 to 1. + + 0 is dissimilar, 1 is most similar. + """ + + if self.awadb_client is None: + raise ValueError("AwaDB client is None!!!") + + embedding = None + if self.embedding_model is not None: + embedding = self.embedding_model.embed_query(query) + + show_results = self.awadb_client.Search(embedding, k) + + results: List[Tuple[Document, float]] = [] + + if show_results.__len__() == 0: + return results + + scores: List[float] = [] + retrieval_docs = self.similarity_search_by_vector(embedding, k, scores) + + L2_Norm = 0.0 + for score in scores: + L2_Norm = L2_Norm + score * score + + L2_Norm = pow(L2_Norm, 0.5) + doc_no = 0 + for doc in retrieval_docs: + doc_tuple = (doc, 1 - scores[doc_no] / L2_Norm) + results.append(doc_tuple) + doc_no = doc_no + 1 + + return results + + def similarity_search_with_relevance_scores( + self, + query: str, + k: int = DEFAULT_TOPN, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores, normalized on a scale from 0 to 1. + + 0 is dissimilar, 1 is most similar. + """ + + if self.awadb_client is None: + raise ValueError("AwaDB client is None!!!") + + embedding = None + if self.embedding_model is not None: + embedding = self.embedding_model.embed_query(query) + + show_results = self.awadb_client.Search(embedding, k) + + results: List[Tuple[Document, float]] = [] + + if show_results.__len__() == 0: + return results + + scores: List[float] = [] + retrieval_docs = self.similarity_search_by_vector(embedding, k, scores) + + L2_Norm = 0.0 + for score in scores: + L2_Norm = L2_Norm + score * score + + L2_Norm = pow(L2_Norm, 0.5) + doc_no = 0 + for doc in retrieval_docs: + doc_tuple = (doc, 1 - scores[doc_no] / L2_Norm) + results.append(doc_tuple) + doc_no = doc_no + 1 + + return results + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = DEFAULT_TOPN, + scores: Optional[list] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query vector. + """ + + if self.awadb_client is None: + raise ValueError("AwaDB client is None!!!") + + show_results = self.awadb_client.Search(embedding, k) + + results: List[Document] = [] + + if show_results.__len__() == 0: + return results + + for item_detail in show_results[0]["ResultItems"]: + content = "" + meta_data = {} + for item_key in item_detail: + if item_key == "Field@0": # text for the document + content = item_detail[item_key] + elif item_key == "Field@1": # embedding field for the document + continue + elif item_key == "score": # L2 distance + if scores is not None: + score = item_detail[item_key] + scores.append(score) + else: + meta_data[item_key] = item_detail[item_key] + results.append(Document(page_content=content, metadata=meta_data)) + return results + + @classmethod + def from_texts( + cls: Type[AwaDB], + texts: List[str], + embedding: Optional[Embeddings] = None, + metadatas: Optional[List[dict]] = None, + table_name: str = _DEFAULT_TABLE_NAME, + logging_and_data_dir: Optional[str] = None, + client: Optional[awadb.Client] = None, + **kwargs: Any, + ) -> AwaDB: + """Create an AwaDB vectorstore from a raw documents. + + Args: + texts (List[str]): List of texts to add to the table. + embedding (Optional[Embeddings]): Embedding function. Defaults to None. + metadatas (Optional[List[dict]]): List of metadatas. Defaults to None. + table_name (str): Name of the table to create. + logging_and_data_dir (Optional[str]): Directory of logging and persistence. + client (Optional[awadb.Client]): AwaDB client + + Returns: + AwaDB: AwaDB vectorstore. + """ + awadb_client = cls( + table_name=table_name, + embedding_model=embedding, + log_and_data_dir=logging_and_data_dir, + client=client, + ) + awadb_client.add_texts(texts=texts, metadatas=metadatas) + return awadb_client diff --git a/poetry.lock b/poetry.lock index 96d1c500..03242e43 100644 --- a/poetry.lock +++ b/poetry.lock @@ -570,6 +570,26 @@ dev = ["coverage (>=5,<6)", "flake8 (>=3,<4)", "pytest (>=6,<7)", "sphinx-copybu docs = ["sphinx-copybutton (>=0.4,<0.5)", "sphinx-rtd-theme (>=1.0,<2.0)", "sphinx-tabs (>=3,<4)", "sphinxcontrib-mermaid (>=0.7,<0.8)"] test = ["coverage (>=5,<6)", "pytest (>=6,<7)"] +[[package]] +name = "awadb" +version = "0.3.2" +description = "The AI Native database for embedding vectors" +category = "main" +optional = true +python-versions = ">=3.6" +files = [ + {file = "awadb-0.3.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:f3ce3b066198782fa413f452c56001c58ebec71a1e1dca0eee68f73321ba15a9"}, + {file = "awadb-0.3.2-cp311-cp311-macosx_10_13_universal2.whl", hash = "sha256:c96b5e263c32b2563b1fa027035bdcf50540808ad303071cc1aed3471c3c39b7"}, + {file = "awadb-0.3.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:3e43b5a74753261857d0b146543a4620e00938833181259f138f07457fa84812"}, + {file = "awadb-0.3.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:6330b4d18a814c1562113b3b7897db629c2ac9b5818236ead0fc5f3445b6b7fb"}, + {file = "awadb-0.3.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:82b4e61cc905339868a9f833d0988098f56411b42e0f8dd571aec7c8d6a3f1fa"}, + {file = "awadb-0.3.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:5efaa93d69c467f16ec4f65ed250ec26015781826c0d059c8a54613a5d3e2c3e"}, + {file = "awadb-0.3.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7be0811550d72f49018e4790d290cf521f92ffa84d65ef1073e621f225d142ec"}, +] + +[package.extras] +test = ["pytest (>=6.0)"] + [[package]] name = "azure-ai-formrecognizer" version = "3.2.1" @@ -6030,14 +6050,51 @@ optional = true python-versions = ">=3.7" files = [ {file = "orjson-3.9.1-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:c4434b7b786fdc394b95d029fb99949d7c2b05bbd4bf5cb5e3906be96ffeee3b"}, + {file = "orjson-3.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09faf14f74ed47e773fa56833be118e04aa534956f661eb491522970b7478e3b"}, + {file = "orjson-3.9.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:503eb86a8d53a187fe66aa80c69295a3ca35475804da89a9547e4fce5f803822"}, + {file = "orjson-3.9.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:20f2804b5a1dbd3609c086041bd243519224d47716efd7429db6c03ed28b7cc3"}, + {file = "orjson-3.9.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fd828e0656615a711c4cc4da70f3cac142e66a6703ba876c20156a14e28e3fa"}, + {file = "orjson-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec53d648176f873203b9c700a0abacab33ca1ab595066e9d616f98cdc56f4434"}, + {file = "orjson-3.9.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e186ae76b0d97c505500664193ddf508c13c1e675d9b25f1f4414a7606100da6"}, + {file = "orjson-3.9.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d4edee78503016f4df30aeede0d999b3cb11fb56f47e9db0e487bce0aaca9285"}, + {file = "orjson-3.9.1-cp310-none-win_amd64.whl", hash = "sha256:a4cc5d21e68af982d9a2528ac61e604f092c60eed27aef3324969c68f182ec7e"}, {file = "orjson-3.9.1-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:761b6efd33c49de20dd73ce64cc59da62c0dab10aa6015f582680e0663cc792c"}, + {file = "orjson-3.9.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31229f9d0b8dc2ef7ee7e4393f2e4433a28e16582d4b25afbfccc9d68dc768f8"}, + {file = "orjson-3.9.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0b7ab18d55ecb1de543d452f0a5f8094b52282b916aa4097ac11a4c79f317b86"}, + {file = "orjson-3.9.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db774344c39041f4801c7dfe03483df9203cbd6c84e601a65908e5552228dd25"}, + {file = "orjson-3.9.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ae47ef8c0fe89c4677db7e9e1fb2093ca6e66c3acbee5442d84d74e727edad5e"}, + {file = "orjson-3.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:103952c21575b9805803c98add2eaecd005580a1e746292ed2ec0d76dd3b9746"}, + {file = "orjson-3.9.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2cb0121e6f2c9da3eddf049b99b95fef0adf8480ea7cb544ce858706cdf916eb"}, + {file = "orjson-3.9.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:24d4ddaa2876e657c0fd32902b5c451fd2afc35159d66a58da7837357044b8c2"}, {file = "orjson-3.9.1-cp311-none-win_amd64.whl", hash = "sha256:0b53b5f72cf536dd8aa4fc4c95e7e09a7adb119f8ff8ee6cc60f735d7740ad6a"}, {file = "orjson-3.9.1-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:d4b68d01a506242316a07f1d2f29fb0a8b36cee30a7c35076f1ef59dce0890c1"}, + {file = "orjson-3.9.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9dd4abe6c6fd352f00f4246d85228f6a9847d0cc14f4d54ee553718c225388f"}, + {file = "orjson-3.9.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9e20bca5e13041e31ceba7a09bf142e6d63c8a7467f5a9c974f8c13377c75af2"}, + {file = "orjson-3.9.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d8ae0467d01eb1e4bcffef4486d964bfd1c2e608103e75f7074ed34be5df48cc"}, + {file = "orjson-3.9.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:06f6ab4697fab090517f295915318763a97a12ee8186054adf21c1e6f6abbd3d"}, + {file = "orjson-3.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8515867713301fa065c58ec4c9053ba1a22c35113ab4acad555317b8fd802e50"}, + {file = "orjson-3.9.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:393d0697d1dfa18d27d193e980c04fdfb672c87f7765b87952f550521e21b627"}, + {file = "orjson-3.9.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d96747662d3666f79119e5d28c124e7d356c7dc195cd4b09faea4031c9079dc9"}, {file = "orjson-3.9.1-cp37-none-win_amd64.whl", hash = "sha256:6d173d3921dd58a068c88ec22baea7dbc87a137411501618b1292a9d6252318e"}, {file = "orjson-3.9.1-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:d1c2b0b4246c992ce2529fc610a446b945f1429445ece1c1f826a234c829a918"}, + {file = "orjson-3.9.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19f70ba1f441e1c4bb1a581f0baa092e8b3e3ce5b2aac2e1e090f0ac097966da"}, + {file = "orjson-3.9.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:375d65f002e686212aac42680aed044872c45ee4bc656cf63d4a215137a6124a"}, + {file = "orjson-3.9.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4751cee4a7b1daeacb90a7f5adf2170ccab893c3ab7c5cea58b45a13f89b30b3"}, + {file = "orjson-3.9.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78d9a2a4b2302d5ebc3695498ebc305c3568e5ad4f3501eb30a6405a32d8af22"}, + {file = "orjson-3.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46b4facc32643b2689dfc292c0c463985dac4b6ab504799cf51fc3c6959ed668"}, + {file = "orjson-3.9.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ec7c8a0f1bf35da0d5fd14f8956f3b82a9a6918a3c6963d718dfd414d6d3b604"}, + {file = "orjson-3.9.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d3a40b0fbe06ccd4d6a99e523d20b47985655bcada8d1eba485b1b32a43e4904"}, {file = "orjson-3.9.1-cp38-none-win_amd64.whl", hash = "sha256:402f9d3edfec4560a98880224ec10eba4c5f7b4791e4bc0d4f4d8df5faf2a006"}, {file = "orjson-3.9.1-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:49c0d78dcd34626e2e934f1192d7c052b94e0ecadc5f386fd2bda6d2e03dadf5"}, + {file = "orjson-3.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:125f63e56d38393daa0a1a6dc6fedefca16c538614b66ea5997c3bd3af35ef26"}, + {file = "orjson-3.9.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:08927970365d2e1f3ce4894f9ff928a7b865d53f26768f1bbdd85dd4fee3e966"}, + {file = "orjson-3.9.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f9a744e212d4780ecd67f4b6b128b2e727bee1df03e7059cddb2dfe1083e7dc4"}, + {file = "orjson-3.9.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d1dbf36db7240c61eec98c8d21545d671bce70be0730deb2c0d772e06b71af3"}, + {file = "orjson-3.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80a1e384626f76b66df615f7bb622a79a25c166d08c5d2151ffd41f24c4cc104"}, + {file = "orjson-3.9.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:15d28872fb055bf17ffca913826e618af61b2f689d2b170f72ecae1a86f80d52"}, + {file = "orjson-3.9.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1e4d905338f9ef32c67566929dfbfbb23cc80287af8a2c38930fb0eda3d40b76"}, {file = "orjson-3.9.1-cp39-none-win_amd64.whl", hash = "sha256:48a27da6c7306965846565cc385611d03382bbd84120008653aa2f6741e2105d"}, + {file = "orjson-3.9.1.tar.gz", hash = "sha256:db373a25ec4a4fccf8186f9a72a1b3442837e40807a736a815ab42481e83b7d0"}, ] [[package]] @@ -11366,7 +11423,7 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "pymongo", "weaviate-client", "redis", "google-api-python-client", "google-auth", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "langkit", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "lxml", "requests-toolbelt", "neo4j", "openlm", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "momento", "singlestoredb", "tigrisdb", "nebula3-python"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "pymongo", "weaviate-client", "redis", "google-api-python-client", "google-auth", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "langkit", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "lxml", "requests-toolbelt", "neo4j", "openlm", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "momento", "singlestoredb", "tigrisdb", "nebula3-python", "awadb"] azure = ["azure-identity", "azure-cosmos", "openai", "azure-core", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech"] cohere = ["cohere"] docarray = ["docarray"] @@ -11380,4 +11437,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "dbbaa2907bf2ac09ed111ce712772bba0fe56901627f41c53aef71ae5a38d1c6" +content-hash = "ecf7086e83cc0ff19e6851c0b63170b082b267c1c1c00f47700fd3a8c8bb46c5" diff --git a/pyproject.toml b/pyproject.toml index f0f68ae3..40fc00be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,6 +106,7 @@ pyspark = {version = "^3.4.0", optional = true} tigrisdb = {version = "^1.0.0b6", optional = true} nebula3-python = {version = "^3.4.0", optional = true} langchainplus-sdk = ">=0.0.7" +awadb = {version = "^0.3.2", optional = true} [tool.poetry.group.docs.dependencies] @@ -286,6 +287,7 @@ all = [ "singlestoredb", "tigrisdb", "nebula3-python", + "awadb", ] # An extra used to be able to add extended testing. diff --git a/tests/integration_tests/vectorstores/test_awadb.py b/tests/integration_tests/vectorstores/test_awadb.py new file mode 100644 index 00000000..b643f682 --- /dev/null +++ b/tests/integration_tests/vectorstores/test_awadb.py @@ -0,0 +1,55 @@ +"""Test AwaDB functionality.""" +from langchain.docstore.document import Document +from langchain.vectorstores import AwaDB +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + + +def test_awadb() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = AwaDB.from_texts( + table_name="test_awadb", texts=texts, embedding=FakeEmbeddings() + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_awadb_with_metadatas() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = AwaDB.from_texts( + table_name="test_awadb", + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": 0})] + + +def test_awadb_with_metadatas_with_scores() -> None: + """Test end to end construction and scored search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = AwaDB.from_texts( + table_name="test_awadb", + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + ) + output = docsearch.similarity_search_with_score("foo", k=1) + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + + +def test_awadb_add_texts() -> None: + """Test end to end adding of texts.""" + # Create initial doc store. + texts = ["foo", "bar", "baz"] + docsearch = AwaDB.from_texts( + table_name="test_awadb", texts=texts, embedding=FakeEmbeddings() + ) + # Test adding a similar document as before. + docsearch.add_texts(["foo"]) + output = docsearch.similarity_search("foo", k=2) + assert output == [Document(page_content="foo"), Document(page_content="foo")]