Improve the performance of add_texts interface and upgrade the AwaDB from 0.3.2 to 0.3.3 (#6316)

1. Changed the implementation of add_texts interface for the AwaDB
vector store in order to improve the performance
2. Upgrade the AwaDB from 0.3.2 to 0.3.3

---------

Co-authored-by: vincent <awadb.vincent@gmail.com>
searx_updates
ljeagle 12 months ago committed by GitHub
parent 24b2af5218
commit ad324a39ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -39,7 +39,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n", "loader = TextLoader('../../../state_of_the_union.txt')\n",
"documents = loader.load()\n", "documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)\n", "text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)" "docs = text_splitter.split_documents(documents)"
@ -59,22 +59,22 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"id": "62b7a4c5", "id": "4b172de8",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.\n"
]
}
],
"source": [ "source": [
"print(docs[0].page_content)" "print(docs[0].page_content)"
] ]
}, },
{
"cell_type": "markdown",
"id": "a9b4be48",
"metadata": {},
"source": [
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence."
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "87fec6b5", "id": "87fec6b5",
@ -103,22 +103,22 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"id": "f0045583", "id": "93cd0b7a",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(Document(page_content='And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt'}), 0.561813814013747)\n"
]
}
],
"source": [ "source": [
"print(docs[0])" "print(docs[0])"
] ]
}, },
{
"cell_type": "markdown",
"id": "8c2da99d",
"metadata": {},
"source": [
"(Document(page_content='And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt'}), 0.561813814013747)"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "0b49fb59", "id": "0b49fb59",
@ -153,19 +153,16 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"awadb_client = awadb.Client()\n", "awadb_client = awadb.Client()\n",
"ret = awadb_client.Load(\"langchain_awadb\")\n", "ret = awadb_client.Load('langchain_awadb')\n",
"if ret:\n", "if ret : print('awadb load table success')\n",
" print(\"awadb load table success\")\n",
"else:\n", "else:\n",
" print(\"awadb load table failed\")" " print('awadb load table failed')"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "raw",
"execution_count": null, "id": "aba255c2",
"id": "5ae9a9dd",
"metadata": {}, "metadata": {},
"outputs": [],
"source": [ "source": [
"awadb load table success" "awadb load table success"
] ]

@ -57,13 +57,15 @@ class AwaDB(VectorStore):
self, self,
texts: Iterable[str], texts: Iterable[str],
metadatas: Optional[List[dict]] = None, metadatas: Optional[List[dict]] = None,
is_duplicate_texts: Optional[bool] = None,
**kwargs: Any, **kwargs: Any,
) -> List[str]: ) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore. """Run more texts through the embeddings and add to the vectorstore.
Args: Args:
texts: Iterable of strings to add to the vectorstore. texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts. metadatas: Optional list of metadatas associated with the texts.
kwargs: vectorstore specific parameters is_duplicate_texts: Optional whether to duplicate texts.
kwargs: vectorstore specific parameters.
Returns: Returns:
List of ids from adding the texts into the vectorstore. List of ids from adding the texts into the vectorstore.
@ -74,28 +76,10 @@ class AwaDB(VectorStore):
embeddings = None embeddings = None
if self.embedding_model is not None: if self.embedding_model is not None:
embeddings = self.embedding_model.embed_documents(list(texts)) embeddings = self.embedding_model.embed_documents(list(texts))
added_results: List[str] = []
doc_no = 0
for text in texts:
doc: List[Any] = []
if embeddings is not None:
doc.append(text)
doc.append(embeddings[doc_no])
else:
dict_tmp = {}
dict_tmp["embedding_text"] = text
doc.append(dict_tmp)
if metadatas is not None:
if doc_no < metadatas.__len__():
doc.append(metadatas[doc_no])
self.awadb_client.Add(doc)
added_results.append(str(self.added_doc_count))
doc_no = doc_no + 1 return self.awadb_client.AddTexts(
self.added_doc_count = self.added_doc_count + 1 "text", "text_embedding", texts, embeddings, metadatas, is_duplicate_texts
)
return added_results
def load_local( def load_local(
self, self,

20
poetry.lock generated

@ -572,19 +572,19 @@ test = ["coverage (>=5,<6)", "pytest (>=6,<7)"]
[[package]] [[package]]
name = "awadb" name = "awadb"
version = "0.3.2" version = "0.3.3"
description = "The AI Native database for embedding vectors" description = "The AI Native database for embedding vectors"
category = "main" category = "main"
optional = true optional = true
python-versions = ">=3.6" python-versions = ">=3.7"
files = [ files = [
{file = "awadb-0.3.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:f3ce3b066198782fa413f452c56001c58ebec71a1e1dca0eee68f73321ba15a9"}, {file = "awadb-0.3.3-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:daebc108103c8cace41dfb3235fcfdda28ea48e6cd6548b6072f7ad49b64274b"},
{file = "awadb-0.3.2-cp311-cp311-macosx_10_13_universal2.whl", hash = "sha256:c96b5e263c32b2563b1fa027035bdcf50540808ad303071cc1aed3471c3c39b7"}, {file = "awadb-0.3.3-cp311-cp311-macosx_10_13_universal2.whl", hash = "sha256:2bb3ca2f943448060b1bba4395dd99e2218d7f2149507a8fdfa7a3fd4cfe97ec"},
{file = "awadb-0.3.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:3e43b5a74753261857d0b146543a4620e00938833181259f138f07457fa84812"}, {file = "awadb-0.3.3-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7b99662af9f7b58e217661a70c295e40605900552bec6d8e9553d90dbf19c5c1"},
{file = "awadb-0.3.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:6330b4d18a814c1562113b3b7897db629c2ac9b5818236ead0fc5f3445b6b7fb"}, {file = "awadb-0.3.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:94be44e587f28fa26b2cade0b6f4c04689f50cb0c07183db5ee50e48fe2e9ae3"},
{file = "awadb-0.3.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:82b4e61cc905339868a9f833d0988098f56411b42e0f8dd571aec7c8d6a3f1fa"}, {file = "awadb-0.3.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:314929dc3a8d25c0f234a2b86c920543050f4eb298a6f68bd2c97c9fe3fb6224"},
{file = "awadb-0.3.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:5efaa93d69c467f16ec4f65ed250ec26015781826c0d059c8a54613a5d3e2c3e"}, {file = "awadb-0.3.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8bfccff1c7373899153427d93d96a97ae5371e8a6f09ff4dcbd28fb9f3f63ff4"},
{file = "awadb-0.3.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7be0811550d72f49018e4790d290cf521f92ffa84d65ef1073e621f225d142ec"}, {file = "awadb-0.3.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:810021a90b873f668d8ab63e2c2747b2b2835bf0ae25f4223b6c94f06faffea4"},
] ]
[package.extras] [package.extras]
@ -11486,4 +11486,4 @@ text-helpers = ["chardet"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8.1,<4.0" python-versions = ">=3.8.1,<4.0"
content-hash = "b4a782d8223ccc19b2dfb777978c3ad636b11a79cc58a5c45e4dcdb0fe5e29c1" content-hash = "dd54bb9201b260b734ceebad2c6629b4b1b3bf224731de092782678219f32120"

@ -106,7 +106,7 @@ pyspark = {version = "^3.4.0", optional = true}
tigrisdb = {version = "^1.0.0b6", optional = true} tigrisdb = {version = "^1.0.0b6", optional = true}
nebula3-python = {version = "^3.4.0", optional = true} nebula3-python = {version = "^3.4.0", optional = true}
langchainplus-sdk = ">=0.0.9" langchainplus-sdk = ">=0.0.9"
awadb = {version = "^0.3.2", optional = true} awadb = {version = "^0.3.3", optional = true}
azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true} azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true}
[tool.poetry.group.docs.dependencies] [tool.poetry.group.docs.dependencies]

Loading…
Cancel
Save