Improve the performance of add_texts interface and upgrade the AwaDB from 0.3.2 to 0.3.3 (#6316)

1. Changed the implementation of add_texts interface for the AwaDB
vector store in order to improve the performance
2. Upgrade the AwaDB from 0.3.2 to 0.3.3

---------

Co-authored-by: vincent <awadb.vincent@gmail.com>
searx_updates
ljeagle 11 months ago committed by GitHub
parent 24b2af5218
commit ad324a39ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -39,7 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
"loader = TextLoader('../../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)"
@ -59,22 +59,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "62b7a4c5",
"metadata": {},
"outputs": [],
"execution_count": 4,
"id": "4b172de8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.\n"
]
}
],
"source": [
"print(docs[0].page_content)"
]
},
{
"cell_type": "markdown",
"id": "a9b4be48",
"metadata": {},
"source": [
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence."
]
},
{
"cell_type": "markdown",
"id": "87fec6b5",
@ -103,22 +103,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0045583",
"metadata": {},
"outputs": [],
"execution_count": 4,
"id": "93cd0b7a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(Document(page_content='And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt'}), 0.561813814013747)\n"
]
}
],
"source": [
"print(docs[0])"
]
},
{
"cell_type": "markdown",
"id": "8c2da99d",
"metadata": {},
"source": [
"(Document(page_content='And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt'}), 0.561813814013747)"
]
},
{
"cell_type": "markdown",
"id": "0b49fb59",
@ -153,19 +153,16 @@
"outputs": [],
"source": [
"awadb_client = awadb.Client()\n",
"ret = awadb_client.Load(\"langchain_awadb\")\n",
"if ret:\n",
" print(\"awadb load table success\")\n",
"ret = awadb_client.Load('langchain_awadb')\n",
"if ret : print('awadb load table success')\n",
"else:\n",
" print(\"awadb load table failed\")"
" print('awadb load table failed')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ae9a9dd",
"cell_type": "raw",
"id": "aba255c2",
"metadata": {},
"outputs": [],
"source": [
"awadb load table success"
]

@ -57,13 +57,15 @@ class AwaDB(VectorStore):
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
is_duplicate_texts: Optional[bool] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
kwargs: vectorstore specific parameters
is_duplicate_texts: Optional whether to duplicate texts.
kwargs: vectorstore specific parameters.
Returns:
List of ids from adding the texts into the vectorstore.
@ -74,28 +76,10 @@ class AwaDB(VectorStore):
embeddings = None
if self.embedding_model is not None:
embeddings = self.embedding_model.embed_documents(list(texts))
added_results: List[str] = []
doc_no = 0
for text in texts:
doc: List[Any] = []
if embeddings is not None:
doc.append(text)
doc.append(embeddings[doc_no])
else:
dict_tmp = {}
dict_tmp["embedding_text"] = text
doc.append(dict_tmp)
if metadatas is not None:
if doc_no < metadatas.__len__():
doc.append(metadatas[doc_no])
self.awadb_client.Add(doc)
added_results.append(str(self.added_doc_count))
doc_no = doc_no + 1
self.added_doc_count = self.added_doc_count + 1
return added_results
return self.awadb_client.AddTexts(
"text", "text_embedding", texts, embeddings, metadatas, is_duplicate_texts
)
def load_local(
self,

20
poetry.lock generated

@ -572,19 +572,19 @@ test = ["coverage (>=5,<6)", "pytest (>=6,<7)"]
[[package]]
name = "awadb"
version = "0.3.2"
version = "0.3.3"
description = "The AI Native database for embedding vectors"
category = "main"
optional = true
python-versions = ">=3.6"
python-versions = ">=3.7"
files = [
{file = "awadb-0.3.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:f3ce3b066198782fa413f452c56001c58ebec71a1e1dca0eee68f73321ba15a9"},
{file = "awadb-0.3.2-cp311-cp311-macosx_10_13_universal2.whl", hash = "sha256:c96b5e263c32b2563b1fa027035bdcf50540808ad303071cc1aed3471c3c39b7"},
{file = "awadb-0.3.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:3e43b5a74753261857d0b146543a4620e00938833181259f138f07457fa84812"},
{file = "awadb-0.3.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:6330b4d18a814c1562113b3b7897db629c2ac9b5818236ead0fc5f3445b6b7fb"},
{file = "awadb-0.3.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:82b4e61cc905339868a9f833d0988098f56411b42e0f8dd571aec7c8d6a3f1fa"},
{file = "awadb-0.3.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:5efaa93d69c467f16ec4f65ed250ec26015781826c0d059c8a54613a5d3e2c3e"},
{file = "awadb-0.3.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7be0811550d72f49018e4790d290cf521f92ffa84d65ef1073e621f225d142ec"},
{file = "awadb-0.3.3-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:daebc108103c8cace41dfb3235fcfdda28ea48e6cd6548b6072f7ad49b64274b"},
{file = "awadb-0.3.3-cp311-cp311-macosx_10_13_universal2.whl", hash = "sha256:2bb3ca2f943448060b1bba4395dd99e2218d7f2149507a8fdfa7a3fd4cfe97ec"},
{file = "awadb-0.3.3-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7b99662af9f7b58e217661a70c295e40605900552bec6d8e9553d90dbf19c5c1"},
{file = "awadb-0.3.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:94be44e587f28fa26b2cade0b6f4c04689f50cb0c07183db5ee50e48fe2e9ae3"},
{file = "awadb-0.3.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:314929dc3a8d25c0f234a2b86c920543050f4eb298a6f68bd2c97c9fe3fb6224"},
{file = "awadb-0.3.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8bfccff1c7373899153427d93d96a97ae5371e8a6f09ff4dcbd28fb9f3f63ff4"},
{file = "awadb-0.3.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:810021a90b873f668d8ab63e2c2747b2b2835bf0ae25f4223b6c94f06faffea4"},
]
[package.extras]
@ -11486,4 +11486,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "b4a782d8223ccc19b2dfb777978c3ad636b11a79cc58a5c45e4dcdb0fe5e29c1"
content-hash = "dd54bb9201b260b734ceebad2c6629b4b1b3bf224731de092782678219f32120"

@ -106,7 +106,7 @@ pyspark = {version = "^3.4.0", optional = true}
tigrisdb = {version = "^1.0.0b6", optional = true}
nebula3-python = {version = "^3.4.0", optional = true}
langchainplus-sdk = ">=0.0.9"
awadb = {version = "^0.3.2", optional = true}
awadb = {version = "^0.3.3", optional = true}
azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true}
[tool.poetry.group.docs.dependencies]

Loading…
Cancel
Save