Updated usage of metadata so that both part and doc level metadata is returned properly as a single meta-data dict

Updated tests
This commit is contained in:
Ofer Mendelevitch 2023-08-19 13:59:52 -07:00
parent 129d056085
commit c4c79da071
2 changed files with 64 additions and 39 deletions

View File

@ -202,12 +202,12 @@ class Vectara(VectorStore):
doc_metadata: optional metadata for the document
This function indexes all the input text strings in the Vectara corpus as a
single Vectara document, where each input text is considered a "part" and the
metadata are associated with each part.
single Vectara document, where each input text is considered a "section" and the
metadata are associated with each section.
if 'doc_metadata' is provided, it is associated with the Vectara document.
Returns:
List of ids from adding the texts into the vectorstore.
document ID of the document added
"""
doc_hash = md5()
@ -307,21 +307,27 @@ class Vectara(VectorStore):
result = response.json()
responses = result["responseSet"][0]["response"]
vectara_default_metadata = ["lang", "len", "offset"]
documents = result["responseSet"][0]["document"]
metadatas = []
for x in responses:
md = { m["name"]: m["value"] for m in x["metadata"] }
doc_num = x['documentIndex']
doc_md = { m["name"]: m["value"] for m in documents[doc_num]['metadata'] }
md.update(doc_md)
metadatas.append(md)
docs = [
(
Document(
page_content=x["text"],
metadata={
m["name"]: m["value"]
for m in x["metadata"]
if m["name"] not in vectara_default_metadata
},
metadata=md,
),
x["score"],
)
for x in responses
for x,md in zip(responses,metadatas)
]
return docs
def similarity_search(

View File

@ -5,12 +5,14 @@ from langchain.docstore.document import Document
from langchain.vectorstores.vectara import Vectara
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
# For this test to run properly, please setup as follows
# 1. Create a corpus in Vectara, with a filter attribute called "test_num".
# 2. Create an API_KEY for this corpus with permissions for query and indexing
# 3. Setup environment variables:
#
# For this test to run properly, please setup as follows:
# 1. Create a Vectara account: sign up at https://console.vectara.com/signup
# 2. Create a corpus in your Vectara account, with a filter attribute called "test_num".
# 3. Create an API_KEY for this corpus with permissions for query and indexing
# 4. Setup environment variables:
# VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID
#
def get_abbr(s: str) -> str:
words = s.split(" ") # Split the string into words
@ -21,38 +23,52 @@ def get_abbr(s: str) -> str:
def test_vectara_add_documents() -> None:
"""Test end to end construction and search."""
# start with some initial texts
texts = ["grounded generation", "retrieval augmented generation", "data privacy"]
docsearch: Vectara = Vectara.from_texts(
texts,
embedding=FakeEmbeddings(),
metadatas=[
{"abbr": "gg", "test_num": "1"},
{"abbr": "rag", "test_num": "1"},
{"abbr": "dp", "test_num": "1"},
],
# create a new Vectara instance
docsearch: Vectara = Vectara()
# start with some initial texts, added with add_texts
texts1 = ["grounded generation", "retrieval augmented generation", "data privacy"]
md = [{"abbr": get_abbr(t)} for t in texts1]
doc_id1 = docsearch.add_texts(
texts1,
metadatas=md,
doc_metadata={"test_num": "1"},
)
# then add some additional documents
new_texts = ["large language model", "information retrieval", "question answering"]
docsearch.add_documents(
[Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts],
doc_metadata={"test_num": "1"},
# then add some additional documents, now with add_documents
texts2 = ["large language model", "information retrieval", "question answering"]
doc_id2 = docsearch.add_documents(
[Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in texts2],
doc_metadata={"test_num": "2"},
)
doc_ids = doc_id1 + doc_id2
# finally do a similarity search to see if all works okay
output = docsearch.similarity_search(
# test without filter
output1 = docsearch.similarity_search(
"large language model",
k=2,
n_sentence_context=0,
)
assert len(output1) == 2
assert output1[0].page_content == "large language model"
assert output1[0].metadata['abbr'] == "llm"
assert output1[1].page_content == "information retrieval"
assert output1[1].metadata['abbr'] == "ir"
# test with metadata filter (doc level)
# since the query does not match test_num=1 directly we get RAG as the matching result
output2 = docsearch.similarity_search(
"large language model",
k=1,
n_sentence_context=0,
filter="doc.test_num = 1",
)
assert output[0].page_content == "large language model"
assert output[0].metadata == {"abbr": "llm"}
assert output[1].page_content == "information retrieval"
assert output[1].metadata == {"abbr": "ir"}
assert len(output2) == 1
assert output2[0].page_content == "retrieval augmented generation"
assert output2[0].metadata['abbr'] == "rag"
for doc_id in doc_ids:
docsearch._delete_doc(doc_id)
def test_vectara_from_files() -> None:
"""Test end to end construction and search."""
@ -73,8 +89,9 @@ def test_vectara_from_files() -> None:
urllib.request.urlretrieve(url, name)
files_list.append(name)
docsearch: Vectara = Vectara.from_files(
files=files_list,
docsearch: Vectara = Vectara()
doc_ids = docsearch.add_files(
files_list=files_list,
embedding=FakeEmbeddings(),
metadatas=[{"url": url, "test_num": "2"} for url in urls],
)
@ -101,7 +118,6 @@ def test_vectara_from_files() -> None:
n_sentence_context=1,
filter="doc.test_num = 2",
)
print(output[0].page_content)
assert output[0].page_content == (
"""\
Note the use of hybrid in 3) above is different from that used sometimes in the literature, \
@ -114,3 +130,6 @@ This classification scheme, however, misses a key insight gained in deep learnin
models can greatly improve the training of DNNs and other deep discriminative models via better regularization.\
""" # noqa: E501
)
for doc_id in doc_ids:
docsearch._delete_doc(doc_id)