mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
Updated usage of metadata so that both part and doc level metadata is returned properly as a single meta-data dict
Updated tests
This commit is contained in:
parent
129d056085
commit
c4c79da071
@ -202,12 +202,12 @@ class Vectara(VectorStore):
|
||||
doc_metadata: optional metadata for the document
|
||||
|
||||
This function indexes all the input text strings in the Vectara corpus as a
|
||||
single Vectara document, where each input text is considered a "part" and the
|
||||
metadata are associated with each part.
|
||||
single Vectara document, where each input text is considered a "section" and the
|
||||
metadata are associated with each section.
|
||||
if 'doc_metadata' is provided, it is associated with the Vectara document.
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
document ID of the document added
|
||||
|
||||
"""
|
||||
doc_hash = md5()
|
||||
@ -307,21 +307,27 @@ class Vectara(VectorStore):
|
||||
result = response.json()
|
||||
|
||||
responses = result["responseSet"][0]["response"]
|
||||
vectara_default_metadata = ["lang", "len", "offset"]
|
||||
documents = result["responseSet"][0]["document"]
|
||||
|
||||
metadatas = []
|
||||
for x in responses:
|
||||
md = { m["name"]: m["value"] for m in x["metadata"] }
|
||||
doc_num = x['documentIndex']
|
||||
doc_md = { m["name"]: m["value"] for m in documents[doc_num]['metadata'] }
|
||||
md.update(doc_md)
|
||||
metadatas.append(md)
|
||||
|
||||
docs = [
|
||||
(
|
||||
Document(
|
||||
page_content=x["text"],
|
||||
metadata={
|
||||
m["name"]: m["value"]
|
||||
for m in x["metadata"]
|
||||
if m["name"] not in vectara_default_metadata
|
||||
},
|
||||
metadata=md,
|
||||
),
|
||||
x["score"],
|
||||
)
|
||||
for x in responses
|
||||
for x,md in zip(responses,metadatas)
|
||||
]
|
||||
|
||||
return docs
|
||||
|
||||
def similarity_search(
|
||||
|
@ -5,12 +5,14 @@ from langchain.docstore.document import Document
|
||||
from langchain.vectorstores.vectara import Vectara
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||
|
||||
# For this test to run properly, please setup as follows
|
||||
# 1. Create a corpus in Vectara, with a filter attribute called "test_num".
|
||||
# 2. Create an API_KEY for this corpus with permissions for query and indexing
|
||||
# 3. Setup environment variables:
|
||||
#
|
||||
# For this test to run properly, please setup as follows:
|
||||
# 1. Create a Vectara account: sign up at https://console.vectara.com/signup
|
||||
# 2. Create a corpus in your Vectara account, with a filter attribute called "test_num".
|
||||
# 3. Create an API_KEY for this corpus with permissions for query and indexing
|
||||
# 4. Setup environment variables:
|
||||
# VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID
|
||||
|
||||
#
|
||||
|
||||
def get_abbr(s: str) -> str:
|
||||
words = s.split(" ") # Split the string into words
|
||||
@ -21,38 +23,52 @@ def get_abbr(s: str) -> str:
|
||||
def test_vectara_add_documents() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
# start with some initial texts
|
||||
texts = ["grounded generation", "retrieval augmented generation", "data privacy"]
|
||||
docsearch: Vectara = Vectara.from_texts(
|
||||
texts,
|
||||
embedding=FakeEmbeddings(),
|
||||
metadatas=[
|
||||
{"abbr": "gg", "test_num": "1"},
|
||||
{"abbr": "rag", "test_num": "1"},
|
||||
{"abbr": "dp", "test_num": "1"},
|
||||
],
|
||||
# create a new Vectara instance
|
||||
docsearch: Vectara = Vectara()
|
||||
|
||||
# start with some initial texts, added with add_texts
|
||||
texts1 = ["grounded generation", "retrieval augmented generation", "data privacy"]
|
||||
md = [{"abbr": get_abbr(t)} for t in texts1]
|
||||
doc_id1 = docsearch.add_texts(
|
||||
texts1,
|
||||
metadatas=md,
|
||||
doc_metadata={"test_num": "1"},
|
||||
)
|
||||
|
||||
# then add some additional documents
|
||||
new_texts = ["large language model", "information retrieval", "question answering"]
|
||||
docsearch.add_documents(
|
||||
[Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts],
|
||||
doc_metadata={"test_num": "1"},
|
||||
# then add some additional documents, now with add_documents
|
||||
texts2 = ["large language model", "information retrieval", "question answering"]
|
||||
doc_id2 = docsearch.add_documents(
|
||||
[Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in texts2],
|
||||
doc_metadata={"test_num": "2"},
|
||||
)
|
||||
doc_ids = doc_id1 + doc_id2
|
||||
|
||||
# finally do a similarity search to see if all works okay
|
||||
output = docsearch.similarity_search(
|
||||
# test without filter
|
||||
output1 = docsearch.similarity_search(
|
||||
"large language model",
|
||||
k=2,
|
||||
n_sentence_context=0,
|
||||
)
|
||||
assert len(output1) == 2
|
||||
assert output1[0].page_content == "large language model"
|
||||
assert output1[0].metadata['abbr'] == "llm"
|
||||
assert output1[1].page_content == "information retrieval"
|
||||
assert output1[1].metadata['abbr'] == "ir"
|
||||
|
||||
# test with metadata filter (doc level)
|
||||
# since the query does not match test_num=1 directly we get RAG as the matching result
|
||||
output2 = docsearch.similarity_search(
|
||||
"large language model",
|
||||
k=1,
|
||||
n_sentence_context=0,
|
||||
filter="doc.test_num = 1",
|
||||
)
|
||||
assert output[0].page_content == "large language model"
|
||||
assert output[0].metadata == {"abbr": "llm"}
|
||||
assert output[1].page_content == "information retrieval"
|
||||
assert output[1].metadata == {"abbr": "ir"}
|
||||
assert len(output2) == 1
|
||||
assert output2[0].page_content == "retrieval augmented generation"
|
||||
assert output2[0].metadata['abbr'] == "rag"
|
||||
|
||||
for doc_id in doc_ids:
|
||||
docsearch._delete_doc(doc_id)
|
||||
|
||||
def test_vectara_from_files() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
@ -73,8 +89,9 @@ def test_vectara_from_files() -> None:
|
||||
urllib.request.urlretrieve(url, name)
|
||||
files_list.append(name)
|
||||
|
||||
docsearch: Vectara = Vectara.from_files(
|
||||
files=files_list,
|
||||
docsearch: Vectara = Vectara()
|
||||
doc_ids = docsearch.add_files(
|
||||
files_list=files_list,
|
||||
embedding=FakeEmbeddings(),
|
||||
metadatas=[{"url": url, "test_num": "2"} for url in urls],
|
||||
)
|
||||
@ -101,7 +118,6 @@ def test_vectara_from_files() -> None:
|
||||
n_sentence_context=1,
|
||||
filter="doc.test_num = 2",
|
||||
)
|
||||
print(output[0].page_content)
|
||||
assert output[0].page_content == (
|
||||
"""\
|
||||
Note the use of “hybrid” in 3) above is different from that used sometimes in the literature, \
|
||||
@ -114,3 +130,6 @@ This classification scheme, however, misses a key insight gained in deep learnin
|
||||
models can greatly improve the training of DNNs and other deep discriminative models via better regularization.\
|
||||
""" # noqa: E501
|
||||
)
|
||||
|
||||
for doc_id in doc_ids:
|
||||
docsearch._delete_doc(doc_id)
|
||||
|
Loading…
Reference in New Issue
Block a user