langchain/tests/integration_tests/vectorstores/test_vectara.py

97 lines
3.4 KiB
Python
Raw Normal View History

import tempfile
import urllib.request
from langchain.docstore.document import Document
from langchain.vectorstores.vectara import Vectara
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
# For this test to run properly, please setup as follows
# 1. Create a corpus in Vectara, with a filter attribute called "test_num".
# 2. Create an API_KEY for this corpus with permissions for query and indexing
# 3. Setup environment variables:
# VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID
def get_abbr(s: str) -> str:
words = s.split(" ") # Split the string into words
first_letters = [word[0] for word in words] # Extract the first letter of each word
return "".join(first_letters) # Join the first letters into a single string
def test_vectara_add_documents() -> None:
"""Test end to end construction and search."""
# start with some initial texts
texts = ["grounded generation", "retrieval augmented generation", "data privacy"]
docsearch: Vectara = Vectara.from_texts(
texts,
embedding=FakeEmbeddings(),
metadatas=[
{"abbr": "gg", "test_num": "1"},
{"abbr": "rag", "test_num": "1"},
{"abbr": "dp", "test_num": "1"},
],
doc_metadata={"test_num": "1"},
)
# then add some additional documents
new_texts = ["large language model", "information retrieval", "question answering"]
docsearch.add_documents(
[Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts],
doc_metadata={"test_num": "1"},
)
# finally do a similarity search to see if all works okay
output = docsearch.similarity_search(
"large language model",
k=2,
n_sentence_context=0,
filter="doc.test_num = 1",
)
assert output[0].page_content == "large language model"
assert output[0].metadata == {"abbr": "llm"}
assert output[1].page_content == "information retrieval"
assert output[1].metadata == {"abbr": "ir"}
def test_vectara_from_files() -> None:
"""Test end to end construction and search."""
# download documents to local storage and then upload as files
# attention paper and deep learning book
urls = [
("https://arxiv.org/pdf/1706.03762.pdf"),
(
"https://www.microsoft.com/en-us/research/wp-content/uploads/"
"2016/02/Final-DengYu-NOW-Book-DeepLearn2013-ForLecturesJuly2.docx"
),
]
files_list = []
for url in urls:
name = tempfile.NamedTemporaryFile().name
urllib.request.urlretrieve(url, name)
files_list.append(name)
docsearch: Vectara = Vectara.from_files(
files=files_list,
embedding=FakeEmbeddings(),
metadatas=[{"url": url, "test_num": "2"} for url in urls],
)
# finally do a similarity search to see if all works okay
output = docsearch.similarity_search(
"By the commonly adopted machine learning tradition",
k=1,
n_sentence_context=0,
filter="doc.test_num = 2",
)
print(output)
assert output[0].page_content == (
"By the commonly adopted machine learning tradition "
"(e.g., Chapter 28 in Murphy, 2012; Deng and Li, 2013), it may be natural "
"to just classify deep learning techniques into deep discriminative models "
"(e.g., DNNs) and deep probabilistic generative models (e.g., DBN, Deep "
"Boltzmann Machine (DBM))."
)