langchain/tests/integration_tests/vectorstores/test_vectara.py

import tempfile
import urllib.request

from langchain.docstore.document import Document
from langchain.vectorstores.vectara import Vectara
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings

# For this test to run properly, please setup as follows
# 1. Create a corpus in Vectara, with a filter attribute called "test_num".
# 2. Create an API_KEY for this corpus with permissions for query and indexing
# 3. Setup environment variables:
#    VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID


def get_abbr(s: str) -> str:
    words = s.split(" ")  # Split the string into words
    first_letters = [word[0] for word in words]  # Extract the first letter of each word
    return "".join(first_letters)  # Join the first letters into a single string


def test_vectara_add_documents() -> None:
    """Test end to end construction and search."""

    # start with some initial texts
    texts = ["grounded generation", "retrieval augmented generation", "data privacy"]
    docsearch: Vectara = Vectara.from_texts(
        texts,
        embedding=FakeEmbeddings(),
        metadatas=[
            {"abbr": "gg", "test_num": "1"},
            {"abbr": "rag", "test_num": "1"},
            {"abbr": "dp", "test_num": "1"},
        ],
        doc_metadata={"test_num": "1"},
    )

    # then add some additional documents
    new_texts = ["large language model", "information retrieval", "question answering"]
    docsearch.add_documents(
        [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts],
        doc_metadata={"test_num": "1"},
    )

    # finally do a similarity search to see if all works okay
    output = docsearch.similarity_search(
        "large language model",
        k=2,
        n_sentence_context=0,
        filter="doc.test_num = 1",
    )
    assert output[0].page_content == "large language model"
    assert output[0].metadata == {"abbr": "llm"}
    assert output[1].page_content == "information retrieval"
    assert output[1].metadata == {"abbr": "ir"}


def test_vectara_from_files() -> None:
    """Test end to end construction and search."""

    # download documents to local storage and then upload as files
    # attention paper and deep learning book
    urls = [
        ("https://arxiv.org/pdf/1706.03762.pdf"),
        (
            "https://www.microsoft.com/en-us/research/wp-content/uploads/"
            "2016/02/Final-DengYu-NOW-Book-DeepLearn2013-ForLecturesJuly2.docx"
        ),
    ]

    files_list = []
    for url in urls:
        name = tempfile.NamedTemporaryFile().name
        urllib.request.urlretrieve(url, name)
        files_list.append(name)

    docsearch: Vectara = Vectara.from_files(
        files=files_list,
        embedding=FakeEmbeddings(),
        metadatas=[{"url": url, "test_num": "2"} for url in urls],
    )

    # finally do a similarity search to see if all works okay
    output = docsearch.similarity_search(
        "By the commonly adopted machine learning tradition",
        k=1,
        n_sentence_context=0,
        filter="doc.test_num = 2",
    )
    print(output)
    assert output[0].page_content == (
        "By the commonly adopted machine learning tradition "
        "(e.g., Chapter 28 in Murphy, 2012; Deng and Li, 2013), it may be natural "
        "to just classify deep learning techniques into deep discriminative models "
        "(e.g., DNNs) and deep probabilistic generative models (e.g., DBN, Deep "
        "Boltzmann Machine (DBM))."
    )
Vectara upd2 (#6506) Update to Vectara integration - By user request added "add_files" to take advantage of Vectara capabilities to process files on the backend, without the need for separate loading of documents and chunking in the chain. - Updated vectara.ipynb example notebook to be broader and added testing of add_file() @hwchase17 - project lead --------- Co-authored-by: rlm <pexpresss31@gmail.com> 2023-07-02 19:15:50 +00:00			`import tempfile`
			`import urllib.request`

Vectara (#5069) # Vectara Integration This PR provides integration with Vectara. Implemented here are: * langchain/vectorstore/vectara.py * tests/integration_tests/vectorstores/test_vectara.py * langchain/retrievers/vectara_retriever.py And two IPYNB notebooks to do more testing: * docs/modules/chains/index_examples/vectara_text_generation.ipynb * docs/modules/indexes/vectorstores/examples/vectara.ipynb --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com> 2023-05-24 08:24:58 +00:00			`from langchain.docstore.document import Document`
			`from langchain.vectorstores.vectara import Vectara`
			`from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings`

Vectara upd2 (#6506) Update to Vectara integration - By user request added "add_files" to take advantage of Vectara capabilities to process files on the backend, without the need for separate loading of documents and chunking in the chain. - Updated vectara.ipynb example notebook to be broader and added testing of add_file() @hwchase17 - project lead --------- Co-authored-by: rlm <pexpresss31@gmail.com> 2023-07-02 19:15:50 +00:00			`# For this test to run properly, please setup as follows`
			`# 1. Create a corpus in Vectara, with a filter attribute called "test_num".`
			`# 2. Create an API_KEY for this corpus with permissions for query and indexing`
			`# 3. Setup environment variables:`
			`# VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID`

Vectara (#5069) # Vectara Integration This PR provides integration with Vectara. Implemented here are: * langchain/vectorstore/vectara.py * tests/integration_tests/vectorstores/test_vectara.py * langchain/retrievers/vectara_retriever.py And two IPYNB notebooks to do more testing: * docs/modules/chains/index_examples/vectara_text_generation.ipynb * docs/modules/indexes/vectorstores/examples/vectara.ipynb --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com> 2023-05-24 08:24:58 +00:00
			`def get_abbr(s: str) -> str:`
			`words = s.split(" ") # Split the string into words`
			`first_letters = [word[0] for word in words] # Extract the first letter of each word`
			`return "".join(first_letters) # Join the first letters into a single string`


			`def test_vectara_add_documents() -> None:`
			`"""Test end to end construction and search."""`

Vectara upd2 (#6506) Update to Vectara integration - By user request added "add_files" to take advantage of Vectara capabilities to process files on the backend, without the need for separate loading of documents and chunking in the chain. - Updated vectara.ipynb example notebook to be broader and added testing of add_file() @hwchase17 - project lead --------- Co-authored-by: rlm <pexpresss31@gmail.com> 2023-07-02 19:15:50 +00:00			`# start with some initial texts`
Vectara (#5069) # Vectara Integration This PR provides integration with Vectara. Implemented here are: * langchain/vectorstore/vectara.py * tests/integration_tests/vectorstores/test_vectara.py * langchain/retrievers/vectara_retriever.py And two IPYNB notebooks to do more testing: * docs/modules/chains/index_examples/vectara_text_generation.ipynb * docs/modules/indexes/vectorstores/examples/vectara.ipynb --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com> 2023-05-24 08:24:58 +00:00			`texts = ["grounded generation", "retrieval augmented generation", "data privacy"]`
			`docsearch: Vectara = Vectara.from_texts(`
			`texts,`
			`embedding=FakeEmbeddings(),`
Vectara upd2 (#6506) Update to Vectara integration - By user request added "add_files" to take advantage of Vectara capabilities to process files on the backend, without the need for separate loading of documents and chunking in the chain. - Updated vectara.ipynb example notebook to be broader and added testing of add_file() @hwchase17 - project lead --------- Co-authored-by: rlm <pexpresss31@gmail.com> 2023-07-02 19:15:50 +00:00			`metadatas=[`
			`{"abbr": "gg", "test_num": "1"},`
			`{"abbr": "rag", "test_num": "1"},`
			`{"abbr": "dp", "test_num": "1"},`
			`],`
			`doc_metadata={"test_num": "1"},`
Vectara (#5069) # Vectara Integration This PR provides integration with Vectara. Implemented here are: * langchain/vectorstore/vectara.py * tests/integration_tests/vectorstores/test_vectara.py * langchain/retrievers/vectara_retriever.py And two IPYNB notebooks to do more testing: * docs/modules/chains/index_examples/vectara_text_generation.ipynb * docs/modules/indexes/vectorstores/examples/vectara.ipynb --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com> 2023-05-24 08:24:58 +00:00			`)`

			`# then add some additional documents`
			`new_texts = ["large language model", "information retrieval", "question answering"]`
			`docsearch.add_documents(`
Vectara upd2 (#6506) Update to Vectara integration - By user request added "add_files" to take advantage of Vectara capabilities to process files on the backend, without the need for separate loading of documents and chunking in the chain. - Updated vectara.ipynb example notebook to be broader and added testing of add_file() @hwchase17 - project lead --------- Co-authored-by: rlm <pexpresss31@gmail.com> 2023-07-02 19:15:50 +00:00			`[Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts],`
			`doc_metadata={"test_num": "1"},`
Vectara (#5069) # Vectara Integration This PR provides integration with Vectara. Implemented here are: * langchain/vectorstore/vectara.py * tests/integration_tests/vectorstores/test_vectara.py * langchain/retrievers/vectara_retriever.py And two IPYNB notebooks to do more testing: * docs/modules/chains/index_examples/vectara_text_generation.ipynb * docs/modules/indexes/vectorstores/examples/vectara.ipynb --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com> 2023-05-24 08:24:58 +00:00			`)`

			`# finally do a similarity search to see if all works okay`
Update to Vectara integration (#5950) This PR updates the Vectara integration (@hwchase17 ): * Adds reuse of requests.session to imrpove efficiency and speed. * Utilizes Vectara's low-level API (instead of standard API) to better match user's specific chunking with LangChain * Now add_texts puts all the texts into a single Vectara document so indexing is much faster. * updated variables names from alpha to lambda_val (to be consistent with Vectara docs) and added n_context_sentence so it's available to use if needed. * Updates to documentation and tests --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> 2023-06-10 23:27:01 +00:00			`output = docsearch.similarity_search(`
Vectara upd2 (#6506) Update to Vectara integration - By user request added "add_files" to take advantage of Vectara capabilities to process files on the backend, without the need for separate loading of documents and chunking in the chain. - Updated vectara.ipynb example notebook to be broader and added testing of add_file() @hwchase17 - project lead --------- Co-authored-by: rlm <pexpresss31@gmail.com> 2023-07-02 19:15:50 +00:00			`"large language model",`
			`k=2,`
			`n_sentence_context=0,`
			`filter="doc.test_num = 1",`
Update to Vectara integration (#5950) This PR updates the Vectara integration (@hwchase17 ): * Adds reuse of requests.session to imrpove efficiency and speed. * Utilizes Vectara's low-level API (instead of standard API) to better match user's specific chunking with LangChain * Now add_texts puts all the texts into a single Vectara document so indexing is much faster. * updated variables names from alpha to lambda_val (to be consistent with Vectara docs) and added n_context_sentence so it's available to use if needed. * Updates to documentation and tests --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> 2023-06-10 23:27:01 +00:00			`)`
Vectara (#5069) # Vectara Integration This PR provides integration with Vectara. Implemented here are: * langchain/vectorstore/vectara.py * tests/integration_tests/vectorstores/test_vectara.py * langchain/retrievers/vectara_retriever.py And two IPYNB notebooks to do more testing: * docs/modules/chains/index_examples/vectara_text_generation.ipynb * docs/modules/indexes/vectorstores/examples/vectara.ipynb --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com> 2023-05-24 08:24:58 +00:00			`assert output[0].page_content == "large language model"`
			`assert output[0].metadata == {"abbr": "llm"}`
			`assert output[1].page_content == "information retrieval"`
			`assert output[1].metadata == {"abbr": "ir"}`
Vectara upd2 (#6506) Update to Vectara integration - By user request added "add_files" to take advantage of Vectara capabilities to process files on the backend, without the need for separate loading of documents and chunking in the chain. - Updated vectara.ipynb example notebook to be broader and added testing of add_file() @hwchase17 - project lead --------- Co-authored-by: rlm <pexpresss31@gmail.com> 2023-07-02 19:15:50 +00:00

			`def test_vectara_from_files() -> None:`
			`"""Test end to end construction and search."""`

			`# download documents to local storage and then upload as files`
			`# attention paper and deep learning book`
			`urls = [`
			`("https://arxiv.org/pdf/1706.03762.pdf"),`
			`(`
			`"https://www.microsoft.com/en-us/research/wp-content/uploads/"`
			`"2016/02/Final-DengYu-NOW-Book-DeepLearn2013-ForLecturesJuly2.docx"`
			`),`
			`]`

			`files_list = []`
			`for url in urls:`
			`name = tempfile.NamedTemporaryFile().name`
			`urllib.request.urlretrieve(url, name)`
			`files_list.append(name)`

			`docsearch: Vectara = Vectara.from_files(`
			`files=files_list,`
			`embedding=FakeEmbeddings(),`
			`metadatas=[{"url": url, "test_num": "2"} for url in urls],`
			`)`

			`# finally do a similarity search to see if all works okay`
			`output = docsearch.similarity_search(`
			`"By the commonly adopted machine learning tradition",`
			`k=1,`
			`n_sentence_context=0,`
			`filter="doc.test_num = 2",`
			`)`
			`print(output)`
			`assert output[0].page_content == (`
			`"By the commonly adopted machine learning tradition "`
			`"(e.g., Chapter 28 in Murphy, 2012; Deng and Li, 2013), it may be natural "`
			`"to just classify deep learning techniques into deep discriminative models "`
			`"(e.g., DNNs) and deep probabilistic generative models (e.g., DBN, Deep "`
			`"Boltzmann Machine (DBM))."`
			`)`