Updated usage of metadata so that both part and doc level metadata is returned properly as a single meta-data dict

Updated tests
2024-11-10 01:10:59 +00:00 · 2023-08-19 13:59:52 -07:00 · 2023-08-19 13:59:52 -07:00 · c4c79da071
commit c4c79da071
parent 129d056085
2 changed files with 64 additions and 39 deletions
--- a/libs/langchain/langchain/vectorstores/vectara.py
+++ b/libs/langchain/langchain/vectorstores/vectara.py
@ -202,12 +202,12 @@ class Vectara(VectorStore):
            doc_metadata: optional metadata for the document

        This function indexes all the input text strings in the Vectara corpus as a
-        single Vectara document, where each input text is considered a "part" and the
-        metadata are associated with each part.
+        single Vectara document, where each input text is considered a "section" and the
+        metadata are associated with each section.
        if 'doc_metadata' is provided, it is associated with the Vectara document.

        Returns:
-            List of ids from adding the texts into the vectorstore.
+            document ID of the document added

        """
        doc_hash = md5()
@ -307,21 +307,27 @@ class Vectara(VectorStore):
        result = response.json()

        responses = result["responseSet"][0]["response"]
-        vectara_default_metadata = ["lang", "len", "offset"]
+        documents = result["responseSet"][0]["document"]
+
+        metadatas = []
+        for x in responses:
+            md = { m["name"]: m["value"] for m in x["metadata"] }
+            doc_num = x['documentIndex']
+            doc_md = { m["name"]: m["value"] for m in documents[doc_num]['metadata'] }
+            md.update(doc_md)
+            metadatas.append(md)
+
        docs = [
            (
                Document(
                    page_content=x["text"],
-                    metadata={
-                        m["name"]: m["value"]
-                        for m in x["metadata"]
-                        if m["name"] not in vectara_default_metadata
-                    },
+                    metadata=md,
                ),
                x["score"],
            )
-            for x in responses
+            for x,md in zip(responses,metadatas)
        ]
+
        return docs

    def similarity_search(
--- a/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_vectara.py
@ -5,12 +5,14 @@ from langchain.docstore.document import Document
 from langchain.vectorstores.vectara import Vectara
 from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings

-# For this test to run properly, please setup as follows
-# 1. Create a corpus in Vectara, with a filter attribute called "test_num".
-# 2. Create an API_KEY for this corpus with permissions for query and indexing
-# 3. Setup environment variables:
+#
+# For this test to run properly, please setup as follows:
+# 1. Create a Vectara account: sign up at https://console.vectara.com/signup
+# 2. Create a corpus in your Vectara account, with a filter attribute called "test_num".
+# 3. Create an API_KEY for this corpus with permissions for query and indexing
+# 4. Setup environment variables:
 #    VECTARA_API_KEY, VECTARA_CORPUS_ID and VECTARA_CUSTOMER_ID
-
+#

 def get_abbr(s: str) -> str:
    words = s.split(" ")  # Split the string into words
@ -21,38 +23,52 @@ def get_abbr(s: str) -> str:
 def test_vectara_add_documents() -> None:
    """Test end to end construction and search."""

-    # start with some initial texts
-    texts = ["grounded generation", "retrieval augmented generation", "data privacy"]
-    docsearch: Vectara = Vectara.from_texts(
-        texts,
-        embedding=FakeEmbeddings(),
-        metadatas=[
-            {"abbr": "gg", "test_num": "1"},
-            {"abbr": "rag", "test_num": "1"},
-            {"abbr": "dp", "test_num": "1"},
-        ],
+    # create a new Vectara instance
+    docsearch: Vectara = Vectara()
+
+    # start with some initial texts, added with add_texts
+    texts1 = ["grounded generation", "retrieval augmented generation", "data privacy"]
+    md = [{"abbr": get_abbr(t)} for t in texts1]
+    doc_id1 = docsearch.add_texts(
+        texts1,
+        metadatas=md,
        doc_metadata={"test_num": "1"},
    )

-    # then add some additional documents
-    new_texts = ["large language model", "information retrieval", "question answering"]
-    docsearch.add_documents(
-        [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts],
-        doc_metadata={"test_num": "1"},
+    # then add some additional documents, now with add_documents
+    texts2 = ["large language model", "information retrieval", "question answering"]
+    doc_id2 = docsearch.add_documents(
+        [Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in texts2],
+        doc_metadata={"test_num": "2"},
    )
+    doc_ids = doc_id1 + doc_id2

-    # finally do a similarity search to see if all works okay
-    output = docsearch.similarity_search(
+    # test without filter
+    output1 = docsearch.similarity_search(
        "large language model",
        k=2,
        n_sentence_context=0,
+    )
+    assert len(output1) == 2
+    assert output1[0].page_content == "large language model"
+    assert output1[0].metadata['abbr'] == "llm"
+    assert output1[1].page_content == "information retrieval"
+    assert output1[1].metadata['abbr'] == "ir"
+
+    # test with metadata filter (doc level)
+    # since the query does not match test_num=1 directly we get RAG as the matching result
+    output2 = docsearch.similarity_search(
+        "large language model",
+        k=1,
+        n_sentence_context=0,
        filter="doc.test_num = 1",
    )
-    assert output[0].page_content == "large language model"
-    assert output[0].metadata == {"abbr": "llm"}
-    assert output[1].page_content == "information retrieval"
-    assert output[1].metadata == {"abbr": "ir"}
+    assert len(output2) == 1
+    assert output2[0].page_content == "retrieval augmented generation"
+    assert output2[0].metadata['abbr'] == "rag"

+    for doc_id in doc_ids:
+        docsearch._delete_doc(doc_id)

 def test_vectara_from_files() -> None:
    """Test end to end construction and search."""
@ -73,8 +89,9 @@ def test_vectara_from_files() -> None:
        urllib.request.urlretrieve(url, name)
        files_list.append(name)

-    docsearch: Vectara = Vectara.from_files(
-        files=files_list,
+    docsearch: Vectara = Vectara()
+    doc_ids = docsearch.add_files(
+        files_list=files_list,
        embedding=FakeEmbeddings(),
        metadatas=[{"url": url, "test_num": "2"} for url in urls],
    )
@ -101,7 +118,6 @@ def test_vectara_from_files() -> None:
        n_sentence_context=1,
        filter="doc.test_num = 2",
    )
-    print(output[0].page_content)
    assert output[0].page_content == (
        """\
 Note the use of “hybrid” in 3) above is different from that used sometimes in the literature, \
@ -114,3 +130,6 @@ This classification scheme, however, misses a key insight gained in deep learnin
 models can greatly improve the training of DNNs and other deep discriminative models via better regularization.\
 """  # noqa: E501
    )
+
+    for doc_id in doc_ids:
+        docsearch._delete_doc(doc_id)