From 315b0c09c614fa44daa61529d1f1da2fe827b16c Mon Sep 17 00:00:00 2001
From: Samantha Whitmore <whitmore.samantha@gmail.com>
Date: Sun, 20 Nov 2022 16:23:58 -0800
Subject: [PATCH] wip: add method for both docstore and embeddings (#119)

this will break atm but wanted to get thoughts on implementation.

1. should add() be on docstore interface?
2. should InMemoryDocstore change to take a list of documents as init?
(makes this slightly easier to implement in FAISS -- if we think it is
less clean then could expose a method to get the number of documents
currently in the dict, and perform the logic of creating the necessary
dictionary in the FAISS.add_texts method.

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
---
 langchain/docstore/base.py                    | 10 +++-
 langchain/docstore/in_memory.py               | 11 ++++-
 langchain/vectorstores/base.py                |  6 ++-
 .../vectorstores/elastic_vector_search.py     | 24 ++++++++-
 langchain/vectorstores/faiss.py               | 49 ++++++++++++++++---
 .../vectorstores/test_faiss.py                | 26 ++++++++--
 tests/unit_tests/docstore/test_inmemory.py    | 35 +++++++++++++
 7 files changed, 146 insertions(+), 15 deletions(-)

diff --git a/langchain/docstore/base.py b/langchain/docstore/base.py
index 2849dd09..4a91680c 100644
--- a/langchain/docstore/base.py
+++ b/langchain/docstore/base.py
@@ -1,6 +1,6 @@
 """Interface to access to place that stores documents."""
 from abc import ABC, abstractmethod
-from typing import Union
+from typing import Dict, Union
 
 from langchain.docstore.document import Document
 
@@ -15,3 +15,11 @@ class Docstore(ABC):
         If page exists, return the page summary, and a Document object.
         If page does not exist, return similar entries.
         """
+
+
+class AddableMixin(ABC):
+    """Mixin class that supports adding texts."""
+
+    @abstractmethod
+    def add(self, texts: Dict[str, Document]) -> None:
+        """Add more documents."""
diff --git a/langchain/docstore/in_memory.py b/langchain/docstore/in_memory.py
index 5023d5ff..f1e36102 100644
--- a/langchain/docstore/in_memory.py
+++ b/langchain/docstore/in_memory.py
@@ -1,17 +1,24 @@
 """Simple in memory docstore in the form of a dict."""
 from typing import Dict, Union
 
-from langchain.docstore.base import Docstore
+from langchain.docstore.base import AddableMixin, Docstore
 from langchain.docstore.document import Document
 
 
-class InMemoryDocstore(Docstore):
+class InMemoryDocstore(Docstore, AddableMixin):
     """Simple in memory docstore in the form of a dict."""
 
     def __init__(self, _dict: Dict[str, Document]):
         """Initialize with dict."""
         self._dict = _dict
 
+    def add(self, texts: Dict[str, Document]) -> None:
+        """Add texts to in memory dictionary."""
+        overlapping = set(texts).intersection(self._dict)
+        if overlapping:
+            raise ValueError(f"Tried to add ids that already exist: {overlapping}")
+        self._dict = dict(self._dict, **texts)
+
     def search(self, search: str) -> Union[str, Document]:
         """Search via direct lookup."""
         if search not in self._dict:
diff --git a/langchain/vectorstores/base.py b/langchain/vectorstores/base.py
index a7097893..8c9b171c 100644
--- a/langchain/vectorstores/base.py
+++ b/langchain/vectorstores/base.py
@@ -1,6 +1,6 @@
 """Interface for vector stores."""
 from abc import ABC, abstractmethod
-from typing import Any, List, Optional
+from typing import Any, Iterable, List, Optional
 
 from langchain.docstore.document import Document
 from langchain.embeddings.base import Embeddings
@@ -9,6 +9,10 @@ from langchain.embeddings.base import Embeddings
 class VectorStore(ABC):
     """Interface for vector stores."""
 
+    @abstractmethod
+    def add_texts(self, texts: Iterable[str]) -> None:
+        """Run more texts through the embeddings and add to the vectorstore."""
+
     @abstractmethod
     def similarity_search(self, query: str, k: int = 4) -> List[Document]:
         """Return docs most similar to query."""
diff --git a/langchain/vectorstores/elastic_vector_search.py b/langchain/vectorstores/elastic_vector_search.py
index b186cfff..91946364 100644
--- a/langchain/vectorstores/elastic_vector_search.py
+++ b/langchain/vectorstores/elastic_vector_search.py
@@ -1,6 +1,6 @@
 """Wrapper around Elasticsearch vector database."""
 import uuid
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, Iterable, List, Optional
 
 from langchain.docstore.document import Document
 from langchain.embeddings.base import Embeddings
@@ -65,6 +65,28 @@ class ElasticVectorSearch(VectorStore):
             )
         self.client = es_client
 
+    def add_texts(self, texts: Iterable[str]) -> None:
+        """Run more texts through the embeddings and add to the vectorstore."""
+        try:
+            from elasticsearch.helpers import bulk
+        except ImportError:
+            raise ValueError(
+                "Could not import elasticsearch python package. "
+                "Please install it with `pip install elasticearch`."
+            )
+        requests = []
+        for i, text in enumerate(texts):
+            request = {
+                "_op_type": "index",
+                "_index": self.index_name,
+                "vector": self.embedding_function(text),
+                "text": text,
+            }
+            requests.append(request)
+        bulk(self.client, requests)
+        # TODO: add option not to refresh
+        self.client.indices.refresh(index=self.index_name)
+
     def similarity_search(self, query: str, k: int = 4) -> List[Document]:
         """Return docs most similar to query.
 
diff --git a/langchain/vectorstores/faiss.py b/langchain/vectorstores/faiss.py
index 8ae2e3f0..2b3e4d61 100644
--- a/langchain/vectorstores/faiss.py
+++ b/langchain/vectorstores/faiss.py
@@ -1,9 +1,10 @@
 """Wrapper around FAISS vector database."""
-from typing import Any, Callable, List, Optional
+import uuid
+from typing import Any, Callable, Dict, Iterable, List, Optional
 
 import numpy as np
 
-from langchain.docstore.base import Docstore
+from langchain.docstore.base import AddableMixin, Docstore
 from langchain.docstore.document import Document
 from langchain.docstore.in_memory import InMemoryDocstore
 from langchain.embeddings.base import Embeddings
@@ -23,11 +24,41 @@ class FAISS(VectorStore):
 
     """
 
-    def __init__(self, embedding_function: Callable, index: Any, docstore: Docstore):
+    def __init__(
+        self,
+        embedding_function: Callable,
+        index: Any,
+        docstore: Docstore,
+        index_to_docstore_id: Dict[int, str],
+    ):
         """Initialize with necessary components."""
         self.embedding_function = embedding_function
         self.index = index
         self.docstore = docstore
+        self.index_to_docstore_id = index_to_docstore_id
+
+    def add_texts(self, texts: Iterable[str]) -> None:
+        """Run more texts through the embeddings and add to the vectorstore."""
+        if not isinstance(self.docstore, AddableMixin):
+            raise ValueError(
+                "If trying to add texts, the underlying docstore should support "
+                f"adding items, which {self.docstore} does not"
+            )
+        # Embed and create the documents.
+        embeddings = [self.embedding_function(text) for text in texts]
+        documents = [Document(page_content=text) for text in texts]
+        # Add to the index, the index_to_id mapping, and the docstore.
+        starting_len = len(self.index_to_docstore_id)
+        self.index.add(np.array(embeddings, dtype=np.float32))
+        # Get list of index, id, and docs.
+        full_info = [
+            (starting_len + i, str(uuid.uuid4()), doc)
+            for i, doc in enumerate(documents)
+        ]
+        # Add information to docstore and index.
+        self.docstore.add({_id: doc for _, _id, doc in full_info})
+        index_to_id = {index: _id for index, _id, _ in full_info}
+        self.index_to_docstore_id.update(index_to_id)
 
     def similarity_search(self, query: str, k: int = 4) -> List[Document]:
         """Return docs most similar to query.
@@ -46,9 +77,10 @@ class FAISS(VectorStore):
             if i == -1:
                 # This happens when not enough docs are returned.
                 continue
-            doc = self.docstore.search(str(i))
+            _id = self.index_to_docstore_id[i]
+            doc = self.docstore.search(_id)
             if not isinstance(doc, Document):
-                raise ValueError(f"Could not find document for id {i}, got {doc}")
+                raise ValueError(f"Could not find document for id {_id}, got {doc}")
             docs.append(doc)
         return docs
 
@@ -92,5 +124,8 @@ class FAISS(VectorStore):
         for i, text in enumerate(texts):
             metadata = metadatas[i] if metadatas else {}
             documents.append(Document(page_content=text, metadata=metadata))
-        docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(documents)})
-        return cls(embedding.embed_query, index, docstore)
+        index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))}
+        docstore = InMemoryDocstore(
+            {index_to_id[i]: doc for i, doc in enumerate(documents)}
+        )
+        return cls(embedding.embed_query, index, docstore, index_to_id)
diff --git a/tests/integration_tests/vectorstores/test_faiss.py b/tests/integration_tests/vectorstores/test_faiss.py
index 2b3cbd1d..c3d2ba57 100644
--- a/tests/integration_tests/vectorstores/test_faiss.py
+++ b/tests/integration_tests/vectorstores/test_faiss.py
@@ -5,6 +5,7 @@ import pytest
 
 from langchain.docstore.document import Document
 from langchain.docstore.in_memory import InMemoryDocstore
+from langchain.docstore.wikipedia import Wikipedia
 from langchain.embeddings.base import Embeddings
 from langchain.vectorstores.faiss import FAISS
 
@@ -25,11 +26,12 @@ def test_faiss() -> None:
     """Test end to end construction and search."""
     texts = ["foo", "bar", "baz"]
     docsearch = FAISS.from_texts(texts, FakeEmbeddings())
+    index_to_id = docsearch.index_to_docstore_id
     expected_docstore = InMemoryDocstore(
         {
-            "0": Document(page_content="foo"),
-            "1": Document(page_content="bar"),
-            "2": Document(page_content="baz"),
+            index_to_id[0]: Document(page_content="foo"),
+            index_to_id[1]: Document(page_content="bar"),
+            index_to_id[2]: Document(page_content="baz"),
         }
     )
     assert docsearch.docstore.__dict__ == expected_docstore.__dict__
@@ -62,3 +64,21 @@ def test_faiss_search_not_found() -> None:
     docsearch.docstore = InMemoryDocstore({})
     with pytest.raises(ValueError):
         docsearch.similarity_search("foo")
+
+
+def test_faiss_add_texts() -> None:
+    """Test end to end adding of texts."""
+    # Create initial doc store.
+    texts = ["foo", "bar", "baz"]
+    docsearch = FAISS.from_texts(texts, FakeEmbeddings())
+    # Test adding a similar document as before.
+    docsearch.add_texts(["foo"])
+    output = docsearch.similarity_search("foo", k=2)
+    assert output == [Document(page_content="foo"), Document(page_content="foo")]
+
+
+def test_faiss_add_texts_not_supported() -> None:
+    """Test adding of texts to a docstore that doesn't support it."""
+    docsearch = FAISS(FakeEmbeddings().embed_query, None, Wikipedia(), {})
+    with pytest.raises(ValueError):
+        docsearch.add_texts(["foo"])
diff --git a/tests/unit_tests/docstore/test_inmemory.py b/tests/unit_tests/docstore/test_inmemory.py
index 284f9224..4fe9104c 100644
--- a/tests/unit_tests/docstore/test_inmemory.py
+++ b/tests/unit_tests/docstore/test_inmemory.py
@@ -1,4 +1,5 @@
 """Test in memory docstore."""
+import pytest
 
 from langchain.docstore.document import Document
 from langchain.docstore.in_memory import InMemoryDocstore
@@ -19,3 +20,37 @@ def test_document_not_found() -> None:
     docstore = InMemoryDocstore(_dict)
     output = docstore.search("bar")
     assert output == "ID bar not found."
+
+
+def test_adding_document() -> None:
+    """Test that documents are added correctly."""
+    _dict = {"foo": Document(page_content="bar")}
+    docstore = InMemoryDocstore(_dict)
+    new_dict = {"bar": Document(page_content="foo")}
+    docstore.add(new_dict)
+
+    # Test that you can find new document.
+    foo_output = docstore.search("bar")
+    assert isinstance(foo_output, Document)
+    assert foo_output.page_content == "foo"
+
+    # Test that old document is the same.
+    bar_output = docstore.search("foo")
+    assert isinstance(bar_output, Document)
+    assert bar_output.page_content == "bar"
+
+
+def test_adding_document_already_exists() -> None:
+    """Test that error is raised if document id already exists."""
+    _dict = {"foo": Document(page_content="bar")}
+    docstore = InMemoryDocstore(_dict)
+    new_dict = {"foo": Document(page_content="foo")}
+
+    # Test that error is raised.
+    with pytest.raises(ValueError):
+        docstore.add(new_dict)
+
+    # Test that old document is the same.
+    bar_output = docstore.search("foo")
+    assert isinstance(bar_output, Document)
+    assert bar_output.page_content == "bar"