From 588237ef30ca665582005a346de2770db46b518a Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 30 Aug 2023 09:45:04 -0400 Subject: [PATCH] Make document serializable, create utility to create a docstore (#9674) This PR makes the following changes: 1. Documents become serializable using langhchain serialization 2. Make a utility to create a docstore kw store Will help to address issue here: https://github.com/langchain-ai/langchain/issues/9345 --- libs/langchain/langchain/schema/document.py | 5 ++ libs/langchain/langchain/storage/__init__.py | 3 + libs/langchain/langchain/storage/_lc_store.py | 88 +++++++++++++++++++ .../tests/unit_tests/storage/test_lc_store.py | 36 ++++++++ 4 files changed, 132 insertions(+) create mode 100644 libs/langchain/langchain/storage/_lc_store.py create mode 100644 libs/langchain/tests/unit_tests/storage/test_lc_store.py diff --git a/libs/langchain/langchain/schema/document.py b/libs/langchain/langchain/schema/document.py index feaa1acac0..ccee67ca2a 100644 --- a/libs/langchain/langchain/schema/document.py +++ b/libs/langchain/langchain/schema/document.py @@ -17,6 +17,11 @@ class Document(Serializable): documents, etc.). """ + @property + def lc_serializable(self) -> bool: + """Return whether or not the class is serializable.""" + return True + class BaseDocumentTransformer(ABC): """Abstract base class for document transformation systems. diff --git a/libs/langchain/langchain/storage/__init__.py b/libs/langchain/langchain/storage/__init__.py index ecc2e817f2..49a721b59a 100644 --- a/libs/langchain/langchain/storage/__init__.py +++ b/libs/langchain/langchain/storage/__init__.py @@ -6,6 +6,7 @@ to a simple key-value interface. The primary goal of these storages is to support implementation of caching. """ +from langchain.storage._lc_store import create_kv_docstore, create_lc_store from langchain.storage.encoder_backed import EncoderBackedStore from langchain.storage.file_system import LocalFileStore from langchain.storage.in_memory import InMemoryStore @@ -16,4 +17,6 @@ __all__ = [ "InMemoryStore", "LocalFileStore", "RedisStore", + "create_lc_store", + "create_kv_docstore", ] diff --git a/libs/langchain/langchain/storage/_lc_store.py b/libs/langchain/langchain/storage/_lc_store.py new file mode 100644 index 0000000000..be528e7748 --- /dev/null +++ b/libs/langchain/langchain/storage/_lc_store.py @@ -0,0 +1,88 @@ +"""Create a key-value store for any langchain serializable object.""" +from typing import Callable, Optional + +from langchain.load.dump import dumps +from langchain.load.load import loads +from langchain.load.serializable import Serializable +from langchain.schema import BaseStore, Document +from langchain.storage.encoder_backed import EncoderBackedStore + + +def _dump_as_bytes(obj: Serializable) -> bytes: + """Return a bytes representation of a document.""" + return dumps(obj).encode("utf-8") + + +def _dump_document_as_bytes(obj: Document) -> bytes: + """Return a bytes representation of a document.""" + if not isinstance(obj, Document): + raise TypeError("Expected a Document instance") + return dumps(obj).encode("utf-8") + + +def _load_document_from_bytes(serialized: bytes) -> Document: + """Return a document from a bytes representation.""" + obj = loads(serialized.decode("utf-8")) + if not isinstance(obj, Document): + raise TypeError(f"Expected a Document instance. Got {type(obj)}") + return obj + + +def _load_from_bytes(serialized: bytes) -> Serializable: + """Return a document from a bytes representation.""" + return loads(serialized.decode("utf-8")) + + +def _identity(x: str) -> str: + """Return the same object.""" + return x + + +# PUBLIC API + + +def create_lc_store( + store: BaseStore[str, bytes], + *, + key_encoder: Optional[Callable[[str], str]] = None, +) -> BaseStore[str, Serializable]: + """Create a store for langchain serializable objects from a bytes store. + + Args: + store: A bytes store to use as the underlying store. + key_encoder: A function to encode keys; if None uses identity function. + + Returns: + A key-value store for documents. + """ + return EncoderBackedStore( + store, + key_encoder or _identity, + _dump_as_bytes, + _load_from_bytes, + ) + + +def create_kv_docstore( + store: BaseStore[str, bytes], + *, + key_encoder: Optional[Callable[[str], str]] = None, +) -> BaseStore[str, Document]: + """Create a store for langchain Document objects from a bytes store. + + This store does run time type checking to ensure that the values are + Document objects. + + Args: + store: A bytes store to use as the underlying store. + key_encoder: A function to encode keys; if None uses identity function. + + Returns: + A key-value store for documents. + """ + return EncoderBackedStore( + store, + key_encoder or _identity, + _dump_document_as_bytes, + _load_document_from_bytes, + ) diff --git a/libs/langchain/tests/unit_tests/storage/test_lc_store.py b/libs/langchain/tests/unit_tests/storage/test_lc_store.py new file mode 100644 index 0000000000..5d15683ac3 --- /dev/null +++ b/libs/langchain/tests/unit_tests/storage/test_lc_store.py @@ -0,0 +1,36 @@ +import tempfile +from typing import Generator, cast + +import pytest + +from langchain.schema import Document +from langchain.storage._lc_store import create_kv_docstore, create_lc_store +from langchain.storage.file_system import LocalFileStore + + +@pytest.fixture +def file_store() -> Generator[LocalFileStore, None, None]: + # Create a temporary directory for testing + with tempfile.TemporaryDirectory() as temp_dir: + # Instantiate the LocalFileStore with the temporary directory as the root path + store = LocalFileStore(temp_dir) + yield store + + +def test_create_lc_store(file_store: LocalFileStore) -> None: + """Test that a docstore is created from a base store.""" + docstore = create_lc_store(file_store) + docstore.mset([("key1", Document(page_content="hello", metadata={"key": "value"}))]) + fetched_doc = cast(Document, docstore.mget(["key1"])[0]) + assert fetched_doc.page_content == "hello" + assert fetched_doc.metadata == {"key": "value"} + + +def test_create_kv_store(file_store: LocalFileStore) -> None: + """Test that a docstore is created from a base store.""" + docstore = create_kv_docstore(file_store) + docstore.mset([("key1", Document(page_content="hello", metadata={"key": "value"}))]) + fetched_doc = docstore.mget(["key1"])[0] + assert isinstance(fetched_doc, Document) + assert fetched_doc.page_content == "hello" + assert fetched_doc.metadata == {"key": "value"}