Make document serializable, create utility to create a docstore (#9674)

This PR makes the following changes:

1. Documents become serializable using langhchain serialization
2. Make a utility to create a docstore kw store

Will help to address issue here:
https://github.com/langchain-ai/langchain/issues/9345
pull/9789/head
Eugene Yurtsev 1 year ago committed by GitHub
parent a28e888b36
commit 588237ef30
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -17,6 +17,11 @@ class Document(Serializable):
documents, etc.).
"""
@property
def lc_serializable(self) -> bool:
"""Return whether or not the class is serializable."""
return True
class BaseDocumentTransformer(ABC):
"""Abstract base class for document transformation systems.

@ -6,6 +6,7 @@ to a simple key-value interface.
The primary goal of these storages is to support implementation of caching.
"""
from langchain.storage._lc_store import create_kv_docstore, create_lc_store
from langchain.storage.encoder_backed import EncoderBackedStore
from langchain.storage.file_system import LocalFileStore
from langchain.storage.in_memory import InMemoryStore
@ -16,4 +17,6 @@ __all__ = [
"InMemoryStore",
"LocalFileStore",
"RedisStore",
"create_lc_store",
"create_kv_docstore",
]

@ -0,0 +1,88 @@
"""Create a key-value store for any langchain serializable object."""
from typing import Callable, Optional
from langchain.load.dump import dumps
from langchain.load.load import loads
from langchain.load.serializable import Serializable
from langchain.schema import BaseStore, Document
from langchain.storage.encoder_backed import EncoderBackedStore
def _dump_as_bytes(obj: Serializable) -> bytes:
"""Return a bytes representation of a document."""
return dumps(obj).encode("utf-8")
def _dump_document_as_bytes(obj: Document) -> bytes:
"""Return a bytes representation of a document."""
if not isinstance(obj, Document):
raise TypeError("Expected a Document instance")
return dumps(obj).encode("utf-8")
def _load_document_from_bytes(serialized: bytes) -> Document:
"""Return a document from a bytes representation."""
obj = loads(serialized.decode("utf-8"))
if not isinstance(obj, Document):
raise TypeError(f"Expected a Document instance. Got {type(obj)}")
return obj
def _load_from_bytes(serialized: bytes) -> Serializable:
"""Return a document from a bytes representation."""
return loads(serialized.decode("utf-8"))
def _identity(x: str) -> str:
"""Return the same object."""
return x
# PUBLIC API
def create_lc_store(
store: BaseStore[str, bytes],
*,
key_encoder: Optional[Callable[[str], str]] = None,
) -> BaseStore[str, Serializable]:
"""Create a store for langchain serializable objects from a bytes store.
Args:
store: A bytes store to use as the underlying store.
key_encoder: A function to encode keys; if None uses identity function.
Returns:
A key-value store for documents.
"""
return EncoderBackedStore(
store,
key_encoder or _identity,
_dump_as_bytes,
_load_from_bytes,
)
def create_kv_docstore(
store: BaseStore[str, bytes],
*,
key_encoder: Optional[Callable[[str], str]] = None,
) -> BaseStore[str, Document]:
"""Create a store for langchain Document objects from a bytes store.
This store does run time type checking to ensure that the values are
Document objects.
Args:
store: A bytes store to use as the underlying store.
key_encoder: A function to encode keys; if None uses identity function.
Returns:
A key-value store for documents.
"""
return EncoderBackedStore(
store,
key_encoder or _identity,
_dump_document_as_bytes,
_load_document_from_bytes,
)

@ -0,0 +1,36 @@
import tempfile
from typing import Generator, cast
import pytest
from langchain.schema import Document
from langchain.storage._lc_store import create_kv_docstore, create_lc_store
from langchain.storage.file_system import LocalFileStore
@pytest.fixture
def file_store() -> Generator[LocalFileStore, None, None]:
# Create a temporary directory for testing
with tempfile.TemporaryDirectory() as temp_dir:
# Instantiate the LocalFileStore with the temporary directory as the root path
store = LocalFileStore(temp_dir)
yield store
def test_create_lc_store(file_store: LocalFileStore) -> None:
"""Test that a docstore is created from a base store."""
docstore = create_lc_store(file_store)
docstore.mset([("key1", Document(page_content="hello", metadata={"key": "value"}))])
fetched_doc = cast(Document, docstore.mget(["key1"])[0])
assert fetched_doc.page_content == "hello"
assert fetched_doc.metadata == {"key": "value"}
def test_create_kv_store(file_store: LocalFileStore) -> None:
"""Test that a docstore is created from a base store."""
docstore = create_kv_docstore(file_store)
docstore.mset([("key1", Document(page_content="hello", metadata={"key": "value"}))])
fetched_doc = docstore.mget(["key1"])[0]
assert isinstance(fetched_doc, Document)
assert fetched_doc.page_content == "hello"
assert fetched_doc.metadata == {"key": "value"}
Loading…
Cancel
Save