From 5d020107632337b796857b063d4863799d585d17 Mon Sep 17 00:00:00 2001
From: Eugene Yurtsev <eyurtsev@gmail.com>
Date: Thu, 27 Apr 2023 09:45:25 -0400
Subject: [PATCH] Introduce Blob and Blob Loader interface (#3603)

This PR introduces a Blob data type and a Blob loader interface.

This is the first of a sequence of PRs that follows this proposal:

https://github.com/hwchase17/langchain/pull/2833

The primary goals of these abstraction are:

* Decouple content loading from content parsing code.
* Help duplicated content loading code from document loaders.
* Make lazy loading a default for langchain.
---
 .../document_loaders/blob_loaders/__init__.py |   3 +
 .../document_loaders/blob_loaders/schema.py   | 156 ++++++++++++++++++
 .../document_loader/blob_loaders/__init__.py  |   0
 .../blob_loaders/test_public_api.py           |   6 +
 .../blob_loaders/test_schema.py               | 101 ++++++++++++
 5 files changed, 266 insertions(+)
 create mode 100644 langchain/document_loaders/blob_loaders/__init__.py
 create mode 100644 langchain/document_loaders/blob_loaders/schema.py
 create mode 100644 tests/unit_tests/document_loader/blob_loaders/__init__.py
 create mode 100644 tests/unit_tests/document_loader/blob_loaders/test_public_api.py
 create mode 100644 tests/unit_tests/document_loader/blob_loaders/test_schema.py

diff --git a/langchain/document_loaders/blob_loaders/__init__.py b/langchain/document_loaders/blob_loaders/__init__.py
new file mode 100644
index 00000000..6af2e7b3
--- /dev/null
+++ b/langchain/document_loaders/blob_loaders/__init__.py
@@ -0,0 +1,3 @@
+from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
+
+__all__ = ["BlobLoader", "Blob"]
diff --git a/langchain/document_loaders/blob_loaders/schema.py b/langchain/document_loaders/blob_loaders/schema.py
new file mode 100644
index 00000000..f4a9f6be
--- /dev/null
+++ b/langchain/document_loaders/blob_loaders/schema.py
@@ -0,0 +1,156 @@
+"""Schema for Blobs and Blob Loaders.
+
+The goal is to facilitate decoupling of content loading from content parsing code.
+
+In addition, content loading code should provide a lazy loading interface by default.
+"""
+import contextlib
+import mimetypes
+from abc import ABC, abstractmethod
+from io import BufferedReader, BytesIO
+from pathlib import PurePath
+from typing import Generator, Iterable, Optional, Union
+
+from pydantic import BaseModel
+
+PathLike = Union[str, PurePath]
+
+
+class Blob(BaseModel):
+    """A blob is used to represent raw data by either reference or value.
+
+    Provides an interface to materialize the blob in different representations, and
+    help to decouple the development of data loaders from the downstream parsing of
+    the raw data.
+
+    Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
+    """
+
+    data: Union[bytes, str, None]  # Raw data
+    mimetype: Optional[str] = None  # Not to be confused with a file extension
+    encoding: str = "utf-8"  # Use utf-8 as default encoding, if decoding to string
+    # Location where the original content was found
+    # Represent location on the local file system
+    # Useful for situations where downstream code assumes it must work with file paths
+    # rather than in-memory content.
+    path: Optional[PathLike] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+        frozen = True
+
+    @property
+    def source(self) -> Optional[str]:
+        """The source location of the blob as string if known otherwise none."""
+        return str(self.path) if self.path else None
+
+    def as_string(self) -> str:
+        """Read data as a string."""
+        if self.data is None and self.path:
+            with open(str(self.path), "r", encoding=self.encoding) as f:
+                return f.read()
+        elif isinstance(self.data, bytes):
+            return self.data.decode(self.encoding)
+        elif isinstance(self.data, str):
+            return self.data
+        else:
+            raise ValueError(f"Unable to get string for blob {self}")
+
+    def as_bytes(self) -> bytes:
+        """Read data as bytes."""
+        if isinstance(self.data, bytes):
+            return self.data
+        elif isinstance(self.data, str):
+            return self.data.encode(self.encoding)
+        elif self.data is None and self.path:
+            with open(str(self.path), "rb") as f:
+                return f.read()
+        else:
+            raise ValueError(f"Unable to get bytes for blob {self}")
+
+    @contextlib.contextmanager
+    def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
+        """Read data as a byte stream."""
+        if isinstance(self.data, bytes):
+            yield BytesIO(self.data)
+        elif self.data is None and self.path:
+            with open(str(self.path), "rb") as f:
+                yield f
+        else:
+            raise NotImplementedError(f"Unable to convert blob {self}")
+
+    @classmethod
+    def from_path(
+        cls,
+        path: PathLike,
+        *,
+        encoding: str = "utf-8",
+        mime_type: Optional[str] = None,
+        guess_type: bool = True,
+    ) -> "Blob":
+        """Load the blob from a path like object.
+
+        Args:
+            path: path like object to file to be read
+            encoding: Encoding to use if decoding the bytes into a string
+            mime_type: if provided, will be set as the mime-type of the data
+            guess_type: If True, the mimetype will be guessed from the file extension,
+                        if a mime-type was not provided
+
+        Returns:
+            Blob instance
+        """
+        if mime_type is None and guess_type:
+            _mimetype = mimetypes.guess_type(path)[0] if guess_type else None
+        else:
+            _mimetype = mime_type
+        # We do not load the data immediately, instead we treat the blob as a
+        # reference to the underlying data.
+        return cls(data=None, mimetype=_mimetype, encoding=encoding, path=path)
+
+    @classmethod
+    def from_data(
+        cls,
+        data: Union[str, bytes],
+        *,
+        encoding: str = "utf-8",
+        mime_type: Optional[str] = None,
+        path: Optional[str] = None,
+    ) -> "Blob":
+        """Initialize the blob from in-memory data.
+
+        Args:
+            data: the in-memory data associated with the blob
+            encoding: Encoding to use if decoding the bytes into a string
+            mime_type: if provided, will be set as the mime-type of the data
+            path: if provided, will be set as the source from which the data came
+
+        Returns:
+            Blob instance
+        """
+        return cls(data=data, mime_type=mime_type, encoding=encoding, path=path)
+
+    def __repr__(self) -> str:
+        """Define the blob representation."""
+        str_repr = f"Blob {id(self)}"
+        if self.source:
+            str_repr += f" {self.source}"
+        return str_repr
+
+
+class BlobLoader(ABC):
+    """Abstract interface for blob loaders implementation.
+
+    Implementer should be able to load raw content from a storage system according
+    to some criteria and return the raw content lazily as a stream of blobs.
+    """
+
+    @abstractmethod
+    def yield_blobs(
+        self,
+    ) -> Iterable[Blob]:
+        """A lazy loader for raw data represented by LangChain's Blob object.
+
+        Returns:
+            A generator over blobs
+        """
diff --git a/tests/unit_tests/document_loader/blob_loaders/__init__.py b/tests/unit_tests/document_loader/blob_loaders/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit_tests/document_loader/blob_loaders/test_public_api.py b/tests/unit_tests/document_loader/blob_loaders/test_public_api.py
new file mode 100644
index 00000000..a1eb79b7
--- /dev/null
+++ b/tests/unit_tests/document_loader/blob_loaders/test_public_api.py
@@ -0,0 +1,6 @@
+from langchain.document_loaders.blob_loaders import __all__
+
+
+def test_public_api() -> None:
+    """Hard-code public API to help determine if we have broken it."""
+    assert sorted(__all__) == ["Blob", "BlobLoader"]
diff --git a/tests/unit_tests/document_loader/blob_loaders/test_schema.py b/tests/unit_tests/document_loader/blob_loaders/test_schema.py
new file mode 100644
index 00000000..4791d976
--- /dev/null
+++ b/tests/unit_tests/document_loader/blob_loaders/test_schema.py
@@ -0,0 +1,101 @@
+import os
+from contextlib import contextmanager
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import Generator, Iterable, Optional
+
+import pytest
+
+from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader, PathLike
+
+
+@contextmanager
+def get_temp_file(
+    content: bytes, suffix: Optional[str] = None
+) -> Generator[Path, None, None]:
+    """Yield a temporary field with some content."""
+    with NamedTemporaryFile(suffix=suffix, delete=False) as temp_file:
+        temp_file.write(content)
+        path = Path(temp_file.name)
+    try:
+        yield path
+    finally:
+        os.remove(str(path))
+
+
+def test_blob_initialized_with_binary_data() -> None:
+    """Test reading blob IO if blob content hasn't been read yet."""
+    data = b"Hello, World!"
+    blob = Blob(data=data)
+    assert blob.as_string() == "Hello, World!"
+    assert blob.as_bytes() == data
+    assert blob.source is None
+    with blob.as_bytes_io() as bytes_io:
+        assert bytes_io.read() == data
+
+
+def test_blob_from_pure_path() -> None:
+    """Test reading blob from a file path."""
+    content = b"Hello, World!"
+
+    with get_temp_file(content, suffix=".html") as temp_path:
+        assert isinstance(temp_path, Path)
+        blob = Blob.from_path(temp_path)
+        assert blob.encoding == "utf-8"  # Default encoding
+        assert blob.path == temp_path
+        assert blob.mimetype == "text/html"
+        assert blob.source == str(temp_path)
+        assert blob.data is None
+        assert blob.as_bytes() == content
+        assert blob.as_string() == "Hello, World!"
+        with blob.as_bytes_io() as bytes_io:
+            assert bytes_io.read() == content
+
+
+def test_blob_from_str_path() -> None:
+    """Test reading blob from a file path."""
+    content = b"Hello, World!"
+
+    with get_temp_file(content) as temp_path:
+        str_path = str(temp_path)
+        assert isinstance(str_path, str)
+        blob = Blob.from_path(str_path)
+        assert blob.encoding == "utf-8"  # Default encoding
+        assert blob.path == str(temp_path)
+        assert blob.source == str(temp_path)
+        assert blob.data is None
+        assert blob.as_bytes() == content
+        assert blob.as_string() == "Hello, World!"
+        with blob.as_bytes_io() as bytes_io:
+            assert bytes_io.read() == content
+
+
+@pytest.mark.parametrize(
+    "path, mime_type, guess_type, expected_mime_type",
+    [
+        ("test.txt", None, True, "text/plain"),
+        ("test.txt", None, False, None),
+        ("test.html", None, True, "text/html"),
+        ("test.html", None, False, None),
+        ("test.html", "user_forced_value", True, "user_forced_value"),
+        (Path("test.html"), "user_forced_value", True, "user_forced_value"),
+        (Path("test.html"), None, True, "text/html"),
+    ],
+)
+def test_mime_type_inference(
+    path: PathLike, mime_type: str, guess_type: bool, expected_mime_type: Optional[str]
+) -> None:
+    """Tests mimetype inference based on options and path."""
+    blob = Blob.from_path(path, mime_type=mime_type, guess_type=guess_type)
+    assert blob.mimetype == expected_mime_type
+
+
+def test_blob_loader() -> None:
+    """Simple test that verifies that we can implement a blob loader."""
+
+    class TestLoader(BlobLoader):
+        def yield_blobs(self) -> Iterable[Blob]:
+            """Yield blob implementation."""
+            yield Blob(data=b"Hello, World!")
+
+    assert list(TestLoader().yield_blobs()) == [Blob(data=b"Hello, World!")]