From 708787dddb2fa3cdb2d1dabefa00c01ffec572f6 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Thu, 27 Apr 2023 14:33:59 -0400 Subject: [PATCH] Blob: Add validator and use future annotations (#3650) Minor changes to the Blob schema. --------- Co-authored-by: Zander Chase <130414180+vowelparrot@users.noreply.github.com> --- .../document_loaders/blob_loaders/schema.py | 17 +++++++++++++---- .../document_loader/blob_loaders/test_schema.py | 9 +++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/langchain/document_loaders/blob_loaders/schema.py b/langchain/document_loaders/blob_loaders/schema.py index f4a9f6be..6ea20fdb 100644 --- a/langchain/document_loaders/blob_loaders/schema.py +++ b/langchain/document_loaders/blob_loaders/schema.py @@ -4,14 +4,16 @@ The goal is to facilitate decoupling of content loading from content parsing cod In addition, content loading code should provide a lazy loading interface by default. """ +from __future__ import annotations + import contextlib import mimetypes from abc import ABC, abstractmethod from io import BufferedReader, BytesIO from pathlib import PurePath -from typing import Generator, Iterable, Optional, Union +from typing import Any, Generator, Iterable, Mapping, Optional, Union -from pydantic import BaseModel +from pydantic import BaseModel, root_validator PathLike = Union[str, PurePath] @@ -44,6 +46,13 @@ class Blob(BaseModel): """The source location of the blob as string if known otherwise none.""" return str(self.path) if self.path else None + @root_validator(pre=True) + def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]: + """Verify that either data or path is provided.""" + if "data" not in values and "path" not in values: + raise ValueError("Either data or path must be provided") + return values + def as_string(self) -> str: """Read data as a string.""" if self.data is None and self.path: @@ -87,7 +96,7 @@ class Blob(BaseModel): encoding: str = "utf-8", mime_type: Optional[str] = None, guess_type: bool = True, - ) -> "Blob": + ) -> Blob: """Load the blob from a path like object. Args: @@ -116,7 +125,7 @@ class Blob(BaseModel): encoding: str = "utf-8", mime_type: Optional[str] = None, path: Optional[str] = None, - ) -> "Blob": + ) -> Blob: """Initialize the blob from in-memory data. Args: diff --git a/tests/unit_tests/document_loader/blob_loaders/test_schema.py b/tests/unit_tests/document_loader/blob_loaders/test_schema.py index 4791d976..fa4a3dca 100644 --- a/tests/unit_tests/document_loader/blob_loaders/test_schema.py +++ b/tests/unit_tests/document_loader/blob_loaders/test_schema.py @@ -90,6 +90,15 @@ def test_mime_type_inference( assert blob.mimetype == expected_mime_type +def test_blob_initialization_validator() -> None: + """Test that blob initialization validates the arguments.""" + with pytest.raises(ValueError, match="Either data or path must be provided"): + Blob() + + assert Blob(data=b"Hello, World!") is not None + assert Blob(path="some_path") is not None + + def test_blob_loader() -> None: """Simple test that verifies that we can implement a blob loader."""