From 7cce68a0513442a569881334bb2678c4eee3ba2a Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Mon, 1 May 2023 21:37:26 -0400 Subject: [PATCH] Add minimal file system blob loader (#3669) This adds a minimal file system blob loader. If looks good, this PR will be merged and a few additional enhancements will be made. --- .../document_loaders/blob_loaders/__init__.py | 3 +- .../blob_loaders/file_system.py | 73 +++++++++++++ .../test_filesystem_blob_loader.py | 101 ++++++++++++++++++ .../blob_loaders/test_public_api.py | 2 +- 4 files changed, 177 insertions(+), 2 deletions(-) create mode 100644 langchain/document_loaders/blob_loaders/file_system.py create mode 100644 tests/unit_tests/document_loader/blob_loaders/test_filesystem_blob_loader.py diff --git a/langchain/document_loaders/blob_loaders/__init__.py b/langchain/document_loaders/blob_loaders/__init__.py index 6af2e7b3..b023cdbc 100644 --- a/langchain/document_loaders/blob_loaders/__init__.py +++ b/langchain/document_loaders/blob_loaders/__init__.py @@ -1,3 +1,4 @@ +from langchain.document_loaders.blob_loaders.file_system import FileSystemBlobLoader from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader -__all__ = ["BlobLoader", "Blob"] +__all__ = ["BlobLoader", "Blob", "FileSystemBlobLoader"] diff --git a/langchain/document_loaders/blob_loaders/file_system.py b/langchain/document_loaders/blob_loaders/file_system.py new file mode 100644 index 00000000..48c965aa --- /dev/null +++ b/langchain/document_loaders/blob_loaders/file_system.py @@ -0,0 +1,73 @@ +"""Use to load blobs from the local file system.""" +from pathlib import Path +from typing import Iterable, Optional, Sequence, Union + +from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader + +# PUBLIC API + + +class FileSystemBlobLoader(BlobLoader): + """Blob loader for the local file system. + + Example: + + .. code-block:: python + + from langchain.document_loaders.blob_loaders import FileSystemBlobLoader + loader = FileSystemBlobLoader("/path/to/directory") + for blob in loader.yield_blobs(): + print(blob) + """ + + def __init__( + self, + path: Union[str, Path], + *, + glob: str = "**/[!.]*", + suffixes: Optional[Sequence[str]] = None, + ) -> None: + """Initialize with path to directory and how to glob over it. + + Args: + path: Path to directory to load from + glob: Glob pattern relative to the specified path + by default set to pick up all non-hidden files + suffixes: Provide to keep only files with these suffixes + Useful when wanting to keep files with different suffixes + Suffixes must include the dot, e.g. ".txt" + + Examples: + + ... code-block:: python + + # Recursively load all text files in a directory. + loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt") + + # Recursively load all non-hidden files in a directory. + loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*") + + # Load all files in a directory without recursion. + loader = FileSystemBlobLoader("/path/to/directory", glob="*") + """ + if isinstance(path, Path): + _path = path + elif isinstance(path, str): + _path = Path(path) + else: + raise TypeError(f"Expected str or Path, got {type(path)}") + + self.path = _path + self.glob = glob + self.suffixes = set(suffixes or []) + + def yield_blobs( + self, + ) -> Iterable[Blob]: + """Yield blobs that match the requested pattern.""" + paths = self.path.glob(self.glob) + for path in paths: + if path.is_file(): + if self.suffixes and path.suffix not in self.suffixes: + continue + yield Blob.from_path(str(path)) diff --git a/tests/unit_tests/document_loader/blob_loaders/test_filesystem_blob_loader.py b/tests/unit_tests/document_loader/blob_loaders/test_filesystem_blob_loader.py new file mode 100644 index 00000000..37bcd472 --- /dev/null +++ b/tests/unit_tests/document_loader/blob_loaders/test_filesystem_blob_loader.py @@ -0,0 +1,101 @@ +"""Verify that file system blob loader works as expected.""" +import os +import tempfile +from pathlib import Path +from typing import Generator, Sequence + +import pytest + +from langchain.document_loaders.blob_loaders import FileSystemBlobLoader + + +@pytest.fixture +def toy_dir() -> Generator[Path, None, None]: + """Yield a pre-populated directory to test the blob loader.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create test.txt + with open(os.path.join(temp_dir, "test.txt"), "w") as test_txt: + test_txt.write("This is a test.txt file.") + + # Create test.html + with open(os.path.join(temp_dir, "test.html"), "w") as test_html: + test_html.write( + "

This is a test.html file.

" + ) + + # Create .hidden_file + with open(os.path.join(temp_dir, ".hidden_file"), "w") as hidden_file: + hidden_file.write("This is a hidden file.") + + # Create some_dir/nested_file.txt + some_dir = os.path.join(temp_dir, "some_dir") + os.makedirs(some_dir) + with open(os.path.join(some_dir, "nested_file.txt"), "w") as nested_file: + nested_file.write("This is a nested_file.txt file.") + + # Create some_dir/other_dir/more_nested.txt + other_dir = os.path.join(some_dir, "other_dir") + os.makedirs(other_dir) + with open(os.path.join(other_dir, "more_nested.txt"), "w") as nested_file: + nested_file.write("This is a more_nested.txt file.") + + yield Path(temp_dir) + + +@pytest.mark.parametrize( + "glob, suffixes, relative_filenames", + [ + ( + "**/[!.]*", + None, + [ + "test.html", + "test.txt", + "some_dir/nested_file.txt", + "some_dir/other_dir/more_nested.txt", + ], + ), + ("*", None, ["test.html", "test.txt", ".hidden_file"]), + ("**/*.html", None, ["test.html"]), + ("*/*.txt", None, ["some_dir/nested_file.txt"]), + ( + "**/*.txt", + None, + [ + "test.txt", + "some_dir/nested_file.txt", + "some_dir/other_dir/more_nested.txt", + ], + ), + ( + "**/*", + [".txt"], + [ + "test.txt", + "some_dir/nested_file.txt", + "some_dir/other_dir/more_nested.txt", + ], + ), + ("meeeeeeow", None, []), + ("*", [".html", ".txt"], ["test.html", "test.txt"]), + ], +) +def test_file_names_exist( + toy_dir: str, + glob: str, + suffixes: Sequence[str], + relative_filenames: Sequence[str], +) -> None: + """Verify that the file names exist.""" + + loader = FileSystemBlobLoader(toy_dir, glob=glob, suffixes=suffixes) + blobs = list(loader.yield_blobs()) + + file_names = sorted(str(blob.path) for blob in blobs) + + expected_filenames = sorted( + str(Path(toy_dir) / relative_filename) + for relative_filename in relative_filenames + ) + + assert file_names == expected_filenames diff --git a/tests/unit_tests/document_loader/blob_loaders/test_public_api.py b/tests/unit_tests/document_loader/blob_loaders/test_public_api.py index a1eb79b7..c844f243 100644 --- a/tests/unit_tests/document_loader/blob_loaders/test_public_api.py +++ b/tests/unit_tests/document_loader/blob_loaders/test_public_api.py @@ -3,4 +3,4 @@ from langchain.document_loaders.blob_loaders import __all__ def test_public_api() -> None: """Hard-code public API to help determine if we have broken it.""" - assert sorted(__all__) == ["Blob", "BlobLoader"] + assert sorted(__all__) == ["Blob", "BlobLoader", "FileSystemBlobLoader"]