diff --git a/langchain/document_loaders/blob_loaders/__init__.py b/langchain/document_loaders/blob_loaders/__init__.py index 6af2e7b3d3..b023cdbc4a 100644 --- a/langchain/document_loaders/blob_loaders/__init__.py +++ b/langchain/document_loaders/blob_loaders/__init__.py @@ -1,3 +1,4 @@ +from langchain.document_loaders.blob_loaders.file_system import FileSystemBlobLoader from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader -__all__ = ["BlobLoader", "Blob"] +__all__ = ["BlobLoader", "Blob", "FileSystemBlobLoader"] diff --git a/langchain/document_loaders/blob_loaders/file_system.py b/langchain/document_loaders/blob_loaders/file_system.py new file mode 100644 index 0000000000..48c965aadd --- /dev/null +++ b/langchain/document_loaders/blob_loaders/file_system.py @@ -0,0 +1,73 @@ +"""Use to load blobs from the local file system.""" +from pathlib import Path +from typing import Iterable, Optional, Sequence, Union + +from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader + +# PUBLIC API + + +class FileSystemBlobLoader(BlobLoader): + """Blob loader for the local file system. + + Example: + + .. code-block:: python + + from langchain.document_loaders.blob_loaders import FileSystemBlobLoader + loader = FileSystemBlobLoader("/path/to/directory") + for blob in loader.yield_blobs(): + print(blob) + """ + + def __init__( + self, + path: Union[str, Path], + *, + glob: str = "**/[!.]*", + suffixes: Optional[Sequence[str]] = None, + ) -> None: + """Initialize with path to directory and how to glob over it. + + Args: + path: Path to directory to load from + glob: Glob pattern relative to the specified path + by default set to pick up all non-hidden files + suffixes: Provide to keep only files with these suffixes + Useful when wanting to keep files with different suffixes + Suffixes must include the dot, e.g. ".txt" + + Examples: + + ... code-block:: python + + # Recursively load all text files in a directory. + loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt") + + # Recursively load all non-hidden files in a directory. + loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*") + + # Load all files in a directory without recursion. + loader = FileSystemBlobLoader("/path/to/directory", glob="*") + """ + if isinstance(path, Path): + _path = path + elif isinstance(path, str): + _path = Path(path) + else: + raise TypeError(f"Expected str or Path, got {type(path)}") + + self.path = _path + self.glob = glob + self.suffixes = set(suffixes or []) + + def yield_blobs( + self, + ) -> Iterable[Blob]: + """Yield blobs that match the requested pattern.""" + paths = self.path.glob(self.glob) + for path in paths: + if path.is_file(): + if self.suffixes and path.suffix not in self.suffixes: + continue + yield Blob.from_path(str(path)) diff --git a/tests/unit_tests/document_loader/blob_loaders/test_filesystem_blob_loader.py b/tests/unit_tests/document_loader/blob_loaders/test_filesystem_blob_loader.py new file mode 100644 index 0000000000..37bcd472a5 --- /dev/null +++ b/tests/unit_tests/document_loader/blob_loaders/test_filesystem_blob_loader.py @@ -0,0 +1,101 @@ +"""Verify that file system blob loader works as expected.""" +import os +import tempfile +from pathlib import Path +from typing import Generator, Sequence + +import pytest + +from langchain.document_loaders.blob_loaders import FileSystemBlobLoader + + +@pytest.fixture +def toy_dir() -> Generator[Path, None, None]: + """Yield a pre-populated directory to test the blob loader.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create test.txt + with open(os.path.join(temp_dir, "test.txt"), "w") as test_txt: + test_txt.write("This is a test.txt file.") + + # Create test.html + with open(os.path.join(temp_dir, "test.html"), "w") as test_html: + test_html.write( + "