diff --git a/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py b/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py index 4705b2c563..f4d7ec4ce9 100644 --- a/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py +++ b/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py @@ -54,6 +54,7 @@ class FileSystemBlobLoader(BlobLoader): path: Union[str, Path], *, glob: str = "**/[!.]*", + exclude: Sequence[str] = (), suffixes: Optional[Sequence[str]] = None, show_progress: bool = False, ) -> None: @@ -63,6 +64,7 @@ class FileSystemBlobLoader(BlobLoader): path: Path to directory to load from glob: Glob pattern relative to the specified path by default set to pick up all non-hidden files + exclude: patterns to exclude from results, use glob syntax suffixes: Provide to keep only files with these suffixes Useful when wanting to keep files with different suffixes Suffixes must include the dot, e.g. ".txt" @@ -77,11 +79,21 @@ class FileSystemBlobLoader(BlobLoader): # Recursively load all text files in a directory. loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt") + # Recursively load all files in a directory, except for py or pyc files. + loader = FileSystemBlobLoader( + "/path/to/directory", + glob="**/*.txt", + exclude=["**/*.py", "**/*.pyc"] + ) + # Recursively load all non-hidden files in a directory. loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*") # Load all files in a directory without recursion. loader = FileSystemBlobLoader("/path/to/directory", glob="*") + + # Load all files in a directory without recursion. + """ if isinstance(path, Path): _path = path @@ -94,6 +106,7 @@ class FileSystemBlobLoader(BlobLoader): self.glob = glob self.suffixes = set(suffixes or []) self.show_progress = show_progress + self.exclude = exclude def yield_blobs( self, @@ -110,6 +123,9 @@ class FileSystemBlobLoader(BlobLoader): """Yield paths that match the requested pattern.""" paths = self.path.glob(self.glob) for path in paths: + if self.exclude: + if any(path.match(glob) for glob in self.exclude): + continue if path.is_file(): if self.suffixes and path.suffix not in self.suffixes: continue diff --git a/libs/langchain/tests/unit_tests/document_loaders/blob_loaders/test_filesystem_blob_loader.py b/libs/langchain/tests/unit_tests/document_loaders/blob_loaders/test_filesystem_blob_loader.py index 0c40bc08b1..2544f9a141 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/blob_loaders/test_filesystem_blob_loader.py +++ b/libs/langchain/tests/unit_tests/document_loaders/blob_loaders/test_filesystem_blob_loader.py @@ -2,7 +2,7 @@ import os import tempfile from pathlib import Path -from typing import Generator, Sequence +from typing import Generator import pytest @@ -42,56 +42,98 @@ def toy_dir() -> Generator[Path, None, None]: yield Path(temp_dir) -@pytest.mark.parametrize( - "glob, suffixes, relative_filenames", - [ - ( - "**/[!.]*", - None, - [ - "test.html", - "test.txt", - "some_dir/nested_file.txt", - "some_dir/other_dir/more_nested.txt", - ], - ), - ("*", None, ["test.html", "test.txt", ".hidden_file"]), - ("**/*.html", None, ["test.html"]), - ("*/*.txt", None, ["some_dir/nested_file.txt"]), - ( - "**/*.txt", - None, - [ - "test.txt", - "some_dir/nested_file.txt", - "some_dir/other_dir/more_nested.txt", - ], - ), - ( - "**/*", - [".txt"], - [ - "test.txt", - "some_dir/nested_file.txt", - "some_dir/other_dir/more_nested.txt", - ], - ), - ("meeeeeeow", None, []), - ("*", [".html", ".txt"], ["test.html", "test.txt"]), - ], -) -def test_file_names_exist( - toy_dir: str, - glob: str, - suffixes: Sequence[str], - relative_filenames: Sequence[str], -) -> None: +_TEST_CASES = [ + { + "glob": "**/[!.]*", + "suffixes": None, + "exclude": (), + "relative_filenames": [ + "test.html", + "test.txt", + "some_dir/nested_file.txt", + "some_dir/other_dir/more_nested.txt", + ], + }, + { + "glob": "*", + "suffixes": None, + "exclude": (), + "relative_filenames": ["test.html", "test.txt", ".hidden_file"], + }, + { + "glob": "**/*.html", + "suffixes": None, + "exclude": (), + "relative_filenames": ["test.html"], + }, + { + "glob": "*/*.txt", + "suffixes": None, + "exclude": (), + "relative_filenames": ["some_dir/nested_file.txt"], + }, + { + "glob": "**/*.txt", + "suffixes": None, + "exclude": (), + "relative_filenames": [ + "test.txt", + "some_dir/nested_file.txt", + "some_dir/other_dir/more_nested.txt", + ], + }, + { + "glob": "**/*", + "suffixes": [".txt"], + "exclude": (), + "relative_filenames": [ + "test.txt", + "some_dir/nested_file.txt", + "some_dir/other_dir/more_nested.txt", + ], + }, + { + "glob": "meeeeeeow", + "suffixes": None, + "exclude": (), + "relative_filenames": [], + }, + { + "glob": "*", + "suffixes": [".html", ".txt"], + "exclude": (), + "relative_filenames": ["test.html", "test.txt"], + }, + # Using exclude patterns + { + "glob": "**/*", + "suffixes": [".txt"], + "exclude": ("some_dir/*",), + "relative_filenames": ["test.txt", "some_dir/other_dir/more_nested.txt"], + }, + # Using 2 exclude patterns, one of which is recursive + { + "glob": "**/*", + "suffixes": None, + "exclude": ("**/*.txt", ".hidden*"), + "relative_filenames": ["test.html"], + }, +] + + +@pytest.mark.parametrize("params", _TEST_CASES) +def test_file_names_exist(toy_dir: str, params: dict) -> None: """Verify that the file names exist.""" - loader = FileSystemBlobLoader(toy_dir, glob=glob, suffixes=suffixes) - blobs = list(loader.yield_blobs()) + glob_pattern = params["glob"] + suffixes = params["suffixes"] + exclude = params["exclude"] + relative_filenames = params["relative_filenames"] - assert loader.count_matching_files() == len(relative_filenames) + loader = FileSystemBlobLoader( + toy_dir, glob=glob_pattern, suffixes=suffixes, exclude=exclude + ) + blobs = list(loader.yield_blobs()) file_names = sorted(str(blob.path) for blob in blobs) @@ -101,6 +143,7 @@ def test_file_names_exist( ) assert file_names == expected_filenames + assert loader.count_matching_files() == len(relative_filenames) @pytest.mark.requires("tqdm")