Add excludes to FileSystemBlobLoader (#9064)

Add option to specify exclude patterns.

https://github.com/langchain-ai/langchain/discussions/9059
This commit is contained in:
Eugene Yurtsev 2023-08-10 14:56:58 -04:00 committed by GitHub
parent 6c70f491ba
commit b7bc8ec87f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 107 additions and 48 deletions

View File

@ -54,6 +54,7 @@ class FileSystemBlobLoader(BlobLoader):
path: Union[str, Path],
*,
glob: str = "**/[!.]*",
exclude: Sequence[str] = (),
suffixes: Optional[Sequence[str]] = None,
show_progress: bool = False,
) -> None:
@ -63,6 +64,7 @@ class FileSystemBlobLoader(BlobLoader):
path: Path to directory to load from
glob: Glob pattern relative to the specified path
by default set to pick up all non-hidden files
exclude: patterns to exclude from results, use glob syntax
suffixes: Provide to keep only files with these suffixes
Useful when wanting to keep files with different suffixes
Suffixes must include the dot, e.g. ".txt"
@ -77,11 +79,21 @@ class FileSystemBlobLoader(BlobLoader):
# Recursively load all text files in a directory.
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
# Recursively load all files in a directory, except for py or pyc files.
loader = FileSystemBlobLoader(
"/path/to/directory",
glob="**/*.txt",
exclude=["**/*.py", "**/*.pyc"]
)
# Recursively load all non-hidden files in a directory.
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
# Load all files in a directory without recursion.
loader = FileSystemBlobLoader("/path/to/directory", glob="*")
# Load all files in a directory without recursion.
"""
if isinstance(path, Path):
_path = path
@ -94,6 +106,7 @@ class FileSystemBlobLoader(BlobLoader):
self.glob = glob
self.suffixes = set(suffixes or [])
self.show_progress = show_progress
self.exclude = exclude
def yield_blobs(
self,
@ -110,6 +123,9 @@ class FileSystemBlobLoader(BlobLoader):
"""Yield paths that match the requested pattern."""
paths = self.path.glob(self.glob)
for path in paths:
if self.exclude:
if any(path.match(glob) for glob in self.exclude):
continue
if path.is_file():
if self.suffixes and path.suffix not in self.suffixes:
continue

View File

@ -2,7 +2,7 @@
import os
import tempfile
from pathlib import Path
from typing import Generator, Sequence
from typing import Generator
import pytest
@ -42,56 +42,98 @@ def toy_dir() -> Generator[Path, None, None]:
yield Path(temp_dir)
@pytest.mark.parametrize(
"glob, suffixes, relative_filenames",
[
(
"**/[!.]*",
None,
[
"test.html",
"test.txt",
"some_dir/nested_file.txt",
"some_dir/other_dir/more_nested.txt",
],
),
("*", None, ["test.html", "test.txt", ".hidden_file"]),
("**/*.html", None, ["test.html"]),
("*/*.txt", None, ["some_dir/nested_file.txt"]),
(
"**/*.txt",
None,
[
"test.txt",
"some_dir/nested_file.txt",
"some_dir/other_dir/more_nested.txt",
],
),
(
"**/*",
[".txt"],
[
"test.txt",
"some_dir/nested_file.txt",
"some_dir/other_dir/more_nested.txt",
],
),
("meeeeeeow", None, []),
("*", [".html", ".txt"], ["test.html", "test.txt"]),
],
)
def test_file_names_exist(
toy_dir: str,
glob: str,
suffixes: Sequence[str],
relative_filenames: Sequence[str],
) -> None:
_TEST_CASES = [
{
"glob": "**/[!.]*",
"suffixes": None,
"exclude": (),
"relative_filenames": [
"test.html",
"test.txt",
"some_dir/nested_file.txt",
"some_dir/other_dir/more_nested.txt",
],
},
{
"glob": "*",
"suffixes": None,
"exclude": (),
"relative_filenames": ["test.html", "test.txt", ".hidden_file"],
},
{
"glob": "**/*.html",
"suffixes": None,
"exclude": (),
"relative_filenames": ["test.html"],
},
{
"glob": "*/*.txt",
"suffixes": None,
"exclude": (),
"relative_filenames": ["some_dir/nested_file.txt"],
},
{
"glob": "**/*.txt",
"suffixes": None,
"exclude": (),
"relative_filenames": [
"test.txt",
"some_dir/nested_file.txt",
"some_dir/other_dir/more_nested.txt",
],
},
{
"glob": "**/*",
"suffixes": [".txt"],
"exclude": (),
"relative_filenames": [
"test.txt",
"some_dir/nested_file.txt",
"some_dir/other_dir/more_nested.txt",
],
},
{
"glob": "meeeeeeow",
"suffixes": None,
"exclude": (),
"relative_filenames": [],
},
{
"glob": "*",
"suffixes": [".html", ".txt"],
"exclude": (),
"relative_filenames": ["test.html", "test.txt"],
},
# Using exclude patterns
{
"glob": "**/*",
"suffixes": [".txt"],
"exclude": ("some_dir/*",),
"relative_filenames": ["test.txt", "some_dir/other_dir/more_nested.txt"],
},
# Using 2 exclude patterns, one of which is recursive
{
"glob": "**/*",
"suffixes": None,
"exclude": ("**/*.txt", ".hidden*"),
"relative_filenames": ["test.html"],
},
]
@pytest.mark.parametrize("params", _TEST_CASES)
def test_file_names_exist(toy_dir: str, params: dict) -> None:
"""Verify that the file names exist."""
loader = FileSystemBlobLoader(toy_dir, glob=glob, suffixes=suffixes)
blobs = list(loader.yield_blobs())
glob_pattern = params["glob"]
suffixes = params["suffixes"]
exclude = params["exclude"]
relative_filenames = params["relative_filenames"]
assert loader.count_matching_files() == len(relative_filenames)
loader = FileSystemBlobLoader(
toy_dir, glob=glob_pattern, suffixes=suffixes, exclude=exclude
)
blobs = list(loader.yield_blobs())
file_names = sorted(str(blob.path) for blob in blobs)
@ -101,6 +143,7 @@ def test_file_names_exist(
)
assert file_names == expected_filenames
assert loader.count_matching_files() == len(relative_filenames)
@pytest.mark.requires("tqdm")