mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Add excludes to FileSystemBlobLoader (#9064)
Add option to specify exclude patterns. https://github.com/langchain-ai/langchain/discussions/9059
This commit is contained in:
parent
6c70f491ba
commit
b7bc8ec87f
@ -54,6 +54,7 @@ class FileSystemBlobLoader(BlobLoader):
|
||||
path: Union[str, Path],
|
||||
*,
|
||||
glob: str = "**/[!.]*",
|
||||
exclude: Sequence[str] = (),
|
||||
suffixes: Optional[Sequence[str]] = None,
|
||||
show_progress: bool = False,
|
||||
) -> None:
|
||||
@ -63,6 +64,7 @@ class FileSystemBlobLoader(BlobLoader):
|
||||
path: Path to directory to load from
|
||||
glob: Glob pattern relative to the specified path
|
||||
by default set to pick up all non-hidden files
|
||||
exclude: patterns to exclude from results, use glob syntax
|
||||
suffixes: Provide to keep only files with these suffixes
|
||||
Useful when wanting to keep files with different suffixes
|
||||
Suffixes must include the dot, e.g. ".txt"
|
||||
@ -77,11 +79,21 @@ class FileSystemBlobLoader(BlobLoader):
|
||||
# Recursively load all text files in a directory.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
|
||||
|
||||
# Recursively load all files in a directory, except for py or pyc files.
|
||||
loader = FileSystemBlobLoader(
|
||||
"/path/to/directory",
|
||||
glob="**/*.txt",
|
||||
exclude=["**/*.py", "**/*.pyc"]
|
||||
)
|
||||
|
||||
# Recursively load all non-hidden files in a directory.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
|
||||
|
||||
# Load all files in a directory without recursion.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="*")
|
||||
|
||||
# Load all files in a directory without recursion.
|
||||
|
||||
"""
|
||||
if isinstance(path, Path):
|
||||
_path = path
|
||||
@ -94,6 +106,7 @@ class FileSystemBlobLoader(BlobLoader):
|
||||
self.glob = glob
|
||||
self.suffixes = set(suffixes or [])
|
||||
self.show_progress = show_progress
|
||||
self.exclude = exclude
|
||||
|
||||
def yield_blobs(
|
||||
self,
|
||||
@ -110,6 +123,9 @@ class FileSystemBlobLoader(BlobLoader):
|
||||
"""Yield paths that match the requested pattern."""
|
||||
paths = self.path.glob(self.glob)
|
||||
for path in paths:
|
||||
if self.exclude:
|
||||
if any(path.match(glob) for glob in self.exclude):
|
||||
continue
|
||||
if path.is_file():
|
||||
if self.suffixes and path.suffix not in self.suffixes:
|
||||
continue
|
||||
|
@ -2,7 +2,7 @@
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Generator, Sequence
|
||||
from typing import Generator
|
||||
|
||||
import pytest
|
||||
|
||||
@ -42,56 +42,98 @@ def toy_dir() -> Generator[Path, None, None]:
|
||||
yield Path(temp_dir)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"glob, suffixes, relative_filenames",
|
||||
[
|
||||
(
|
||||
"**/[!.]*",
|
||||
None,
|
||||
[
|
||||
_TEST_CASES = [
|
||||
{
|
||||
"glob": "**/[!.]*",
|
||||
"suffixes": None,
|
||||
"exclude": (),
|
||||
"relative_filenames": [
|
||||
"test.html",
|
||||
"test.txt",
|
||||
"some_dir/nested_file.txt",
|
||||
"some_dir/other_dir/more_nested.txt",
|
||||
],
|
||||
),
|
||||
("*", None, ["test.html", "test.txt", ".hidden_file"]),
|
||||
("**/*.html", None, ["test.html"]),
|
||||
("*/*.txt", None, ["some_dir/nested_file.txt"]),
|
||||
(
|
||||
"**/*.txt",
|
||||
None,
|
||||
[
|
||||
},
|
||||
{
|
||||
"glob": "*",
|
||||
"suffixes": None,
|
||||
"exclude": (),
|
||||
"relative_filenames": ["test.html", "test.txt", ".hidden_file"],
|
||||
},
|
||||
{
|
||||
"glob": "**/*.html",
|
||||
"suffixes": None,
|
||||
"exclude": (),
|
||||
"relative_filenames": ["test.html"],
|
||||
},
|
||||
{
|
||||
"glob": "*/*.txt",
|
||||
"suffixes": None,
|
||||
"exclude": (),
|
||||
"relative_filenames": ["some_dir/nested_file.txt"],
|
||||
},
|
||||
{
|
||||
"glob": "**/*.txt",
|
||||
"suffixes": None,
|
||||
"exclude": (),
|
||||
"relative_filenames": [
|
||||
"test.txt",
|
||||
"some_dir/nested_file.txt",
|
||||
"some_dir/other_dir/more_nested.txt",
|
||||
],
|
||||
),
|
||||
(
|
||||
"**/*",
|
||||
[".txt"],
|
||||
[
|
||||
},
|
||||
{
|
||||
"glob": "**/*",
|
||||
"suffixes": [".txt"],
|
||||
"exclude": (),
|
||||
"relative_filenames": [
|
||||
"test.txt",
|
||||
"some_dir/nested_file.txt",
|
||||
"some_dir/other_dir/more_nested.txt",
|
||||
],
|
||||
),
|
||||
("meeeeeeow", None, []),
|
||||
("*", [".html", ".txt"], ["test.html", "test.txt"]),
|
||||
],
|
||||
)
|
||||
def test_file_names_exist(
|
||||
toy_dir: str,
|
||||
glob: str,
|
||||
suffixes: Sequence[str],
|
||||
relative_filenames: Sequence[str],
|
||||
) -> None:
|
||||
},
|
||||
{
|
||||
"glob": "meeeeeeow",
|
||||
"suffixes": None,
|
||||
"exclude": (),
|
||||
"relative_filenames": [],
|
||||
},
|
||||
{
|
||||
"glob": "*",
|
||||
"suffixes": [".html", ".txt"],
|
||||
"exclude": (),
|
||||
"relative_filenames": ["test.html", "test.txt"],
|
||||
},
|
||||
# Using exclude patterns
|
||||
{
|
||||
"glob": "**/*",
|
||||
"suffixes": [".txt"],
|
||||
"exclude": ("some_dir/*",),
|
||||
"relative_filenames": ["test.txt", "some_dir/other_dir/more_nested.txt"],
|
||||
},
|
||||
# Using 2 exclude patterns, one of which is recursive
|
||||
{
|
||||
"glob": "**/*",
|
||||
"suffixes": None,
|
||||
"exclude": ("**/*.txt", ".hidden*"),
|
||||
"relative_filenames": ["test.html"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("params", _TEST_CASES)
|
||||
def test_file_names_exist(toy_dir: str, params: dict) -> None:
|
||||
"""Verify that the file names exist."""
|
||||
|
||||
loader = FileSystemBlobLoader(toy_dir, glob=glob, suffixes=suffixes)
|
||||
blobs = list(loader.yield_blobs())
|
||||
glob_pattern = params["glob"]
|
||||
suffixes = params["suffixes"]
|
||||
exclude = params["exclude"]
|
||||
relative_filenames = params["relative_filenames"]
|
||||
|
||||
assert loader.count_matching_files() == len(relative_filenames)
|
||||
loader = FileSystemBlobLoader(
|
||||
toy_dir, glob=glob_pattern, suffixes=suffixes, exclude=exclude
|
||||
)
|
||||
blobs = list(loader.yield_blobs())
|
||||
|
||||
file_names = sorted(str(blob.path) for blob in blobs)
|
||||
|
||||
@ -101,6 +143,7 @@ def test_file_names_exist(
|
||||
)
|
||||
|
||||
assert file_names == expected_filenames
|
||||
assert loader.count_matching_files() == len(relative_filenames)
|
||||
|
||||
|
||||
@pytest.mark.requires("tqdm")
|
||||
|
Loading…
Reference in New Issue
Block a user