Add minimal file system blob loader (#3669)

This adds a minimal file system blob loader.

If looks good, this PR will be merged and a few additional enhancements will be made.
fix_agent_callbacks
Eugene Yurtsev 1 year ago committed by GitHub
parent 487d4aeebd
commit 7cce68a051
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,3 +1,4 @@
from langchain.document_loaders.blob_loaders.file_system import FileSystemBlobLoader
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
__all__ = ["BlobLoader", "Blob"]
__all__ = ["BlobLoader", "Blob", "FileSystemBlobLoader"]

@ -0,0 +1,73 @@
"""Use to load blobs from the local file system."""
from pathlib import Path
from typing import Iterable, Optional, Sequence, Union
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
# PUBLIC API
class FileSystemBlobLoader(BlobLoader):
"""Blob loader for the local file system.
Example:
.. code-block:: python
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
loader = FileSystemBlobLoader("/path/to/directory")
for blob in loader.yield_blobs():
print(blob)
"""
def __init__(
self,
path: Union[str, Path],
*,
glob: str = "**/[!.]*",
suffixes: Optional[Sequence[str]] = None,
) -> None:
"""Initialize with path to directory and how to glob over it.
Args:
path: Path to directory to load from
glob: Glob pattern relative to the specified path
by default set to pick up all non-hidden files
suffixes: Provide to keep only files with these suffixes
Useful when wanting to keep files with different suffixes
Suffixes must include the dot, e.g. ".txt"
Examples:
... code-block:: python
# Recursively load all text files in a directory.
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
# Recursively load all non-hidden files in a directory.
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
# Load all files in a directory without recursion.
loader = FileSystemBlobLoader("/path/to/directory", glob="*")
"""
if isinstance(path, Path):
_path = path
elif isinstance(path, str):
_path = Path(path)
else:
raise TypeError(f"Expected str or Path, got {type(path)}")
self.path = _path
self.glob = glob
self.suffixes = set(suffixes or [])
def yield_blobs(
self,
) -> Iterable[Blob]:
"""Yield blobs that match the requested pattern."""
paths = self.path.glob(self.glob)
for path in paths:
if path.is_file():
if self.suffixes and path.suffix not in self.suffixes:
continue
yield Blob.from_path(str(path))

@ -0,0 +1,101 @@
"""Verify that file system blob loader works as expected."""
import os
import tempfile
from pathlib import Path
from typing import Generator, Sequence
import pytest
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
@pytest.fixture
def toy_dir() -> Generator[Path, None, None]:
"""Yield a pre-populated directory to test the blob loader."""
with tempfile.TemporaryDirectory() as temp_dir:
# Create test.txt
with open(os.path.join(temp_dir, "test.txt"), "w") as test_txt:
test_txt.write("This is a test.txt file.")
# Create test.html
with open(os.path.join(temp_dir, "test.html"), "w") as test_html:
test_html.write(
"<html><body><h1>This is a test.html file.</h1></body></html>"
)
# Create .hidden_file
with open(os.path.join(temp_dir, ".hidden_file"), "w") as hidden_file:
hidden_file.write("This is a hidden file.")
# Create some_dir/nested_file.txt
some_dir = os.path.join(temp_dir, "some_dir")
os.makedirs(some_dir)
with open(os.path.join(some_dir, "nested_file.txt"), "w") as nested_file:
nested_file.write("This is a nested_file.txt file.")
# Create some_dir/other_dir/more_nested.txt
other_dir = os.path.join(some_dir, "other_dir")
os.makedirs(other_dir)
with open(os.path.join(other_dir, "more_nested.txt"), "w") as nested_file:
nested_file.write("This is a more_nested.txt file.")
yield Path(temp_dir)
@pytest.mark.parametrize(
"glob, suffixes, relative_filenames",
[
(
"**/[!.]*",
None,
[
"test.html",
"test.txt",
"some_dir/nested_file.txt",
"some_dir/other_dir/more_nested.txt",
],
),
("*", None, ["test.html", "test.txt", ".hidden_file"]),
("**/*.html", None, ["test.html"]),
("*/*.txt", None, ["some_dir/nested_file.txt"]),
(
"**/*.txt",
None,
[
"test.txt",
"some_dir/nested_file.txt",
"some_dir/other_dir/more_nested.txt",
],
),
(
"**/*",
[".txt"],
[
"test.txt",
"some_dir/nested_file.txt",
"some_dir/other_dir/more_nested.txt",
],
),
("meeeeeeow", None, []),
("*", [".html", ".txt"], ["test.html", "test.txt"]),
],
)
def test_file_names_exist(
toy_dir: str,
glob: str,
suffixes: Sequence[str],
relative_filenames: Sequence[str],
) -> None:
"""Verify that the file names exist."""
loader = FileSystemBlobLoader(toy_dir, glob=glob, suffixes=suffixes)
blobs = list(loader.yield_blobs())
file_names = sorted(str(blob.path) for blob in blobs)
expected_filenames = sorted(
str(Path(toy_dir) / relative_filename)
for relative_filename in relative_filenames
)
assert file_names == expected_filenames

@ -3,4 +3,4 @@ from langchain.document_loaders.blob_loaders import __all__
def test_public_api() -> None:
"""Hard-code public API to help determine if we have broken it."""
assert sorted(__all__) == ["Blob", "BlobLoader"]
assert sorted(__all__) == ["Blob", "BlobLoader", "FileSystemBlobLoader"]

Loading…
Cancel
Save