mirror of https://github.com/hwchase17/langchain
Add minimal file system blob loader (#3669)
This adds a minimal file system blob loader. If looks good, this PR will be merged and a few additional enhancements will be made.pull/3927/head
parent
487d4aeebd
commit
7cce68a051
@ -1,3 +1,4 @@
|
||||
from langchain.document_loaders.blob_loaders.file_system import FileSystemBlobLoader
|
||||
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
|
||||
|
||||
__all__ = ["BlobLoader", "Blob"]
|
||||
__all__ = ["BlobLoader", "Blob", "FileSystemBlobLoader"]
|
||||
|
@ -0,0 +1,73 @@
|
||||
"""Use to load blobs from the local file system."""
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Sequence, Union
|
||||
|
||||
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
|
||||
|
||||
# PUBLIC API
|
||||
|
||||
|
||||
class FileSystemBlobLoader(BlobLoader):
|
||||
"""Blob loader for the local file system.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||
loader = FileSystemBlobLoader("/path/to/directory")
|
||||
for blob in loader.yield_blobs():
|
||||
print(blob)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
*,
|
||||
glob: str = "**/[!.]*",
|
||||
suffixes: Optional[Sequence[str]] = None,
|
||||
) -> None:
|
||||
"""Initialize with path to directory and how to glob over it.
|
||||
|
||||
Args:
|
||||
path: Path to directory to load from
|
||||
glob: Glob pattern relative to the specified path
|
||||
by default set to pick up all non-hidden files
|
||||
suffixes: Provide to keep only files with these suffixes
|
||||
Useful when wanting to keep files with different suffixes
|
||||
Suffixes must include the dot, e.g. ".txt"
|
||||
|
||||
Examples:
|
||||
|
||||
... code-block:: python
|
||||
|
||||
# Recursively load all text files in a directory.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
|
||||
|
||||
# Recursively load all non-hidden files in a directory.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
|
||||
|
||||
# Load all files in a directory without recursion.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="*")
|
||||
"""
|
||||
if isinstance(path, Path):
|
||||
_path = path
|
||||
elif isinstance(path, str):
|
||||
_path = Path(path)
|
||||
else:
|
||||
raise TypeError(f"Expected str or Path, got {type(path)}")
|
||||
|
||||
self.path = _path
|
||||
self.glob = glob
|
||||
self.suffixes = set(suffixes or [])
|
||||
|
||||
def yield_blobs(
|
||||
self,
|
||||
) -> Iterable[Blob]:
|
||||
"""Yield blobs that match the requested pattern."""
|
||||
paths = self.path.glob(self.glob)
|
||||
for path in paths:
|
||||
if path.is_file():
|
||||
if self.suffixes and path.suffix not in self.suffixes:
|
||||
continue
|
||||
yield Blob.from_path(str(path))
|
@ -0,0 +1,101 @@
|
||||
"""Verify that file system blob loader works as expected."""
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Generator, Sequence
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def toy_dir() -> Generator[Path, None, None]:
|
||||
"""Yield a pre-populated directory to test the blob loader."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Create test.txt
|
||||
with open(os.path.join(temp_dir, "test.txt"), "w") as test_txt:
|
||||
test_txt.write("This is a test.txt file.")
|
||||
|
||||
# Create test.html
|
||||
with open(os.path.join(temp_dir, "test.html"), "w") as test_html:
|
||||
test_html.write(
|
||||
"<html><body><h1>This is a test.html file.</h1></body></html>"
|
||||
)
|
||||
|
||||
# Create .hidden_file
|
||||
with open(os.path.join(temp_dir, ".hidden_file"), "w") as hidden_file:
|
||||
hidden_file.write("This is a hidden file.")
|
||||
|
||||
# Create some_dir/nested_file.txt
|
||||
some_dir = os.path.join(temp_dir, "some_dir")
|
||||
os.makedirs(some_dir)
|
||||
with open(os.path.join(some_dir, "nested_file.txt"), "w") as nested_file:
|
||||
nested_file.write("This is a nested_file.txt file.")
|
||||
|
||||
# Create some_dir/other_dir/more_nested.txt
|
||||
other_dir = os.path.join(some_dir, "other_dir")
|
||||
os.makedirs(other_dir)
|
||||
with open(os.path.join(other_dir, "more_nested.txt"), "w") as nested_file:
|
||||
nested_file.write("This is a more_nested.txt file.")
|
||||
|
||||
yield Path(temp_dir)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"glob, suffixes, relative_filenames",
|
||||
[
|
||||
(
|
||||
"**/[!.]*",
|
||||
None,
|
||||
[
|
||||
"test.html",
|
||||
"test.txt",
|
||||
"some_dir/nested_file.txt",
|
||||
"some_dir/other_dir/more_nested.txt",
|
||||
],
|
||||
),
|
||||
("*", None, ["test.html", "test.txt", ".hidden_file"]),
|
||||
("**/*.html", None, ["test.html"]),
|
||||
("*/*.txt", None, ["some_dir/nested_file.txt"]),
|
||||
(
|
||||
"**/*.txt",
|
||||
None,
|
||||
[
|
||||
"test.txt",
|
||||
"some_dir/nested_file.txt",
|
||||
"some_dir/other_dir/more_nested.txt",
|
||||
],
|
||||
),
|
||||
(
|
||||
"**/*",
|
||||
[".txt"],
|
||||
[
|
||||
"test.txt",
|
||||
"some_dir/nested_file.txt",
|
||||
"some_dir/other_dir/more_nested.txt",
|
||||
],
|
||||
),
|
||||
("meeeeeeow", None, []),
|
||||
("*", [".html", ".txt"], ["test.html", "test.txt"]),
|
||||
],
|
||||
)
|
||||
def test_file_names_exist(
|
||||
toy_dir: str,
|
||||
glob: str,
|
||||
suffixes: Sequence[str],
|
||||
relative_filenames: Sequence[str],
|
||||
) -> None:
|
||||
"""Verify that the file names exist."""
|
||||
|
||||
loader = FileSystemBlobLoader(toy_dir, glob=glob, suffixes=suffixes)
|
||||
blobs = list(loader.yield_blobs())
|
||||
|
||||
file_names = sorted(str(blob.path) for blob in blobs)
|
||||
|
||||
expected_filenames = sorted(
|
||||
str(Path(toy_dir) / relative_filename)
|
||||
for relative_filename in relative_filenames
|
||||
)
|
||||
|
||||
assert file_names == expected_filenames
|
Loading…
Reference in New Issue