mirror of https://github.com/hwchase17/langchain
Add minimal file system blob loader (#3669)
This adds a minimal file system blob loader. If looks good, this PR will be merged and a few additional enhancements will be made.pull/3927/head
parent
487d4aeebd
commit
7cce68a051
@ -1,3 +1,4 @@
|
|||||||
|
from langchain.document_loaders.blob_loaders.file_system import FileSystemBlobLoader
|
||||||
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
|
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
|
||||||
|
|
||||||
__all__ = ["BlobLoader", "Blob"]
|
__all__ = ["BlobLoader", "Blob", "FileSystemBlobLoader"]
|
||||||
|
@ -0,0 +1,73 @@
|
|||||||
|
"""Use to load blobs from the local file system."""
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, Optional, Sequence, Union
|
||||||
|
|
||||||
|
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
|
||||||
|
|
||||||
|
# PUBLIC API
|
||||||
|
|
||||||
|
|
||||||
|
class FileSystemBlobLoader(BlobLoader):
|
||||||
|
"""Blob loader for the local file system.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||||
|
loader = FileSystemBlobLoader("/path/to/directory")
|
||||||
|
for blob in loader.yield_blobs():
|
||||||
|
print(blob)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
path: Union[str, Path],
|
||||||
|
*,
|
||||||
|
glob: str = "**/[!.]*",
|
||||||
|
suffixes: Optional[Sequence[str]] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Initialize with path to directory and how to glob over it.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to directory to load from
|
||||||
|
glob: Glob pattern relative to the specified path
|
||||||
|
by default set to pick up all non-hidden files
|
||||||
|
suffixes: Provide to keep only files with these suffixes
|
||||||
|
Useful when wanting to keep files with different suffixes
|
||||||
|
Suffixes must include the dot, e.g. ".txt"
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
... code-block:: python
|
||||||
|
|
||||||
|
# Recursively load all text files in a directory.
|
||||||
|
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
|
||||||
|
|
||||||
|
# Recursively load all non-hidden files in a directory.
|
||||||
|
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
|
||||||
|
|
||||||
|
# Load all files in a directory without recursion.
|
||||||
|
loader = FileSystemBlobLoader("/path/to/directory", glob="*")
|
||||||
|
"""
|
||||||
|
if isinstance(path, Path):
|
||||||
|
_path = path
|
||||||
|
elif isinstance(path, str):
|
||||||
|
_path = Path(path)
|
||||||
|
else:
|
||||||
|
raise TypeError(f"Expected str or Path, got {type(path)}")
|
||||||
|
|
||||||
|
self.path = _path
|
||||||
|
self.glob = glob
|
||||||
|
self.suffixes = set(suffixes or [])
|
||||||
|
|
||||||
|
def yield_blobs(
|
||||||
|
self,
|
||||||
|
) -> Iterable[Blob]:
|
||||||
|
"""Yield blobs that match the requested pattern."""
|
||||||
|
paths = self.path.glob(self.glob)
|
||||||
|
for path in paths:
|
||||||
|
if path.is_file():
|
||||||
|
if self.suffixes and path.suffix not in self.suffixes:
|
||||||
|
continue
|
||||||
|
yield Blob.from_path(str(path))
|
@ -0,0 +1,101 @@
|
|||||||
|
"""Verify that file system blob loader works as expected."""
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Generator, Sequence
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def toy_dir() -> Generator[Path, None, None]:
|
||||||
|
"""Yield a pre-populated directory to test the blob loader."""
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
# Create test.txt
|
||||||
|
with open(os.path.join(temp_dir, "test.txt"), "w") as test_txt:
|
||||||
|
test_txt.write("This is a test.txt file.")
|
||||||
|
|
||||||
|
# Create test.html
|
||||||
|
with open(os.path.join(temp_dir, "test.html"), "w") as test_html:
|
||||||
|
test_html.write(
|
||||||
|
"<html><body><h1>This is a test.html file.</h1></body></html>"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create .hidden_file
|
||||||
|
with open(os.path.join(temp_dir, ".hidden_file"), "w") as hidden_file:
|
||||||
|
hidden_file.write("This is a hidden file.")
|
||||||
|
|
||||||
|
# Create some_dir/nested_file.txt
|
||||||
|
some_dir = os.path.join(temp_dir, "some_dir")
|
||||||
|
os.makedirs(some_dir)
|
||||||
|
with open(os.path.join(some_dir, "nested_file.txt"), "w") as nested_file:
|
||||||
|
nested_file.write("This is a nested_file.txt file.")
|
||||||
|
|
||||||
|
# Create some_dir/other_dir/more_nested.txt
|
||||||
|
other_dir = os.path.join(some_dir, "other_dir")
|
||||||
|
os.makedirs(other_dir)
|
||||||
|
with open(os.path.join(other_dir, "more_nested.txt"), "w") as nested_file:
|
||||||
|
nested_file.write("This is a more_nested.txt file.")
|
||||||
|
|
||||||
|
yield Path(temp_dir)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"glob, suffixes, relative_filenames",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"**/[!.]*",
|
||||||
|
None,
|
||||||
|
[
|
||||||
|
"test.html",
|
||||||
|
"test.txt",
|
||||||
|
"some_dir/nested_file.txt",
|
||||||
|
"some_dir/other_dir/more_nested.txt",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
("*", None, ["test.html", "test.txt", ".hidden_file"]),
|
||||||
|
("**/*.html", None, ["test.html"]),
|
||||||
|
("*/*.txt", None, ["some_dir/nested_file.txt"]),
|
||||||
|
(
|
||||||
|
"**/*.txt",
|
||||||
|
None,
|
||||||
|
[
|
||||||
|
"test.txt",
|
||||||
|
"some_dir/nested_file.txt",
|
||||||
|
"some_dir/other_dir/more_nested.txt",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"**/*",
|
||||||
|
[".txt"],
|
||||||
|
[
|
||||||
|
"test.txt",
|
||||||
|
"some_dir/nested_file.txt",
|
||||||
|
"some_dir/other_dir/more_nested.txt",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
("meeeeeeow", None, []),
|
||||||
|
("*", [".html", ".txt"], ["test.html", "test.txt"]),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_file_names_exist(
|
||||||
|
toy_dir: str,
|
||||||
|
glob: str,
|
||||||
|
suffixes: Sequence[str],
|
||||||
|
relative_filenames: Sequence[str],
|
||||||
|
) -> None:
|
||||||
|
"""Verify that the file names exist."""
|
||||||
|
|
||||||
|
loader = FileSystemBlobLoader(toy_dir, glob=glob, suffixes=suffixes)
|
||||||
|
blobs = list(loader.yield_blobs())
|
||||||
|
|
||||||
|
file_names = sorted(str(blob.path) for blob in blobs)
|
||||||
|
|
||||||
|
expected_filenames = sorted(
|
||||||
|
str(Path(toy_dir) / relative_filename)
|
||||||
|
for relative_filename in relative_filenames
|
||||||
|
)
|
||||||
|
|
||||||
|
assert file_names == expected_filenames
|
Loading…
Reference in New Issue