From 1366d070fc656813c0c33cb5733290ade0fddf7c Mon Sep 17 00:00:00 2001 From: os1ma <39944763+os1ma@users.noreply.github.com> Date: Mon, 29 May 2023 04:31:23 +0900 Subject: [PATCH] Add path validation to DirectoryLoader (#5327) # Add path validation to DirectoryLoader This PR introduces a minor adjustment to the DirectoryLoader by adding validation for the path argument. Previously, if the provided path didn't exist or wasn't a directory, DirectoryLoader would return an empty document list due to the behavior of the `glob` method. This could potentially cause confusion for users, as they might expect a file-loading error instead. So, I've added two validations to the load method of the DirectoryLoader: - Raise a FileNotFoundError if the provided path does not exist - Raise a ValueError if the provided path is not a directory Due to the relatively small scope of these changes, a new issue was not created. ## Before submitting ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: @eyurtsev --- langchain/document_loaders/directory.py | 5 +++++ .../document_loaders/test_directory.py | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 tests/unit_tests/document_loaders/test_directory.py diff --git a/langchain/document_loaders/directory.py b/langchain/document_loaders/directory.py index cf1065f2..003d6f01 100644 --- a/langchain/document_loaders/directory.py +++ b/langchain/document_loaders/directory.py @@ -74,6 +74,11 @@ class DirectoryLoader(BaseLoader): def load(self) -> List[Document]: """Load documents.""" p = Path(self.path) + if not p.exists(): + raise FileNotFoundError(f"Directory not found: '{self.path}'") + if not p.is_dir(): + raise ValueError(f"Expected directory, got file: '{self.path}'") + docs: List[Document] = [] items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob)) diff --git a/tests/unit_tests/document_loaders/test_directory.py b/tests/unit_tests/document_loaders/test_directory.py new file mode 100644 index 00000000..7fadecf3 --- /dev/null +++ b/tests/unit_tests/document_loaders/test_directory.py @@ -0,0 +1,19 @@ +import pytest + +from langchain.document_loaders import DirectoryLoader + + +def test_raise_error_if_path_not_exist() -> None: + loader = DirectoryLoader("./not_exist_directory") + with pytest.raises(FileNotFoundError) as e: + loader.load() + + assert str(e.value) == "Directory not found: './not_exist_directory'" + + +def test_raise_error_if_path_is_not_directory() -> None: + loader = DirectoryLoader(__file__) + with pytest.raises(ValueError) as e: + loader.load() + + assert str(e.value) == f"Expected directory, got file: '{__file__}'"