From cf94091cd0fcdef6c6f8c21e951505d986ce8b48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20LEBRUN?= Date: Fri, 8 Mar 2024 19:50:58 -0500 Subject: [PATCH] community[patch]: Skip nested directories when using S3DirectoryLoader (#17829) - **Description:** `S3DirectoryLoader` is failing if prefix is a folder (ex: `my_folder/`) because `S3FileLoader` will try to load that folder and will fail. This PR skip nested directories so prefix can be set to folder instead of `my_folder/files_prefix`. - **Issue:** - #11917 - #6535 - #4326 - **Dependencies:** none - **Twitter handle:** @Falydoor - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ --- .../langchain_community/document_loaders/s3_directory.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libs/community/langchain_community/document_loaders/s3_directory.py b/libs/community/langchain_community/document_loaders/s3_directory.py index 9885418ec8..24d4afab62 100644 --- a/libs/community/langchain_community/document_loaders/s3_directory.py +++ b/libs/community/langchain_community/document_loaders/s3_directory.py @@ -120,6 +120,9 @@ class S3DirectoryLoader(BaseLoader): bucket = s3.Bucket(self.bucket) docs = [] for obj in bucket.objects.filter(Prefix=self.prefix): + # Skip directories + if obj.size == 0 and obj.key.endswith("/"): + continue loader = S3FileLoader( self.bucket, obj.key,