diff --git a/langchain/document_loaders/gcs_directory.py b/langchain/document_loaders/gcs_directory.py index 6c38f681..4b81012b 100644 --- a/langchain/document_loaders/gcs_directory.py +++ b/langchain/document_loaders/gcs_directory.py @@ -27,6 +27,10 @@ class GCSDirectoryLoader(BaseLoader): client = storage.Client(project=self.project_name) docs = [] for blob in client.list_blobs(self.bucket, prefix=self.prefix): + # we shall just skip directories since GCSFileLoader creates + # intermediate directories on the fly + if blob.name.endswith("/"): + continue loader = GCSFileLoader(self.project_name, self.bucket, blob.name) docs.extend(loader.load()) return docs diff --git a/langchain/document_loaders/gcs_file.py b/langchain/document_loaders/gcs_file.py index 9397bafe..b1dc43e3 100644 --- a/langchain/document_loaders/gcs_file.py +++ b/langchain/document_loaders/gcs_file.py @@ -1,4 +1,5 @@ """Loading logic for loading documents from a GCS file.""" +import os import tempfile from typing import List @@ -34,6 +35,7 @@ class GCSFileLoader(BaseLoader): blob = bucket.blob(self.blob) with tempfile.TemporaryDirectory() as temp_dir: file_path = f"{temp_dir}/{self.blob}" + os.makedirs(os.path.dirname(file_path), exist_ok=True) # Download the file to a destination blob.download_to_filename(file_path) loader = UnstructuredFileLoader(file_path)