From e6c1c32affdcf4bf218d3b41c265babd8bd709ee Mon Sep 17 00:00:00 2001 From: Lucas Vieira Date: Tue, 25 Apr 2023 02:05:44 -0300 Subject: [PATCH] Support GCS Objects with `/` in GCS Loaders (#3356) So, this is basically fixing the same things as #1517 but for GCS. ### Problem When loading GCS Objects with `/` in the object key (eg. folder/some-document.txt) using `GCSFileLoader`, the objects are downloaded into a temporary directory and saved as a file. This errors out when the parent directory does not exist within the temporary directory. ### What this pr does Creates parent directories based on object key. This also works with deeply nested keys: folder/subfolder/some-document.txt --- langchain/document_loaders/gcs_directory.py | 4 ++++ langchain/document_loaders/gcs_file.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/langchain/document_loaders/gcs_directory.py b/langchain/document_loaders/gcs_directory.py index 6c38f681..4b81012b 100644 --- a/langchain/document_loaders/gcs_directory.py +++ b/langchain/document_loaders/gcs_directory.py @@ -27,6 +27,10 @@ class GCSDirectoryLoader(BaseLoader): client = storage.Client(project=self.project_name) docs = [] for blob in client.list_blobs(self.bucket, prefix=self.prefix): + # we shall just skip directories since GCSFileLoader creates + # intermediate directories on the fly + if blob.name.endswith("/"): + continue loader = GCSFileLoader(self.project_name, self.bucket, blob.name) docs.extend(loader.load()) return docs diff --git a/langchain/document_loaders/gcs_file.py b/langchain/document_loaders/gcs_file.py index 9397bafe..b1dc43e3 100644 --- a/langchain/document_loaders/gcs_file.py +++ b/langchain/document_loaders/gcs_file.py @@ -1,4 +1,5 @@ """Loading logic for loading documents from a GCS file.""" +import os import tempfile from typing import List @@ -34,6 +35,7 @@ class GCSFileLoader(BaseLoader): blob = bucket.blob(self.blob) with tempfile.TemporaryDirectory() as temp_dir: file_path = f"{temp_dir}/{self.blob}" + os.makedirs(os.path.dirname(file_path), exist_ok=True) # Download the file to a destination blob.download_to_filename(file_path) loader = UnstructuredFileLoader(file_path)