Support GCS Objects with `/` in GCS Loaders (#3356)

So, this is basically fixing the same things as #1517 but for GCS.

### Problem
When loading GCS Objects with `/` in the object key (eg.
folder/some-document.txt) using `GCSFileLoader`, the objects are
downloaded into a temporary directory and saved as a file.

This errors out when the parent directory does not exist within the
temporary directory.

### What this pr does
Creates parent directories based on object key.

This also works with deeply nested keys:
folder/subfolder/some-document.txt
fix_agent_callbacks
Lucas Vieira 1 year ago committed by GitHub
parent a4d85f7fd5
commit e6c1c32aff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -27,6 +27,10 @@ class GCSDirectoryLoader(BaseLoader):
client = storage.Client(project=self.project_name)
docs = []
for blob in client.list_blobs(self.bucket, prefix=self.prefix):
# we shall just skip directories since GCSFileLoader creates
# intermediate directories on the fly
if blob.name.endswith("/"):
continue
loader = GCSFileLoader(self.project_name, self.bucket, blob.name)
docs.extend(loader.load())
return docs

@ -1,4 +1,5 @@
"""Loading logic for loading documents from a GCS file."""
import os
import tempfile
from typing import List
@ -34,6 +35,7 @@ class GCSFileLoader(BaseLoader):
blob = bucket.blob(self.blob)
with tempfile.TemporaryDirectory() as temp_dir:
file_path = f"{temp_dir}/{self.blob}"
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Download the file to a destination
blob.download_to_filename(file_path)
loader = UnstructuredFileLoader(file_path)

Loading…
Cancel
Save