Add bucket and object key to metadata in S3 loader (#9317)

- Description: this PR adds `s3_object_key` and `s3_bucket` to the doc
metadata when loading an S3 file. This is particularly useful when using
`S3DirectoryLoader` to remove the files from the dir once they have been
processed (getting the object keys from the metadata `source` field
seems brittle)
  - Dependencies: N/A
  - Tag maintainer: ?
  - Twitter handle: _cbornet

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
pull/9997/head
Christophe Bornet 1 year ago committed by GitHub
parent 24c0b01c38
commit 9870bfb9cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -90,7 +90,7 @@
{
"data": {
"text/plain": [
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]"
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
]
},
"execution_count": 6,

@ -53,7 +53,7 @@
{
"data": {
"text/plain": [
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpxvave6wl/fake.docx'}, lookup_index=0)]"
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
]
},
"execution_count": 9,
@ -96,3 +96,4 @@
"nbformat": 4,
"nbformat_minor": 5
}

@ -2,12 +2,10 @@ import os
import tempfile
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.document_loaders.unstructured import UnstructuredBaseLoader
class S3FileLoader(BaseLoader):
class S3FileLoader(UnstructuredBaseLoader):
"""Load from `Amazon AWS S3` file."""
def __init__(self, bucket: str, key: str):
@ -17,11 +15,14 @@ class S3FileLoader(BaseLoader):
bucket: The name of the S3 bucket.
key: The key of the S3 object.
"""
super().__init__()
self.bucket = bucket
self.key = key
def load(self) -> List[Document]:
"""Load documents."""
def _get_elements(self) -> List:
"""Get elements."""
from unstructured.partition.auto import partition
try:
import boto3
except ImportError:
@ -34,5 +35,7 @@ class S3FileLoader(BaseLoader):
file_path = f"{temp_dir}/{self.key}"
os.makedirs(os.path.dirname(file_path), exist_ok=True)
s3.download_file(self.bucket, self.key, file_path)
loader = UnstructuredFileLoader(file_path)
return loader.load()
return partition(filename=file_path)
def _get_metadata(self) -> dict:
return {"source": f"s3://{self.bucket}/{self.key}"}

Loading…
Cancel
Save