Add bucket and object key to metadata in S3 loader (#9317)

- Description: this PR adds `s3_object_key` and `s3_bucket` to the doc
metadata when loading an S3 file. This is particularly useful when using
`S3DirectoryLoader` to remove the files from the dir once they have been
processed (getting the object keys from the metadata `source` field
seems brittle)
  - Dependencies: N/A
  - Tag maintainer: ?
  - Twitter handle: _cbornet

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Christophe Bornet 2023-08-30 17:03:24 +02:00 committed by GitHub
parent 24c0b01c38
commit 9870bfb9cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 14 additions and 10 deletions

View File

@ -90,7 +90,7 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]" "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
] ]
}, },
"execution_count": 6, "execution_count": 6,

View File

@ -53,7 +53,7 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpxvave6wl/fake.docx'}, lookup_index=0)]" "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
] ]
}, },
"execution_count": 9, "execution_count": 9,
@ -96,3 +96,4 @@
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 5 "nbformat_minor": 5
} }

View File

@ -2,12 +2,10 @@ import os
import tempfile import tempfile
from typing import List from typing import List
from langchain.docstore.document import Document from langchain.document_loaders.unstructured import UnstructuredBaseLoader
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
class S3FileLoader(BaseLoader): class S3FileLoader(UnstructuredBaseLoader):
"""Load from `Amazon AWS S3` file.""" """Load from `Amazon AWS S3` file."""
def __init__(self, bucket: str, key: str): def __init__(self, bucket: str, key: str):
@ -17,11 +15,14 @@ class S3FileLoader(BaseLoader):
bucket: The name of the S3 bucket. bucket: The name of the S3 bucket.
key: The key of the S3 object. key: The key of the S3 object.
""" """
super().__init__()
self.bucket = bucket self.bucket = bucket
self.key = key self.key = key
def load(self) -> List[Document]: def _get_elements(self) -> List:
"""Load documents.""" """Get elements."""
from unstructured.partition.auto import partition
try: try:
import boto3 import boto3
except ImportError: except ImportError:
@ -34,5 +35,7 @@ class S3FileLoader(BaseLoader):
file_path = f"{temp_dir}/{self.key}" file_path = f"{temp_dir}/{self.key}"
os.makedirs(os.path.dirname(file_path), exist_ok=True) os.makedirs(os.path.dirname(file_path), exist_ok=True)
s3.download_file(self.bucket, self.key, file_path) s3.download_file(self.bucket, self.key, file_path)
loader = UnstructuredFileLoader(file_path) return partition(filename=file_path)
return loader.load()
def _get_metadata(self) -> dict:
return {"source": f"s3://{self.bucket}/{self.key}"}