Add bucket and object key to metadata in S3 loader (#9317)

- Description: this PR adds `s3_object_key` and `s3_bucket` to the doc metadata when loading an S3 file. This is particularly useful when using `S3DirectoryLoader` to remove the files from the dir once they have been processed (getting the object keys from the metadata `source` field seems brittle) - Dependencies: N/A - Tag maintainer: ? - Twitter handle: _cbornet --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2024-11-06 03:20:49 +00:00 · 2023-08-30 17:03:24 +02:00 · 2023-08-30 17:03:24 +02:00 · 9870bfb9cd
commit 9870bfb9cd
parent 24c0b01c38
3 changed files with 14 additions and 10 deletions
--- a/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb
+++ b/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb
@ -90,7 +90,7 @@
    {
     "data": {
      "text/plain": [
-       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]"
+       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
      ]
     },
     "execution_count": 6,
--- a/docs/extras/integrations/document_loaders/aws_s3_file.ipynb
+++ b/docs/extras/integrations/document_loaders/aws_s3_file.ipynb
@ -53,7 +53,7 @@
    {
     "data": {
      "text/plain": [
-       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpxvave6wl/fake.docx'}, lookup_index=0)]"
+       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
      ]
     },
     "execution_count": 9,
@ -96,3 +96,4 @@
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/libs/langchain/langchain/document_loaders/s3_file.py
+++ b/libs/langchain/langchain/document_loaders/s3_file.py
@ -2,12 +2,10 @@ import os
 import tempfile
 from typing import List
-from langchain.docstore.document import Document
+from langchain.document_loaders.unstructured import UnstructuredBaseLoader
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
-class S3FileLoader(BaseLoader):
+class S3FileLoader(UnstructuredBaseLoader):
    """Load from `Amazon AWS S3` file."""
    def __init__(self, bucket: str, key: str):
@ -17,11 +15,14 @@ class S3FileLoader(BaseLoader):
            bucket: The name of the S3 bucket.
            key: The key of the S3 object.
        """
        super().__init__()
        self.bucket = bucket
        self.key = key
-    def load(self) -> List[Document]:
+    def _get_elements(self) -> List:
-        """Load documents."""
+        """Get elements."""
        from unstructured.partition.auto import partition
        try:
            import boto3
        except ImportError:
@ -34,5 +35,7 @@ class S3FileLoader(BaseLoader):
            file_path = f"{temp_dir}/{self.key}"
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            s3.download_file(self.bucket, self.key, file_path)
-            loader = UnstructuredFileLoader(file_path)
+            return partition(filename=file_path)
-            return loader.load()
+
    def _get_metadata(self) -> dict:
        return {"source": f"s3://{self.bucket}/{self.key}"}