mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Add bucket and object key to metadata in S3 loader (#9317)
- Description: this PR adds `s3_object_key` and `s3_bucket` to the doc metadata when loading an S3 file. This is particularly useful when using `S3DirectoryLoader` to remove the files from the dir once they have been processed (getting the object keys from the metadata `source` field seems brittle) - Dependencies: N/A - Tag maintainer: ? - Twitter handle: _cbornet --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
24c0b01c38
commit
9870bfb9cd
@ -90,7 +90,7 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]"
|
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 6,
|
"execution_count": 6,
|
||||||
|
@ -53,7 +53,7 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpxvave6wl/fake.docx'}, lookup_index=0)]"
|
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 9,
|
"execution_count": 9,
|
||||||
@ -96,3 +96,4 @@
|
|||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 5
|
"nbformat_minor": 5
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2,12 +2,10 @@ import os
|
|||||||
import tempfile
|
import tempfile
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.document_loaders.unstructured import UnstructuredBaseLoader
|
||||||
from langchain.document_loaders.base import BaseLoader
|
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|
||||||
|
|
||||||
|
|
||||||
class S3FileLoader(BaseLoader):
|
class S3FileLoader(UnstructuredBaseLoader):
|
||||||
"""Load from `Amazon AWS S3` file."""
|
"""Load from `Amazon AWS S3` file."""
|
||||||
|
|
||||||
def __init__(self, bucket: str, key: str):
|
def __init__(self, bucket: str, key: str):
|
||||||
@ -17,11 +15,14 @@ class S3FileLoader(BaseLoader):
|
|||||||
bucket: The name of the S3 bucket.
|
bucket: The name of the S3 bucket.
|
||||||
key: The key of the S3 object.
|
key: The key of the S3 object.
|
||||||
"""
|
"""
|
||||||
|
super().__init__()
|
||||||
self.bucket = bucket
|
self.bucket = bucket
|
||||||
self.key = key
|
self.key = key
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def _get_elements(self) -> List:
|
||||||
"""Load documents."""
|
"""Get elements."""
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import boto3
|
import boto3
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -34,5 +35,7 @@ class S3FileLoader(BaseLoader):
|
|||||||
file_path = f"{temp_dir}/{self.key}"
|
file_path = f"{temp_dir}/{self.key}"
|
||||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||||
s3.download_file(self.bucket, self.key, file_path)
|
s3.download_file(self.bucket, self.key, file_path)
|
||||||
loader = UnstructuredFileLoader(file_path)
|
return partition(filename=file_path)
|
||||||
return loader.load()
|
|
||||||
|
def _get_metadata(self) -> dict:
|
||||||
|
return {"source": f"s3://{self.bucket}/{self.key}"}
|
||||||
|
Loading…
Reference in New Issue
Block a user