mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
Add the possibility to configure boto3 in the S3 loaders (#9304)
- Description: this PR adds the possibility to configure boto3 in the S3 loaders. Any named argument you add will be used to create the Boto3 session. This is useful when the AWS credentials can't be passed as env variables or can't be read from the credentials file. - Issue: N/A - Dependencies: N/A - Tag maintainer: ? - Twitter handle: cbornet_ --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
03174c91d0
commit
803d0d9656
@ -102,13 +102,34 @@
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Configuring the AWS Boto3 client\n",
|
||||
"You can configure the AWS [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) client by passing\n",
|
||||
"named arguments when creating the S3DirectoryLoader.\n",
|
||||
"This is useful for instance when AWS credentials can't be set as environment variables.\n",
|
||||
"See the [list of parameters](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session) that can be configured."
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "885dc280",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"loader = S3DirectoryLoader(\"testing-hwc\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")"
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
],
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -66,12 +66,34 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"cell_type": "markdown",
|
||||
"id": "93689594",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configuring the AWS Boto3 client\n",
|
||||
"You can configure the AWS [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) client by passing\n",
|
||||
"named arguments when creating the S3DirectoryLoader.\n",
|
||||
"This is useful for instance when AWS credentials can't be set as environment variables.\n",
|
||||
"See the [list of parameters](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session) that can be configured."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"loader = S3FileLoader(\"testing-hwc\", \"fake.docx\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")"
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
],
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -1,22 +1,99 @@
|
||||
from typing import List
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.s3_file import S3FileLoader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import botocore
|
||||
|
||||
|
||||
class S3DirectoryLoader(BaseLoader):
|
||||
"""Load from `Amazon AWS S3` directory."""
|
||||
|
||||
def __init__(self, bucket: str, prefix: str = ""):
|
||||
def __init__(
|
||||
self,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
*,
|
||||
region_name: Optional[str] = None,
|
||||
api_version: Optional[str] = None,
|
||||
use_ssl: Optional[bool] = True,
|
||||
verify: Union[str, bool, None] = None,
|
||||
endpoint_url: Optional[str] = None,
|
||||
aws_access_key_id: Optional[str] = None,
|
||||
aws_secret_access_key: Optional[str] = None,
|
||||
aws_session_token: Optional[str] = None,
|
||||
boto_config: Optional[botocore.client.Config] = None,
|
||||
):
|
||||
"""Initialize with bucket and key name.
|
||||
|
||||
Args:
|
||||
bucket: The name of the S3 bucket.
|
||||
prefix: The prefix of the S3 key. Defaults to "".
|
||||
:param bucket: The name of the S3 bucket.
|
||||
:param prefix: The prefix of the S3 key. Defaults to "".
|
||||
|
||||
:param region_name: The name of the region associated with the client.
|
||||
A client is associated with a single region.
|
||||
|
||||
:param api_version: The API version to use. By default, botocore will
|
||||
use the latest API version when creating a client. You only need
|
||||
to specify this parameter if you want to use a previous API version
|
||||
of the client.
|
||||
|
||||
:param use_ssl: Whether to use SSL. By default, SSL is used.
|
||||
Note that not all services support non-ssl connections.
|
||||
|
||||
:param verify: Whether to verify SSL certificates.
|
||||
By default SSL certificates are verified. You can provide the
|
||||
following values:
|
||||
|
||||
* False - do not validate SSL certificates. SSL will still be
|
||||
used (unless use_ssl is False), but SSL certificates
|
||||
will not be verified.
|
||||
* path/to/cert/bundle.pem - A filename of the CA cert bundle to
|
||||
uses. You can specify this argument if you want to use a
|
||||
different CA cert bundle than the one used by botocore.
|
||||
|
||||
:param endpoint_url: The complete URL to use for the constructed
|
||||
client. Normally, botocore will automatically construct the
|
||||
appropriate URL to use when communicating with a service. You can
|
||||
specify a complete URL (including the "http/https" scheme) to
|
||||
override this behavior. If this value is provided, then
|
||||
``use_ssl`` is ignored.
|
||||
|
||||
:param aws_access_key_id: The access key to use when creating
|
||||
the client. This is entirely optional, and if not provided,
|
||||
the credentials configured for the session will automatically
|
||||
be used. You only need to provide this argument if you want
|
||||
to override the credentials used for this specific client.
|
||||
|
||||
:param aws_secret_access_key: The secret key to use when creating
|
||||
the client. Same semantics as aws_access_key_id above.
|
||||
|
||||
:param aws_session_token: The session token to use when creating
|
||||
the client. Same semantics as aws_access_key_id above.
|
||||
|
||||
:type boto_config: botocore.client.Config
|
||||
:param boto_config: Advanced boto3 client configuration options. If a value
|
||||
is specified in the client config, its value will take precedence
|
||||
over environment variables and configuration values, but not over
|
||||
a value passed explicitly to the method. If a default config
|
||||
object is set on the session, the config object used when creating
|
||||
the client will be the result of calling ``merge()`` on the
|
||||
default config with the config provided to this call.
|
||||
"""
|
||||
self.bucket = bucket
|
||||
self.prefix = prefix
|
||||
self.region_name = region_name
|
||||
self.api_version = api_version
|
||||
self.use_ssl = use_ssl
|
||||
self.verify = verify
|
||||
self.endpoint_url = endpoint_url
|
||||
self.aws_access_key_id = aws_access_key_id
|
||||
self.aws_secret_access_key = aws_secret_access_key
|
||||
self.aws_session_token = aws_session_token
|
||||
self.boto_config = boto_config
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
@ -27,10 +104,33 @@ class S3DirectoryLoader(BaseLoader):
|
||||
"Could not import boto3 python package. "
|
||||
"Please install it with `pip install boto3`."
|
||||
)
|
||||
s3 = boto3.resource("s3")
|
||||
s3 = boto3.resource(
|
||||
"s3",
|
||||
region_name=self.region_name,
|
||||
api_version=self.api_version,
|
||||
use_ssl=self.use_ssl,
|
||||
verify=self.verify,
|
||||
endpoint_url=self.endpoint_url,
|
||||
aws_access_key_id=self.aws_access_key_id,
|
||||
aws_secret_access_key=self.aws_secret_access_key,
|
||||
aws_session_token=self.aws_session_token,
|
||||
boto_config=self.boto_config,
|
||||
)
|
||||
bucket = s3.Bucket(self.bucket)
|
||||
docs = []
|
||||
for obj in bucket.objects.filter(Prefix=self.prefix):
|
||||
loader = S3FileLoader(self.bucket, obj.key)
|
||||
loader = S3FileLoader(
|
||||
self.bucket,
|
||||
obj.key,
|
||||
region_name=self.region_name,
|
||||
api_version=self.api_version,
|
||||
use_ssl=self.use_ssl,
|
||||
verify=self.verify,
|
||||
endpoint_url=self.endpoint_url,
|
||||
aws_access_key_id=self.aws_access_key_id,
|
||||
aws_secret_access_key=self.aws_secret_access_key,
|
||||
aws_session_token=self.aws_session_token,
|
||||
boto_config=self.boto_config,
|
||||
)
|
||||
docs.extend(loader.load())
|
||||
return docs
|
||||
|
@ -1,23 +1,100 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from typing import List
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
from langchain.document_loaders.unstructured import UnstructuredBaseLoader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import botocore
|
||||
|
||||
|
||||
class S3FileLoader(UnstructuredBaseLoader):
|
||||
"""Load from `Amazon AWS S3` file."""
|
||||
|
||||
def __init__(self, bucket: str, key: str):
|
||||
def __init__(
|
||||
self,
|
||||
bucket: str,
|
||||
key: str,
|
||||
*,
|
||||
region_name: Optional[str] = None,
|
||||
api_version: Optional[str] = None,
|
||||
use_ssl: Optional[bool] = True,
|
||||
verify: Union[str, bool, None] = None,
|
||||
endpoint_url: Optional[str] = None,
|
||||
aws_access_key_id: Optional[str] = None,
|
||||
aws_secret_access_key: Optional[str] = None,
|
||||
aws_session_token: Optional[str] = None,
|
||||
boto_config: Optional[botocore.client.Config] = None,
|
||||
):
|
||||
"""Initialize with bucket and key name.
|
||||
|
||||
Args:
|
||||
bucket: The name of the S3 bucket.
|
||||
key: The key of the S3 object.
|
||||
:param bucket: The name of the S3 bucket.
|
||||
:param key: The key of the S3 object.
|
||||
|
||||
:param region_name: The name of the region associated with the client.
|
||||
A client is associated with a single region.
|
||||
|
||||
:param api_version: The API version to use. By default, botocore will
|
||||
use the latest API version when creating a client. You only need
|
||||
to specify this parameter if you want to use a previous API version
|
||||
of the client.
|
||||
|
||||
:param use_ssl: Whether or not to use SSL. By default, SSL is used.
|
||||
Note that not all services support non-ssl connections.
|
||||
|
||||
:param verify: Whether or not to verify SSL certificates.
|
||||
By default SSL certificates are verified. You can provide the
|
||||
following values:
|
||||
|
||||
* False - do not validate SSL certificates. SSL will still be
|
||||
used (unless use_ssl is False), but SSL certificates
|
||||
will not be verified.
|
||||
* path/to/cert/bundle.pem - A filename of the CA cert bundle to
|
||||
uses. You can specify this argument if you want to use a
|
||||
different CA cert bundle than the one used by botocore.
|
||||
|
||||
:param endpoint_url: The complete URL to use for the constructed
|
||||
client. Normally, botocore will automatically construct the
|
||||
appropriate URL to use when communicating with a service. You can
|
||||
specify a complete URL (including the "http/https" scheme) to
|
||||
override this behavior. If this value is provided, then
|
||||
``use_ssl`` is ignored.
|
||||
|
||||
:param aws_access_key_id: The access key to use when creating
|
||||
the client. This is entirely optional, and if not provided,
|
||||
the credentials configured for the session will automatically
|
||||
be used. You only need to provide this argument if you want
|
||||
to override the credentials used for this specific client.
|
||||
|
||||
:param aws_secret_access_key: The secret key to use when creating
|
||||
the client. Same semantics as aws_access_key_id above.
|
||||
|
||||
:param aws_session_token: The session token to use when creating
|
||||
the client. Same semantics as aws_access_key_id above.
|
||||
|
||||
:type boto_config: botocore.client.Config
|
||||
:param boto_config: Advanced boto3 client configuration options. If a value
|
||||
is specified in the client config, its value will take precedence
|
||||
over environment variables and configuration values, but not over
|
||||
a value passed explicitly to the method. If a default config
|
||||
object is set on the session, the config object used when creating
|
||||
the client will be the result of calling ``merge()`` on the
|
||||
default config with the config provided to this call.
|
||||
"""
|
||||
super().__init__()
|
||||
self.bucket = bucket
|
||||
self.key = key
|
||||
self.region_name = region_name
|
||||
self.api_version = api_version
|
||||
self.use_ssl = use_ssl
|
||||
self.verify = verify
|
||||
self.endpoint_url = endpoint_url
|
||||
self.aws_access_key_id = aws_access_key_id
|
||||
self.aws_secret_access_key = aws_secret_access_key
|
||||
self.aws_session_token = aws_session_token
|
||||
self.boto_config = boto_config
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
"""Get elements."""
|
||||
@ -30,7 +107,18 @@ class S3FileLoader(UnstructuredBaseLoader):
|
||||
"Could not import `boto3` python package. "
|
||||
"Please install it with `pip install boto3`."
|
||||
)
|
||||
s3 = boto3.client("s3")
|
||||
s3 = boto3.client(
|
||||
"s3",
|
||||
region_name=self.region_name,
|
||||
api_version=self.api_version,
|
||||
use_ssl=self.use_ssl,
|
||||
verify=self.verify,
|
||||
endpoint_url=self.endpoint_url,
|
||||
aws_access_key_id=self.aws_access_key_id,
|
||||
aws_secret_access_key=self.aws_secret_access_key,
|
||||
aws_session_token=self.aws_session_token,
|
||||
config=self.boto_config,
|
||||
)
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
file_path = f"{temp_dir}/{self.key}"
|
||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||
|
Loading…
Reference in New Issue
Block a user