From 7df2dfc4c29d3dcd37d0a8024f8f15bc3be11bb6 Mon Sep 17 00:00:00 2001 From: mpb159753 Date: Wed, 2 Aug 2023 00:30:30 +0800 Subject: [PATCH] Add Support for Loading Documents from Huawei OBS (#8573) Description: This PR adds support for loading documents from Huawei OBS (Object Storage Service) in Langchain. OBS is a cloud-based object storage service provided by Huawei Cloud. With this enhancement, Langchain users can now easily access and load documents stored in Huawei OBS directly into the system. Key Changes: - Added a new document loader module specifically for Huawei OBS integration. - Implemented the necessary logic to authenticate and connect to Huawei OBS using access credentials. - Enabled the loading of individual documents from a specified bucket and object key in Huawei OBS. - Provided the option to specify custom authentication information or obtain security tokens from Huawei Cloud ECS for easy access. How to Test: 1. Ensure the required package "esdk-obs-python" is installed. 2. Configure the endpoint, access key, secret key, and bucket details for Huawei OBS in the Langchain settings. 3. Load documents from Huawei OBS using the updated document loader module. 4. Verify that documents are successfully retrieved and loaded into Langchain for further processing. Please review this PR and let us know if any further improvements are needed. Your feedback is highly appreciated! @rlancemartin, @eyurtsev --------- Co-authored-by: Bagatur --- .../huawei_obs_directory.ipynb | 178 +++++++++++++++++ .../document_loaders/huawei_obs_file.ipynb | 180 ++++++++++++++++++ .../langchain/document_loaders/__init__.py | 4 + .../document_loaders/obs_directory.py | 82 ++++++++ .../langchain/document_loaders/obs_file.py | 104 ++++++++++ 5 files changed, 548 insertions(+) create mode 100644 docs/extras/integrations/document_loaders/huawei_obs_directory.ipynb create mode 100644 docs/extras/integrations/document_loaders/huawei_obs_file.ipynb create mode 100644 libs/langchain/langchain/document_loaders/obs_directory.py create mode 100644 libs/langchain/langchain/document_loaders/obs_file.py diff --git a/docs/extras/integrations/document_loaders/huawei_obs_directory.ipynb b/docs/extras/integrations/document_loaders/huawei_obs_directory.ipynb new file mode 100644 index 0000000000..e2cbeef268 --- /dev/null +++ b/docs/extras/integrations/document_loaders/huawei_obs_directory.ipynb @@ -0,0 +1,178 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c83b6a4c", + "metadata": {}, + "source": [ + "# Huawei OBS Directory\n", + "The following code demonstrates how to load objects from the Huawei OBS (Object Storage Service) as documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2191935", + "metadata": {}, + "outputs": [], + "source": [ + "# Install the required package\n", + "# pip install esdk-obs-python" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "55fca3b4", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import OBSDirectoryLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c3ed419f", + "metadata": {}, + "outputs": [], + "source": [ + "endpoint = \"your-endpoint\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3428fd4e", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure your access credentials\\n\n", + "config = {\n", + " \"ak\": \"your-access-key\",\n", + " \"sk\": \"your-secret-key\"\n", + "}\n", + "loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9beede9f", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "1e20a839", + "metadata": {}, + "source": [ + "## Specify a Prefix for Loading\n", + "If you want to load objects with a specific prefix from the bucket, you can use the following code:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "125f311d", + "metadata": {}, + "outputs": [], + "source": [ + "loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint, config=config, prefix=\"test_prefix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3488037", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "84c82c0a", + "metadata": {}, + "source": [ + "## Get Authentication Information from ECS\n", + "If your langchain is deployed on Huawei Cloud ECS and [Agency is set up](https://support.huaweicloud.com/intl/en-us/usermanual-ecs/ecs_03_0166.html#section7), the loader can directly get the security token from ECS without needing access key and secret key. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1db99969", + "metadata": {}, + "outputs": [], + "source": [ + "config = {\"get_token_from_ecs\": True}\n", + "loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57dd9f35", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "30205d25", + "metadata": {}, + "source": [ + "## Use a Public Bucket\n", + "If your bucket's bucket policy allows anonymous access (anonymous users have `listBucket` and `GetObject` permissions), you can directly load the objects without configuring the `config` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4dfa2ef0", + "metadata": {}, + "outputs": [], + "source": [ + "loader = OBSDirectoryLoader(\"your-bucket-name\", endpoint=endpoint)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67d4c1d0", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/integrations/document_loaders/huawei_obs_file.ipynb b/docs/extras/integrations/document_loaders/huawei_obs_file.ipynb new file mode 100644 index 0000000000..5617f673cd --- /dev/null +++ b/docs/extras/integrations/document_loaders/huawei_obs_file.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4394a872", + "metadata": {}, + "source": [ + "# Huawei OBS File\n", + "The following code demonstrates how to load an object from the Huawei OBS (Object Storage Service) as document." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c43d811b", + "metadata": {}, + "outputs": [], + "source": [ + "# Install the required package\n", + "# pip install esdk-obs-python" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5e16bae6", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.obs_file import OBSFileLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "75cc7e7c", + "metadata": {}, + "outputs": [], + "source": [ + "endpoint = \"your-endpoint\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f9816984", + "metadata": {}, + "outputs": [], + "source": [ + "from obs import ObsClient\n", + "obs_client = ObsClient(access_key_id=\"your-access-key\", secret_access_key=\"your-secret-key\", server=endpoint)\n", + "loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\", client=obs_client)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6143b39b", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "633e05ca", + "metadata": {}, + "source": [ + "## Each Loader with Separate Authentication Information\n", + "If you don't need to reuse OBS connections between different loaders, you can directly configure the `config`. The loader will use the config information to initialize its own OBS client." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a5dd6a5d", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure your access credentials\\n\n", + "config = {\n", + " \"ak\": \"your-access-key\",\n", + " \"sk\": \"your-secret-key\"\n", + "}\n", + "loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\",endpoint=endpoint, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a741f1c", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "1e2e611c", + "metadata": {}, + "source": [ + "## Get Authentication Information from ECS\n", + "If your langchain is deployed on Huawei Cloud ECS and [Agency is set up](https://support.huaweicloud.com/intl/en-us/usermanual-ecs/ecs_03_0166.html#section7), the loader can directly get the security token from ECS without needing access key and secret key. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "338fafef", + "metadata": {}, + "outputs": [], + "source": [ + "config = {\"get_token_from_ecs\": True}\n", + "loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\", endpoint=endpoint, config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73976c55", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "b77aa18c", + "metadata": {}, + "source": [ + "## Access a Publicly Accessible Object\n", + "If the object you want to access allows anonymous user access (anonymous users have `GetObject` permission), you can directly load the object without configuring the `config` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "df83d121", + "metadata": {}, + "outputs": [], + "source": [ + "loader = OBSFileLoader(\"your-bucket-name\", \"your-object-key\", endpoint=endpoint)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82a844ba", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py index 34446a33a4..4813021f77 100644 --- a/libs/langchain/langchain/document_loaders/__init__.py +++ b/libs/langchain/langchain/document_loaders/__init__.py @@ -98,6 +98,8 @@ from langchain.document_loaders.modern_treasury import ModernTreasuryLoader from langchain.document_loaders.notebook import NotebookLoader from langchain.document_loaders.notion import NotionDirectoryLoader from langchain.document_loaders.notiondb import NotionDBLoader +from langchain.document_loaders.obs_directory import OBSDirectoryLoader +from langchain.document_loaders.obs_file import OBSFileLoader from langchain.document_loaders.obsidian import ObsidianLoader from langchain.document_loaders.odt import UnstructuredODTLoader from langchain.document_loaders.onedrive import OneDriveLoader @@ -251,6 +253,8 @@ __all__ = [ "NotebookLoader", "NotionDBLoader", "NotionDirectoryLoader", + "OBSDirectoryLoader", + "OBSFileLoader", "ObsidianLoader", "OneDriveFileLoader", "OneDriveLoader", diff --git a/libs/langchain/langchain/document_loaders/obs_directory.py b/libs/langchain/langchain/document_loaders/obs_directory.py new file mode 100644 index 0000000000..4c81c5ff11 --- /dev/null +++ b/libs/langchain/langchain/document_loaders/obs_directory.py @@ -0,0 +1,82 @@ +# coding:utf-8 +from typing import List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.obs_file import OBSFileLoader + + +class OBSDirectoryLoader(BaseLoader): + """Loading logic for loading documents from Huawei OBS.""" + + def __init__( + self, + bucket: str, + endpoint: str, + config: Optional[dict] = None, + prefix: str = "", + ): + """Initialize the OBSDirectoryLoader with the specified settings. + + Args: + bucket (str): The name of the OBS bucket to be used. + endpoint (str): The endpoint URL of your OBS bucket. + config (dict): The parameters for connecting to OBS, provided as a dictionary. The dictionary could have the following keys: + - "ak" (str, optional): Your OBS access key (required if `get_token_from_ecs` is False and bucket policy is not public read). + - "sk" (str, optional): Your OBS secret key (required if `get_token_from_ecs` is False and bucket policy is not public read). + - "token" (str, optional): Your security token (required if using temporary credentials). + - "get_token_from_ecs" (bool, optional): Whether to retrieve the security token from ECS. Defaults to False if not provided. If set to True, `ak`, `sk`, and `token` will be ignored. + prefix (str, optional): The prefix to be added to the OBS key. Defaults to "". + + Note: + Before using this class, make sure you have registered with OBS and have the necessary credentials. The `ak`, `sk`, and `endpoint` values are mandatory unless `get_token_from_ecs` is True or the bucket policy is public read. `token` is required when using temporary credentials. + Example: + To create a new OBSDirectoryLoader: + ``` + config = { + "ak": "your-access-key", + "sk": "your-secret-key" + } + ``` + directory_loader = OBSDirectoryLoader("your-bucket-name", "your-end-endpoint", config, "your-prefix") + """ # noqa: E501 + try: + from obs import ObsClient + except ImportError: + raise ValueError( + "Could not import esdk-obs-python python package. " + "Please install it with `pip install esdk-obs-python`." + ) + if not config: + config = dict() + if config.get("get_token_from_ecs"): + self.client = ObsClient(server=endpoint, security_provider_policy="ECS") + else: + self.client = ObsClient( + access_key_id=config.get("ak"), + secret_access_key=config.get("sk"), + security_token=config.get("token"), + server=endpoint, + ) + + self.bucket = bucket + self.prefix = prefix + + def load(self) -> List[Document]: + """Load documents.""" + max_num = 1000 + mark = None + docs = [] + while True: + resp = self.client.listObjects( + self.bucket, prefix=self.prefix, marker=mark, max_keys=max_num + ) + if resp.status < 300: + for content in resp.body.contents: + loader = OBSFileLoader(self.bucket, content.key, client=self.client) + docs.extend(loader.load()) + if resp.body.is_truncated is True: + mark = resp.body.next_marker + else: + break + return docs diff --git a/libs/langchain/langchain/document_loaders/obs_file.py b/libs/langchain/langchain/document_loaders/obs_file.py new file mode 100644 index 0000000000..0e5cdabcdd --- /dev/null +++ b/libs/langchain/langchain/document_loaders/obs_file.py @@ -0,0 +1,104 @@ +# coding:utf-8 + +import os +import tempfile +from typing import Any, List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader + + +class OBSFileLoader(BaseLoader): + """Loader for Huawei OBS file.""" + + def __init__( + self, + bucket: str, + key: str, + client: Any = None, + endpoint: str = "", + config: Optional[dict] = None, + ) -> None: + """Initialize the OBSFileLoader with the specified settings. + + Args: + bucket (str): The name of the OBS bucket to be used. + key (str): The name of the object in the OBS bucket. + client (ObsClient, optional): An instance of the ObsClient to connect to OBS. + endpoint (str, optional): The endpoint URL of your OBS bucket. This parameter is mandatory if `client` is not provided. + config (dict, optional): The parameters for connecting to OBS, provided as a dictionary. This parameter is ignored if `client` is provided. The dictionary could have the following keys: + - "ak" (str, optional): Your OBS access key (required if `get_token_from_ecs` is False and bucket policy is not public read). + - "sk" (str, optional): Your OBS secret key (required if `get_token_from_ecs` is False and bucket policy is not public read). + - "token" (str, optional): Your security token (required if using temporary credentials). + - "get_token_from_ecs" (bool, optional): Whether to retrieve the security token from ECS. Defaults to False if not provided. If set to True, `ak`, `sk`, and `token` will be ignored. + + Raises: + ValueError: If the `esdk-obs-python` package is not installed. + TypeError: If the provided `client` is not an instance of ObsClient. + ValueError: If `client` is not provided, but `endpoint` is missing. + + Note: + Before using this class, make sure you have registered with OBS and have the necessary credentials. The `ak`, `sk`, and `endpoint` values are mandatory unless `get_token_from_ecs` is True or the bucket policy is public read. `token` is required when using temporary credentials. + + Example: + To create a new OBSFileLoader with a new client: + ``` + config = { + "ak": "your-access-key", + "sk": "your-secret-key" + } + obs_loader = OBSFileLoader("your-bucket-name", "your-object-key", config=config) + ``` + + To create a new OBSFileLoader with an existing client: + ``` + from obs import ObsClient + + # Assuming you have an existing ObsClient object 'obs_client' + obs_loader = OBSFileLoader("your-bucket-name", "your-object-key", client=obs_client) + ``` + + To create a new OBSFileLoader without an existing client: + ``` + obs_loader = OBSFileLoader("your-bucket-name", "your-object-key", endpoint="your-endpoint-url") + ``` + """ # noqa: E501 + try: + from obs import ObsClient + except ImportError: + raise ValueError( + "Could not import esdk-obs-python python package. " + "Please install it with `pip install esdk-obs-python`." + ) + if not client: + if not endpoint: + raise ValueError("Either OBSClient or endpoint must be provided.") + if not config: + config = dict() + if config.get("get_token_from_ecs"): + client = ObsClient(server=endpoint, security_provider_policy="ECS") + else: + client = ObsClient( + access_key_id=config.get("ak"), + secret_access_key=config.get("sk"), + security_token=config.get("token"), + server=endpoint, + ) + if not isinstance(client, ObsClient): + raise TypeError("Client must be ObsClient type") + self.client = client + self.bucket = bucket + self.key = key + + def load(self) -> List[Document]: + """Load documents.""" + with tempfile.TemporaryDirectory() as temp_dir: + file_path = f"{temp_dir}/{self.bucket}/{self.key}" + os.makedirs(os.path.dirname(file_path), exist_ok=True) + # Download the file to a destination + self.client.downloadFile( + bucketName=self.bucket, objectKey=self.key, downloadFile=file_path + ) + loader = UnstructuredFileLoader(file_path) + return loader.load()