diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/tencent_cos_directory.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/tencent_cos_directory.ipynb new file mode 100644 index 0000000000..c2189b7ccd --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/tencent_cos_directory.ipynb @@ -0,0 +1,116 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a634365e", + "metadata": {}, + "source": [ + "# Tencent COS Directory\n", + "\n", + "This covers how to load document objects from a `Tencent COS Directory`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85e97267", + "metadata": {}, + "outputs": [], + "source": [ + "#! pip install cos-python-sdk-v5" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2f0cd6a5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import TencentCOSDirectoryLoader\n", + "from qcloud_cos import CosConfig" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "321cc7f1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "conf = CosConfig(\n", + " Region=\"your cos region\",\n", + " SecretId=\"your cos secret_id\",\n", + " SecretKey=\"your cos secret_key\",\n", + " )\n", + "loader = TencentCOSDirectoryLoader(conf=conf, bucket=\"you_cos_bucket\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c50d2c7", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "0690c40a", + "metadata": {}, + "source": [ + "## Specifying a prefix\n", + "You can also specify a prefix for more finegrained control over what files to load." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "72d44781", + "metadata": {}, + "outputs": [], + "source": [ + "loader = TencentCOSDirectoryLoader(conf=conf, bucket=\"you_cos_bucket\", prefix=\"fake\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d3c32db", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/tencent_cos_file.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/tencent_cos_file.ipynb new file mode 100644 index 0000000000..a8c11cf2b5 --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/tencent_cos_file.ipynb @@ -0,0 +1,91 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a634365e", + "metadata": {}, + "source": [ + "# Tencent COS File\n", + "\n", + "This covers how to load document object from a `Tencent COS File`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85e97267", + "metadata": {}, + "outputs": [], + "source": [ + "#! pip install cos-python-sdk-v5" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2f0cd6a5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import TencentCOSFileLoader\n", + "from qcloud_cos import CosConfig" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "321cc7f1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "conf = CosConfig(\n", + " Region=\"your cos region\",\n", + " SecretId=\"your cos secret_id\",\n", + " SecretKey=\"your cos secret_key\",\n", + " )\n", + "loader = TencentCOSFileLoader(conf=conf, bucket=\"you_cos_bucket\", key=\"fake.docx\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c50d2c7", + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "0690c40a", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 6bb6b7b0b8..cf3e243a78 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -113,6 +113,8 @@ from langchain.document_loaders.telegram import ( TelegramChatApiLoader, TelegramChatFileLoader, ) +from langchain.document_loaders.tencent_cos_directory import TencentCOSDirectoryLoader +from langchain.document_loaders.tencent_cos_file import TencentCOSFileLoader from langchain.document_loaders.text import TextLoader from langchain.document_loaders.tomarkdown import ToMarkdownLoader from langchain.document_loaders.toml import TomlLoader @@ -243,6 +245,8 @@ __all__ = [ "SnowflakeLoader", "SpreedlyLoader", "StripeLoader", + "TencentCOSDirectoryLoader", + "TencentCOSFileLoader", "TelegramChatApiLoader", "TelegramChatFileLoader", "TelegramChatLoader", diff --git a/langchain/document_loaders/tencent_cos_directory.py b/langchain/document_loaders/tencent_cos_directory.py new file mode 100644 index 0000000000..b51f60ce93 --- /dev/null +++ b/langchain/document_loaders/tencent_cos_directory.py @@ -0,0 +1,50 @@ +"""Loading logic for loading documents from Tencent Cloud COS directory.""" +from typing import Any, Iterator, List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.tencent_cos_file import TencentCOSFileLoader + + +class TencentCOSDirectoryLoader(BaseLoader): + """Loading logic for loading documents from Tencent Cloud COS.""" + + def __init__(self, conf: Any, bucket: str, prefix: str = ""): + """Initialize with COS config, bucket and prefix. + :param conf(CosConfig): COS config. + :param bucket(str): COS bucket. + :param prefix(str): prefix. + """ + self.conf = conf + self.bucket = bucket + self.prefix = prefix + + def load(self) -> List[Document]: + return list(self.lazy_load()) + + def lazy_load(self) -> Iterator[Document]: + """Load documents.""" + try: + from qcloud_cos import CosS3Client + except ImportError: + raise ValueError( + "Could not import cos-python-sdk-v5 python package. " + "Please install it with `pip install cos-python-sdk-v5`." + ) + client = CosS3Client(self.conf) + contents = [] + marker = "" + while True: + response = client.list_objects( + Bucket=self.bucket, Prefix=self.prefix, Marker=marker, MaxKeys=1000 + ) + if "Contents" in response: + contents.extend(response["Contents"]) + if response["IsTruncated"] == "false": + break + marker = response["NextMarker"] + for content in contents: + if content["Key"].endswith("/"): + continue + loader = TencentCOSFileLoader(self.conf, self.bucket, content["Key"]) + yield loader.load()[0] diff --git a/langchain/document_loaders/tencent_cos_file.py b/langchain/document_loaders/tencent_cos_file.py new file mode 100644 index 0000000000..a64220eaf9 --- /dev/null +++ b/langchain/document_loaders/tencent_cos_file.py @@ -0,0 +1,48 @@ +"""Loading logic for loading documents from Tencent Cloud COS file.""" +import os +import tempfile +from typing import Any, Iterator, List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import UnstructuredFileLoader + + +class TencentCOSFileLoader(BaseLoader): + """Loading logic for loading documents from Tencent Cloud COS.""" + + def __init__(self, conf: Any, bucket: str, key: str): + """Initialize with COS config, bucket and key name. + :param conf(CosConfig): COS config. + :param bucket(str): COS bucket. + :param key(str): COS file key. + """ + self.conf = conf + self.bucket = bucket + self.key = key + + def load(self) -> List[Document]: + return list(self.lazy_load()) + + def lazy_load(self) -> Iterator[Document]: + """Load documents.""" + try: + from qcloud_cos import CosS3Client + except ImportError: + raise ValueError( + "Could not import cos-python-sdk-v5 python package. " + "Please install it with `pip install cos-python-sdk-v5`." + ) + + # Initialise a client + client = CosS3Client(self.conf) + with tempfile.TemporaryDirectory() as temp_dir: + file_path = f"{temp_dir}/{self.bucket}/{self.key}" + os.makedirs(os.path.dirname(file_path), exist_ok=True) + # Download the file to a destination + client.download_file( + Bucket=self.bucket, Key=self.key, DestFilePath=file_path + ) + loader = UnstructuredFileLoader(file_path) + # UnstructuredFileLoader not implement lazy_load yet + return iter(loader.load())