mirror of https://github.com/hwchase17/langchain
feat(document_loaders): add tencent cos directory and file loader (#6401)
<!-- Thank you for contributing to LangChain! Your PR will appear in our release under the title you set. Please make sure it highlights your valuable contribution. Replace this with a description of the change, the issue it fixes (if applicable), and relevant context. List any dependencies required for this change. After you're done, someone will review your PR. They may suggest improvements. If no one reviews your PR within a few days, feel free to @-mention the same people again, as notifications can get lost. Finally, we'd love to show appreciation for your contribution - if you'd like us to shout you out on Twitter, please also include your handle! --> <!-- Remove if not applicable --> - add tencent cos directory and file support for document-loader #### Before submitting <!-- If you're adding a new integration, please include: 1. a test for the integration - favor unit tests that does not rely on network access. 2. an example notebook showing its use See contribution guidelines for more information on how to write tests, lint etc: https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md --> #### Who can review? @eyurtsevpull/6857/head
parent
d6cd0deaef
commit
a435a436c1
@ -0,0 +1,116 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a634365e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Tencent COS Directory\n",
|
||||
"\n",
|
||||
"This covers how to load document objects from a `Tencent COS Directory`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "85e97267",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#! pip install cos-python-sdk-v5"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "2f0cd6a5",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TencentCOSDirectoryLoader\n",
|
||||
"from qcloud_cos import CosConfig"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "321cc7f1",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"conf = CosConfig(\n",
|
||||
" Region=\"your cos region\",\n",
|
||||
" SecretId=\"your cos secret_id\",\n",
|
||||
" SecretKey=\"your cos secret_key\",\n",
|
||||
" )\n",
|
||||
"loader = TencentCOSDirectoryLoader(conf=conf, bucket=\"you_cos_bucket\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4c50d2c7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0690c40a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Specifying a prefix\n",
|
||||
"You can also specify a prefix for more finegrained control over what files to load."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "72d44781",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = TencentCOSDirectoryLoader(conf=conf, bucket=\"you_cos_bucket\", prefix=\"fake\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2d3c32db",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,91 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a634365e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Tencent COS File\n",
|
||||
"\n",
|
||||
"This covers how to load document object from a `Tencent COS File`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "85e97267",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#! pip install cos-python-sdk-v5"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "2f0cd6a5",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TencentCOSFileLoader\n",
|
||||
"from qcloud_cos import CosConfig"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "321cc7f1",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"conf = CosConfig(\n",
|
||||
" Region=\"your cos region\",\n",
|
||||
" SecretId=\"your cos secret_id\",\n",
|
||||
" SecretKey=\"your cos secret_key\",\n",
|
||||
" )\n",
|
||||
"loader = TencentCOSFileLoader(conf=conf, bucket=\"you_cos_bucket\", key=\"fake.docx\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4c50d2c7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0690c40a",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,50 @@
|
||||
"""Loading logic for loading documents from Tencent Cloud COS directory."""
|
||||
from typing import Any, Iterator, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.tencent_cos_file import TencentCOSFileLoader
|
||||
|
||||
|
||||
class TencentCOSDirectoryLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from Tencent Cloud COS."""
|
||||
|
||||
def __init__(self, conf: Any, bucket: str, prefix: str = ""):
|
||||
"""Initialize with COS config, bucket and prefix.
|
||||
:param conf(CosConfig): COS config.
|
||||
:param bucket(str): COS bucket.
|
||||
:param prefix(str): prefix.
|
||||
"""
|
||||
self.conf = conf
|
||||
self.bucket = bucket
|
||||
self.prefix = prefix
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
return list(self.lazy_load())
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
from qcloud_cos import CosS3Client
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import cos-python-sdk-v5 python package. "
|
||||
"Please install it with `pip install cos-python-sdk-v5`."
|
||||
)
|
||||
client = CosS3Client(self.conf)
|
||||
contents = []
|
||||
marker = ""
|
||||
while True:
|
||||
response = client.list_objects(
|
||||
Bucket=self.bucket, Prefix=self.prefix, Marker=marker, MaxKeys=1000
|
||||
)
|
||||
if "Contents" in response:
|
||||
contents.extend(response["Contents"])
|
||||
if response["IsTruncated"] == "false":
|
||||
break
|
||||
marker = response["NextMarker"]
|
||||
for content in contents:
|
||||
if content["Key"].endswith("/"):
|
||||
continue
|
||||
loader = TencentCOSFileLoader(self.conf, self.bucket, content["Key"])
|
||||
yield loader.load()[0]
|
@ -0,0 +1,48 @@
|
||||
"""Loading logic for loading documents from Tencent Cloud COS file."""
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Any, Iterator, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class TencentCOSFileLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from Tencent Cloud COS."""
|
||||
|
||||
def __init__(self, conf: Any, bucket: str, key: str):
|
||||
"""Initialize with COS config, bucket and key name.
|
||||
:param conf(CosConfig): COS config.
|
||||
:param bucket(str): COS bucket.
|
||||
:param key(str): COS file key.
|
||||
"""
|
||||
self.conf = conf
|
||||
self.bucket = bucket
|
||||
self.key = key
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
return list(self.lazy_load())
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
from qcloud_cos import CosS3Client
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import cos-python-sdk-v5 python package. "
|
||||
"Please install it with `pip install cos-python-sdk-v5`."
|
||||
)
|
||||
|
||||
# Initialise a client
|
||||
client = CosS3Client(self.conf)
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
file_path = f"{temp_dir}/{self.bucket}/{self.key}"
|
||||
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||
# Download the file to a destination
|
||||
client.download_file(
|
||||
Bucket=self.bucket, Key=self.key, DestFilePath=file_path
|
||||
)
|
||||
loader = UnstructuredFileLoader(file_path)
|
||||
# UnstructuredFileLoader not implement lazy_load yet
|
||||
return iter(loader.load())
|
Loading…
Reference in New Issue