mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
feat (documents): add LarkSuite document loader (#6420)
<!-- Thank you for contributing to LangChain! Your PR will appear in our release under the title you set. Please make sure it highlights your valuable contribution. Replace this with a description of the change, the issue it fixes (if applicable), and relevant context. List any dependencies required for this change. After you're done, someone will review your PR. They may suggest improvements. If no one reviews your PR within a few days, feel free to @-mention the same people again, as notifications can get lost. Finally, we'd love to show appreciation for your contribution - if you'd like us to shout you out on Twitter, please also include your handle! --> <!-- Remove if not applicable --> ### Summary This PR adds a LarkSuite (FeiShu) document loader. > [LarkSuite](https://www.larksuite.com/) is an enterprise collaboration platform developed by ByteDance. ### Tests - an integration test case is added - an example notebook showing usage is added. [Notebook preview](https://github.com/yaohui-wyh/langchain/blob/master/docs/extras/modules/data_connection/document_loaders/integrations/larksuite.ipynb) <!-- If you're adding a new integration, please include: 1. a test for the integration - favor unit tests that does not rely on network access. 2. an example notebook showing its use See contribution guidelines for more information on how to write tests, lint etc: https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md --> ### Who can review? - PTAL @eyurtsev @hwchase17 <!-- For a quicker response, figure out the right person to tag with @ @hwchase17 - project lead Tracing / Callbacks - @agola11 Async - @agola11 DataLoaders - @eyurtsev Models - @hwchase17 - @agola11 Agents / Tools / Toolkits - @hwchase17 VectorStores / Retrievers / Memory - @dev2049 --> --------- Co-authored-by: Yaohui Wang <wangyaohui.01@bytedance.com>
This commit is contained in:
parent
a435a436c1
commit
9d1bd18596
@ -0,0 +1,103 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "33205b12",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# LarkSuite (FeiShu)\n",
|
||||||
|
"\n",
|
||||||
|
">[LarkSuite](https://www.larksuite.com/) is an enterprise collaboration platform developed by ByteDance.\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook covers how to load data from the `LarkSuite` REST API into a format that can be ingested into LangChain, along with example usage for text summarization.\n",
|
||||||
|
"\n",
|
||||||
|
"The LarkSuite API requires an access token (tenant_access_token or user_access_token), checkout [LarkSuite open platform document](https://open.larksuite.com/document) for API details."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "90b69c94",
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-06-19T10:05:03.645161Z",
|
||||||
|
"start_time": "2023-06-19T10:04:49.541968Z"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from getpass import getpass\n",
|
||||||
|
"from langchain.document_loaders.larksuite import LarkSuiteDocLoader\n",
|
||||||
|
"\n",
|
||||||
|
"DOMAIN = input(\"larksuite domain\")\n",
|
||||||
|
"ACCESS_TOKEN = getpass(\"larksuite tenant_access_token or user_access_token\")\n",
|
||||||
|
"DOCUMENT_ID = input(\"larksuite document id\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "13deb0f5",
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-06-19T10:05:36.016495Z",
|
||||||
|
"start_time": "2023-06-19T10:05:35.360884Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[Document(page_content='Test Doc\\nThis is a Test Doc\\n\\n1\\n2\\n3\\n\\n', metadata={'document_id': 'V76kdbd2HoBbYJxdiNNccajunPf', 'revision_id': 11, 'title': 'Test Doc'})]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from pprint import pprint\n",
|
||||||
|
"\n",
|
||||||
|
"larksuite_loader = LarkSuiteDocLoader(DOMAIN, ACCESS_TOKEN, DOCUMENT_ID)\n",
|
||||||
|
"docs = larksuite_loader.load()\n",
|
||||||
|
"\n",
|
||||||
|
"pprint(docs)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "9ccc1e2f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# see https://python.langchain.com/docs/use_cases/summarization for more details\n",
|
||||||
|
"from langchain.chains.summarize import load_summarize_chain\n",
|
||||||
|
"\n",
|
||||||
|
"chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n",
|
||||||
|
"chain.run(docs)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -63,6 +63,7 @@ from langchain.document_loaders.imsdb import IMSDbLoader
|
|||||||
from langchain.document_loaders.iugu import IuguLoader
|
from langchain.document_loaders.iugu import IuguLoader
|
||||||
from langchain.document_loaders.joplin import JoplinLoader
|
from langchain.document_loaders.joplin import JoplinLoader
|
||||||
from langchain.document_loaders.json_loader import JSONLoader
|
from langchain.document_loaders.json_loader import JSONLoader
|
||||||
|
from langchain.document_loaders.larksuite import LarkSuiteDocLoader
|
||||||
from langchain.document_loaders.markdown import UnstructuredMarkdownLoader
|
from langchain.document_loaders.markdown import UnstructuredMarkdownLoader
|
||||||
from langchain.document_loaders.mastodon import MastodonTootsLoader
|
from langchain.document_loaders.mastodon import MastodonTootsLoader
|
||||||
from langchain.document_loaders.max_compute import MaxComputeLoader
|
from langchain.document_loaders.max_compute import MaxComputeLoader
|
||||||
@ -204,6 +205,7 @@ __all__ = [
|
|||||||
"IuguLoader",
|
"IuguLoader",
|
||||||
"JSONLoader",
|
"JSONLoader",
|
||||||
"JoplinLoader",
|
"JoplinLoader",
|
||||||
|
"LarkSuiteDocLoader",
|
||||||
"MWDumpLoader",
|
"MWDumpLoader",
|
||||||
"MastodonTootsLoader",
|
"MastodonTootsLoader",
|
||||||
"MathpixPDFLoader",
|
"MathpixPDFLoader",
|
||||||
|
46
langchain/document_loaders/larksuite.py
Normal file
46
langchain/document_loaders/larksuite.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
"""Loader that loads LarkSuite (FeiShu) document json dump."""
|
||||||
|
import json
|
||||||
|
import urllib.request
|
||||||
|
from typing import Any, Iterator, List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class LarkSuiteDocLoader(BaseLoader):
|
||||||
|
"""Loader that loads LarkSuite (FeiShu) document."""
|
||||||
|
|
||||||
|
def __init__(self, domain: str, access_token: str, document_id: str):
|
||||||
|
"""Initialize with domain, access_token (tenant / user), and document_id."""
|
||||||
|
self.domain = domain
|
||||||
|
self.access_token = access_token
|
||||||
|
self.document_id = document_id
|
||||||
|
|
||||||
|
def _get_larksuite_api_json_data(self, api_url: str) -> Any:
|
||||||
|
"""Get LarkSuite (FeiShu) API response json data."""
|
||||||
|
headers = {"Authorization": f"Bearer {self.access_token}"}
|
||||||
|
request = urllib.request.Request(api_url, headers=headers)
|
||||||
|
with urllib.request.urlopen(request) as response:
|
||||||
|
json_data = json.loads(response.read().decode())
|
||||||
|
return json_data
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
"""Lazy load LarkSuite (FeiShu) document."""
|
||||||
|
api_url_prefix = f"{self.domain}/open-apis/docx/v1/documents"
|
||||||
|
metadata_json = self._get_larksuite_api_json_data(
|
||||||
|
f"{api_url_prefix}/{self.document_id}"
|
||||||
|
)
|
||||||
|
raw_content_json = self._get_larksuite_api_json_data(
|
||||||
|
f"{api_url_prefix}/{self.document_id}/raw_content"
|
||||||
|
)
|
||||||
|
text = raw_content_json["data"]["content"]
|
||||||
|
metadata = {
|
||||||
|
"document_id": self.document_id,
|
||||||
|
"revision_id": metadata_json["data"]["document"]["revision_id"],
|
||||||
|
"title": metadata_json["data"]["document"]["title"],
|
||||||
|
}
|
||||||
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load LarkSuite (FeiShu) document."""
|
||||||
|
return list(self.lazy_load())
|
14
tests/integration_tests/document_loaders/test_larksuite.py
Normal file
14
tests/integration_tests/document_loaders/test_larksuite.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from langchain.document_loaders.larksuite import LarkSuiteDocLoader
|
||||||
|
|
||||||
|
DOMAIN = ""
|
||||||
|
ACCESS_TOKEN = ""
|
||||||
|
DOCUMENT_ID = ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_larksuite_doc_loader() -> None:
|
||||||
|
"""Test LarkSuite (FeiShu) document loader."""
|
||||||
|
loader = LarkSuiteDocLoader(DOMAIN, ACCESS_TOKEN, DOCUMENT_ID)
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 1
|
||||||
|
assert docs[0].page_content is not None
|
Loading…
Reference in New Issue
Block a user