community[minor]: add Yuque document loader (#17924)

This pull request support loading documents from Yuque with Langchain. Yuque is a professional cloud-based knowledge base for team collaboration in documentation. Website: https://www.yuque.com OpenAPI: https://www.yuque.com/yuque/developer/openapi
3 months ago · ad48f55357
parent 60c5d964a8
commit ad48f55357
6 changed files with 174 additions and 0 deletions
--- a/docs/docs/integrations/document_loaders/yuque.ipynb
+++ b/docs/docs/integrations/document_loaders/yuque.ipynb
@ -0,0 +1,77 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "66a7777e",
+   "metadata": {},
+   "source": [
+    "# Yuque\n",
+    "\n",
+    ">[Yuque](https://www.yuque.com/) is a professional cloud-based knowledge base for team collaboration in documentation.\n",
+    "\n",
+    "This notebook covers how to load documents from `Yuque`.\n",
+    "\n",
+    "You can obtain the personal access token by clicking on your personal avatar in the [Personal Settings](https://www.yuque.com/settings/tokens) page."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ec8a3b3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders import YuqueLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "loader = YuqueLoader(access_token=\"<your_personal_access_token>\")"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "2ea958f0327ed6e8"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3470dadf",
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/libs/community/langchain_community/document_loaders/init.py
+++ b/libs/community/langchain_community/document_loaders/init.py
@ -234,6 +234,7 @@ from langchain_community.document_loaders.youtube import (
    GoogleApiYoutubeLoader,
    YoutubeLoader,
 )
+from langchain_community.document_loaders.yuque import YuqueLoader

 # Legacy: only for backwards compatibility. Use PyPDFLoader instead
 PagedPDFSplitter = PyPDFLoader
@ -421,4 +422,5 @@ __all__ = [
    "XorbitsLoader",
    "YoutubeAudioLoader",
    "YoutubeLoader",
+    "YuqueLoader",
 ]
--- a/libs/community/langchain_community/document_loaders/yuque.py
+++ b/libs/community/langchain_community/document_loaders/yuque.py
@ -0,0 +1,92 @@
+import re
+from typing import Dict, Iterator, List
+
+import requests
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseLoader
+
+
+class YuqueLoader(BaseLoader):
+    """Load documents from `Yuque`."""
+
+    def __init__(self, access_token: str, api_url: str = "https://www.yuque.com"):
+        """Initialize with Yuque access_token and api_url.
+
+        Args:
+            access_token: Personal access token - see https://www.yuque.com/settings/tokens.
+            api_url: Yuque API url.
+        """
+        self.access_token = access_token
+        self.api_url = api_url
+
+    @property
+    def headers(self) -> Dict[str, str]:
+        return {
+            "Content-Type": "application/json",
+            "X-Auth-Token": self.access_token,
+        }
+
+    def get_user_id(self) -> int:
+        url = f"{self.api_url}/api/v2/user"
+        response = self.http_get(url=url)
+
+        return response["data"]["id"]
+
+    def get_books(self, user_id: int) -> List[Dict]:
+        url = f"{self.api_url}/api/v2/users/{user_id}/repos"
+        response = self.http_get(url=url)
+
+        return response["data"]
+
+    def get_document_ids(self, book_id: int) -> List[int]:
+        url = f"{self.api_url}/api/v2/repos/{book_id}/docs"
+        response = self.http_get(url=url)
+
+        return [document["id"] for document in response["data"]]
+
+    def get_document(self, book_id: int, document_id: int) -> Dict:
+        url = f"{self.api_url}/api/v2/repos/{book_id}/docs/{document_id}"
+        response = self.http_get(url=url)
+
+        return response["data"]
+
+    def parse_document(self, document: Dict) -> Document:
+        content = self.parse_document_body(document["body"])
+        metadata = {
+            "title": document["title"],
+            "description": document["description"],
+            "created_at": document["created_at"],
+            "updated_at": document["updated_at"],
+        }
+
+        return Document(page_content=content, metadata=metadata)
+
+    @staticmethod
+    def parse_document_body(body: str) -> str:
+        result = re.sub(r'<a name="(.*)"></a>', "", body)
+        result = re.sub(r"<br\s*/?>", "", result)
+
+        return result
+
+    def http_get(self, url: str) -> Dict:
+        response = requests.get(url, headers=self.headers)
+        response.raise_for_status()
+
+        return response.json()
+
+    def get_documents(self) -> Iterator[Document]:
+        user_id = self.get_user_id()
+        books = self.get_books(user_id)
+
+        for book in books:
+            book_id = book["id"]
+            document_ids = self.get_document_ids(book_id)
+            for document_id in document_ids:
+                document = self.get_document(book_id, document_id)
+                parsed_document = self.parse_document(document)
+                yield parsed_document
+
+    def load(self) -> List[Document]:
+        """Load documents from `Yuque`."""
+        return list(self.get_documents())
--- a/libs/community/tests/unit_tests/document_loaders/test_imports.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py
@ -180,6 +180,7 @@ EXPECTED_ALL = [
    "XorbitsLoader",
    "YoutubeAudioLoader",
    "YoutubeLoader",
+    "YuqueLoader",
 ]


--- a/libs/langchain/langchain/document_loaders/init.py
+++ b/libs/langchain/langchain/document_loaders/init.py
@ -220,4 +220,5 @@ __all__ = [
    "XorbitsLoader",
    "YoutubeAudioLoader",
    "YoutubeLoader",
+    "YuqueLoader",
 ]
--- a/libs/langchain/tests/unit_tests/document_loaders/test_imports.py
+++ b/libs/langchain/tests/unit_tests/document_loaders/test_imports.py
@ -170,6 +170,7 @@ EXPECTED_ALL = [
    "XorbitsLoader",
    "YoutubeAudioLoader",
    "YoutubeLoader",
+    "YuqueLoader",
 ]