From ad48f55357ad92e995b29bf24b7b2290659806e4 Mon Sep 17 00:00:00 2001 From: Dounx Date: Wed, 6 Mar 2024 07:54:07 +0800 Subject: [PATCH] community[minor]: add Yuque document loader (#17924) This pull request support loading documents from Yuque with Langchain. Yuque is a professional cloud-based knowledge base for team collaboration in documentation. Website: https://www.yuque.com OpenAPI: https://www.yuque.com/yuque/developer/openapi --- .../integrations/document_loaders/yuque.ipynb | 77 ++++++++++++++++ .../document_loaders/__init__.py | 2 + .../document_loaders/yuque.py | 92 +++++++++++++++++++ .../document_loaders/test_imports.py | 1 + .../langchain/document_loaders/__init__.py | 1 + .../document_loaders/test_imports.py | 1 + 6 files changed, 174 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/yuque.ipynb create mode 100644 libs/community/langchain_community/document_loaders/yuque.py diff --git a/docs/docs/integrations/document_loaders/yuque.ipynb b/docs/docs/integrations/document_loaders/yuque.ipynb new file mode 100644 index 0000000000..658aae5cf9 --- /dev/null +++ b/docs/docs/integrations/document_loaders/yuque.ipynb @@ -0,0 +1,77 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "66a7777e", + "metadata": {}, + "source": [ + "# Yuque\n", + "\n", + ">[Yuque](https://www.yuque.com/) is a professional cloud-based knowledge base for team collaboration in documentation.\n", + "\n", + "This notebook covers how to load documents from `Yuque`.\n", + "\n", + "You can obtain the personal access token by clicking on your personal avatar in the [Personal Settings](https://www.yuque.com/settings/tokens) page." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ec8a3b3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import YuqueLoader" + ] + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "loader = YuqueLoader(access_token=\"\")" + ], + "metadata": { + "collapsed": false + }, + "id": "2ea958f0327ed6e8" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3470dadf", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index 4d708a89ad..21d9f39d68 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -234,6 +234,7 @@ from langchain_community.document_loaders.youtube import ( GoogleApiYoutubeLoader, YoutubeLoader, ) +from langchain_community.document_loaders.yuque import YuqueLoader # Legacy: only for backwards compatibility. Use PyPDFLoader instead PagedPDFSplitter = PyPDFLoader @@ -421,4 +422,5 @@ __all__ = [ "XorbitsLoader", "YoutubeAudioLoader", "YoutubeLoader", + "YuqueLoader", ] diff --git a/libs/community/langchain_community/document_loaders/yuque.py b/libs/community/langchain_community/document_loaders/yuque.py new file mode 100644 index 0000000000..9947c948a1 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/yuque.py @@ -0,0 +1,92 @@ +import re +from typing import Dict, Iterator, List + +import requests +from langchain_core.documents import Document + +from langchain_community.document_loaders.base import BaseLoader + + +class YuqueLoader(BaseLoader): + """Load documents from `Yuque`.""" + + def __init__(self, access_token: str, api_url: str = "https://www.yuque.com"): + """Initialize with Yuque access_token and api_url. + + Args: + access_token: Personal access token - see https://www.yuque.com/settings/tokens. + api_url: Yuque API url. + """ + self.access_token = access_token + self.api_url = api_url + + @property + def headers(self) -> Dict[str, str]: + return { + "Content-Type": "application/json", + "X-Auth-Token": self.access_token, + } + + def get_user_id(self) -> int: + url = f"{self.api_url}/api/v2/user" + response = self.http_get(url=url) + + return response["data"]["id"] + + def get_books(self, user_id: int) -> List[Dict]: + url = f"{self.api_url}/api/v2/users/{user_id}/repos" + response = self.http_get(url=url) + + return response["data"] + + def get_document_ids(self, book_id: int) -> List[int]: + url = f"{self.api_url}/api/v2/repos/{book_id}/docs" + response = self.http_get(url=url) + + return [document["id"] for document in response["data"]] + + def get_document(self, book_id: int, document_id: int) -> Dict: + url = f"{self.api_url}/api/v2/repos/{book_id}/docs/{document_id}" + response = self.http_get(url=url) + + return response["data"] + + def parse_document(self, document: Dict) -> Document: + content = self.parse_document_body(document["body"]) + metadata = { + "title": document["title"], + "description": document["description"], + "created_at": document["created_at"], + "updated_at": document["updated_at"], + } + + return Document(page_content=content, metadata=metadata) + + @staticmethod + def parse_document_body(body: str) -> str: + result = re.sub(r'', "", body) + result = re.sub(r"", "", result) + + return result + + def http_get(self, url: str) -> Dict: + response = requests.get(url, headers=self.headers) + response.raise_for_status() + + return response.json() + + def get_documents(self) -> Iterator[Document]: + user_id = self.get_user_id() + books = self.get_books(user_id) + + for book in books: + book_id = book["id"] + document_ids = self.get_document_ids(book_id) + for document_id in document_ids: + document = self.get_document(book_id, document_id) + parsed_document = self.parse_document(document) + yield parsed_document + + def load(self) -> List[Document]: + """Load documents from `Yuque`.""" + return list(self.get_documents()) diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index 27f5a54d43..387bf60a0f 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -180,6 +180,7 @@ EXPECTED_ALL = [ "XorbitsLoader", "YoutubeAudioLoader", "YoutubeLoader", + "YuqueLoader", ] diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py index 5a5aec095e..4e816f7d35 100644 --- a/libs/langchain/langchain/document_loaders/__init__.py +++ b/libs/langchain/langchain/document_loaders/__init__.py @@ -220,4 +220,5 @@ __all__ = [ "XorbitsLoader", "YoutubeAudioLoader", "YoutubeLoader", + "YuqueLoader", ] diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_imports.py b/libs/langchain/tests/unit_tests/document_loaders/test_imports.py index ad1b5a7ea3..377b43691f 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/langchain/tests/unit_tests/document_loaders/test_imports.py @@ -170,6 +170,7 @@ EXPECTED_ALL = [ "XorbitsLoader", "YoutubeAudioLoader", "YoutubeLoader", + "YuqueLoader", ]