mirror of https://github.com/hwchase17/langchain
community[minor]: add Yuque document loader (#17924)
This pull request support loading documents from Yuque with Langchain. Yuque is a professional cloud-based knowledge base for team collaboration in documentation. Website: https://www.yuque.com OpenAPI: https://www.yuque.com/yuque/developer/openapipull/18663/head
parent
60c5d964a8
commit
ad48f55357
@ -0,0 +1,77 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "66a7777e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Yuque\n",
|
||||
"\n",
|
||||
">[Yuque](https://www.yuque.com/) is a professional cloud-based knowledge base for team collaboration in documentation.\n",
|
||||
"\n",
|
||||
"This notebook covers how to load documents from `Yuque`.\n",
|
||||
"\n",
|
||||
"You can obtain the personal access token by clicking on your personal avatar in the [Personal Settings](https://www.yuque.com/settings/tokens) page."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9ec8a3b3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import YuqueLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = YuqueLoader(access_token=\"<your_personal_access_token>\")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "2ea958f0327ed6e8"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3470dadf",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,92 @@
|
||||
import re
|
||||
from typing import Dict, Iterator, List
|
||||
|
||||
import requests
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class YuqueLoader(BaseLoader):
|
||||
"""Load documents from `Yuque`."""
|
||||
|
||||
def __init__(self, access_token: str, api_url: str = "https://www.yuque.com"):
|
||||
"""Initialize with Yuque access_token and api_url.
|
||||
|
||||
Args:
|
||||
access_token: Personal access token - see https://www.yuque.com/settings/tokens.
|
||||
api_url: Yuque API url.
|
||||
"""
|
||||
self.access_token = access_token
|
||||
self.api_url = api_url
|
||||
|
||||
@property
|
||||
def headers(self) -> Dict[str, str]:
|
||||
return {
|
||||
"Content-Type": "application/json",
|
||||
"X-Auth-Token": self.access_token,
|
||||
}
|
||||
|
||||
def get_user_id(self) -> int:
|
||||
url = f"{self.api_url}/api/v2/user"
|
||||
response = self.http_get(url=url)
|
||||
|
||||
return response["data"]["id"]
|
||||
|
||||
def get_books(self, user_id: int) -> List[Dict]:
|
||||
url = f"{self.api_url}/api/v2/users/{user_id}/repos"
|
||||
response = self.http_get(url=url)
|
||||
|
||||
return response["data"]
|
||||
|
||||
def get_document_ids(self, book_id: int) -> List[int]:
|
||||
url = f"{self.api_url}/api/v2/repos/{book_id}/docs"
|
||||
response = self.http_get(url=url)
|
||||
|
||||
return [document["id"] for document in response["data"]]
|
||||
|
||||
def get_document(self, book_id: int, document_id: int) -> Dict:
|
||||
url = f"{self.api_url}/api/v2/repos/{book_id}/docs/{document_id}"
|
||||
response = self.http_get(url=url)
|
||||
|
||||
return response["data"]
|
||||
|
||||
def parse_document(self, document: Dict) -> Document:
|
||||
content = self.parse_document_body(document["body"])
|
||||
metadata = {
|
||||
"title": document["title"],
|
||||
"description": document["description"],
|
||||
"created_at": document["created_at"],
|
||||
"updated_at": document["updated_at"],
|
||||
}
|
||||
|
||||
return Document(page_content=content, metadata=metadata)
|
||||
|
||||
@staticmethod
|
||||
def parse_document_body(body: str) -> str:
|
||||
result = re.sub(r'<a name="(.*)"></a>', "", body)
|
||||
result = re.sub(r"<br\s*/?>", "", result)
|
||||
|
||||
return result
|
||||
|
||||
def http_get(self, url: str) -> Dict:
|
||||
response = requests.get(url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
|
||||
return response.json()
|
||||
|
||||
def get_documents(self) -> Iterator[Document]:
|
||||
user_id = self.get_user_id()
|
||||
books = self.get_books(user_id)
|
||||
|
||||
for book in books:
|
||||
book_id = book["id"]
|
||||
document_ids = self.get_document_ids(book_id)
|
||||
for document_id in document_ids:
|
||||
document = self.get_document(book_id, document_id)
|
||||
parsed_document = self.parse_document(document)
|
||||
yield parsed_document
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents from `Yuque`."""
|
||||
return list(self.get_documents())
|
Loading…
Reference in New Issue