forked from Archives/langchain
Add Joplin document loader (#5153)
# Add Joplin document loader [Joplin](https://joplinapp.org/) is an open source note-taking app. Joplin has a [REST API](https://joplinapp.org/api/references/rest_api/) for accessing its local database. The proposed `JoplinLoader` uses the API to retrieve all notes in the database and their metadata. Joplin needs to be installed and running locally, and an access token is required. - The PR includes an integration test. - The PR includes an example notebook. --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
This commit is contained in:
parent
f10be072ff
commit
44abe925df
@ -123,6 +123,7 @@ We need access tokens and sometime other parameters to get access to these datas
|
|||||||
./document_loaders/examples/google_cloud_storage_file.ipynb
|
./document_loaders/examples/google_cloud_storage_file.ipynb
|
||||||
./document_loaders/examples/google_drive.ipynb
|
./document_loaders/examples/google_drive.ipynb
|
||||||
./document_loaders/examples/image_captions.ipynb
|
./document_loaders/examples/image_captions.ipynb
|
||||||
|
./document_loaders/examples/joplin.ipynb
|
||||||
./document_loaders/examples/microsoft_onedrive.ipynb
|
./document_loaders/examples/microsoft_onedrive.ipynb
|
||||||
./document_loaders/examples/modern_treasury.ipynb
|
./document_loaders/examples/modern_treasury.ipynb
|
||||||
./document_loaders/examples/notiondb.ipynb
|
./document_loaders/examples/notiondb.ipynb
|
||||||
|
89
docs/modules/indexes/document_loaders/examples/joplin.ipynb
Normal file
89
docs/modules/indexes/document_loaders/examples/joplin.ipynb
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1dc7df1d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Joplin\n",
|
||||||
|
"\n",
|
||||||
|
">[Joplin](https://joplinapp.org/) is an open source note-taking app. Capture your thoughts and securely access them from any device.\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook covers how to load documents from a `Joplin` database.\n",
|
||||||
|
"\n",
|
||||||
|
"`Joplin` has a [REST API](https://joplinapp.org/api/references/rest_api/) for accessing its local database. This loader uses the API to retrieve all notes in the database and their metadata. This requires an access token that can be obtained from the app by following these steps:\n",
|
||||||
|
"\n",
|
||||||
|
"1. Open the `Joplin` app. The app must stay open while the documents are being loaded.\n",
|
||||||
|
"2. Go to settings / options and select \"Web Clipper\".\n",
|
||||||
|
"3. Make sure that the Web Clipper service is enabled.\n",
|
||||||
|
"4. Under \"Advanced Options\", copy the authorization token.\n",
|
||||||
|
"\n",
|
||||||
|
"You may either initialize the loader directly with the access token, or store it in the environment variable JOPLIN_ACCESS_TOKEN.\n",
|
||||||
|
"\n",
|
||||||
|
"An alternative to this approach is to export the `Joplin`'s note database to Markdown files (optionally, with Front Matter metadata) and use a Markdown loader, such as ObsidianLoader, to load them."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "007c5cbf",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import JoplinLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "a1caec59",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = JoplinLoader(access_token=\"<access-token>\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "b1c30ff7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "fa93b965",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.11"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -46,6 +46,7 @@ from langchain.document_loaders.ifixit import IFixitLoader
|
|||||||
from langchain.document_loaders.image import UnstructuredImageLoader
|
from langchain.document_loaders.image import UnstructuredImageLoader
|
||||||
from langchain.document_loaders.image_captions import ImageCaptionLoader
|
from langchain.document_loaders.image_captions import ImageCaptionLoader
|
||||||
from langchain.document_loaders.imsdb import IMSDbLoader
|
from langchain.document_loaders.imsdb import IMSDbLoader
|
||||||
|
from langchain.document_loaders.joplin import JoplinLoader
|
||||||
from langchain.document_loaders.json_loader import JSONLoader
|
from langchain.document_loaders.json_loader import JSONLoader
|
||||||
from langchain.document_loaders.markdown import UnstructuredMarkdownLoader
|
from langchain.document_loaders.markdown import UnstructuredMarkdownLoader
|
||||||
from langchain.document_loaders.mastodon import MastodonTootsLoader
|
from langchain.document_loaders.mastodon import MastodonTootsLoader
|
||||||
@ -160,6 +161,7 @@ __all__ = [
|
|||||||
"IFixitLoader",
|
"IFixitLoader",
|
||||||
"IMSDbLoader",
|
"IMSDbLoader",
|
||||||
"ImageCaptionLoader",
|
"ImageCaptionLoader",
|
||||||
|
"JoplinLoader",
|
||||||
"JSONLoader",
|
"JSONLoader",
|
||||||
"MWDumpLoader",
|
"MWDumpLoader",
|
||||||
"MastodonTootsLoader",
|
"MastodonTootsLoader",
|
||||||
|
88
langchain/document_loaders/joplin.py
Normal file
88
langchain/document_loaders/joplin.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import json
|
||||||
|
import urllib
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Iterator, List, Optional
|
||||||
|
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
from langchain.schema import Document
|
||||||
|
from langchain.utils import get_from_env
|
||||||
|
|
||||||
|
LINK_NOTE_TEMPLATE = "joplin://x-callback-url/openNote?id={id}"
|
||||||
|
|
||||||
|
|
||||||
|
class JoplinLoader(BaseLoader):
|
||||||
|
"""
|
||||||
|
Loader that fetches notes from Joplin.
|
||||||
|
|
||||||
|
In order to use this loader, you need to have Joplin running with the
|
||||||
|
Web Clipper enabled (look for "Web Clipper" in the app settings).
|
||||||
|
|
||||||
|
To get the access token, you need to go to the Web Clipper options and
|
||||||
|
under "Advanced Options" you will find the access token.
|
||||||
|
|
||||||
|
You can find more information about the Web Clipper service here:
|
||||||
|
https://joplinapp.org/clipper/
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
access_token: Optional[str] = None,
|
||||||
|
port: int = 41184,
|
||||||
|
host: str = "localhost",
|
||||||
|
) -> None:
|
||||||
|
access_token = access_token or get_from_env(
|
||||||
|
"access_token", "JOPLIN_ACCESS_TOKEN"
|
||||||
|
)
|
||||||
|
base_url = f"http://{host}:{port}"
|
||||||
|
self._get_note_url = (
|
||||||
|
f"{base_url}/notes?token={access_token}"
|
||||||
|
"&fields=id,parent_id,title,body,created_time,updated_time&page={{page}}"
|
||||||
|
)
|
||||||
|
self._get_folder_url = (
|
||||||
|
f"{base_url}/folders/{{id}}?token={access_token}&fields=title"
|
||||||
|
)
|
||||||
|
self._get_tag_url = (
|
||||||
|
f"{base_url}/notes/{{id}}/tags?token={access_token}&fields=title"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_notes(self) -> Iterator[Document]:
|
||||||
|
has_more = True
|
||||||
|
page = 1
|
||||||
|
while has_more:
|
||||||
|
req_note = urllib.request.Request(self._get_note_url.format(page=page))
|
||||||
|
with urllib.request.urlopen(req_note) as response:
|
||||||
|
json_data = json.loads(response.read().decode())
|
||||||
|
for note in json_data["items"]:
|
||||||
|
metadata = {
|
||||||
|
"source": LINK_NOTE_TEMPLATE.format(id=note["id"]),
|
||||||
|
"folder": self._get_folder(note["parent_id"]),
|
||||||
|
"tags": self._get_tags(note["id"]),
|
||||||
|
"title": note["title"],
|
||||||
|
"created_time": self._convert_date(note["created_time"]),
|
||||||
|
"updated_time": self._convert_date(note["updated_time"]),
|
||||||
|
}
|
||||||
|
yield Document(page_content=note["body"], metadata=metadata)
|
||||||
|
|
||||||
|
has_more = json_data["has_more"]
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
def _get_folder(self, folder_id: str) -> str:
|
||||||
|
req_folder = urllib.request.Request(self._get_folder_url.format(id=folder_id))
|
||||||
|
with urllib.request.urlopen(req_folder) as response:
|
||||||
|
json_data = json.loads(response.read().decode())
|
||||||
|
return json_data["title"]
|
||||||
|
|
||||||
|
def _get_tags(self, note_id: str) -> List[str]:
|
||||||
|
req_tag = urllib.request.Request(self._get_tag_url.format(id=note_id))
|
||||||
|
with urllib.request.urlopen(req_tag) as response:
|
||||||
|
json_data = json.loads(response.read().decode())
|
||||||
|
return [tag["title"] for tag in json_data["items"]]
|
||||||
|
|
||||||
|
def _convert_date(self, date: int) -> str:
|
||||||
|
return datetime.fromtimestamp(date / 1000).strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
yield from self._get_notes()
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
return list(self.lazy_load())
|
11
tests/integration_tests/document_loaders/test_joplin.py
Normal file
11
tests/integration_tests/document_loaders/test_joplin.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from langchain.document_loaders.joplin import JoplinLoader
|
||||||
|
|
||||||
|
|
||||||
|
def test_joplin_loader() -> None:
|
||||||
|
loader = JoplinLoader()
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert type(docs) is list
|
||||||
|
assert type(docs[0].page_content) is str
|
||||||
|
assert type(docs[0].metadata["source"]) is str
|
||||||
|
assert type(docs[0].metadata["title"]) is str
|
Loading…
Reference in New Issue
Block a user