From 44abe925dfbdd25bb7aa89f85d190f955cf722fc Mon Sep 17 00:00:00 2001 From: Alon Diament Date: Wed, 24 May 2023 22:31:55 +0300 Subject: [PATCH] Add Joplin document loader (#5153) # Add Joplin document loader [Joplin](https://joplinapp.org/) is an open source note-taking app. Joplin has a [REST API](https://joplinapp.org/api/references/rest_api/) for accessing its local database. The proposed `JoplinLoader` uses the API to retrieve all notes in the database and their metadata. Joplin needs to be installed and running locally, and an access token is required. - The PR includes an integration test. - The PR includes an example notebook. --------- Co-authored-by: Dev 2049 --- docs/modules/indexes/document_loaders.rst | 1 + .../document_loaders/examples/joplin.ipynb | 89 +++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/joplin.py | 88 ++++++++++++++++++ .../document_loaders/test_joplin.py | 11 +++ 5 files changed, 191 insertions(+) create mode 100644 docs/modules/indexes/document_loaders/examples/joplin.ipynb create mode 100644 langchain/document_loaders/joplin.py create mode 100644 tests/integration_tests/document_loaders/test_joplin.py diff --git a/docs/modules/indexes/document_loaders.rst b/docs/modules/indexes/document_loaders.rst index 54ff07cf72..fc9375c7e9 100644 --- a/docs/modules/indexes/document_loaders.rst +++ b/docs/modules/indexes/document_loaders.rst @@ -123,6 +123,7 @@ We need access tokens and sometime other parameters to get access to these datas ./document_loaders/examples/google_cloud_storage_file.ipynb ./document_loaders/examples/google_drive.ipynb ./document_loaders/examples/image_captions.ipynb + ./document_loaders/examples/joplin.ipynb ./document_loaders/examples/microsoft_onedrive.ipynb ./document_loaders/examples/modern_treasury.ipynb ./document_loaders/examples/notiondb.ipynb diff --git a/docs/modules/indexes/document_loaders/examples/joplin.ipynb b/docs/modules/indexes/document_loaders/examples/joplin.ipynb new file mode 100644 index 0000000000..78dc59183b --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/joplin.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "1dc7df1d", + "metadata": {}, + "source": [ + "# Joplin\n", + "\n", + ">[Joplin](https://joplinapp.org/) is an open source note-taking app. Capture your thoughts and securely access them from any device.\n", + "\n", + "This notebook covers how to load documents from a `Joplin` database.\n", + "\n", + "`Joplin` has a [REST API](https://joplinapp.org/api/references/rest_api/) for accessing its local database. This loader uses the API to retrieve all notes in the database and their metadata. This requires an access token that can be obtained from the app by following these steps:\n", + "\n", + "1. Open the `Joplin` app. The app must stay open while the documents are being loaded.\n", + "2. Go to settings / options and select \"Web Clipper\".\n", + "3. Make sure that the Web Clipper service is enabled.\n", + "4. Under \"Advanced Options\", copy the authorization token.\n", + "\n", + "You may either initialize the loader directly with the access token, or store it in the environment variable JOPLIN_ACCESS_TOKEN.\n", + "\n", + "An alternative to this approach is to export the `Joplin`'s note database to Markdown files (optionally, with Front Matter metadata) and use a Markdown loader, such as ObsidianLoader, to load them." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "007c5cbf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import JoplinLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a1caec59", + "metadata": {}, + "outputs": [], + "source": [ + "loader = JoplinLoader(access_token=\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b1c30ff7", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa93b965", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 6e693c2132..9693548060 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -46,6 +46,7 @@ from langchain.document_loaders.ifixit import IFixitLoader from langchain.document_loaders.image import UnstructuredImageLoader from langchain.document_loaders.image_captions import ImageCaptionLoader from langchain.document_loaders.imsdb import IMSDbLoader +from langchain.document_loaders.joplin import JoplinLoader from langchain.document_loaders.json_loader import JSONLoader from langchain.document_loaders.markdown import UnstructuredMarkdownLoader from langchain.document_loaders.mastodon import MastodonTootsLoader @@ -160,6 +161,7 @@ __all__ = [ "IFixitLoader", "IMSDbLoader", "ImageCaptionLoader", + "JoplinLoader", "JSONLoader", "MWDumpLoader", "MastodonTootsLoader", diff --git a/langchain/document_loaders/joplin.py b/langchain/document_loaders/joplin.py new file mode 100644 index 0000000000..865a699739 --- /dev/null +++ b/langchain/document_loaders/joplin.py @@ -0,0 +1,88 @@ +import json +import urllib +from datetime import datetime +from typing import Iterator, List, Optional + +from langchain.document_loaders.base import BaseLoader +from langchain.schema import Document +from langchain.utils import get_from_env + +LINK_NOTE_TEMPLATE = "joplin://x-callback-url/openNote?id={id}" + + +class JoplinLoader(BaseLoader): + """ + Loader that fetches notes from Joplin. + + In order to use this loader, you need to have Joplin running with the + Web Clipper enabled (look for "Web Clipper" in the app settings). + + To get the access token, you need to go to the Web Clipper options and + under "Advanced Options" you will find the access token. + + You can find more information about the Web Clipper service here: + https://joplinapp.org/clipper/ + """ + + def __init__( + self, + access_token: Optional[str] = None, + port: int = 41184, + host: str = "localhost", + ) -> None: + access_token = access_token or get_from_env( + "access_token", "JOPLIN_ACCESS_TOKEN" + ) + base_url = f"http://{host}:{port}" + self._get_note_url = ( + f"{base_url}/notes?token={access_token}" + "&fields=id,parent_id,title,body,created_time,updated_time&page={{page}}" + ) + self._get_folder_url = ( + f"{base_url}/folders/{{id}}?token={access_token}&fields=title" + ) + self._get_tag_url = ( + f"{base_url}/notes/{{id}}/tags?token={access_token}&fields=title" + ) + + def _get_notes(self) -> Iterator[Document]: + has_more = True + page = 1 + while has_more: + req_note = urllib.request.Request(self._get_note_url.format(page=page)) + with urllib.request.urlopen(req_note) as response: + json_data = json.loads(response.read().decode()) + for note in json_data["items"]: + metadata = { + "source": LINK_NOTE_TEMPLATE.format(id=note["id"]), + "folder": self._get_folder(note["parent_id"]), + "tags": self._get_tags(note["id"]), + "title": note["title"], + "created_time": self._convert_date(note["created_time"]), + "updated_time": self._convert_date(note["updated_time"]), + } + yield Document(page_content=note["body"], metadata=metadata) + + has_more = json_data["has_more"] + page += 1 + + def _get_folder(self, folder_id: str) -> str: + req_folder = urllib.request.Request(self._get_folder_url.format(id=folder_id)) + with urllib.request.urlopen(req_folder) as response: + json_data = json.loads(response.read().decode()) + return json_data["title"] + + def _get_tags(self, note_id: str) -> List[str]: + req_tag = urllib.request.Request(self._get_tag_url.format(id=note_id)) + with urllib.request.urlopen(req_tag) as response: + json_data = json.loads(response.read().decode()) + return [tag["title"] for tag in json_data["items"]] + + def _convert_date(self, date: int) -> str: + return datetime.fromtimestamp(date / 1000).strftime("%Y-%m-%d %H:%M:%S") + + def lazy_load(self) -> Iterator[Document]: + yield from self._get_notes() + + def load(self) -> List[Document]: + return list(self.lazy_load()) diff --git a/tests/integration_tests/document_loaders/test_joplin.py b/tests/integration_tests/document_loaders/test_joplin.py new file mode 100644 index 0000000000..0b7c8dc195 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_joplin.py @@ -0,0 +1,11 @@ +from langchain.document_loaders.joplin import JoplinLoader + + +def test_joplin_loader() -> None: + loader = JoplinLoader() + docs = loader.load() + + assert type(docs) is list + assert type(docs[0].page_content) is str + assert type(docs[0].metadata["source"]) is str + assert type(docs[0].metadata["title"]) is str