diff --git a/docs/modules/indexes/document_loaders.rst b/docs/modules/indexes/document_loaders.rst index 54ff07cf..fc9375c7 100644 --- a/docs/modules/indexes/document_loaders.rst +++ b/docs/modules/indexes/document_loaders.rst @@ -123,6 +123,7 @@ We need access tokens and sometime other parameters to get access to these datas ./document_loaders/examples/google_cloud_storage_file.ipynb ./document_loaders/examples/google_drive.ipynb ./document_loaders/examples/image_captions.ipynb + ./document_loaders/examples/joplin.ipynb ./document_loaders/examples/microsoft_onedrive.ipynb ./document_loaders/examples/modern_treasury.ipynb ./document_loaders/examples/notiondb.ipynb diff --git a/docs/modules/indexes/document_loaders/examples/joplin.ipynb b/docs/modules/indexes/document_loaders/examples/joplin.ipynb new file mode 100644 index 00000000..78dc5918 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/joplin.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "1dc7df1d", + "metadata": {}, + "source": [ + "# Joplin\n", + "\n", + ">[Joplin](https://joplinapp.org/) is an open source note-taking app. Capture your thoughts and securely access them from any device.\n", + "\n", + "This notebook covers how to load documents from a `Joplin` database.\n", + "\n", + "`Joplin` has a [REST API](https://joplinapp.org/api/references/rest_api/) for accessing its local database. This loader uses the API to retrieve all notes in the database and their metadata. This requires an access token that can be obtained from the app by following these steps:\n", + "\n", + "1. Open the `Joplin` app. The app must stay open while the documents are being loaded.\n", + "2. Go to settings / options and select \"Web Clipper\".\n", + "3. Make sure that the Web Clipper service is enabled.\n", + "4. Under \"Advanced Options\", copy the authorization token.\n", + "\n", + "You may either initialize the loader directly with the access token, or store it in the environment variable JOPLIN_ACCESS_TOKEN.\n", + "\n", + "An alternative to this approach is to export the `Joplin`'s note database to Markdown files (optionally, with Front Matter metadata) and use a Markdown loader, such as ObsidianLoader, to load them." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "007c5cbf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import JoplinLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a1caec59", + "metadata": {}, + "outputs": [], + "source": [ + "loader = JoplinLoader(access_token=\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b1c30ff7", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa93b965", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 6e693c21..96935480 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -46,6 +46,7 @@ from langchain.document_loaders.ifixit import IFixitLoader from langchain.document_loaders.image import UnstructuredImageLoader from langchain.document_loaders.image_captions import ImageCaptionLoader from langchain.document_loaders.imsdb import IMSDbLoader +from langchain.document_loaders.joplin import JoplinLoader from langchain.document_loaders.json_loader import JSONLoader from langchain.document_loaders.markdown import UnstructuredMarkdownLoader from langchain.document_loaders.mastodon import MastodonTootsLoader @@ -160,6 +161,7 @@ __all__ = [ "IFixitLoader", "IMSDbLoader", "ImageCaptionLoader", + "JoplinLoader", "JSONLoader", "MWDumpLoader", "MastodonTootsLoader", diff --git a/langchain/document_loaders/joplin.py b/langchain/document_loaders/joplin.py new file mode 100644 index 00000000..865a6997 --- /dev/null +++ b/langchain/document_loaders/joplin.py @@ -0,0 +1,88 @@ +import json +import urllib +from datetime import datetime +from typing import Iterator, List, Optional + +from langchain.document_loaders.base import BaseLoader +from langchain.schema import Document +from langchain.utils import get_from_env + +LINK_NOTE_TEMPLATE = "joplin://x-callback-url/openNote?id={id}" + + +class JoplinLoader(BaseLoader): + """ + Loader that fetches notes from Joplin. + + In order to use this loader, you need to have Joplin running with the + Web Clipper enabled (look for "Web Clipper" in the app settings). + + To get the access token, you need to go to the Web Clipper options and + under "Advanced Options" you will find the access token. + + You can find more information about the Web Clipper service here: + https://joplinapp.org/clipper/ + """ + + def __init__( + self, + access_token: Optional[str] = None, + port: int = 41184, + host: str = "localhost", + ) -> None: + access_token = access_token or get_from_env( + "access_token", "JOPLIN_ACCESS_TOKEN" + ) + base_url = f"http://{host}:{port}" + self._get_note_url = ( + f"{base_url}/notes?token={access_token}" + "&fields=id,parent_id,title,body,created_time,updated_time&page={{page}}" + ) + self._get_folder_url = ( + f"{base_url}/folders/{{id}}?token={access_token}&fields=title" + ) + self._get_tag_url = ( + f"{base_url}/notes/{{id}}/tags?token={access_token}&fields=title" + ) + + def _get_notes(self) -> Iterator[Document]: + has_more = True + page = 1 + while has_more: + req_note = urllib.request.Request(self._get_note_url.format(page=page)) + with urllib.request.urlopen(req_note) as response: + json_data = json.loads(response.read().decode()) + for note in json_data["items"]: + metadata = { + "source": LINK_NOTE_TEMPLATE.format(id=note["id"]), + "folder": self._get_folder(note["parent_id"]), + "tags": self._get_tags(note["id"]), + "title": note["title"], + "created_time": self._convert_date(note["created_time"]), + "updated_time": self._convert_date(note["updated_time"]), + } + yield Document(page_content=note["body"], metadata=metadata) + + has_more = json_data["has_more"] + page += 1 + + def _get_folder(self, folder_id: str) -> str: + req_folder = urllib.request.Request(self._get_folder_url.format(id=folder_id)) + with urllib.request.urlopen(req_folder) as response: + json_data = json.loads(response.read().decode()) + return json_data["title"] + + def _get_tags(self, note_id: str) -> List[str]: + req_tag = urllib.request.Request(self._get_tag_url.format(id=note_id)) + with urllib.request.urlopen(req_tag) as response: + json_data = json.loads(response.read().decode()) + return [tag["title"] for tag in json_data["items"]] + + def _convert_date(self, date: int) -> str: + return datetime.fromtimestamp(date / 1000).strftime("%Y-%m-%d %H:%M:%S") + + def lazy_load(self) -> Iterator[Document]: + yield from self._get_notes() + + def load(self) -> List[Document]: + return list(self.lazy_load()) diff --git a/tests/integration_tests/document_loaders/test_joplin.py b/tests/integration_tests/document_loaders/test_joplin.py new file mode 100644 index 00000000..0b7c8dc1 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_joplin.py @@ -0,0 +1,11 @@ +from langchain.document_loaders.joplin import JoplinLoader + + +def test_joplin_loader() -> None: + loader = JoplinLoader() + docs = loader.load() + + assert type(docs) is list + assert type(docs[0].page_content) is str + assert type(docs[0].metadata["source"]) is str + assert type(docs[0].metadata["title"]) is str