From 5469d898a90b2ab042f6a8e2975600c7e6121f47 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Fri, 10 Feb 2023 08:02:35 -0800 Subject: [PATCH] Harrison/everynote (#974) Co-authored-by: Harrison Chase --- .../document_loaders/examples/everynote.ipynb | 80 ++++++++++++++++++ .../examples/example_data/testing.enex | 16 ++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/everynote.py | 82 +++++++++++++++++++ 4 files changed, 180 insertions(+) create mode 100644 docs/modules/document_loaders/examples/everynote.ipynb create mode 100644 docs/modules/document_loaders/examples/example_data/testing.enex create mode 100644 langchain/document_loaders/everynote.py diff --git a/docs/modules/document_loaders/examples/everynote.ipynb b/docs/modules/document_loaders/examples/everynote.ipynb new file mode 100644 index 00000000..532358b0 --- /dev/null +++ b/docs/modules/document_loaders/examples/everynote.ipynb @@ -0,0 +1,80 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "56ac1584", + "metadata": {}, + "source": [ + "# EveryNote\n", + "\n", + "How to load EveryNote file from disk." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1a53ece0", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install pypandoc\n", + "# import pypandoc\n", + "\n", + "# pypandoc.download_pandoc()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "88df766f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='testing this\\n\\nwhat happens?\\n\\nto the world?\\n', lookup_str='', metadata={'source': 'example_data/testing.enex'}, lookup_index=0)]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.document_loaders import EveryNoteLoader\n", + "\n", + "loader = EveryNoteLoader(\"example_data/testing.enex\")\n", + "loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1329905", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/example_data/testing.enex b/docs/modules/document_loaders/examples/example_data/testing.enex new file mode 100644 index 00000000..edff3e7a --- /dev/null +++ b/docs/modules/document_loaders/examples/example_data/testing.enex @@ -0,0 +1,16 @@ + + + + + testing + 20230209T034746Z + 20230209T035328Z + + Harrison Chase + + + +
testing this
what happens?
to the world?
]]> +
+
+
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 7496bef9..7956cea0 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -5,6 +5,7 @@ from langchain.document_loaders.college_confidential import CollegeConfidentialL from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.docx import UnstructuredDocxLoader from langchain.document_loaders.email import UnstructuredEmailLoader +from langchain.document_loaders.everynote import EveryNoteLoader from langchain.document_loaders.gcs_directory import GCSDirectoryLoader from langchain.document_loaders.gcs_file import GCSFileLoader from langchain.document_loaders.googledrive import GoogleDriveLoader @@ -46,4 +47,5 @@ __all__ = [ "AZLyricsLoader", "CollegeConfidentialLoader", "GutenbergLoader", + "EveryNoteLoader", ] diff --git a/langchain/document_loaders/everynote.py b/langchain/document_loaders/everynote.py new file mode 100644 index 00000000..85666084 --- /dev/null +++ b/langchain/document_loaders/everynote.py @@ -0,0 +1,82 @@ +"""Load documents from Everynote. + +https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c +""" +import hashlib +from base64 import b64decode +from time import strptime +from typing import Any, Dict, List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +def _parse_content(content: str) -> str: + from pypandoc import convert_text + + text = convert_text(content, "org", format="html") + return text + + +def _parse_resource(resource: list) -> dict: + rsc_dict: Dict[str, Any] = {} + for elem in resource: + if elem.tag == "data": + # Some times elem.text is None + rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b"" + rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest() + else: + rsc_dict[elem.tag] = elem.text + + return rsc_dict + + +def _parse_note(note: List) -> dict: + note_dict: Dict[str, Any] = {} + resources = [] + for elem in note: + if elem.tag == "content": + note_dict[elem.tag] = _parse_content(elem.text) + # A copy of original content + note_dict["content-raw"] = elem.text + elif elem.tag == "resource": + resources.append(_parse_resource(elem)) + elif elem.tag == "created" or elem.tag == "updated": + note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ") + else: + note_dict[elem.tag] = elem.text + + note_dict["resource"] = resources + + return note_dict + + +def _parse_note_xml(xml_file: str) -> str: + """Parse everynote xml.""" + # Without huge_tree set to True, parser may complain about huge text node + # Try to recover, because there may be " ", which will cause + # "XMLSyntaxError: Entity 'nbsp' not defined" + from lxml import etree + + context = etree.iterparse( + xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True + ) + result_string = "" + for action, elem in context: + if elem.tag == "note": + result_string += _parse_note(elem)["content"] + return result_string + + +class EveryNoteLoader(BaseLoader): + """Loader to load in EverNnote files..""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + self.file_path = file_path + + def load(self) -> List[Document]: + """Load document from EveryNote file.""" + text = _parse_note_xml(self.file_path) + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)]