Harrison/everynote (#974)

Co-authored-by: Harrison Chase <harrisonchase@Harrisons-MBP.attlocal.net>
This commit is contained in:
Harrison Chase 2023-02-10 08:02:35 -08:00 committed by GitHub
parent 3d639d1539
commit 5469d898a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 180 additions and 0 deletions

View File

@ -0,0 +1,80 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "56ac1584",
"metadata": {},
"source": [
"# EveryNote\n",
"\n",
"How to load EveryNote file from disk."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1a53ece0",
"metadata": {},
"outputs": [],
"source": [
"# !pip install pypandoc\n",
"# import pypandoc\n",
"\n",
"# pypandoc.download_pandoc()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "88df766f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='testing this\\n\\nwhat happens?\\n\\nto the world?\\n', lookup_str='', metadata={'source': 'example_data/testing.enex'}, lookup_index=0)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from langchain.document_loaders import EveryNoteLoader\n",
"\n",
"loader = EveryNoteLoader(\"example_data/testing.enex\")\n",
"loader.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1329905",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
<en-export export-date="20230309T035336Z" application="Evernote" version="10.53.2">
<note>
<title>testing</title>
<created>20230209T034746Z</created>
<updated>20230209T035328Z</updated>
<note-attributes>
<author>Harrison Chase</author>
</note-attributes>
<content>
<![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>testing this</div><div>what happens?</div><div>to the world?</div></en-note> ]]>
</content>
</note>
</en-export>

View File

@ -5,6 +5,7 @@ from langchain.document_loaders.college_confidential import CollegeConfidentialL
from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.directory import DirectoryLoader
from langchain.document_loaders.docx import UnstructuredDocxLoader from langchain.document_loaders.docx import UnstructuredDocxLoader
from langchain.document_loaders.email import UnstructuredEmailLoader from langchain.document_loaders.email import UnstructuredEmailLoader
from langchain.document_loaders.everynote import EveryNoteLoader
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
from langchain.document_loaders.gcs_file import GCSFileLoader from langchain.document_loaders.gcs_file import GCSFileLoader
from langchain.document_loaders.googledrive import GoogleDriveLoader from langchain.document_loaders.googledrive import GoogleDriveLoader
@ -46,4 +47,5 @@ __all__ = [
"AZLyricsLoader", "AZLyricsLoader",
"CollegeConfidentialLoader", "CollegeConfidentialLoader",
"GutenbergLoader", "GutenbergLoader",
"EveryNoteLoader",
] ]

View File

@ -0,0 +1,82 @@
"""Load documents from Everynote.
https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
"""
import hashlib
from base64 import b64decode
from time import strptime
from typing import Any, Dict, List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
def _parse_content(content: str) -> str:
from pypandoc import convert_text
text = convert_text(content, "org", format="html")
return text
def _parse_resource(resource: list) -> dict:
rsc_dict: Dict[str, Any] = {}
for elem in resource:
if elem.tag == "data":
# Some times elem.text is None
rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
else:
rsc_dict[elem.tag] = elem.text
return rsc_dict
def _parse_note(note: List) -> dict:
note_dict: Dict[str, Any] = {}
resources = []
for elem in note:
if elem.tag == "content":
note_dict[elem.tag] = _parse_content(elem.text)
# A copy of original content
note_dict["content-raw"] = elem.text
elif elem.tag == "resource":
resources.append(_parse_resource(elem))
elif elem.tag == "created" or elem.tag == "updated":
note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
else:
note_dict[elem.tag] = elem.text
note_dict["resource"] = resources
return note_dict
def _parse_note_xml(xml_file: str) -> str:
"""Parse everynote xml."""
# Without huge_tree set to True, parser may complain about huge text node
# Try to recover, because there may be "&nbsp;", which will cause
# "XMLSyntaxError: Entity 'nbsp' not defined"
from lxml import etree
context = etree.iterparse(
xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
)
result_string = ""
for action, elem in context:
if elem.tag == "note":
result_string += _parse_note(elem)["content"]
return result_string
class EveryNoteLoader(BaseLoader):
"""Loader to load in EverNnote files.."""
def __init__(self, file_path: str):
"""Initialize with file path."""
self.file_path = file_path
def load(self) -> List[Document]:
"""Load document from EveryNote file."""
text = _parse_note_xml(self.file_path)
metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)]