mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Harrison/everynote (#974)
Co-authored-by: Harrison Chase <harrisonchase@Harrisons-MBP.attlocal.net>
This commit is contained in:
parent
3d639d1539
commit
5469d898a9
80
docs/modules/document_loaders/examples/everynote.ipynb
Normal file
80
docs/modules/document_loaders/examples/everynote.ipynb
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "56ac1584",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# EveryNote\n",
|
||||||
|
"\n",
|
||||||
|
"How to load EveryNote file from disk."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "1a53ece0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# !pip install pypandoc\n",
|
||||||
|
"# import pypandoc\n",
|
||||||
|
"\n",
|
||||||
|
"# pypandoc.download_pandoc()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "88df766f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='testing this\\n\\nwhat happens?\\n\\nto the world?\\n', lookup_str='', metadata={'source': 'example_data/testing.enex'}, lookup_index=0)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import EveryNoteLoader\n",
|
||||||
|
"\n",
|
||||||
|
"loader = EveryNoteLoader(\"example_data/testing.enex\")\n",
|
||||||
|
"loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c1329905",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,16 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
|
||||||
|
<en-export export-date="20230309T035336Z" application="Evernote" version="10.53.2">
|
||||||
|
<note>
|
||||||
|
<title>testing</title>
|
||||||
|
<created>20230209T034746Z</created>
|
||||||
|
<updated>20230209T035328Z</updated>
|
||||||
|
<note-attributes>
|
||||||
|
<author>Harrison Chase</author>
|
||||||
|
</note-attributes>
|
||||||
|
<content>
|
||||||
|
<![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
|
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>testing this</div><div>what happens?</div><div>to the world?</div></en-note> ]]>
|
||||||
|
</content>
|
||||||
|
</note>
|
||||||
|
</en-export>
|
@ -5,6 +5,7 @@ from langchain.document_loaders.college_confidential import CollegeConfidentialL
|
|||||||
from langchain.document_loaders.directory import DirectoryLoader
|
from langchain.document_loaders.directory import DirectoryLoader
|
||||||
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
||||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||||
|
from langchain.document_loaders.everynote import EveryNoteLoader
|
||||||
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
|
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
|
||||||
from langchain.document_loaders.gcs_file import GCSFileLoader
|
from langchain.document_loaders.gcs_file import GCSFileLoader
|
||||||
from langchain.document_loaders.googledrive import GoogleDriveLoader
|
from langchain.document_loaders.googledrive import GoogleDriveLoader
|
||||||
@ -46,4 +47,5 @@ __all__ = [
|
|||||||
"AZLyricsLoader",
|
"AZLyricsLoader",
|
||||||
"CollegeConfidentialLoader",
|
"CollegeConfidentialLoader",
|
||||||
"GutenbergLoader",
|
"GutenbergLoader",
|
||||||
|
"EveryNoteLoader",
|
||||||
]
|
]
|
||||||
|
82
langchain/document_loaders/everynote.py
Normal file
82
langchain/document_loaders/everynote.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
"""Load documents from Everynote.
|
||||||
|
|
||||||
|
https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
|
||||||
|
"""
|
||||||
|
import hashlib
|
||||||
|
from base64 import b64decode
|
||||||
|
from time import strptime
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_content(content: str) -> str:
|
||||||
|
from pypandoc import convert_text
|
||||||
|
|
||||||
|
text = convert_text(content, "org", format="html")
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_resource(resource: list) -> dict:
|
||||||
|
rsc_dict: Dict[str, Any] = {}
|
||||||
|
for elem in resource:
|
||||||
|
if elem.tag == "data":
|
||||||
|
# Some times elem.text is None
|
||||||
|
rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
|
||||||
|
rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
|
||||||
|
else:
|
||||||
|
rsc_dict[elem.tag] = elem.text
|
||||||
|
|
||||||
|
return rsc_dict
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_note(note: List) -> dict:
|
||||||
|
note_dict: Dict[str, Any] = {}
|
||||||
|
resources = []
|
||||||
|
for elem in note:
|
||||||
|
if elem.tag == "content":
|
||||||
|
note_dict[elem.tag] = _parse_content(elem.text)
|
||||||
|
# A copy of original content
|
||||||
|
note_dict["content-raw"] = elem.text
|
||||||
|
elif elem.tag == "resource":
|
||||||
|
resources.append(_parse_resource(elem))
|
||||||
|
elif elem.tag == "created" or elem.tag == "updated":
|
||||||
|
note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
|
||||||
|
else:
|
||||||
|
note_dict[elem.tag] = elem.text
|
||||||
|
|
||||||
|
note_dict["resource"] = resources
|
||||||
|
|
||||||
|
return note_dict
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_note_xml(xml_file: str) -> str:
|
||||||
|
"""Parse everynote xml."""
|
||||||
|
# Without huge_tree set to True, parser may complain about huge text node
|
||||||
|
# Try to recover, because there may be " ", which will cause
|
||||||
|
# "XMLSyntaxError: Entity 'nbsp' not defined"
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
context = etree.iterparse(
|
||||||
|
xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
|
||||||
|
)
|
||||||
|
result_string = ""
|
||||||
|
for action, elem in context:
|
||||||
|
if elem.tag == "note":
|
||||||
|
result_string += _parse_note(elem)["content"]
|
||||||
|
return result_string
|
||||||
|
|
||||||
|
|
||||||
|
class EveryNoteLoader(BaseLoader):
|
||||||
|
"""Loader to load in EverNnote files.."""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load document from EveryNote file."""
|
||||||
|
text = _parse_note_xml(self.file_path)
|
||||||
|
metadata = {"source": self.file_path}
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
Loading…
Reference in New Issue
Block a user