forked from Archives/langchain
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
83 lines
2.5 KiB
Python
83 lines
2.5 KiB
Python
"""Load documents from Evernote.
|
|
|
|
https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
|
|
"""
|
|
import hashlib
|
|
from base64 import b64decode
|
|
from time import strptime
|
|
from typing import Any, Dict, List
|
|
|
|
from langchain.docstore.document import Document
|
|
from langchain.document_loaders.base import BaseLoader
|
|
|
|
|
|
def _parse_content(content: str) -> str:
|
|
from pypandoc import convert_text
|
|
|
|
text = convert_text(content, "org", format="html")
|
|
return text
|
|
|
|
|
|
def _parse_resource(resource: list) -> dict:
|
|
rsc_dict: Dict[str, Any] = {}
|
|
for elem in resource:
|
|
if elem.tag == "data":
|
|
# Some times elem.text is None
|
|
rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
|
|
rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
|
|
else:
|
|
rsc_dict[elem.tag] = elem.text
|
|
|
|
return rsc_dict
|
|
|
|
|
|
def _parse_note(note: List) -> dict:
|
|
note_dict: Dict[str, Any] = {}
|
|
resources = []
|
|
for elem in note:
|
|
if elem.tag == "content":
|
|
note_dict[elem.tag] = _parse_content(elem.text)
|
|
# A copy of original content
|
|
note_dict["content-raw"] = elem.text
|
|
elif elem.tag == "resource":
|
|
resources.append(_parse_resource(elem))
|
|
elif elem.tag == "created" or elem.tag == "updated":
|
|
note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
|
|
else:
|
|
note_dict[elem.tag] = elem.text
|
|
|
|
note_dict["resource"] = resources
|
|
|
|
return note_dict
|
|
|
|
|
|
def _parse_note_xml(xml_file: str) -> str:
|
|
"""Parse Evernote xml."""
|
|
# Without huge_tree set to True, parser may complain about huge text node
|
|
# Try to recover, because there may be " ", which will cause
|
|
# "XMLSyntaxError: Entity 'nbsp' not defined"
|
|
from lxml import etree
|
|
|
|
context = etree.iterparse(
|
|
xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
|
|
)
|
|
result_string = ""
|
|
for action, elem in context:
|
|
if elem.tag == "note":
|
|
result_string += _parse_note(elem)["content"]
|
|
return result_string
|
|
|
|
|
|
class EverNoteLoader(BaseLoader):
|
|
"""Loader to load in EverNote files.."""
|
|
|
|
def __init__(self, file_path: str):
|
|
"""Initialize with file path."""
|
|
self.file_path = file_path
|
|
|
|
def load(self) -> List[Document]:
|
|
"""Load document from EverNote file."""
|
|
text = _parse_note_xml(self.file_path)
|
|
metadata = {"source": self.file_path}
|
|
return [Document(page_content=text, metadata=metadata)]
|