You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/langchain/document_loaders/evernote.py

83 lines
2.5 KiB
Python

"""Load documents from Evernote.
https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
"""
import hashlib
from base64 import b64decode
from time import strptime
from typing import Any, Dict, List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
def _parse_content(content: str) -> str:
from pypandoc import convert_text
text = convert_text(content, "org", format="html")
return text
def _parse_resource(resource: list) -> dict:
rsc_dict: Dict[str, Any] = {}
for elem in resource:
if elem.tag == "data":
# Some times elem.text is None
rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
else:
rsc_dict[elem.tag] = elem.text
return rsc_dict
def _parse_note(note: List) -> dict:
note_dict: Dict[str, Any] = {}
resources = []
for elem in note:
if elem.tag == "content":
note_dict[elem.tag] = _parse_content(elem.text)
# A copy of original content
note_dict["content-raw"] = elem.text
elif elem.tag == "resource":
resources.append(_parse_resource(elem))
elif elem.tag == "created" or elem.tag == "updated":
note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
else:
note_dict[elem.tag] = elem.text
note_dict["resource"] = resources
return note_dict
def _parse_note_xml(xml_file: str) -> str:
"""Parse Evernote xml."""
# Without huge_tree set to True, parser may complain about huge text node
# Try to recover, because there may be " ", which will cause
# "XMLSyntaxError: Entity 'nbsp' not defined"
from lxml import etree
context = etree.iterparse(
xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
)
result_string = ""
for action, elem in context:
if elem.tag == "note":
result_string += _parse_note(elem)["content"]
return result_string
class EverNoteLoader(BaseLoader):
"""Loader to load in EverNote files.."""
def __init__(self, file_path: str):
"""Initialize with file path."""
self.file_path = file_path
def load(self) -> List[Document]:
"""Load document from EverNote file."""
text = _parse_note_xml(self.file_path)
metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)]