langchain/libs/community/langchain_community/document_loaders/evernote.py
mwmajewsk f7a1fd91b8
community: better support of pathlib paths in document loaders (#18396)
So this arose from the
https://github.com/langchain-ai/langchain/pull/18397 problem of document
loaders not supporting `pathlib.Path`.

This pull request provides more uniform support for Path as an argument.
The core ideas for this upgrade: 
- if there is a local file path used as an argument, it should be
supported as `pathlib.Path`
- if there are some external calls that may or may not support Pathlib,
the argument is immidiately converted to `str`
- if there `self.file_path` is used in a way that it allows for it to
stay pathlib without conversion, is is only converted for the metadata.

Twitter handle: https://twitter.com/mwmajewsk
2024-03-26 11:51:52 -04:00

152 lines
5.8 KiB
Python

"""Load documents from Evernote.
https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
"""
import hashlib
import logging
from base64 import b64decode
from pathlib import Path
from time import strptime
from typing import Any, Dict, Iterator, List, Optional, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
class EverNoteLoader(BaseLoader):
"""Load from `EverNote`.
Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
Instructions on producing this file can be found at
https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML
Currently only the plain text in the note is extracted and stored as the contents
of the Document, any non content metadata (e.g. 'author', 'created', 'updated' etc.
but not 'content-raw' or 'resource') tags on the note will be extracted and stored
as metadata on the Document.
Args:
file_path (str): The path to the notebook export with a .enex extension
load_single_document (bool): Whether or not to concatenate the content of all
notes into a single long Document.
If this is set to True (default) then the only metadata on the document will be
the 'source' which contains the file name of the export.
""" # noqa: E501
def __init__(self, file_path: Union[str, Path], load_single_document: bool = True):
"""Initialize with file path."""
self.file_path = str(file_path)
self.load_single_document = load_single_document
def _lazy_load(self) -> Iterator[Document]:
for note in self._parse_note_xml(self.file_path):
if note.get("content") is not None:
yield Document(
page_content=note["content"],
metadata={
**{
key: value
for key, value in note.items()
if key not in ["content", "content-raw", "resource"]
},
**{"source": self.file_path},
},
)
def lazy_load(self) -> Iterator[Document]:
"""Load documents from EverNote export file."""
if not self.load_single_document:
yield from self._lazy_load()
else:
yield Document(
page_content="".join(
[document.page_content for document in self._lazy_load()]
),
metadata={"source": self.file_path},
)
@staticmethod
def _parse_content(content: str) -> str:
try:
import html2text
return html2text.html2text(content).strip()
except ImportError as e:
raise ImportError(
"Could not import `html2text`. Although it is not a required package "
"to use Langchain, using the EverNote loader requires `html2text`. "
"Please install `html2text` via `pip install html2text` and try again."
) from e
@staticmethod
def _parse_resource(resource: list) -> dict:
rsc_dict: Dict[str, Any] = {}
for elem in resource:
if elem.tag == "data":
# Sometimes elem.text is None
rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
else:
rsc_dict[elem.tag] = elem.text
return rsc_dict
@staticmethod
def _parse_note(note: List, prefix: Optional[str] = None) -> dict:
note_dict: Dict[str, Any] = {}
resources = []
def add_prefix(element_tag: str) -> str:
if prefix is None:
return element_tag
return f"{prefix}.{element_tag}"
for elem in note:
if elem.tag == "content":
note_dict[elem.tag] = EverNoteLoader._parse_content(elem.text)
# A copy of original content
note_dict["content-raw"] = elem.text
elif elem.tag == "resource":
resources.append(EverNoteLoader._parse_resource(elem))
elif elem.tag == "created" or elem.tag == "updated":
note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
elif elem.tag == "note-attributes":
additional_attributes = EverNoteLoader._parse_note(
elem, elem.tag
) # Recursively enter the note-attributes tag
note_dict.update(additional_attributes)
else:
note_dict[elem.tag] = elem.text
if len(resources) > 0:
note_dict["resource"] = resources
return {add_prefix(key): value for key, value in note_dict.items()}
@staticmethod
def _parse_note_xml(xml_file: str) -> Iterator[Dict[str, Any]]:
"""Parse Evernote xml."""
# Without huge_tree set to True, parser may complain about huge text node
# Try to recover, because there may be " ", which will cause
# "XMLSyntaxError: Entity 'nbsp' not defined"
try:
from lxml import etree
except ImportError as e:
logger.error(
"Could not import `lxml`. Although it is not a required package to use "
"Langchain, using the EverNote loader requires `lxml`. Please install "
"`lxml` via `pip install lxml` and try again."
)
raise e
context = etree.iterparse(
xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
)
for action, elem in context:
if elem.tag == "note":
yield EverNoteLoader._parse_note(elem)