mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
ddd595fe81
# Improve Evernote Document Loader When exporting from Evernote you may export more than one note. Currently the Evernote loader concatenates the content of all notes in the export into a single document and only attaches the name of the export file as metadata on the document. This change ensures that each note is loaded as an independent document and all available metadata on the note e.g. author, title, created, updated are added as metadata on each document. It also uses an existing optional dependency of `html2text` instead of `pypandoc` to remove the need to download the pandoc application via `download_pandoc()` to be able to use the `pypandoc` python bindings. Fixes #4493 Co-authored-by: Mike McGarry <mike.mcgarry@finbourne.com> Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
177 lines
6.3 KiB
Python
177 lines
6.3 KiB
Python
import os
|
||
import pathlib
|
||
import time
|
||
|
||
import pytest
|
||
|
||
from langchain.document_loaders import EverNoteLoader
|
||
|
||
|
||
@pytest.mark.requires("lxml", "html2text")
|
||
class TestEverNoteLoader:
|
||
@staticmethod
|
||
def example_notebook_path(notebook_name: str) -> str:
|
||
current_dir = pathlib.Path(__file__).parent
|
||
return os.path.join(current_dir, "sample_documents", notebook_name)
|
||
|
||
def test_loadnotebook_eachnoteisindividualdocument(self) -> None:
|
||
loader = EverNoteLoader(
|
||
self.example_notebook_path("sample_notebook.enex"), False
|
||
)
|
||
documents = loader.load()
|
||
assert len(documents) == 2
|
||
|
||
def test_loadnotebook_eachnotehasexpectedcontentwithleadingandtrailingremoved(
|
||
self,
|
||
) -> None:
|
||
documents = EverNoteLoader(
|
||
self.example_notebook_path("sample_notebook.enex"), False
|
||
).load()
|
||
|
||
content_note1 = documents[0].page_content
|
||
assert content_note1 == "abc"
|
||
|
||
content_note2 = documents[1].page_content
|
||
assert content_note2 == "**Jan - March 2022**"
|
||
|
||
def test_loademptynotebook_emptylistreturned(self) -> None:
|
||
documents = EverNoteLoader(
|
||
self.example_notebook_path("empty_export.enex"), False
|
||
).load()
|
||
assert len(documents) == 0
|
||
|
||
def test_loadnotewithemptycontent_emptydocumentcontent(self) -> None:
|
||
documents = EverNoteLoader(
|
||
self.example_notebook_path("sample_notebook_emptynote.enex"), False
|
||
).load()
|
||
note = documents[0]
|
||
assert note.page_content == ""
|
||
|
||
def test_loadnotewithmissingcontenttag_emptylistreturned(
|
||
self,
|
||
) -> None:
|
||
documents = EverNoteLoader(
|
||
self.example_notebook_path("sample_notebook_missingcontenttag.enex"), False
|
||
).load()
|
||
assert len(documents) == 0
|
||
|
||
def test_loadnotewithnometadata_documentreturnedwithsourceonly(
|
||
self,
|
||
) -> None:
|
||
documents = EverNoteLoader(
|
||
self.example_notebook_path("sample_notebook_missingmetadata.enex"), False
|
||
).load()
|
||
note = documents[0]
|
||
|
||
assert note.page_content == "I only have content, no metadata"
|
||
|
||
assert len(note.metadata) == 1
|
||
assert "source" in note.metadata
|
||
assert "sample_notebook_missingmetadata.enex" in note.metadata["source"]
|
||
|
||
def test_loadnotebookwithimage_notehasplaintextonlywithresourcesremoved(
|
||
self,
|
||
) -> None:
|
||
documents = EverNoteLoader(
|
||
self.example_notebook_path("sample_notebook_with_media.enex"), False
|
||
).load()
|
||
|
||
note = documents[0]
|
||
assert (
|
||
note.page_content
|
||
== """\
|
||
When you pick this mug up with your thumb on top and middle finger through the
|
||
loop, your ring finger slides into the mug under the loop where it is too hot
|
||
to touch and burns you.
|
||
|
||
|
||
|
||
If you try and pick it up with your thumb and index finger you can’t hold the
|
||
mug."""
|
||
)
|
||
|
||
def test_loadnotebook_eachnotehasexpectedmetadata(self) -> None:
|
||
documents = EverNoteLoader(
|
||
self.example_notebook_path("sample_notebook.enex"), False
|
||
).load()
|
||
metadata_note1 = documents[0].metadata
|
||
|
||
assert "title" in metadata_note1.keys()
|
||
assert "created" in metadata_note1.keys()
|
||
assert "updated" in metadata_note1.keys()
|
||
assert "note-attributes.author" in metadata_note1.keys()
|
||
assert (
|
||
"content" not in metadata_note1.keys()
|
||
) # This should be in the content of the document instead
|
||
assert (
|
||
"content-raw" not in metadata_note1.keys()
|
||
) # This is too large to be stored as metadata
|
||
assert (
|
||
"resource" not in metadata_note1.keys()
|
||
) # This is too large to be stored as metadata
|
||
|
||
assert metadata_note1["title"] == "Test"
|
||
assert metadata_note1["note-attributes.author"] == "Michael McGarry"
|
||
|
||
assert isinstance(metadata_note1["created"], time.struct_time)
|
||
assert isinstance(metadata_note1["updated"], time.struct_time)
|
||
|
||
assert metadata_note1["created"].tm_year == 2023
|
||
assert metadata_note1["created"].tm_mon == 5
|
||
assert metadata_note1["created"].tm_mday == 11
|
||
|
||
assert metadata_note1["updated"].tm_year == 2024
|
||
assert metadata_note1["updated"].tm_mon == 7
|
||
assert metadata_note1["updated"].tm_mday == 14
|
||
|
||
metadata_note2 = documents[1].metadata
|
||
|
||
assert "title" in metadata_note2.keys()
|
||
assert "created" in metadata_note2.keys()
|
||
assert "updated" not in metadata_note2.keys()
|
||
assert "note-attributes.author" in metadata_note2.keys()
|
||
assert "note-attributes.source" in metadata_note2.keys()
|
||
assert "content" not in metadata_note2.keys()
|
||
assert "content-raw" not in metadata_note2.keys()
|
||
assert (
|
||
"resource" not in metadata_note2.keys()
|
||
) # This is too large to be stored as metadata
|
||
|
||
assert metadata_note2["title"] == "Summer Training Program"
|
||
assert metadata_note2["note-attributes.author"] == "Mike McGarry"
|
||
assert metadata_note2["note-attributes.source"] == "mobile.iphone"
|
||
|
||
assert isinstance(metadata_note2["created"], time.struct_time)
|
||
|
||
assert metadata_note2["created"].tm_year == 2022
|
||
assert metadata_note2["created"].tm_mon == 12
|
||
assert metadata_note2["created"].tm_mday == 27
|
||
|
||
def test_loadnotebookwithconflictingsourcemetadatatag_sourceoffilepreferred(
|
||
self,
|
||
) -> None:
|
||
documents = EverNoteLoader(
|
||
self.example_notebook_path("sample_notebook_2.enex"), False
|
||
).load()
|
||
assert "sample_notebook_2.enex" in documents[0].metadata["source"]
|
||
assert "mobile.iphone" not in documents[0].metadata["source"]
|
||
|
||
def test_returnsingledocument_loadnotebook_eachnoteiscombinedinto1document(
|
||
self,
|
||
) -> None:
|
||
loader = EverNoteLoader(
|
||
self.example_notebook_path("sample_notebook.enex"), True
|
||
)
|
||
documents = loader.load()
|
||
assert len(documents) == 1
|
||
|
||
def test_returnsingledocument_loadnotebook_notecontentiscombinedinto1document(
|
||
self,
|
||
) -> None:
|
||
loader = EverNoteLoader(
|
||
self.example_notebook_path("sample_notebook.enex"), True
|
||
)
|
||
documents = loader.load()
|
||
note = documents[0]
|
||
assert note.page_content == "abc**Jan - March 2022**"
|