mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
177 lines
6.3 KiB
Python
177 lines
6.3 KiB
Python
|
import os
|
|||
|
import pathlib
|
|||
|
import time
|
|||
|
|
|||
|
import pytest
|
|||
|
|
|||
|
from langchain.document_loaders import EverNoteLoader
|
|||
|
|
|||
|
|
|||
|
@pytest.mark.requires("lxml", "html2text")
|
|||
|
class TestEverNoteLoader:
|
|||
|
@staticmethod
|
|||
|
def example_notebook_path(notebook_name: str) -> str:
|
|||
|
current_dir = pathlib.Path(__file__).parent
|
|||
|
return os.path.join(current_dir, "sample_documents", notebook_name)
|
|||
|
|
|||
|
def test_loadnotebook_eachnoteisindividualdocument(self) -> None:
|
|||
|
loader = EverNoteLoader(
|
|||
|
self.example_notebook_path("sample_notebook.enex"), False
|
|||
|
)
|
|||
|
documents = loader.load()
|
|||
|
assert len(documents) == 2
|
|||
|
|
|||
|
def test_loadnotebook_eachnotehasexpectedcontentwithleadingandtrailingremoved(
|
|||
|
self,
|
|||
|
) -> None:
|
|||
|
documents = EverNoteLoader(
|
|||
|
self.example_notebook_path("sample_notebook.enex"), False
|
|||
|
).load()
|
|||
|
|
|||
|
content_note1 = documents[0].page_content
|
|||
|
assert content_note1 == "abc"
|
|||
|
|
|||
|
content_note2 = documents[1].page_content
|
|||
|
assert content_note2 == "**Jan - March 2022**"
|
|||
|
|
|||
|
def test_loademptynotebook_emptylistreturned(self) -> None:
|
|||
|
documents = EverNoteLoader(
|
|||
|
self.example_notebook_path("empty_export.enex"), False
|
|||
|
).load()
|
|||
|
assert len(documents) == 0
|
|||
|
|
|||
|
def test_loadnotewithemptycontent_emptydocumentcontent(self) -> None:
|
|||
|
documents = EverNoteLoader(
|
|||
|
self.example_notebook_path("sample_notebook_emptynote.enex"), False
|
|||
|
).load()
|
|||
|
note = documents[0]
|
|||
|
assert note.page_content == ""
|
|||
|
|
|||
|
def test_loadnotewithmissingcontenttag_emptylistreturned(
|
|||
|
self,
|
|||
|
) -> None:
|
|||
|
documents = EverNoteLoader(
|
|||
|
self.example_notebook_path("sample_notebook_missingcontenttag.enex"), False
|
|||
|
).load()
|
|||
|
assert len(documents) == 0
|
|||
|
|
|||
|
def test_loadnotewithnometadata_documentreturnedwithsourceonly(
|
|||
|
self,
|
|||
|
) -> None:
|
|||
|
documents = EverNoteLoader(
|
|||
|
self.example_notebook_path("sample_notebook_missingmetadata.enex"), False
|
|||
|
).load()
|
|||
|
note = documents[0]
|
|||
|
|
|||
|
assert note.page_content == "I only have content, no metadata"
|
|||
|
|
|||
|
assert len(note.metadata) == 1
|
|||
|
assert "source" in note.metadata
|
|||
|
assert "sample_notebook_missingmetadata.enex" in note.metadata["source"]
|
|||
|
|
|||
|
def test_loadnotebookwithimage_notehasplaintextonlywithresourcesremoved(
|
|||
|
self,
|
|||
|
) -> None:
|
|||
|
documents = EverNoteLoader(
|
|||
|
self.example_notebook_path("sample_notebook_with_media.enex"), False
|
|||
|
).load()
|
|||
|
|
|||
|
note = documents[0]
|
|||
|
assert (
|
|||
|
note.page_content
|
|||
|
== """\
|
|||
|
When you pick this mug up with your thumb on top and middle finger through the
|
|||
|
loop, your ring finger slides into the mug under the loop where it is too hot
|
|||
|
to touch and burns you.
|
|||
|
|
|||
|
|
|||
|
|
|||
|
If you try and pick it up with your thumb and index finger you can’t hold the
|
|||
|
mug."""
|
|||
|
)
|
|||
|
|
|||
|
def test_loadnotebook_eachnotehasexpectedmetadata(self) -> None:
|
|||
|
documents = EverNoteLoader(
|
|||
|
self.example_notebook_path("sample_notebook.enex"), False
|
|||
|
).load()
|
|||
|
metadata_note1 = documents[0].metadata
|
|||
|
|
|||
|
assert "title" in metadata_note1.keys()
|
|||
|
assert "created" in metadata_note1.keys()
|
|||
|
assert "updated" in metadata_note1.keys()
|
|||
|
assert "note-attributes.author" in metadata_note1.keys()
|
|||
|
assert (
|
|||
|
"content" not in metadata_note1.keys()
|
|||
|
) # This should be in the content of the document instead
|
|||
|
assert (
|
|||
|
"content-raw" not in metadata_note1.keys()
|
|||
|
) # This is too large to be stored as metadata
|
|||
|
assert (
|
|||
|
"resource" not in metadata_note1.keys()
|
|||
|
) # This is too large to be stored as metadata
|
|||
|
|
|||
|
assert metadata_note1["title"] == "Test"
|
|||
|
assert metadata_note1["note-attributes.author"] == "Michael McGarry"
|
|||
|
|
|||
|
assert isinstance(metadata_note1["created"], time.struct_time)
|
|||
|
assert isinstance(metadata_note1["updated"], time.struct_time)
|
|||
|
|
|||
|
assert metadata_note1["created"].tm_year == 2023
|
|||
|
assert metadata_note1["created"].tm_mon == 5
|
|||
|
assert metadata_note1["created"].tm_mday == 11
|
|||
|
|
|||
|
assert metadata_note1["updated"].tm_year == 2024
|
|||
|
assert metadata_note1["updated"].tm_mon == 7
|
|||
|
assert metadata_note1["updated"].tm_mday == 14
|
|||
|
|
|||
|
metadata_note2 = documents[1].metadata
|
|||
|
|
|||
|
assert "title" in metadata_note2.keys()
|
|||
|
assert "created" in metadata_note2.keys()
|
|||
|
assert "updated" not in metadata_note2.keys()
|
|||
|
assert "note-attributes.author" in metadata_note2.keys()
|
|||
|
assert "note-attributes.source" in metadata_note2.keys()
|
|||
|
assert "content" not in metadata_note2.keys()
|
|||
|
assert "content-raw" not in metadata_note2.keys()
|
|||
|
assert (
|
|||
|
"resource" not in metadata_note2.keys()
|
|||
|
) # This is too large to be stored as metadata
|
|||
|
|
|||
|
assert metadata_note2["title"] == "Summer Training Program"
|
|||
|
assert metadata_note2["note-attributes.author"] == "Mike McGarry"
|
|||
|
assert metadata_note2["note-attributes.source"] == "mobile.iphone"
|
|||
|
|
|||
|
assert isinstance(metadata_note2["created"], time.struct_time)
|
|||
|
|
|||
|
assert metadata_note2["created"].tm_year == 2022
|
|||
|
assert metadata_note2["created"].tm_mon == 12
|
|||
|
assert metadata_note2["created"].tm_mday == 27
|
|||
|
|
|||
|
def test_loadnotebookwithconflictingsourcemetadatatag_sourceoffilepreferred(
|
|||
|
self,
|
|||
|
) -> None:
|
|||
|
documents = EverNoteLoader(
|
|||
|
self.example_notebook_path("sample_notebook_2.enex"), False
|
|||
|
).load()
|
|||
|
assert "sample_notebook_2.enex" in documents[0].metadata["source"]
|
|||
|
assert "mobile.iphone" not in documents[0].metadata["source"]
|
|||
|
|
|||
|
def test_returnsingledocument_loadnotebook_eachnoteiscombinedinto1document(
|
|||
|
self,
|
|||
|
) -> None:
|
|||
|
loader = EverNoteLoader(
|
|||
|
self.example_notebook_path("sample_notebook.enex"), True
|
|||
|
)
|
|||
|
documents = loader.load()
|
|||
|
assert len(documents) == 1
|
|||
|
|
|||
|
def test_returnsingledocument_loadnotebook_notecontentiscombinedinto1document(
|
|||
|
self,
|
|||
|
) -> None:
|
|||
|
loader = EverNoteLoader(
|
|||
|
self.example_notebook_path("sample_notebook.enex"), True
|
|||
|
)
|
|||
|
documents = loader.load()
|
|||
|
note = documents[0]
|
|||
|
assert note.page_content == "abc**Jan - March 2022**"
|