langchain/tests/unit_tests/document_loaders/test_bibtex.py
Eugene Yurtsev 5cfa72a130
Bibtex integration for document loader and retriever (#5137)
# Bibtex integration

Wrap bibtexparser to retrieve a list of docs from a bibtex file.
* Get the metadata from the bibtex entries
* `page_content` get from the local pdf referenced in the `file` field
of the bibtex entry using `pymupdf`
* If no valid pdf file, `page_content` set to the `abstract` field of
the bibtex entry
* Support Zotero flavour using regex to get the file path
* Added usage example in
`docs/modules/indexes/document_loaders/examples/bibtex.ipynb`
---------

Co-authored-by: Sébastien M. Popoff <sebastien.popoff@espci.fr>
Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
2023-05-25 00:21:31 -07:00

62 lines
1.7 KiB
Python

from pathlib import Path
import pytest
from langchain.document_loaders.bibtex import BibtexLoader
BIBTEX_EXAMPLE_FILE = Path(__file__).parent / "sample_documents" / "bibtex.bib"
@pytest.mark.requires("fitz", "bibtexparser")
def test_load_success() -> None:
"""Test that returns one document"""
loader = BibtexLoader(file_path=str(BIBTEX_EXAMPLE_FILE))
docs = loader.load()
assert len(docs) == 1
doc = docs[0]
assert doc.page_content
assert set(doc.metadata) == {
"id",
"published_year",
"title",
"publication",
"authors",
"abstract",
}
@pytest.mark.requires("fitz", "bibtexparser")
def test_load_max_content_chars() -> None:
"""Test that cuts off document contents at max_content_chars."""
loader = BibtexLoader(file_path=str(BIBTEX_EXAMPLE_FILE), max_content_chars=10)
doc = loader.load()[0]
assert len(doc.page_content) == 10
@pytest.mark.requires("fitz", "bibtexparser")
def test_load_load_extra_metadata() -> None:
"""Test that returns extra metadata fields."""
loader = BibtexLoader(file_path=str(BIBTEX_EXAMPLE_FILE), load_extra_metadata=True)
doc = loader.load()[0]
assert set(doc.metadata) == {
"id",
"published_year",
"title",
"publication",
"authors",
"abstract",
"booktitle",
"editor",
"organization",
}
@pytest.mark.requires("fitz", "bibtexparser")
def test_load_file_pattern() -> None:
"""Test that returns no documents when json file pattern specified."""
loader = BibtexLoader(
file_path=str(BIBTEX_EXAMPLE_FILE), file_pattern=r"[^:]+\.json"
)
docs = loader.load()
assert len(docs) == 0