forked from Archives/langchain
b3234bf3b0
`OnlinePDFLoader` and `PagedPDFSplitter` lived separate from the rest of the pdf loaders. Because they're all similar, I propose moving all to `pdy.py` and the same docs/examples page. Additionally, `PagedPDFSplitter` naming doesn't match the pattern the rest of the loaders follow, so I renamed to `PyPDFLoader` and had it inherit from `BasePDFLoader` so it can now load from remote file sources.
20 lines
733 B
Python
20 lines
733 B
Python
"""Test splitting with page numbers included."""
|
|
import os
|
|
|
|
from langchain.document_loaders import PyPDFLoader
|
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
|
from langchain.vectorstores import FAISS
|
|
|
|
|
|
def test_pdf_pagesplitter() -> None:
|
|
"""Test splitting with page numbers included."""
|
|
script_dir = os.path.dirname(__file__)
|
|
loader = PyPDFLoader(os.path.join(script_dir, "examples/hello.pdf"))
|
|
docs = loader.load()
|
|
assert "page" in docs[0].metadata
|
|
assert "source" in docs[0].metadata
|
|
|
|
faiss_index = FAISS.from_documents(docs, OpenAIEmbeddings())
|
|
docs = faiss_index.similarity_search("Complete this sentence: Hello", k=1)
|
|
assert "Hello world" in docs[0].page_content
|