You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/document_loaders/llmsherpa.py

143 lines
4.8 KiB
Python

from pathlib import Path
from typing import Iterator, Union
from urllib.parse import urlparse
from langchain_core.documents import Document
from langchain_community.document_loaders.pdf import BaseLoader
DEFAULT_API = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
class LLMSherpaFileLoader(BaseLoader):
"""Load Documents using `LLMSherpa`.
LLMSherpaFileLoader use LayoutPDFReader, which is part of the LLMSherpa library.
This tool is designed to parse PDFs while preserving their layout information,
which is often lost when using most PDF to text parsers.
Examples
--------
from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader
loader = LLMSherpaFileLoader(
"example.pdf",
strategy="chunks",
llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all",
)
docs = loader.load()
"""
def __init__(
self,
file_path: Union[str, Path],
new_indent_parser: bool = True,
apply_ocr: bool = True,
strategy: str = "chunks",
llmsherpa_api_url: str = DEFAULT_API,
):
"""Initialize with a file path."""
try:
import llmsherpa # noqa:F401
except ImportError:
raise ImportError(
"llmsherpa package not found, please install it with "
"`pip install llmsherpa`"
)
_valid_strategies = ["sections", "chunks", "html", "text"]
if strategy not in _valid_strategies:
raise ValueError(
f"Got {strategy} for `strategy`, "
f"but should be one of `{_valid_strategies}`"
)
# validate llmsherpa url
if not self._is_valid_url(llmsherpa_api_url):
raise ValueError(f"Invalid URL: {llmsherpa_api_url}")
self.url = self._validate_llmsherpa_url(
url=llmsherpa_api_url,
new_indent_parser=new_indent_parser,
apply_ocr=apply_ocr,
)
self.strategy = strategy
self.file_path = str(file_path)
@staticmethod
def _is_valid_url(url: str) -> bool:
"""Check if the url is valid."""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
@staticmethod
def _validate_llmsherpa_url(
url: str, new_indent_parser: bool = True, apply_ocr: bool = True
) -> str:
"""Check if the llmsherpa url is valid."""
parsed = urlparse(url)
valid_url = url
if ("/api/parseDocument" not in parsed.path) and (
"/api/document/developer/parseDocument" not in parsed.path
):
raise ValueError(f"Invalid LLMSherpa URL: {url}")
if "renderFormat=all" not in parsed.query:
valid_url = valid_url + "?renderFormat=all"
if new_indent_parser and "useNewIndentParser=true" not in parsed.query:
valid_url = valid_url + "&useNewIndentParser=true"
if apply_ocr and "applyOcr=yes" not in parsed.query:
valid_url = valid_url + "&applyOcr=yes"
return valid_url
def lazy_load(
self,
) -> Iterator[Document]:
"""Load file."""
from llmsherpa.readers import LayoutPDFReader
docs_reader = LayoutPDFReader(self.url)
doc = docs_reader.read_pdf(self.file_path)
if self.strategy == "sections":
yield from [
Document(
page_content=section.to_text(include_children=True, recurse=True),
metadata={
"source": self.file_path,
"section_number": section_num,
"section_title": section.title,
},
)
for section_num, section in enumerate(doc.sections())
]
if self.strategy == "chunks":
yield from [
Document(
page_content=chunk.to_context_text(),
metadata={
"source": self.file_path,
"chunk_number": chunk_num,
"chunk_type": chunk.tag,
},
)
for chunk_num, chunk in enumerate(doc.chunks())
]
if self.strategy == "html":
yield from [
Document(
page_content=doc.to_html(),
metadata={
"source": self.file_path,
},
)
]
if self.strategy == "text":
yield from [
Document(
page_content=doc.to_text(),
metadata={
"source": self.file_path,
},
)
]