langchain/libs/community/langchain_community/document_loaders/llmsherpa.py
M.Abdulrahman Alnaseer ba54f1577f
community[minor]: add support for llmsherpa (#19741)
Thank you for contributing to LangChain!

- [x] **PR title**: "community: added support for llmsherpa library"

- [x] **Add tests and docs**: 
1. Integration test:
'docs/docs/integrations/document_loaders/test_llmsherpa.py'.
2. an example notebook:
`docs/docs/integrations/document_loaders/llmsherpa.ipynb`.


- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, hwchase17.

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2024-03-29 16:04:57 -07:00

143 lines
4.8 KiB
Python

from pathlib import Path
from typing import Iterator, Union
from urllib.parse import urlparse
from langchain_core.documents import Document
from langchain_community.document_loaders.pdf import BaseLoader
DEFAULT_API = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
class LLMSherpaFileLoader(BaseLoader):
"""Load Documents using `LLMSherpa`.
LLMSherpaFileLoader use LayoutPDFReader, which is part of the LLMSherpa library.
This tool is designed to parse PDFs while preserving their layout information,
which is often lost when using most PDF to text parsers.
Examples
--------
from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader
loader = LLMSherpaFileLoader(
"example.pdf",
strategy="chunks",
llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all",
)
docs = loader.load()
"""
def __init__(
self,
file_path: Union[str, Path],
new_indent_parser: bool = True,
apply_ocr: bool = True,
strategy: str = "chunks",
llmsherpa_api_url: str = DEFAULT_API,
):
"""Initialize with a file path."""
try:
import llmsherpa # noqa:F401
except ImportError:
raise ImportError(
"llmsherpa package not found, please install it with "
"`pip install llmsherpa`"
)
_valid_strategies = ["sections", "chunks", "html", "text"]
if strategy not in _valid_strategies:
raise ValueError(
f"Got {strategy} for `strategy`, "
f"but should be one of `{_valid_strategies}`"
)
# validate llmsherpa url
if not self._is_valid_url(llmsherpa_api_url):
raise ValueError(f"Invalid URL: {llmsherpa_api_url}")
self.url = self._validate_llmsherpa_url(
url=llmsherpa_api_url,
new_indent_parser=new_indent_parser,
apply_ocr=apply_ocr,
)
self.strategy = strategy
self.file_path = str(file_path)
@staticmethod
def _is_valid_url(url: str) -> bool:
"""Check if the url is valid."""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
@staticmethod
def _validate_llmsherpa_url(
url: str, new_indent_parser: bool = True, apply_ocr: bool = True
) -> str:
"""Check if the llmsherpa url is valid."""
parsed = urlparse(url)
valid_url = url
if ("/api/parseDocument" not in parsed.path) and (
"/api/document/developer/parseDocument" not in parsed.path
):
raise ValueError(f"Invalid LLMSherpa URL: {url}")
if "renderFormat=all" not in parsed.query:
valid_url = valid_url + "?renderFormat=all"
if new_indent_parser and "useNewIndentParser=true" not in parsed.query:
valid_url = valid_url + "&useNewIndentParser=true"
if apply_ocr and "applyOcr=yes" not in parsed.query:
valid_url = valid_url + "&applyOcr=yes"
return valid_url
def lazy_load(
self,
) -> Iterator[Document]:
"""Load file."""
from llmsherpa.readers import LayoutPDFReader
docs_reader = LayoutPDFReader(self.url)
doc = docs_reader.read_pdf(self.file_path)
if self.strategy == "sections":
yield from [
Document(
page_content=section.to_text(include_children=True, recurse=True),
metadata={
"source": self.file_path,
"section_number": section_num,
"section_title": section.title,
},
)
for section_num, section in enumerate(doc.sections())
]
if self.strategy == "chunks":
yield from [
Document(
page_content=chunk.to_context_text(),
metadata={
"source": self.file_path,
"chunk_number": chunk_num,
"chunk_type": chunk.tag,
},
)
for chunk_num, chunk in enumerate(doc.chunks())
]
if self.strategy == "html":
yield from [
Document(
page_content=doc.to_html(),
metadata={
"source": self.file_path,
},
)
]
if self.strategy == "text":
yield from [
Document(
page_content=doc.to_text(),
metadata={
"source": self.file_path,
},
)
]