langchain/libs/community/langchain_community/document_loaders/llmsherpa.py

from pathlib import Path
from typing import Iterator, Union
from urllib.parse import urlparse

from langchain_core.documents import Document

from langchain_community.document_loaders.pdf import BaseLoader

DEFAULT_API = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"


class LLMSherpaFileLoader(BaseLoader):
    """Load Documents using `LLMSherpa`.

    LLMSherpaFileLoader use LayoutPDFReader, which is part of the LLMSherpa library.
    This tool is designed to parse PDFs while preserving their layout information,
    which is often lost when using most PDF to text parsers.

    Examples
    --------
    from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader

    loader = LLMSherpaFileLoader(
        "example.pdf",
        strategy="chunks",
        llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all",
    )
    docs = loader.load()
    """

    def __init__(
        self,
        file_path: Union[str, Path],
        new_indent_parser: bool = True,
        apply_ocr: bool = True,
        strategy: str = "chunks",
        llmsherpa_api_url: str = DEFAULT_API,
    ):
        """Initialize with a file path."""
        try:
            import llmsherpa  # noqa:F401
        except ImportError:
            raise ImportError(
                "llmsherpa package not found, please install it with "
                "`pip install llmsherpa`"
            )
        _valid_strategies = ["sections", "chunks", "html", "text"]
        if strategy not in _valid_strategies:
            raise ValueError(
                f"Got {strategy} for `strategy`, "
                f"but should be one of `{_valid_strategies}`"
            )
        # validate llmsherpa url
        if not self._is_valid_url(llmsherpa_api_url):
            raise ValueError(f"Invalid URL: {llmsherpa_api_url}")
        self.url = self._validate_llmsherpa_url(
            url=llmsherpa_api_url,
            new_indent_parser=new_indent_parser,
            apply_ocr=apply_ocr,
        )

        self.strategy = strategy
        self.file_path = str(file_path)

    @staticmethod
    def _is_valid_url(url: str) -> bool:
        """Check if the url is valid."""
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)

    @staticmethod
    def _validate_llmsherpa_url(
        url: str, new_indent_parser: bool = True, apply_ocr: bool = True
    ) -> str:
        """Check if the llmsherpa url is valid."""
        parsed = urlparse(url)
        valid_url = url
        if ("/api/parseDocument" not in parsed.path) and (
            "/api/document/developer/parseDocument" not in parsed.path
        ):
            raise ValueError(f"Invalid LLMSherpa URL: {url}")

        if "renderFormat=all" not in parsed.query:
            valid_url = valid_url + "?renderFormat=all"
        if new_indent_parser and "useNewIndentParser=true" not in parsed.query:
            valid_url = valid_url + "&useNewIndentParser=true"
        if apply_ocr and "applyOcr=yes" not in parsed.query:
            valid_url = valid_url + "&applyOcr=yes"

        return valid_url

    def lazy_load(
        self,
    ) -> Iterator[Document]:
        """Load file."""
        from llmsherpa.readers import LayoutPDFReader

        docs_reader = LayoutPDFReader(self.url)
        doc = docs_reader.read_pdf(self.file_path)

        if self.strategy == "sections":
            yield from [
                Document(
                    page_content=section.to_text(include_children=True, recurse=True),
                    metadata={
                        "source": self.file_path,
                        "section_number": section_num,
                        "section_title": section.title,
                    },
                )
                for section_num, section in enumerate(doc.sections())
            ]
        if self.strategy == "chunks":
            yield from [
                Document(
                    page_content=chunk.to_context_text(),
                    metadata={
                        "source": self.file_path,
                        "chunk_number": chunk_num,
                        "chunk_type": chunk.tag,
                    },
                )
                for chunk_num, chunk in enumerate(doc.chunks())
            ]
        if self.strategy == "html":
            yield from [
                Document(
                    page_content=doc.to_html(),
                    metadata={
                        "source": self.file_path,
                    },
                )
            ]
        if self.strategy == "text":
            yield from [
                Document(
                    page_content=doc.to_text(),
                    metadata={
                        "source": self.file_path,
                    },
                )
            ]
community[minor]: add support for llmsherpa (#19741) Thank you for contributing to LangChain! - [x] PR title: "community: added support for llmsherpa library" - [x] Add tests and docs: 1. Integration test: 'docs/docs/integrations/document_loaders/test_llmsherpa.py'. 2. an example notebook: `docs/docs/integrations/document_loaders/llmsherpa.ipynb`. - [x] Lint and test: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-03-29 23:04:57 +00:00			`from pathlib import Path`
			`from typing import Iterator, Union`
			`from urllib.parse import urlparse`

			`from langchain_core.documents import Document`

			`from langchain_community.document_loaders.pdf import BaseLoader`

			`DEFAULT_API = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"`


			`class LLMSherpaFileLoader(BaseLoader):`
			"""Load Documents using `LLMSherpa`.

			`LLMSherpaFileLoader use LayoutPDFReader, which is part of the LLMSherpa library.`
			`This tool is designed to parse PDFs while preserving their layout information,`
			`which is often lost when using most PDF to text parsers.`

			`Examples`
			`--------`
			`from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader`

			`loader = LLMSherpaFileLoader(`
			`"example.pdf",`
			`strategy="chunks",`
			`llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all",`
			`)`
			`docs = loader.load()`
			`"""`

			`def __init__(`
			`self,`
			`file_path: Union[str, Path],`
			`new_indent_parser: bool = True,`
			`apply_ocr: bool = True,`
			`strategy: str = "chunks",`
			`llmsherpa_api_url: str = DEFAULT_API,`
			`):`
			`"""Initialize with a file path."""`
			`try:`
			`import llmsherpa # noqa:F401`
			`except ImportError:`
			`raise ImportError(`
			`"llmsherpa package not found, please install it with "`
			"`pip install llmsherpa`"
			`)`
			`_valid_strategies = ["sections", "chunks", "html", "text"]`
			`if strategy not in _valid_strategies:`
			`raise ValueError(`
			f"Got {strategy} for `strategy`, "
			f"but should be one of `{_valid_strategies}`"
			`)`
			`# validate llmsherpa url`
			`if not self._is_valid_url(llmsherpa_api_url):`
			`raise ValueError(f"Invalid URL: {llmsherpa_api_url}")`
			`self.url = self._validate_llmsherpa_url(`
			`url=llmsherpa_api_url,`
			`new_indent_parser=new_indent_parser,`
			`apply_ocr=apply_ocr,`
			`)`

			`self.strategy = strategy`
			`self.file_path = str(file_path)`

			`@staticmethod`
			`def _is_valid_url(url: str) -> bool:`
			`"""Check if the url is valid."""`
			`parsed = urlparse(url)`
			`return bool(parsed.netloc) and bool(parsed.scheme)`

			`@staticmethod`
			`def _validate_llmsherpa_url(`
			`url: str, new_indent_parser: bool = True, apply_ocr: bool = True`
			`) -> str:`
			`"""Check if the llmsherpa url is valid."""`
			`parsed = urlparse(url)`
			`valid_url = url`
			`if ("/api/parseDocument" not in parsed.path) and (`
			`"/api/document/developer/parseDocument" not in parsed.path`
			`):`
			`raise ValueError(f"Invalid LLMSherpa URL: {url}")`

			`if "renderFormat=all" not in parsed.query:`
			`valid_url = valid_url + "?renderFormat=all"`
			`if new_indent_parser and "useNewIndentParser=true" not in parsed.query:`
			`valid_url = valid_url + "&useNewIndentParser=true"`
			`if apply_ocr and "applyOcr=yes" not in parsed.query:`
			`valid_url = valid_url + "&applyOcr=yes"`

			`return valid_url`

			`def lazy_load(`
			`self,`
			`) -> Iterator[Document]:`
			`"""Load file."""`
			`from llmsherpa.readers import LayoutPDFReader`

			`docs_reader = LayoutPDFReader(self.url)`
			`doc = docs_reader.read_pdf(self.file_path)`

			`if self.strategy == "sections":`
			`yield from [`
			`Document(`
			`page_content=section.to_text(include_children=True, recurse=True),`
			`metadata={`
			`"source": self.file_path,`
			`"section_number": section_num,`
			`"section_title": section.title,`
			`},`
			`)`
			`for section_num, section in enumerate(doc.sections())`
			`]`
			`if self.strategy == "chunks":`
			`yield from [`
			`Document(`
			`page_content=chunk.to_context_text(),`
			`metadata={`
			`"source": self.file_path,`
			`"chunk_number": chunk_num,`
			`"chunk_type": chunk.tag,`
			`},`
			`)`
			`for chunk_num, chunk in enumerate(doc.chunks())`
			`]`
			`if self.strategy == "html":`
			`yield from [`
			`Document(`
			`page_content=doc.to_html(),`
			`metadata={`
			`"source": self.file_path,`
			`},`
			`)`
			`]`
			`if self.strategy == "text":`
			`yield from [`
			`Document(`
			`page_content=doc.to_text(),`
			`metadata={`
			`"source": self.file_path,`
			`},`
			`)`
			`]`