Fix: the duplicate characters wrong results when using `pdfplumber loader` (#10165)

(Reopen PR #7706, hope this problem can fix.) When using `pdfplumber`, some documents may be parsed incorrectly, resulting in **duplicated characters**. Taking the [linked](https://bruusgaard.no/wp-content/uploads/2021/05/Datasheet1000-series.pdf) document as an example: ## Before ```python from langchain.document_loaders import PDFPlumberLoader pdf_file = 'file.pdf' loader = PDFPlumberLoader(pdf_file) docs = loader.load() print(docs[0].page_content) ``` Results: ``` 11000000 SSeerriieess PPoorrttaabbllee ssiinnggllee ggaass ddeetteeccttoorrss ffoorr HHyyddrrooggeenn aanndd CCoommbbuussttiibbllee ggaasseess TThhee RRiikkeenn KKeeiikkii GGPP--11000000 iiss aa ccoommppaacctt aanndd lliigghhttwweeiigghhtt ggaass ddeetteeccttoorr wwiitthh hhiigghh sseennssiittiivviittyy ffoorr tthhee ddeetteeccttiioonn ooff hhyyddrrooccaarrbboonnss.. TThhee mmeeaassuurreemmeenntt iiss ppeerrffoorrmmeedd ffoorr tthhiiss ppuurrppoossee bbyy mmeeaannss ooff ccaattaallyyttiicc sseennssoorr.. TThhee GGPP--11000000 hhaass aa bbuuiilltt--iinn ppuummpp wwiitthh ppuummpp bboooosstteerr ffuunnccttiioonn aanndd aa ddiirreecctt sseelleeccttiioonn ffrroomm aa lliisstt ooff 2255 hhyyddrrooccaarrbboonnss ffoorr eexxaacctt aalliiggnnmmeenntt ooff tthhee ttaarrggeett ggaass -- OOnnllyy ccaalliibbrraattiioonn oonn CCHH iiss nneecceessssaarryy.. 44 FFeeaattuurreess TThhee RRiikkeenn KKeeiikkii 110000vvvvttaabbllee ssiinnggllee HHyyddrrooggeenn aanndd CCoommbbuussttiibbllee ggaass ddeetteeccttoorrss.. TThheerree aarree 33 ssttaannddaarrdd mmooddeellss:: GGPP--11000000:: 00--1100%%LLEELL // 00--110000%%LLEELL ›› LLEELL ddeetteeccttoorr NNCC--11000000:: 00--11000000ppppmm // 00--1100000000ppppmm ›› PPPPMM ddeetteeccttoorr DDiirreecctt rreeaaddiinngg ooff tthhee ccoonncceennttrraattiioonn vvaalluueess ooff ccoommbbuussttiibbllee ggaasseess ooff 2255 ggaasseess ((55 NNPP--11000000)).. EEaassyy ooppeerraattiioonn ffeeaattuurree ooff cchhaannggiinngg tthhee ggaass nnaammee ddiissppllaayy wwiitthh 11 sswwiittcchh bbuuttttoonn.. LLoonngg ddiissttaannccee ddrraawwiinngg ppoossssiibbllee wwiitthh tthhee ppuummpp bboooosstteerr ffuunnccttiioonn.. VVaarriioouuss ccoommbbuussttiibbllee ggaasseess ccaann bbee mmeeaassuurreedd bbyy tthhee ppppmm oorrddeerr wwiitthh NNCC--11000000.. www.bruusgaard.no postmaster@bruusgaard.no +47 67 54 93 30 Rev: 446-2 ``` We can see that there are a large number of duplicated characters in the text, which can cause issues in subsequent applications. ## After Therefore, based on the [solution](https://github.com/jsvine/pdfplumber/issues/71) provided by the `pdfplumber` source project. I added the `"dedupe_chars()"` method to address this problem. (Just pass the parameter `dedupe` to `True`) ```python from langchain.document_loaders import PDFPlumberLoader pdf_file = 'file.pdf' loader = PDFPlumberLoader(pdf_file, dedupe=True) docs = loader.load() print(docs[0].page_content) ``` Results: ``` 1000 Series Portable single gas detectors for Hydrogen and Combustible gases The Riken Keiki GP-1000 is a compact and lightweight gas detector with high sensitivity for the detection of hydrocarbons. The measurement is performed for this purpose by means of catalytic sensor. The GP-1000 has a built-in pump with pump booster function and a direct selection from a list of 25 hydrocarbons for exact alignment of the target gas - Only calibration on CH is necessary. 4 Features The Riken Keiki 100vvtable single Hydrogen and Combustible gas detectors. There are 3 standard models: GP-1000: 0-10%LEL / 0-100%LEL › LEL detector NC-1000: 0-1000ppm / 0-10000ppm › PPM detector Direct reading of the concentration values of combustible gases of 25 gases (5 NP-1000). Easy operation feature of changing the gas name display with 1 switch button. Long distance drawing possible with the pump booster function. Various combustible gases can be measured by the ppm order with NC-1000. www.bruusgaard.no postmaster@bruusgaard.no +47 67 54 93 30 Rev: 446-2 ``` --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
1 year ago · cb928ed3d5
parent 27944cb611
commit cb928ed3d5
5 changed files with 51 additions and 5 deletions
--- a/libs/langchain/langchain/document_loaders/parsers/pdf.py
+++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py
@ -1,11 +1,16 @@
 """Module contains common parsers for PDFs."""
-from typing import Any, Iterator, Mapping, Optional, Sequence, Union
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Iterator, Mapping, Optional, Sequence, Union
 from urllib.parse import urlparse

 from langchain.document_loaders.base import BaseBlobParser
 from langchain.document_loaders.blob_loaders import Blob
 from langchain.schema import Document

+if TYPE_CHECKING:
+    import pdfplumber.page
+

 class PyPDFParser(BaseBlobParser):
    """Load `PDF` using `pypdf` and chunk at character level."""
@ -116,13 +121,17 @@ class PyPDFium2Parser(BaseBlobParser):
 class PDFPlumberParser(BaseBlobParser):
    """Parse `PDF` with `PDFPlumber`."""

-    def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
+    def __init__(
+        self, text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False
+    ) -> None:
        """Initialize the parser.

        Args:
            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
+            dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
        """
        self.text_kwargs = text_kwargs or {}
+        self.dedupe = dedupe

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""
@ -133,7 +142,7 @@ class PDFPlumberParser(BaseBlobParser):

            yield from [
                Document(
-                    page_content=page.extract_text(**self.text_kwargs),
+                    page_content=self._process_page_content(page),
                    metadata=dict(
                        {
                            "source": blob.source,
@ -151,6 +160,12 @@ class PDFPlumberParser(BaseBlobParser):
                for page in doc.pages
            ]

+    def _process_page_content(self, page: pdfplumber.page.Page) -> str:
+        """Process the page content based on dedupe."""
+        if self.dedupe:
+            return page.dedupe_chars().extract_text(**self.text_kwargs)
+        return page.extract_text(**self.text_kwargs)
+

 class AmazonTextractPDFParser(BaseBlobParser):
    """Send `PDF` files to `Amazon Textract` and parse them.
--- a/libs/langchain/langchain/document_loaders/pdf.py
+++ b/libs/langchain/langchain/document_loaders/pdf.py
@ -437,7 +437,10 @@ class PDFPlumberLoader(BasePDFLoader):
    """Load `PDF` files using `pdfplumber`."""

    def __init__(
-        self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
+        self,
+        file_path: str,
+        text_kwargs: Optional[Mapping[str, Any]] = None,
+        dedupe: bool = False,
    ) -> None:
        """Initialize with a file path."""
        try:
@ -450,11 +453,12 @@ class PDFPlumberLoader(BasePDFLoader):

        super().__init__(file_path)
        self.text_kwargs = text_kwargs or {}
+        self.dedupe = dedupe

    def load(self) -> List[Document]:
        """Load file."""

-        parser = PDFPlumberParser(text_kwargs=self.text_kwargs)
+        parser = PDFPlumberParser(text_kwargs=self.text_kwargs, dedupe=self.dedupe)
        blob = Blob.from_path(self.file_path)
        return parser.parse(blob)

--- a/libs/langchain/tests/data.py
+++ b/libs/langchain/tests/data.py
@ -8,3 +8,4 @@ _EXAMPLES_DIR = _THIS_DIR / "integration_tests" / "examples"
 # Paths to test PDF files
 HELLO_PDF = _EXAMPLES_DIR / "hello.pdf"
 LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf"
+DUPLICATE_CHARS = _EXAMPLES_DIR / "duplicate-chars.pdf"
--- a/libs/langchain/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/langchain/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@ -19,6 +19,10 @@ LAYOUT_PARSER_PAPER_PDF = (
    Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
 )

+DUPLICATE_CHARS = (
+    Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf"
+)
+

 def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
    """Standard tests to verify that the given parser works.
@ -59,6 +63,26 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) ->
        assert metadata["page"] == 0


+def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None:
+    """PDFPlumber tests to verify that duplicate characters appear or not
+    Args:
+        parser (BaseBlobParser): The parser to test.
+        splits_by_page (bool): Whether the parser splits by page or not by default.
+        dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
+    """
+    blob = Blob.from_path(DUPLICATE_CHARS)
+    doc_generator = parser.lazy_parse(blob)
+    assert isinstance(doc_generator, Iterator)
+    docs = list(doc_generator)
+
+    if dedupe:
+        # use dedupe avoid duplicate characters.
+        assert "1000 Series" == docs[0].page_content.split("\n")[0]
+    else:
+        # duplicate characters will appear in doc if not dedupe
+        assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
+
+
 def test_pymupdf_loader() -> None:
    """Test PyMuPDF loader."""
    _assert_with_parser(PyMuPDFParser())
@ -84,3 +108,5 @@ def test_pypdfium2_parser() -> None:
 def test_pdfplumber_parser() -> None:
    """Test PDFPlumber parser."""
    _assert_with_parser(PDFPlumberParser())
+    _assert_with_duplicate_parser(PDFPlumberParser())
+    _assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
--- a/libs/langchain/tests/integration_tests/examples/duplicate-chars.pdf
+++ b/libs/langchain/tests/integration_tests/examples/duplicate-chars.pdf