Fix: the duplicate characters wrong results when using `pdfplumber loader` (#10165)

(Reopen PR #7706, hope this problem can fix.)

When using `pdfplumber`, some documents may be parsed incorrectly,
resulting in **duplicated characters**.

Taking the
[linked](https://bruusgaard.no/wp-content/uploads/2021/05/Datasheet1000-series.pdf)
document as an example:

## Before
```python
from langchain.document_loaders import PDFPlumberLoader

pdf_file = 'file.pdf'
loader = PDFPlumberLoader(pdf_file)
docs = loader.load()
print(docs[0].page_content)
```

Results:
```
11000000 SSeerriieess
PPoorrttaabbllee ssiinnggllee ggaass ddeetteeccttoorrss ffoorr HHyyddrrooggeenn aanndd CCoommbbuussttiibbllee ggaasseess
TThhee RRiikkeenn KKeeiikkii GGPP--11000000 iiss aa ccoommppaacctt aanndd
lliigghhttwweeiigghhtt ggaass ddeetteeccttoorr wwiitthh hhiigghh sseennssiittiivviittyy ffoorr
tthhee ddeetteeccttiioonn ooff hhyyddrrooccaarrbboonnss.. TThhee mmeeaassuurreemmeenntt
iiss ppeerrffoorrmmeedd ffoorr tthhiiss ppuurrppoossee bbyy mmeeaannss ooff ccaattaallyyttiicc
sseennssoorr.. TThhee GGPP--11000000 hhaass aa bbuuiilltt--iinn ppuummpp wwiitthh
ppuummpp bboooosstteerr ffuunnccttiioonn aanndd aa ddiirreecctt sseelleeccttiioonn ffrroomm
aa lliisstt ooff 2255 hhyyddrrooccaarrbboonnss ffoorr eexxaacctt aalliiggnnmmeenntt ooff tthhee
ttaarrggeett ggaass -- OOnnllyy ccaalliibbrraattiioonn oonn CCHH iiss nneecceessssaarryy..
44
FFeeaattuurreess
TThhee RRiikkeenn KKeeiikkii 110000vvvvttaabbllee ssiinnggllee HHyyddrrooggeenn aanndd
CCoommbbuussttiibbllee ggaass ddeetteeccttoorrss..
TThheerree aarree 33 ssttaannddaarrdd mmooddeellss::
GGPP--11000000:: 00--1100%%LLEELL // 00--110000%%LLEELL ›› LLEELL ddeetteeccttoorr
NNCC--11000000:: 00--11000000ppppmm // 00--1100000000ppppmm ›› PPPPMM
ddeetteeccttoorr
DDiirreecctt rreeaaddiinngg ooff tthhee ccoonncceennttrraattiioonn vvaalluueess ooff
ccoommbbuussttiibbllee ggaasseess ooff 2255 ggaasseess ((55 NNPP--11000000))..
EEaassyy ooppeerraattiioonn ffeeaattuurree ooff cchhaannggiinngg tthhee ggaass nnaammee
ddiissppllaayy wwiitthh 11 sswwiittcchh bbuuttttoonn..
LLoonngg ddiissttaannccee ddrraawwiinngg ppoossssiibbllee wwiitthh tthhee ppuummpp
bboooosstteerr ffuunnccttiioonn..
VVaarriioouuss ccoommbbuussttiibbllee ggaasseess ccaann bbee mmeeaassuurreedd bbyy tthhee
ppppmm oorrddeerr wwiitthh NNCC--11000000..
www.bruusgaard.no postmaster@bruusgaard.no +47 67 54 93 30 Rev: 446-2
```

We can see that there are a large number of duplicated characters in the
text, which can cause issues in subsequent applications.

## After

Therefore, based on the
[solution](https://github.com/jsvine/pdfplumber/issues/71) provided by
the `pdfplumber` source project. I added the `"dedupe_chars()"` method
to address this problem. (Just pass the parameter `dedupe` to `True`)

```python
from langchain.document_loaders import PDFPlumberLoader

pdf_file = 'file.pdf'
loader = PDFPlumberLoader(pdf_file, dedupe=True)
docs = loader.load()
print(docs[0].page_content)
```

Results:

```
1000 Series
Portable single gas detectors for Hydrogen and Combustible gases
The Riken Keiki GP-1000 is a compact and
lightweight gas detector with high sensitivity for
the detection of hydrocarbons. The measurement
is performed for this purpose by means of catalytic
sensor. The GP-1000 has a built-in pump with
pump booster function and a direct selection from
a list of 25 hydrocarbons for exact alignment of the
target gas - Only calibration on CH is necessary.
4
Features
The Riken Keiki 100vvtable single Hydrogen and
Combustible gas detectors.
There are 3 standard models:
GP-1000: 0-10%LEL / 0-100%LEL › LEL detector
NC-1000: 0-1000ppm / 0-10000ppm › PPM
detector
Direct reading of the concentration values of
combustible gases of 25 gases (5 NP-1000).
Easy operation feature of changing the gas name
display with 1 switch button.
Long distance drawing possible with the pump
booster function.
Various combustible gases can be measured by the
ppm order with NC-1000.
www.bruusgaard.no postmaster@bruusgaard.no +47 67 54 93 30 Rev: 446-2
```

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/10189/head
JunXiang 1 year ago committed by GitHub
parent 27944cb611
commit cb928ed3d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,11 +1,16 @@
"""Module contains common parsers for PDFs."""
from typing import Any, Iterator, Mapping, Optional, Sequence, Union
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Iterator, Mapping, Optional, Sequence, Union
from urllib.parse import urlparse
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.schema import Document
if TYPE_CHECKING:
import pdfplumber.page
class PyPDFParser(BaseBlobParser):
"""Load `PDF` using `pypdf` and chunk at character level."""
@ -116,13 +121,17 @@ class PyPDFium2Parser(BaseBlobParser):
class PDFPlumberParser(BaseBlobParser):
"""Parse `PDF` with `PDFPlumber`."""
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
def __init__(
self, text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False
) -> None:
"""Initialize the parser.
Args:
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
"""
self.text_kwargs = text_kwargs or {}
self.dedupe = dedupe
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
@ -133,7 +142,7 @@ class PDFPlumberParser(BaseBlobParser):
yield from [
Document(
page_content=page.extract_text(**self.text_kwargs),
page_content=self._process_page_content(page),
metadata=dict(
{
"source": blob.source,
@ -151,6 +160,12 @@ class PDFPlumberParser(BaseBlobParser):
for page in doc.pages
]
def _process_page_content(self, page: pdfplumber.page.Page) -> str:
"""Process the page content based on dedupe."""
if self.dedupe:
return page.dedupe_chars().extract_text(**self.text_kwargs)
return page.extract_text(**self.text_kwargs)
class AmazonTextractPDFParser(BaseBlobParser):
"""Send `PDF` files to `Amazon Textract` and parse them.

@ -437,7 +437,10 @@ class PDFPlumberLoader(BasePDFLoader):
"""Load `PDF` files using `pdfplumber`."""
def __init__(
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
self,
file_path: str,
text_kwargs: Optional[Mapping[str, Any]] = None,
dedupe: bool = False,
) -> None:
"""Initialize with a file path."""
try:
@ -450,11 +453,12 @@ class PDFPlumberLoader(BasePDFLoader):
super().__init__(file_path)
self.text_kwargs = text_kwargs or {}
self.dedupe = dedupe
def load(self) -> List[Document]:
"""Load file."""
parser = PDFPlumberParser(text_kwargs=self.text_kwargs)
parser = PDFPlumberParser(text_kwargs=self.text_kwargs, dedupe=self.dedupe)
blob = Blob.from_path(self.file_path)
return parser.parse(blob)

@ -8,3 +8,4 @@ _EXAMPLES_DIR = _THIS_DIR / "integration_tests" / "examples"
# Paths to test PDF files
HELLO_PDF = _EXAMPLES_DIR / "hello.pdf"
LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf"
DUPLICATE_CHARS = _EXAMPLES_DIR / "duplicate-chars.pdf"

@ -19,6 +19,10 @@ LAYOUT_PARSER_PAPER_PDF = (
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
)
DUPLICATE_CHARS = (
Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf"
)
def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
"""Standard tests to verify that the given parser works.
@ -59,6 +63,26 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) ->
assert metadata["page"] == 0
def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None:
"""PDFPlumber tests to verify that duplicate characters appear or not
Args:
parser (BaseBlobParser): The parser to test.
splits_by_page (bool): Whether the parser splits by page or not by default.
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
"""
blob = Blob.from_path(DUPLICATE_CHARS)
doc_generator = parser.lazy_parse(blob)
assert isinstance(doc_generator, Iterator)
docs = list(doc_generator)
if dedupe:
# use dedupe avoid duplicate characters.
assert "1000 Series" == docs[0].page_content.split("\n")[0]
else:
# duplicate characters will appear in doc if not dedupe
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
def test_pymupdf_loader() -> None:
"""Test PyMuPDF loader."""
_assert_with_parser(PyMuPDFParser())
@ -84,3 +108,5 @@ def test_pypdfium2_parser() -> None:
def test_pdfplumber_parser() -> None:
"""Test PDFPlumber parser."""
_assert_with_parser(PDFPlumberParser())
_assert_with_duplicate_parser(PDFPlumberParser())
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)

Loading…
Cancel
Save