From 455ab7d714a0bf971fbd1e3d3666e7bf8f81bcce Mon Sep 17 00:00:00 2001 From: ZhangShenao <15201440436@163.com> Date: Fri, 25 Oct 2024 05:42:16 +0800 Subject: [PATCH] Improvement[Community] Improve Document Loaders and Splitters (#27568) - Fix word spelling error - Add static method decorator - Fix language splitter Co-authored-by: Erick Friis --- libs/community/langchain_community/document_loaders/pdf.py | 5 +++-- libs/text-splitters/langchain_text_splitters/character.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index b01ffea8ef..528d34c43b 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -178,7 +178,7 @@ class PyPDFLoader(BasePDFLoader): loader = PyPDFLoader( file_path = "./example_data/layout-parser-paper.pdf", - password = "my-pasword", + password = "my-password", extract_images = True, # headers = None # extraction_mode = "plain", @@ -572,7 +572,8 @@ class MathpixPDFLoader(BasePDFLoader): response = requests.get(url, headers=self._mathpix_headers) return response.content.decode("utf-8") - def clean_pdf(self, contents: str) -> str: + @staticmethod + def clean_pdf(contents: str) -> str: """Clean the PDF file. Args: diff --git a/libs/text-splitters/langchain_text_splitters/character.py b/libs/text-splitters/langchain_text_splitters/character.py index 85124b39de..f65c38869d 100644 --- a/libs/text-splitters/langchain_text_splitters/character.py +++ b/libs/text-splitters/langchain_text_splitters/character.py @@ -465,7 +465,7 @@ class RecursiveCharacterTextSplitter(TextSplitter): "\n\\\\begin{verse}", "\n\\\\begin{verbatim}", # Now split by math environments - "\n\\\begin{align}", + "\n\\\\begin{align}", "$$", "$", # Now split by the normal type of lines