Improvement[Community] Improve Document Loaders and Splitters (#27568)

- Fix word spelling error
- Add static method decorator
- Fix language splitter

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
ZhangShenao 2024-10-25 05:42:16 +08:00 committed by GitHub
parent 7345470669
commit 455ab7d714
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 4 additions and 3 deletions

View File

@ -178,7 +178,7 @@ class PyPDFLoader(BasePDFLoader):
loader = PyPDFLoader( loader = PyPDFLoader(
file_path = "./example_data/layout-parser-paper.pdf", file_path = "./example_data/layout-parser-paper.pdf",
password = "my-pasword", password = "my-password",
extract_images = True, extract_images = True,
# headers = None # headers = None
# extraction_mode = "plain", # extraction_mode = "plain",
@ -572,7 +572,8 @@ class MathpixPDFLoader(BasePDFLoader):
response = requests.get(url, headers=self._mathpix_headers) response = requests.get(url, headers=self._mathpix_headers)
return response.content.decode("utf-8") return response.content.decode("utf-8")
def clean_pdf(self, contents: str) -> str: @staticmethod
def clean_pdf(contents: str) -> str:
"""Clean the PDF file. """Clean the PDF file.
Args: Args:

View File

@ -465,7 +465,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
"\n\\\\begin{verse}", "\n\\\\begin{verse}",
"\n\\\\begin{verbatim}", "\n\\\\begin{verbatim}",
# Now split by math environments # Now split by math environments
"\n\\\begin{align}", "\n\\\\begin{align}",
"$$", "$$",
"$", "$",
# Now split by the normal type of lines # Now split by the normal type of lines