mirror of
https://github.com/hwchase17/langchain
synced 2024-11-13 19:10:52 +00:00
Improvement[Community] Improve Document Loaders and Splitters (#27568)
- Fix word spelling error - Add static method decorator - Fix language splitter Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
7345470669
commit
455ab7d714
@ -178,7 +178,7 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
|
|
||||||
loader = PyPDFLoader(
|
loader = PyPDFLoader(
|
||||||
file_path = "./example_data/layout-parser-paper.pdf",
|
file_path = "./example_data/layout-parser-paper.pdf",
|
||||||
password = "my-pasword",
|
password = "my-password",
|
||||||
extract_images = True,
|
extract_images = True,
|
||||||
# headers = None
|
# headers = None
|
||||||
# extraction_mode = "plain",
|
# extraction_mode = "plain",
|
||||||
@ -572,7 +572,8 @@ class MathpixPDFLoader(BasePDFLoader):
|
|||||||
response = requests.get(url, headers=self._mathpix_headers)
|
response = requests.get(url, headers=self._mathpix_headers)
|
||||||
return response.content.decode("utf-8")
|
return response.content.decode("utf-8")
|
||||||
|
|
||||||
def clean_pdf(self, contents: str) -> str:
|
@staticmethod
|
||||||
|
def clean_pdf(contents: str) -> str:
|
||||||
"""Clean the PDF file.
|
"""Clean the PDF file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -465,7 +465,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|||||||
"\n\\\\begin{verse}",
|
"\n\\\\begin{verse}",
|
||||||
"\n\\\\begin{verbatim}",
|
"\n\\\\begin{verbatim}",
|
||||||
# Now split by math environments
|
# Now split by math environments
|
||||||
"\n\\\begin{align}",
|
"\n\\\\begin{align}",
|
||||||
"$$",
|
"$$",
|
||||||
"$",
|
"$",
|
||||||
# Now split by the normal type of lines
|
# Now split by the normal type of lines
|
||||||
|
Loading…
Reference in New Issue
Block a user