Improvement[Community] Improve Document Loaders and Splitters (#27568)

- Fix word spelling error - Add static method decorator - Fix language splitter Co-authored-by: Erick Friis <erick@langchain.dev>
2024-11-13 19:10:52 +00:00 · 2024-10-25 05:42:16 +08:00 · 2024-10-25 05:42:16 +08:00 · 455ab7d714
commit 455ab7d714
parent 7345470669
2 changed files with 4 additions and 3 deletions
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@ -178,7 +178,7 @@ class PyPDFLoader(BasePDFLoader):

            loader = PyPDFLoader(
                file_path = "./example_data/layout-parser-paper.pdf",
-                password = "my-pasword",
+                password = "my-password",
                extract_images = True,
                # headers = None
                # extraction_mode = "plain",
@ -572,7 +572,8 @@ class MathpixPDFLoader(BasePDFLoader):
        response = requests.get(url, headers=self._mathpix_headers)
        return response.content.decode("utf-8")

-    def clean_pdf(self, contents: str) -> str:
+    @staticmethod
+    def clean_pdf(contents: str) -> str:
        """Clean the PDF file.

        Args:
--- a/libs/text-splitters/langchain_text_splitters/character.py
+++ b/libs/text-splitters/langchain_text_splitters/character.py
@ -465,7 +465,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
                "\n\\\\begin{verse}",
                "\n\\\\begin{verbatim}",
                # Now split by math environments
-                "\n\\\begin{align}",
+                "\n\\\\begin{align}",
                "$$",
                "$",
                # Now split by the normal type of lines