community: better support of pathlib paths in document loaders (#18396)

So this arose from the https://github.com/langchain-ai/langchain/pull/18397 problem of document loaders not supporting `pathlib.Path`. This pull request provides more uniform support for Path as an argument. The core ideas for this upgrade: - if there is a local file path used as an argument, it should be supported as `pathlib.Path` - if there are some external calls that may or may not support Pathlib, the argument is immidiately converted to `str` - if there `self.file_path` is used in a way that it allows for it to stay pathlib without conversion, is is only converted for the metadata. Twitter handle: https://twitter.com/mwmajewsk
3 months ago · f7a1fd91b8
parent 94b869a974
commit f7a1fd91b8
32 changed files with 147 additions and 80 deletions
--- a/libs/community/langchain_community/document_loaders/acreom.py
+++ b/libs/community/langchain_community/document_loaders/acreom.py
@ -1,6 +1,6 @@
 import re
 from pathlib import Path
-from typing import Iterator
+from typing import Iterator, Union

 from langchain_core.documents import Document

@ -14,7 +14,10 @@ class AcreomLoader(BaseLoader):
    """Regex to match front matter metadata in markdown files."""

    def __init__(
-        self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
+        self,
+        path: Union[str, Path],
+        encoding: str = "UTF-8",
+        collect_metadata: bool = True,
    ):
        """Initialize the loader."""
        self.file_path = path
--- a/libs/community/langchain_community/document_loaders/airbyte_json.py
+++ b/libs/community/langchain_community/document_loaders/airbyte_json.py
@ -1,5 +1,6 @@
 import json
-from typing import List
+from pathlib import Path
+from typing import List, Union

 from langchain_core.documents import Document
 from langchain_core.utils import stringify_dict
@ -10,7 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader
 class AirbyteJSONLoader(BaseLoader):
    """Load local `Airbyte` json files."""

-    def __init__(self, file_path: str):
+    def __init__(self, file_path: Union[str, Path]):
        """Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
        self.file_path = file_path
        """Path to the directory containing the json files."""
@ -20,5 +21,5 @@ class AirbyteJSONLoader(BaseLoader):
        for line in open(self.file_path, "r"):
            data = json.loads(line)["_airbyte_data"]
            text += stringify_dict(data)
-        metadata = {"source": self.file_path}
+        metadata = {"source": str(self.file_path)}
        return [Document(page_content=text, metadata=metadata)]
--- a/libs/community/langchain_community/document_loaders/assemblyai.py
+++ b/libs/community/langchain_community/document_loaders/assemblyai.py
@ -1,7 +1,8 @@
 from __future__ import annotations

 from enum import Enum
-from typing import TYPE_CHECKING, Iterator, Optional
+from pathlib import Path
+from typing import TYPE_CHECKING, Iterator, Optional, Union

 import requests
 from langchain_core.documents import Document
@ -44,7 +45,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):

    def __init__(
        self,
-        file_path: str,
+        file_path: Union[str, Path],
        *,
        transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
        config: Optional[assemblyai.TranscriptionConfig] = None,
@ -71,7 +72,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
        if api_key is not None:
            assemblyai.settings.api_key = api_key

-        self.file_path = file_path
+        self.file_path = str(file_path)
        self.transcript_format = transcript_format
        self.transcriber = assemblyai.Transcriber(config=config)

--- a/libs/community/langchain_community/document_loaders/conllu.py
+++ b/libs/community/langchain_community/document_loaders/conllu.py
@ -1,5 +1,6 @@
 import csv
-from typing import List
+from pathlib import Path
+from typing import List, Union

 from langchain_core.documents import Document

@ -9,7 +10,7 @@ from langchain_community.document_loaders.base import BaseLoader
 class CoNLLULoader(BaseLoader):
    """Load `CoNLL-U` files."""

-    def __init__(self, file_path: str):
+    def __init__(self, file_path: Union[str, Path]):
        """Initialize with a file path."""
        self.file_path = file_path

@ -29,5 +30,5 @@ class CoNLLULoader(BaseLoader):
            else:
                text += line[1] + " "

-        metadata = {"source": self.file_path}
+        metadata = {"source": str(self.file_path)}
        return [Document(page_content=text, metadata=metadata)]
--- a/libs/community/langchain_community/document_loaders/csv_loader.py
+++ b/libs/community/langchain_community/document_loaders/csv_loader.py
@ -1,6 +1,7 @@
 import csv
 from io import TextIOWrapper
-from typing import Any, Dict, Iterator, List, Optional, Sequence
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional, Sequence, Union

 from langchain_core.documents import Document

@ -35,7 +36,7 @@ class CSVLoader(BaseLoader):

    def __init__(
        self,
-        file_path: str,
+        file_path: Union[str, Path],
        source_column: Optional[str] = None,
        metadata_columns: Sequence[str] = (),
        csv_args: Optional[Dict] = None,
@ -89,7 +90,7 @@ class CSVLoader(BaseLoader):
                source = (
                    row[self.source_column]
                    if self.source_column is not None
-                    else self.file_path
+                    else str(self.file_path)
                )
            except KeyError:
                raise ValueError(
--- a/libs/community/langchain_community/document_loaders/email.py
+++ b/libs/community/langchain_community/document_loaders/email.py
@ -1,5 +1,6 @@
 import os
-from typing import Any, Iterator, List
+from pathlib import Path
+from typing import Any, Iterator, List, Union

 from langchain_core.documents import Document

@ -41,7 +42,10 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
    """

    def __init__(
-        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+        self,
+        file_path: Union[str, Path],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
    ):
        process_attachments = unstructured_kwargs.get("process_attachments")
        attachment_partitioner = unstructured_kwargs.get("attachment_partitioner")
@ -79,17 +83,17 @@ class OutlookMessageLoader(BaseLoader):
    https://github.com/TeamMsgExtractor/msg-extractor
    """

-    def __init__(self, file_path: str):
+    def __init__(self, file_path: Union[str, Path]):
        """Initialize with a file path.

        Args:
            file_path: The path to the Outlook Message file.
        """

-        self.file_path = file_path
+        self.file_path = str(file_path)

        if not os.path.isfile(self.file_path):
-            raise ValueError("File path %s is not a valid file" % self.file_path)
+            raise ValueError(f"File path {self.file_path} is not a valid file")

        try:
            import extract_msg  # noqa:F401
--- a/libs/community/langchain_community/document_loaders/evernote.py
+++ b/libs/community/langchain_community/document_loaders/evernote.py
@ -5,8 +5,9 @@ https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
 import hashlib
 import logging
 from base64 import b64decode
+from pathlib import Path
 from time import strptime
-from typing import Any, Dict, Iterator, List, Optional
+from typing import Any, Dict, Iterator, List, Optional, Union

 from langchain_core.documents import Document

@ -35,9 +36,9 @@ class EverNoteLoader(BaseLoader):
            the 'source' which contains the file name of the export.
    """  # noqa: E501

-    def __init__(self, file_path: str, load_single_document: bool = True):
+    def __init__(self, file_path: Union[str, Path], load_single_document: bool = True):
        """Initialize with file path."""
-        self.file_path = file_path
+        self.file_path = str(file_path)
        self.load_single_document = load_single_document

    def _lazy_load(self) -> Iterator[Document]:
--- a/libs/community/langchain_community/document_loaders/excel.py
+++ b/libs/community/langchain_community/document_loaders/excel.py
@ -1,5 +1,6 @@
 """Loads Microsoft Excel files."""
-from typing import Any, List
+from pathlib import Path
+from typing import Any, List, Union

 from langchain_community.document_loaders.unstructured import (
    UnstructuredFileLoader,
@ -27,7 +28,10 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
    """

    def __init__(
-        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+        self,
+        file_path: Union[str, Path],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
    ):
        """

--- a/libs/community/langchain_community/document_loaders/facebook_chat.py
+++ b/libs/community/langchain_community/document_loaders/facebook_chat.py
@ -1,7 +1,7 @@
 import datetime
 import json
 from pathlib import Path
-from typing import Iterator
+from typing import Iterator, Union

 from langchain_core.documents import Document

@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str:
 class FacebookChatLoader(BaseLoader):
    """Load `Facebook Chat` messages directory dump."""

-    def __init__(self, path: str):
+    def __init__(self, path: Union[str, Path]):
        """Initialize with a path."""
        self.file_path = path

--- a/libs/community/langchain_community/document_loaders/helpers.py
+++ b/libs/community/langchain_community/document_loaders/helpers.py
@ -1,7 +1,8 @@
 """Document loader helpers."""

 import concurrent.futures
-from typing import List, NamedTuple, Optional, cast
+from pathlib import Path
+from typing import List, NamedTuple, Optional, Union, cast


 class FileEncoding(NamedTuple):
@ -15,7 +16,9 @@ class FileEncoding(NamedTuple):
    """The language of the file."""


-def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
+def detect_file_encodings(
+    file_path: Union[str, Path], timeout: int = 5
+) -> List[FileEncoding]:
    """Try to detect the file encoding.

    Returns a list of `FileEncoding` tuples with the detected encodings ordered
@ -27,6 +30,8 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding
    """
    import chardet

+    file_path = str(file_path)
+
    def read_and_detect(file_path: str) -> List[dict]:
        with open(file_path, "rb") as f:
            rawdata = f.read()
--- a/libs/community/langchain_community/document_loaders/html_bs.py
+++ b/libs/community/langchain_community/document_loaders/html_bs.py
@ -1,4 +1,5 @@
 import logging
+from pathlib import Path
 from typing import Dict, Iterator, Union

 from langchain_core.documents import Document
@ -13,7 +14,7 @@ class BSHTMLLoader(BaseLoader):

    def __init__(
        self,
-        file_path: str,
+        file_path: Union[str, Path],
        open_encoding: Union[str, None] = None,
        bs_kwargs: Union[dict, None] = None,
        get_text_separator: str = "",
@ -57,7 +58,7 @@ class BSHTMLLoader(BaseLoader):
            title = ""

        metadata: Dict[str, Union[str, None]] = {
-            "source": self.file_path,
+            "source": str(self.file_path),
            "title": title,
        }
        yield Document(page_content=text, metadata=metadata)
--- a/libs/community/langchain_community/document_loaders/image_captions.py
+++ b/libs/community/langchain_community/document_loaders/image_captions.py
@ -1,4 +1,5 @@
 from io import BytesIO
+from pathlib import Path
 from typing import Any, List, Tuple, Union

 import requests
@ -17,7 +18,7 @@ class ImageCaptionLoader(BaseLoader):

    def __init__(
        self,
-        images: Union[str, bytes, List[Union[str, bytes]]],
+        images: Union[str, Path, bytes, List[Union[str, bytes, Path]]],
        blip_processor: str = "Salesforce/blip-image-captioning-base",
        blip_model: str = "Salesforce/blip-image-captioning-base",
    ):
@ -29,7 +30,7 @@ class ImageCaptionLoader(BaseLoader):
            blip_processor: The name of the pre-trained BLIP processor.
            blip_model: The name of the pre-trained BLIP model.
        """
-        if isinstance(images, (str, bytes)):
+        if isinstance(images, (str, Path, bytes)):
            self.images = [images]
        else:
            self.images = images
@ -61,7 +62,7 @@ class ImageCaptionLoader(BaseLoader):
        return results

    def _get_captions_and_metadata(
-        self, model: Any, processor: Any, image: Union[str, bytes]
+        self, model: Any, processor: Any, image: Union[str, Path, bytes]
    ) -> Tuple[str, dict]:
        """Helper function for getting the captions and metadata of an image."""
        try:
@ -76,7 +77,9 @@ class ImageCaptionLoader(BaseLoader):
        try:
            if isinstance(image, bytes):
                image = Image.open(BytesIO(image)).convert("RGB")
-            elif image.startswith("http://") or image.startswith("https://"):
+            elif isinstance(image, str) and (
+                image.startswith("http://") or image.startswith("https://")
+            ):
                image = Image.open(requests.get(image, stream=True).raw).convert("RGB")
            else:
                image = Image.open(image).convert("RGB")
@ -94,6 +97,6 @@ class ImageCaptionLoader(BaseLoader):
        if isinstance(image_source, bytes):
            metadata: dict = {"image_source": "Image bytes provided"}
        else:
-            metadata = {"image_path": image_source}
+            metadata = {"image_path": str(image_source)}

        return caption, metadata
--- a/libs/community/langchain_community/document_loaders/mhtml.py
+++ b/libs/community/langchain_community/document_loaders/mhtml.py
@ -1,5 +1,6 @@
 import email
 import logging
+from pathlib import Path
 from typing import Dict, Iterator, Union

 from langchain_core.documents import Document
@ -14,7 +15,7 @@ class MHTMLLoader(BaseLoader):

    def __init__(
        self,
-        file_path: str,
+        file_path: Union[str, Path],
        open_encoding: Union[str, None] = None,
        bs_kwargs: Union[dict, None] = None,
        get_text_separator: str = "",
@ -69,7 +70,7 @@ class MHTMLLoader(BaseLoader):
                        title = ""

                    metadata: Dict[str, Union[str, None]] = {
-                        "source": self.file_path,
+                        "source": str(self.file_path),
                        "title": title,
                    }
                    yield Document(page_content=text, metadata=metadata)
--- a/libs/community/langchain_community/document_loaders/notebook.py
+++ b/libs/community/langchain_community/document_loaders/notebook.py
@ -1,7 +1,7 @@
 """Loads .ipynb notebook files."""
 import json
 from pathlib import Path
-from typing import Any, List
+from typing import Any, List, Union

 from langchain_core.documents import Document

@ -75,7 +75,7 @@ class NotebookLoader(BaseLoader):

    def __init__(
        self,
-        path: str,
+        path: Union[str, Path],
        include_outputs: bool = False,
        max_output_length: int = 10,
        remove_newline: bool = False,
--- a/libs/community/langchain_community/document_loaders/notion.py
+++ b/libs/community/langchain_community/document_loaders/notion.py
@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import List
+from typing import List, Union

 from langchain_core.documents import Document

@ -9,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
 class NotionDirectoryLoader(BaseLoader):
    """Load `Notion directory` dump."""

-    def __init__(self, path: str, *, encoding: str = "utf-8") -> None:
+    def __init__(self, path: Union[str, Path], *, encoding: str = "utf-8") -> None:
        """Initialize with a file path."""
        self.file_path = path
        self.encoding = encoding
--- a/libs/community/langchain_community/document_loaders/obsidian.py
+++ b/libs/community/langchain_community/document_loaders/obsidian.py
@ -2,7 +2,7 @@ import functools
 import logging
 import re
 from pathlib import Path
-from typing import Any, Dict, Iterator
+from typing import Any, Dict, Iterator, Union

 import yaml
 from langchain_core.documents import Document
@ -23,7 +23,10 @@ class ObsidianLoader(BaseLoader):
    DATAVIEW_INLINE_PAREN_REGEX = re.compile(r"\((\w+)::\s*(.*)\)", re.MULTILINE)

    def __init__(
-        self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
+        self,
+        path: Union[str, Path],
+        encoding: str = "UTF-8",
+        collect_metadata: bool = True,
    ):
        """Initialize with a path.

--- a/libs/community/langchain_community/document_loaders/odt.py
+++ b/libs/community/langchain_community/document_loaders/odt.py
@ -1,4 +1,5 @@
-from typing import Any, List
+from pathlib import Path
+from typing import Any, List, Union

 from langchain_community.document_loaders.unstructured import (
    UnstructuredFileLoader,
@ -31,7 +32,10 @@ class UnstructuredODTLoader(UnstructuredFileLoader):
    """

    def __init__(
-        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+        self,
+        file_path: Union[str, Path],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
    ):
        """

--- a/libs/community/langchain_community/document_loaders/org_mode.py
+++ b/libs/community/langchain_community/document_loaders/org_mode.py
@ -1,4 +1,5 @@
-from typing import Any, List
+from pathlib import Path
+from typing import Any, List, Union

 from langchain_community.document_loaders.unstructured import (
    UnstructuredFileLoader,
@ -31,7 +32,10 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader):
    """

    def __init__(
-        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+        self,
+        file_path: Union[str, Path],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
    ):
        """

--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@ -80,14 +80,14 @@ class BasePDFLoader(BaseLoader, ABC):
        clean up the temporary file after completion.
    """

-    def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
+    def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None):
        """Initialize with a file path.

        Args:
            file_path: Either a local, S3 or web path to a PDF file.
            headers: Headers to use for GET request to download a file from a web path.
        """
-        self.file_path = file_path
+        self.file_path = str(file_path)
        self.web_path = None
        self.headers = headers
        if "~" in self.file_path:
@ -226,7 +226,7 @@ class PyPDFDirectoryLoader(BaseLoader):

    def __init__(
        self,
-        path: str,
+        path: Union[str, Path],
        glob: str = "**/[!.]*.pdf",
        silent_errors: bool = False,
        load_hidden: bool = False,
--- a/libs/community/langchain_community/document_loaders/python.py
+++ b/libs/community/langchain_community/document_loaders/python.py
@ -1,4 +1,6 @@
 import tokenize
+from pathlib import Path
+from typing import Union

 from langchain_community.document_loaders.text import TextLoader

@ -6,7 +8,7 @@ from langchain_community.document_loaders.text import TextLoader
 class PythonLoader(TextLoader):
    """Load `Python` files, respecting any non-default encoding if specified."""

-    def __init__(self, file_path: str):
+    def __init__(self, file_path: Union[str, Path]):
        """Initialize with a file path.

        Args:
--- a/libs/community/langchain_community/document_loaders/roam.py
+++ b/libs/community/langchain_community/document_loaders/roam.py
@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import List
+from typing import List, Union

 from langchain_core.documents import Document

@ -9,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
 class RoamLoader(BaseLoader):
    """Load `Roam` files from a directory."""

-    def __init__(self, path: str):
+    def __init__(self, path: Union[str, Path]):
        """Initialize with a path."""
        self.file_path = path

--- a/libs/community/langchain_community/document_loaders/rst.py
+++ b/libs/community/langchain_community/document_loaders/rst.py
@ -1,5 +1,6 @@
 """Loads RST files."""
-from typing import Any, List
+from pathlib import Path
+from typing import Any, List, Union

 from langchain_community.document_loaders.unstructured import (
    UnstructuredFileLoader,
@ -32,7 +33,10 @@ class UnstructuredRSTLoader(UnstructuredFileLoader):
    """

    def __init__(
-        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+        self,
+        file_path: Union[str, Path],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
    ):
        """
        Initialize with a file path.
--- a/libs/community/langchain_community/document_loaders/rtf.py
+++ b/libs/community/langchain_community/document_loaders/rtf.py
@ -1,5 +1,6 @@
 """Loads rich text files."""
-from typing import Any, List
+from pathlib import Path
+from typing import Any, List, Union

 from langchain_community.document_loaders.unstructured import (
    UnstructuredFileLoader,
@ -32,7 +33,10 @@ class UnstructuredRTFLoader(UnstructuredFileLoader):
    """

    def __init__(
-        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+        self,
+        file_path: Union[str, Path],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
    ):
        """
        Initialize with a file path.
--- a/libs/community/langchain_community/document_loaders/slack_directory.py
+++ b/libs/community/langchain_community/document_loaders/slack_directory.py
@ -1,7 +1,7 @@
 import json
 import zipfile
 from pathlib import Path
-from typing import Dict, Iterator, List, Optional
+from typing import Dict, Iterator, List, Optional, Union

 from langchain_core.documents import Document

@ -11,7 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader
 class SlackDirectoryLoader(BaseLoader):
    """Load from a `Slack` directory dump."""

-    def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
+    def __init__(self, zip_path: Union[str, Path], workspace_url: Optional[str] = None):
        """Initialize the SlackDirectoryLoader.

        Args:
--- a/libs/community/langchain_community/document_loaders/srt.py
+++ b/libs/community/langchain_community/document_loaders/srt.py
@ -1,4 +1,5 @@
-from typing import List
+from pathlib import Path
+from typing import List, Union

 from langchain_core.documents import Document

@ -8,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
 class SRTLoader(BaseLoader):
    """Load `.srt` (subtitle) files."""

-    def __init__(self, file_path: str):
+    def __init__(self, file_path: Union[str, Path]):
        """Initialize with a file path."""
        try:
            import pysrt  # noqa:F401
@ -16,7 +17,7 @@ class SRTLoader(BaseLoader):
            raise ImportError(
                "package `pysrt` not found, please install it with `pip install pysrt`"
            )
-        self.file_path = file_path
+        self.file_path = str(file_path)

    def load(self) -> List[Document]:
        """Load using pysrt file."""
--- a/libs/community/langchain_community/document_loaders/telegram.py
+++ b/libs/community/langchain_community/document_loaders/telegram.py
@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str:
 class TelegramChatFileLoader(BaseLoader):
    """Load from `Telegram chat` dump."""

-    def __init__(self, path: str):
+    def __init__(self, path: Union[str, Path]):
        """Initialize with a path."""
        self.file_path = path

--- a/libs/community/langchain_community/document_loaders/text.py
+++ b/libs/community/langchain_community/document_loaders/text.py
@ -1,5 +1,6 @@
 import logging
-from typing import Iterator, Optional
+from pathlib import Path
+from typing import Iterator, Optional, Union

 from langchain_core.documents import Document

@ -25,7 +26,7 @@ class TextLoader(BaseLoader):

    def __init__(
        self,
-        file_path: str,
+        file_path: Union[str, Path],
        encoding: Optional[str] = None,
        autodetect_encoding: bool = False,
    ):
@ -56,5 +57,5 @@ class TextLoader(BaseLoader):
        except Exception as e:
            raise RuntimeError(f"Error loading {self.file_path}") from e

-        metadata = {"source": self.file_path}
+        metadata = {"source": str(self.file_path)}
        yield Document(page_content=text, metadata=metadata)
--- a/libs/community/langchain_community/document_loaders/tsv.py
+++ b/libs/community/langchain_community/document_loaders/tsv.py
@ -1,4 +1,5 @@
-from typing import Any, List
+from pathlib import Path
+from typing import Any, List, Union

 from langchain_community.document_loaders.unstructured import (
    UnstructuredFileLoader,
@ -26,7 +27,10 @@ class UnstructuredTSVLoader(UnstructuredFileLoader):
    """

    def __init__(
-        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+        self,
+        file_path: Union[str, Path],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
    ):
        validate_unstructured_version(min_unstructured_version="0.7.6")
        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
--- a/libs/community/langchain_community/document_loaders/unstructured.py
+++ b/libs/community/langchain_community/document_loaders/unstructured.py
@ -1,6 +1,7 @@
 """Loader that uses unstructured to load files."""
 import collections
 from abc import ABC, abstractmethod
+from pathlib import Path
 from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union

 from langchain_core.documents import Document
@ -155,7 +156,7 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):

    def __init__(
        self,
-        file_path: Union[str, List[str]],
+        file_path: Union[str, List[str], Path, List[Path]],
        mode: str = "single",
        **unstructured_kwargs: Any,
    ):
@ -169,9 +170,13 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
        if isinstance(self.file_path, list):
            elements = []
            for file in self.file_path:
+                if isinstance(file, Path):
+                    file = str(file)
                elements.extend(partition(filename=file, **self.unstructured_kwargs))
            return elements
        else:
+            if isinstance(self.file_path, Path):
+                self.file_path = str(self.file_path)
            return partition(filename=self.file_path, **self.unstructured_kwargs)

    def _get_metadata(self) -> dict:
@ -179,14 +184,16 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):


 def get_elements_from_api(
-    file_path: Union[str, List[str], None] = None,
+    file_path: Union[str, List[str], Path, List[Path], None] = None,
    file: Union[IO, Sequence[IO], None] = None,
    api_url: str = "https://api.unstructured.io/general/v0/general",
    api_key: str = "",
    **unstructured_kwargs: Any,
 ) -> List:
    """Retrieve a list of elements from the `Unstructured API`."""
-    if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
+    if is_list := isinstance(file_path, list):
+        file_path = [str(path) for path in file_path]
+    if isinstance(file, collections.abc.Sequence) or is_list:
        from unstructured.partition.api import partition_multiple_via_api

        _doc_elements = partition_multiple_via_api(
@ -206,7 +213,7 @@ def get_elements_from_api(
        from unstructured.partition.api import partition_via_api

        return partition_via_api(
-            filename=file_path,
+            filename=str(file_path),
            file=file,
            api_key=api_key,
            api_url=api_url,
--- a/libs/community/langchain_community/document_loaders/vsdx.py
+++ b/libs/community/langchain_community/document_loaders/vsdx.py
@ -1,7 +1,8 @@
 import os
 import tempfile
 from abc import ABC
-from typing import List
+from pathlib import Path
+from typing import List, Union
 from urllib.parse import urlparse

 import requests
@ -13,9 +14,9 @@ from langchain_community.document_loaders.parsers import VsdxParser


 class VsdxLoader(BaseLoader, ABC):
-    def __init__(self, file_path: str):
+    def __init__(self, file_path: Union[str, Path]):
        """Initialize with file path."""
-        self.file_path = file_path
+        self.file_path = str(file_path)
        if "~" in self.file_path:
            self.file_path = os.path.expanduser(self.file_path)

--- a/libs/community/langchain_community/document_loaders/word_document.py
+++ b/libs/community/langchain_community/document_loaders/word_document.py
@ -2,7 +2,8 @@
 import os
 import tempfile
 from abc import ABC
-from typing import List
+from pathlib import Path
+from typing import List, Union
 from urllib.parse import urlparse

 import requests
@ -19,9 +20,9 @@ class Docx2txtLoader(BaseLoader, ABC):
    to a temporary file, and use that, then clean up the temporary file after completion
    """

-    def __init__(self, file_path: str):
+    def __init__(self, file_path: Union[str, Path]):
        """Initialize with file path."""
-        self.file_path = file_path
+        self.file_path = str(file_path)
        if "~" in self.file_path:
            self.file_path = os.path.expanduser(self.file_path)

--- a/libs/community/langchain_community/document_loaders/xml.py
+++ b/libs/community/langchain_community/document_loaders/xml.py
@ -1,5 +1,6 @@
 """Loads Microsoft Excel files."""
-from typing import Any, List
+from pathlib import Path
+from typing import Any, List, Union

 from langchain_community.document_loaders.unstructured import (
    UnstructuredFileLoader,
@ -32,8 +33,12 @@ class UnstructuredXMLLoader(UnstructuredFileLoader):
    """

    def __init__(
-        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+        self,
+        file_path: Union[str, Path],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
    ):
+        file_path = str(file_path)
        validate_unstructured_version(min_unstructured_version="0.6.7")
        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)