diff --git a/libs/community/langchain_community/document_loaders/acreom.py b/libs/community/langchain_community/document_loaders/acreom.py index b6eccfeac0..89aff5d173 100644 --- a/libs/community/langchain_community/document_loaders/acreom.py +++ b/libs/community/langchain_community/document_loaders/acreom.py @@ -1,6 +1,6 @@ import re from pathlib import Path -from typing import Iterator +from typing import Iterator, Union from langchain_core.documents import Document @@ -14,7 +14,10 @@ class AcreomLoader(BaseLoader): """Regex to match front matter metadata in markdown files.""" def __init__( - self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True + self, + path: Union[str, Path], + encoding: str = "UTF-8", + collect_metadata: bool = True, ): """Initialize the loader.""" self.file_path = path diff --git a/libs/community/langchain_community/document_loaders/airbyte_json.py b/libs/community/langchain_community/document_loaders/airbyte_json.py index b3a0e2fc0c..aeb4b43cab 100644 --- a/libs/community/langchain_community/document_loaders/airbyte_json.py +++ b/libs/community/langchain_community/document_loaders/airbyte_json.py @@ -1,5 +1,6 @@ import json -from typing import List +from pathlib import Path +from typing import List, Union from langchain_core.documents import Document from langchain_core.utils import stringify_dict @@ -10,7 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader class AirbyteJSONLoader(BaseLoader): """Load local `Airbyte` json files.""" - def __init__(self, file_path: str): + def __init__(self, file_path: Union[str, Path]): """Initialize with a file path. This should start with '/tmp/airbyte_local/'.""" self.file_path = file_path """Path to the directory containing the json files.""" @@ -20,5 +21,5 @@ class AirbyteJSONLoader(BaseLoader): for line in open(self.file_path, "r"): data = json.loads(line)["_airbyte_data"] text += stringify_dict(data) - metadata = {"source": self.file_path} + metadata = {"source": str(self.file_path)} return [Document(page_content=text, metadata=metadata)] diff --git a/libs/community/langchain_community/document_loaders/assemblyai.py b/libs/community/langchain_community/document_loaders/assemblyai.py index cc74c552e2..32ec943bdf 100644 --- a/libs/community/langchain_community/document_loaders/assemblyai.py +++ b/libs/community/langchain_community/document_loaders/assemblyai.py @@ -1,7 +1,8 @@ from __future__ import annotations from enum import Enum -from typing import TYPE_CHECKING, Iterator, Optional +from pathlib import Path +from typing import TYPE_CHECKING, Iterator, Optional, Union import requests from langchain_core.documents import Document @@ -44,7 +45,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader): def __init__( self, - file_path: str, + file_path: Union[str, Path], *, transcript_format: TranscriptFormat = TranscriptFormat.TEXT, config: Optional[assemblyai.TranscriptionConfig] = None, @@ -71,7 +72,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader): if api_key is not None: assemblyai.settings.api_key = api_key - self.file_path = file_path + self.file_path = str(file_path) self.transcript_format = transcript_format self.transcriber = assemblyai.Transcriber(config=config) diff --git a/libs/community/langchain_community/document_loaders/conllu.py b/libs/community/langchain_community/document_loaders/conllu.py index 989eec61a5..fa43af653a 100644 --- a/libs/community/langchain_community/document_loaders/conllu.py +++ b/libs/community/langchain_community/document_loaders/conllu.py @@ -1,5 +1,6 @@ import csv -from typing import List +from pathlib import Path +from typing import List, Union from langchain_core.documents import Document @@ -9,7 +10,7 @@ from langchain_community.document_loaders.base import BaseLoader class CoNLLULoader(BaseLoader): """Load `CoNLL-U` files.""" - def __init__(self, file_path: str): + def __init__(self, file_path: Union[str, Path]): """Initialize with a file path.""" self.file_path = file_path @@ -29,5 +30,5 @@ class CoNLLULoader(BaseLoader): else: text += line[1] + " " - metadata = {"source": self.file_path} + metadata = {"source": str(self.file_path)} return [Document(page_content=text, metadata=metadata)] diff --git a/libs/community/langchain_community/document_loaders/csv_loader.py b/libs/community/langchain_community/document_loaders/csv_loader.py index df207d9dd2..fca2f1f0f9 100644 --- a/libs/community/langchain_community/document_loaders/csv_loader.py +++ b/libs/community/langchain_community/document_loaders/csv_loader.py @@ -1,6 +1,7 @@ import csv from io import TextIOWrapper -from typing import Any, Dict, Iterator, List, Optional, Sequence +from pathlib import Path +from typing import Any, Dict, Iterator, List, Optional, Sequence, Union from langchain_core.documents import Document @@ -35,7 +36,7 @@ class CSVLoader(BaseLoader): def __init__( self, - file_path: str, + file_path: Union[str, Path], source_column: Optional[str] = None, metadata_columns: Sequence[str] = (), csv_args: Optional[Dict] = None, @@ -89,7 +90,7 @@ class CSVLoader(BaseLoader): source = ( row[self.source_column] if self.source_column is not None - else self.file_path + else str(self.file_path) ) except KeyError: raise ValueError( diff --git a/libs/community/langchain_community/document_loaders/email.py b/libs/community/langchain_community/document_loaders/email.py index 66b460e95a..832593839e 100644 --- a/libs/community/langchain_community/document_loaders/email.py +++ b/libs/community/langchain_community/document_loaders/email.py @@ -1,5 +1,6 @@ import os -from typing import Any, Iterator, List +from pathlib import Path +from typing import Any, Iterator, List, Union from langchain_core.documents import Document @@ -41,7 +42,10 @@ class UnstructuredEmailLoader(UnstructuredFileLoader): """ def __init__( - self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, ): process_attachments = unstructured_kwargs.get("process_attachments") attachment_partitioner = unstructured_kwargs.get("attachment_partitioner") @@ -79,17 +83,17 @@ class OutlookMessageLoader(BaseLoader): https://github.com/TeamMsgExtractor/msg-extractor """ - def __init__(self, file_path: str): + def __init__(self, file_path: Union[str, Path]): """Initialize with a file path. Args: file_path: The path to the Outlook Message file. """ - self.file_path = file_path + self.file_path = str(file_path) if not os.path.isfile(self.file_path): - raise ValueError("File path %s is not a valid file" % self.file_path) + raise ValueError(f"File path {self.file_path} is not a valid file") try: import extract_msg # noqa:F401 diff --git a/libs/community/langchain_community/document_loaders/evernote.py b/libs/community/langchain_community/document_loaders/evernote.py index 88a3efa8b1..07ade0ce5d 100644 --- a/libs/community/langchain_community/document_loaders/evernote.py +++ b/libs/community/langchain_community/document_loaders/evernote.py @@ -5,8 +5,9 @@ https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c import hashlib import logging from base64 import b64decode +from pathlib import Path from time import strptime -from typing import Any, Dict, Iterator, List, Optional +from typing import Any, Dict, Iterator, List, Optional, Union from langchain_core.documents import Document @@ -35,9 +36,9 @@ class EverNoteLoader(BaseLoader): the 'source' which contains the file name of the export. """ # noqa: E501 - def __init__(self, file_path: str, load_single_document: bool = True): + def __init__(self, file_path: Union[str, Path], load_single_document: bool = True): """Initialize with file path.""" - self.file_path = file_path + self.file_path = str(file_path) self.load_single_document = load_single_document def _lazy_load(self) -> Iterator[Document]: diff --git a/libs/community/langchain_community/document_loaders/excel.py b/libs/community/langchain_community/document_loaders/excel.py index f1724280d1..3aa31cfe86 100644 --- a/libs/community/langchain_community/document_loaders/excel.py +++ b/libs/community/langchain_community/document_loaders/excel.py @@ -1,5 +1,6 @@ """Loads Microsoft Excel files.""" -from typing import Any, List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import ( UnstructuredFileLoader, @@ -27,7 +28,10 @@ class UnstructuredExcelLoader(UnstructuredFileLoader): """ def __init__( - self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, ): """ diff --git a/libs/community/langchain_community/document_loaders/facebook_chat.py b/libs/community/langchain_community/document_loaders/facebook_chat.py index 1122b460c9..443c9ce07f 100644 --- a/libs/community/langchain_community/document_loaders/facebook_chat.py +++ b/libs/community/langchain_community/document_loaders/facebook_chat.py @@ -1,7 +1,7 @@ import datetime import json from pathlib import Path -from typing import Iterator +from typing import Iterator, Union from langchain_core.documents import Document @@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str: class FacebookChatLoader(BaseLoader): """Load `Facebook Chat` messages directory dump.""" - def __init__(self, path: str): + def __init__(self, path: Union[str, Path]): """Initialize with a path.""" self.file_path = path diff --git a/libs/community/langchain_community/document_loaders/helpers.py b/libs/community/langchain_community/document_loaders/helpers.py index 6e0f8b9bfb..e094db6872 100644 --- a/libs/community/langchain_community/document_loaders/helpers.py +++ b/libs/community/langchain_community/document_loaders/helpers.py @@ -1,7 +1,8 @@ """Document loader helpers.""" import concurrent.futures -from typing import List, NamedTuple, Optional, cast +from pathlib import Path +from typing import List, NamedTuple, Optional, Union, cast class FileEncoding(NamedTuple): @@ -15,7 +16,9 @@ class FileEncoding(NamedTuple): """The language of the file.""" -def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]: +def detect_file_encodings( + file_path: Union[str, Path], timeout: int = 5 +) -> List[FileEncoding]: """Try to detect the file encoding. Returns a list of `FileEncoding` tuples with the detected encodings ordered @@ -27,6 +30,8 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding """ import chardet + file_path = str(file_path) + def read_and_detect(file_path: str) -> List[dict]: with open(file_path, "rb") as f: rawdata = f.read() diff --git a/libs/community/langchain_community/document_loaders/html_bs.py b/libs/community/langchain_community/document_loaders/html_bs.py index 09b7489dda..3ab1820cc3 100644 --- a/libs/community/langchain_community/document_loaders/html_bs.py +++ b/libs/community/langchain_community/document_loaders/html_bs.py @@ -1,4 +1,5 @@ import logging +from pathlib import Path from typing import Dict, Iterator, Union from langchain_core.documents import Document @@ -13,7 +14,7 @@ class BSHTMLLoader(BaseLoader): def __init__( self, - file_path: str, + file_path: Union[str, Path], open_encoding: Union[str, None] = None, bs_kwargs: Union[dict, None] = None, get_text_separator: str = "", @@ -57,7 +58,7 @@ class BSHTMLLoader(BaseLoader): title = "" metadata: Dict[str, Union[str, None]] = { - "source": self.file_path, + "source": str(self.file_path), "title": title, } yield Document(page_content=text, metadata=metadata) diff --git a/libs/community/langchain_community/document_loaders/image_captions.py b/libs/community/langchain_community/document_loaders/image_captions.py index 93dce16432..568fa0a25a 100644 --- a/libs/community/langchain_community/document_loaders/image_captions.py +++ b/libs/community/langchain_community/document_loaders/image_captions.py @@ -1,4 +1,5 @@ from io import BytesIO +from pathlib import Path from typing import Any, List, Tuple, Union import requests @@ -17,7 +18,7 @@ class ImageCaptionLoader(BaseLoader): def __init__( self, - images: Union[str, bytes, List[Union[str, bytes]]], + images: Union[str, Path, bytes, List[Union[str, bytes, Path]]], blip_processor: str = "Salesforce/blip-image-captioning-base", blip_model: str = "Salesforce/blip-image-captioning-base", ): @@ -29,7 +30,7 @@ class ImageCaptionLoader(BaseLoader): blip_processor: The name of the pre-trained BLIP processor. blip_model: The name of the pre-trained BLIP model. """ - if isinstance(images, (str, bytes)): + if isinstance(images, (str, Path, bytes)): self.images = [images] else: self.images = images @@ -61,7 +62,7 @@ class ImageCaptionLoader(BaseLoader): return results def _get_captions_and_metadata( - self, model: Any, processor: Any, image: Union[str, bytes] + self, model: Any, processor: Any, image: Union[str, Path, bytes] ) -> Tuple[str, dict]: """Helper function for getting the captions and metadata of an image.""" try: @@ -76,7 +77,9 @@ class ImageCaptionLoader(BaseLoader): try: if isinstance(image, bytes): image = Image.open(BytesIO(image)).convert("RGB") - elif image.startswith("http://") or image.startswith("https://"): + elif isinstance(image, str) and ( + image.startswith("http://") or image.startswith("https://") + ): image = Image.open(requests.get(image, stream=True).raw).convert("RGB") else: image = Image.open(image).convert("RGB") @@ -94,6 +97,6 @@ class ImageCaptionLoader(BaseLoader): if isinstance(image_source, bytes): metadata: dict = {"image_source": "Image bytes provided"} else: - metadata = {"image_path": image_source} + metadata = {"image_path": str(image_source)} return caption, metadata diff --git a/libs/community/langchain_community/document_loaders/mhtml.py b/libs/community/langchain_community/document_loaders/mhtml.py index 8652ed9e14..95edc76d32 100644 --- a/libs/community/langchain_community/document_loaders/mhtml.py +++ b/libs/community/langchain_community/document_loaders/mhtml.py @@ -1,5 +1,6 @@ import email import logging +from pathlib import Path from typing import Dict, Iterator, Union from langchain_core.documents import Document @@ -14,7 +15,7 @@ class MHTMLLoader(BaseLoader): def __init__( self, - file_path: str, + file_path: Union[str, Path], open_encoding: Union[str, None] = None, bs_kwargs: Union[dict, None] = None, get_text_separator: str = "", @@ -69,7 +70,7 @@ class MHTMLLoader(BaseLoader): title = "" metadata: Dict[str, Union[str, None]] = { - "source": self.file_path, + "source": str(self.file_path), "title": title, } yield Document(page_content=text, metadata=metadata) diff --git a/libs/community/langchain_community/document_loaders/notebook.py b/libs/community/langchain_community/document_loaders/notebook.py index 51eec597a5..aa3e1c38a3 100644 --- a/libs/community/langchain_community/document_loaders/notebook.py +++ b/libs/community/langchain_community/document_loaders/notebook.py @@ -1,7 +1,7 @@ """Loads .ipynb notebook files.""" import json from pathlib import Path -from typing import Any, List +from typing import Any, List, Union from langchain_core.documents import Document @@ -75,7 +75,7 @@ class NotebookLoader(BaseLoader): def __init__( self, - path: str, + path: Union[str, Path], include_outputs: bool = False, max_output_length: int = 10, remove_newline: bool = False, diff --git a/libs/community/langchain_community/document_loaders/notion.py b/libs/community/langchain_community/document_loaders/notion.py index c42bf568f3..ed01891c44 100644 --- a/libs/community/langchain_community/document_loaders/notion.py +++ b/libs/community/langchain_community/document_loaders/notion.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import List +from typing import List, Union from langchain_core.documents import Document @@ -9,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader class NotionDirectoryLoader(BaseLoader): """Load `Notion directory` dump.""" - def __init__(self, path: str, *, encoding: str = "utf-8") -> None: + def __init__(self, path: Union[str, Path], *, encoding: str = "utf-8") -> None: """Initialize with a file path.""" self.file_path = path self.encoding = encoding diff --git a/libs/community/langchain_community/document_loaders/obsidian.py b/libs/community/langchain_community/document_loaders/obsidian.py index bcd2659d65..ed5ce974cb 100644 --- a/libs/community/langchain_community/document_loaders/obsidian.py +++ b/libs/community/langchain_community/document_loaders/obsidian.py @@ -2,7 +2,7 @@ import functools import logging import re from pathlib import Path -from typing import Any, Dict, Iterator +from typing import Any, Dict, Iterator, Union import yaml from langchain_core.documents import Document @@ -23,7 +23,10 @@ class ObsidianLoader(BaseLoader): DATAVIEW_INLINE_PAREN_REGEX = re.compile(r"\((\w+)::\s*(.*)\)", re.MULTILINE) def __init__( - self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True + self, + path: Union[str, Path], + encoding: str = "UTF-8", + collect_metadata: bool = True, ): """Initialize with a path. diff --git a/libs/community/langchain_community/document_loaders/odt.py b/libs/community/langchain_community/document_loaders/odt.py index 6d2cc3474e..171cede5ca 100644 --- a/libs/community/langchain_community/document_loaders/odt.py +++ b/libs/community/langchain_community/document_loaders/odt.py @@ -1,4 +1,5 @@ -from typing import Any, List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import ( UnstructuredFileLoader, @@ -31,7 +32,10 @@ class UnstructuredODTLoader(UnstructuredFileLoader): """ def __init__( - self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, ): """ diff --git a/libs/community/langchain_community/document_loaders/org_mode.py b/libs/community/langchain_community/document_loaders/org_mode.py index e926e6f628..56331e2db0 100644 --- a/libs/community/langchain_community/document_loaders/org_mode.py +++ b/libs/community/langchain_community/document_loaders/org_mode.py @@ -1,4 +1,5 @@ -from typing import Any, List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import ( UnstructuredFileLoader, @@ -31,7 +32,10 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader): """ def __init__( - self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, ): """ diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 8522f491a1..d4a636c69b 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -80,14 +80,14 @@ class BasePDFLoader(BaseLoader, ABC): clean up the temporary file after completion. """ - def __init__(self, file_path: str, *, headers: Optional[Dict] = None): + def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None): """Initialize with a file path. Args: file_path: Either a local, S3 or web path to a PDF file. headers: Headers to use for GET request to download a file from a web path. """ - self.file_path = file_path + self.file_path = str(file_path) self.web_path = None self.headers = headers if "~" in self.file_path: @@ -226,7 +226,7 @@ class PyPDFDirectoryLoader(BaseLoader): def __init__( self, - path: str, + path: Union[str, Path], glob: str = "**/[!.]*.pdf", silent_errors: bool = False, load_hidden: bool = False, diff --git a/libs/community/langchain_community/document_loaders/python.py b/libs/community/langchain_community/document_loaders/python.py index 9afbbd30f7..74c6ecbcf4 100644 --- a/libs/community/langchain_community/document_loaders/python.py +++ b/libs/community/langchain_community/document_loaders/python.py @@ -1,4 +1,6 @@ import tokenize +from pathlib import Path +from typing import Union from langchain_community.document_loaders.text import TextLoader @@ -6,7 +8,7 @@ from langchain_community.document_loaders.text import TextLoader class PythonLoader(TextLoader): """Load `Python` files, respecting any non-default encoding if specified.""" - def __init__(self, file_path: str): + def __init__(self, file_path: Union[str, Path]): """Initialize with a file path. Args: diff --git a/libs/community/langchain_community/document_loaders/roam.py b/libs/community/langchain_community/document_loaders/roam.py index a21b827a1d..cfd431b187 100644 --- a/libs/community/langchain_community/document_loaders/roam.py +++ b/libs/community/langchain_community/document_loaders/roam.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import List +from typing import List, Union from langchain_core.documents import Document @@ -9,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader class RoamLoader(BaseLoader): """Load `Roam` files from a directory.""" - def __init__(self, path: str): + def __init__(self, path: Union[str, Path]): """Initialize with a path.""" self.file_path = path diff --git a/libs/community/langchain_community/document_loaders/rst.py b/libs/community/langchain_community/document_loaders/rst.py index 103b24414e..ffac8f645e 100644 --- a/libs/community/langchain_community/document_loaders/rst.py +++ b/libs/community/langchain_community/document_loaders/rst.py @@ -1,5 +1,6 @@ """Loads RST files.""" -from typing import Any, List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import ( UnstructuredFileLoader, @@ -32,7 +33,10 @@ class UnstructuredRSTLoader(UnstructuredFileLoader): """ def __init__( - self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, ): """ Initialize with a file path. diff --git a/libs/community/langchain_community/document_loaders/rtf.py b/libs/community/langchain_community/document_loaders/rtf.py index 3fe2731684..a018f43a6a 100644 --- a/libs/community/langchain_community/document_loaders/rtf.py +++ b/libs/community/langchain_community/document_loaders/rtf.py @@ -1,5 +1,6 @@ """Loads rich text files.""" -from typing import Any, List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import ( UnstructuredFileLoader, @@ -32,7 +33,10 @@ class UnstructuredRTFLoader(UnstructuredFileLoader): """ def __init__( - self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, ): """ Initialize with a file path. diff --git a/libs/community/langchain_community/document_loaders/slack_directory.py b/libs/community/langchain_community/document_loaders/slack_directory.py index cbd0173ea4..1fdce62033 100644 --- a/libs/community/langchain_community/document_loaders/slack_directory.py +++ b/libs/community/langchain_community/document_loaders/slack_directory.py @@ -1,7 +1,7 @@ import json import zipfile from pathlib import Path -from typing import Dict, Iterator, List, Optional +from typing import Dict, Iterator, List, Optional, Union from langchain_core.documents import Document @@ -11,7 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader class SlackDirectoryLoader(BaseLoader): """Load from a `Slack` directory dump.""" - def __init__(self, zip_path: str, workspace_url: Optional[str] = None): + def __init__(self, zip_path: Union[str, Path], workspace_url: Optional[str] = None): """Initialize the SlackDirectoryLoader. Args: diff --git a/libs/community/langchain_community/document_loaders/srt.py b/libs/community/langchain_community/document_loaders/srt.py index 32acecc41c..4a6f499370 100644 --- a/libs/community/langchain_community/document_loaders/srt.py +++ b/libs/community/langchain_community/document_loaders/srt.py @@ -1,4 +1,5 @@ -from typing import List +from pathlib import Path +from typing import List, Union from langchain_core.documents import Document @@ -8,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader class SRTLoader(BaseLoader): """Load `.srt` (subtitle) files.""" - def __init__(self, file_path: str): + def __init__(self, file_path: Union[str, Path]): """Initialize with a file path.""" try: import pysrt # noqa:F401 @@ -16,7 +17,7 @@ class SRTLoader(BaseLoader): raise ImportError( "package `pysrt` not found, please install it with `pip install pysrt`" ) - self.file_path = file_path + self.file_path = str(file_path) def load(self) -> List[Document]: """Load using pysrt file.""" diff --git a/libs/community/langchain_community/document_loaders/telegram.py b/libs/community/langchain_community/document_loaders/telegram.py index 50955b73fc..f955b491c2 100644 --- a/libs/community/langchain_community/document_loaders/telegram.py +++ b/libs/community/langchain_community/document_loaders/telegram.py @@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str: class TelegramChatFileLoader(BaseLoader): """Load from `Telegram chat` dump.""" - def __init__(self, path: str): + def __init__(self, path: Union[str, Path]): """Initialize with a path.""" self.file_path = path diff --git a/libs/community/langchain_community/document_loaders/text.py b/libs/community/langchain_community/document_loaders/text.py index 9409e86b44..a17216dfff 100644 --- a/libs/community/langchain_community/document_loaders/text.py +++ b/libs/community/langchain_community/document_loaders/text.py @@ -1,5 +1,6 @@ import logging -from typing import Iterator, Optional +from pathlib import Path +from typing import Iterator, Optional, Union from langchain_core.documents import Document @@ -25,7 +26,7 @@ class TextLoader(BaseLoader): def __init__( self, - file_path: str, + file_path: Union[str, Path], encoding: Optional[str] = None, autodetect_encoding: bool = False, ): @@ -56,5 +57,5 @@ class TextLoader(BaseLoader): except Exception as e: raise RuntimeError(f"Error loading {self.file_path}") from e - metadata = {"source": self.file_path} + metadata = {"source": str(self.file_path)} yield Document(page_content=text, metadata=metadata) diff --git a/libs/community/langchain_community/document_loaders/tsv.py b/libs/community/langchain_community/document_loaders/tsv.py index 9bd4b4c2ed..7a06b67282 100644 --- a/libs/community/langchain_community/document_loaders/tsv.py +++ b/libs/community/langchain_community/document_loaders/tsv.py @@ -1,4 +1,5 @@ -from typing import Any, List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import ( UnstructuredFileLoader, @@ -26,7 +27,10 @@ class UnstructuredTSVLoader(UnstructuredFileLoader): """ def __init__( - self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, ): validate_unstructured_version(min_unstructured_version="0.7.6") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/libs/community/langchain_community/document_loaders/unstructured.py b/libs/community/langchain_community/document_loaders/unstructured.py index 2878be2575..22df465589 100644 --- a/libs/community/langchain_community/document_loaders/unstructured.py +++ b/libs/community/langchain_community/document_loaders/unstructured.py @@ -1,6 +1,7 @@ """Loader that uses unstructured to load files.""" import collections from abc import ABC, abstractmethod +from pathlib import Path from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union from langchain_core.documents import Document @@ -155,7 +156,7 @@ class UnstructuredFileLoader(UnstructuredBaseLoader): def __init__( self, - file_path: Union[str, List[str]], + file_path: Union[str, List[str], Path, List[Path]], mode: str = "single", **unstructured_kwargs: Any, ): @@ -169,9 +170,13 @@ class UnstructuredFileLoader(UnstructuredBaseLoader): if isinstance(self.file_path, list): elements = [] for file in self.file_path: + if isinstance(file, Path): + file = str(file) elements.extend(partition(filename=file, **self.unstructured_kwargs)) return elements else: + if isinstance(self.file_path, Path): + self.file_path = str(self.file_path) return partition(filename=self.file_path, **self.unstructured_kwargs) def _get_metadata(self) -> dict: @@ -179,14 +184,16 @@ class UnstructuredFileLoader(UnstructuredBaseLoader): def get_elements_from_api( - file_path: Union[str, List[str], None] = None, + file_path: Union[str, List[str], Path, List[Path], None] = None, file: Union[IO, Sequence[IO], None] = None, api_url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", **unstructured_kwargs: Any, ) -> List: """Retrieve a list of elements from the `Unstructured API`.""" - if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list): + if is_list := isinstance(file_path, list): + file_path = [str(path) for path in file_path] + if isinstance(file, collections.abc.Sequence) or is_list: from unstructured.partition.api import partition_multiple_via_api _doc_elements = partition_multiple_via_api( @@ -206,7 +213,7 @@ def get_elements_from_api( from unstructured.partition.api import partition_via_api return partition_via_api( - filename=file_path, + filename=str(file_path), file=file, api_key=api_key, api_url=api_url, diff --git a/libs/community/langchain_community/document_loaders/vsdx.py b/libs/community/langchain_community/document_loaders/vsdx.py index e0929e4019..5546d5db4d 100644 --- a/libs/community/langchain_community/document_loaders/vsdx.py +++ b/libs/community/langchain_community/document_loaders/vsdx.py @@ -1,7 +1,8 @@ import os import tempfile from abc import ABC -from typing import List +from pathlib import Path +from typing import List, Union from urllib.parse import urlparse import requests @@ -13,9 +14,9 @@ from langchain_community.document_loaders.parsers import VsdxParser class VsdxLoader(BaseLoader, ABC): - def __init__(self, file_path: str): + def __init__(self, file_path: Union[str, Path]): """Initialize with file path.""" - self.file_path = file_path + self.file_path = str(file_path) if "~" in self.file_path: self.file_path = os.path.expanduser(self.file_path) diff --git a/libs/community/langchain_community/document_loaders/word_document.py b/libs/community/langchain_community/document_loaders/word_document.py index efbd12559e..c5e0dc1a9c 100644 --- a/libs/community/langchain_community/document_loaders/word_document.py +++ b/libs/community/langchain_community/document_loaders/word_document.py @@ -2,7 +2,8 @@ import os import tempfile from abc import ABC -from typing import List +from pathlib import Path +from typing import List, Union from urllib.parse import urlparse import requests @@ -19,9 +20,9 @@ class Docx2txtLoader(BaseLoader, ABC): to a temporary file, and use that, then clean up the temporary file after completion """ - def __init__(self, file_path: str): + def __init__(self, file_path: Union[str, Path]): """Initialize with file path.""" - self.file_path = file_path + self.file_path = str(file_path) if "~" in self.file_path: self.file_path = os.path.expanduser(self.file_path) diff --git a/libs/community/langchain_community/document_loaders/xml.py b/libs/community/langchain_community/document_loaders/xml.py index 1e1262de03..d57d6582e9 100644 --- a/libs/community/langchain_community/document_loaders/xml.py +++ b/libs/community/langchain_community/document_loaders/xml.py @@ -1,5 +1,6 @@ """Loads Microsoft Excel files.""" -from typing import Any, List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import ( UnstructuredFileLoader, @@ -32,8 +33,12 @@ class UnstructuredXMLLoader(UnstructuredFileLoader): """ def __init__( - self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, ): + file_path = str(file_path) validate_unstructured_version(min_unstructured_version="0.6.7") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)