community: better support of pathlib paths in document loaders (#18396)

So this arose from the
https://github.com/langchain-ai/langchain/pull/18397 problem of document
loaders not supporting `pathlib.Path`.

This pull request provides more uniform support for Path as an argument.
The core ideas for this upgrade: 
- if there is a local file path used as an argument, it should be
supported as `pathlib.Path`
- if there are some external calls that may or may not support Pathlib,
the argument is immidiately converted to `str`
- if there `self.file_path` is used in a way that it allows for it to
stay pathlib without conversion, is is only converted for the metadata.

Twitter handle: https://twitter.com/mwmajewsk
pull/19590/head
mwmajewsk 3 months ago committed by GitHub
parent 94b869a974
commit f7a1fd91b8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -1,6 +1,6 @@
import re
from pathlib import Path
from typing import Iterator
from typing import Iterator, Union
from langchain_core.documents import Document
@ -14,7 +14,10 @@ class AcreomLoader(BaseLoader):
"""Regex to match front matter metadata in markdown files."""
def __init__(
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
self,
path: Union[str, Path],
encoding: str = "UTF-8",
collect_metadata: bool = True,
):
"""Initialize the loader."""
self.file_path = path

@ -1,5 +1,6 @@
import json
from typing import List
from pathlib import Path
from typing import List, Union
from langchain_core.documents import Document
from langchain_core.utils import stringify_dict
@ -10,7 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader
class AirbyteJSONLoader(BaseLoader):
"""Load local `Airbyte` json files."""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
self.file_path = file_path
"""Path to the directory containing the json files."""
@ -20,5 +21,5 @@ class AirbyteJSONLoader(BaseLoader):
for line in open(self.file_path, "r"):
data = json.loads(line)["_airbyte_data"]
text += stringify_dict(data)
metadata = {"source": self.file_path}
metadata = {"source": str(self.file_path)}
return [Document(page_content=text, metadata=metadata)]

@ -1,7 +1,8 @@
from __future__ import annotations
from enum import Enum
from typing import TYPE_CHECKING, Iterator, Optional
from pathlib import Path
from typing import TYPE_CHECKING, Iterator, Optional, Union
import requests
from langchain_core.documents import Document
@ -44,7 +45,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
def __init__(
self,
file_path: str,
file_path: Union[str, Path],
*,
transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
config: Optional[assemblyai.TranscriptionConfig] = None,
@ -71,7 +72,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
if api_key is not None:
assemblyai.settings.api_key = api_key
self.file_path = file_path
self.file_path = str(file_path)
self.transcript_format = transcript_format
self.transcriber = assemblyai.Transcriber(config=config)

@ -1,5 +1,6 @@
import csv
from typing import List
from pathlib import Path
from typing import List, Union
from langchain_core.documents import Document
@ -9,7 +10,7 @@ from langchain_community.document_loaders.base import BaseLoader
class CoNLLULoader(BaseLoader):
"""Load `CoNLL-U` files."""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path."""
self.file_path = file_path
@ -29,5 +30,5 @@ class CoNLLULoader(BaseLoader):
else:
text += line[1] + " "
metadata = {"source": self.file_path}
metadata = {"source": str(self.file_path)}
return [Document(page_content=text, metadata=metadata)]

@ -1,6 +1,7 @@
import csv
from io import TextIOWrapper
from typing import Any, Dict, Iterator, List, Optional, Sequence
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
from langchain_core.documents import Document
@ -35,7 +36,7 @@ class CSVLoader(BaseLoader):
def __init__(
self,
file_path: str,
file_path: Union[str, Path],
source_column: Optional[str] = None,
metadata_columns: Sequence[str] = (),
csv_args: Optional[Dict] = None,
@ -89,7 +90,7 @@ class CSVLoader(BaseLoader):
source = (
row[self.source_column]
if self.source_column is not None
else self.file_path
else str(self.file_path)
)
except KeyError:
raise ValueError(

@ -1,5 +1,6 @@
import os
from typing import Any, Iterator, List
from pathlib import Path
from typing import Any, Iterator, List, Union
from langchain_core.documents import Document
@ -41,7 +42,10 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
process_attachments = unstructured_kwargs.get("process_attachments")
attachment_partitioner = unstructured_kwargs.get("attachment_partitioner")
@ -79,17 +83,17 @@ class OutlookMessageLoader(BaseLoader):
https://github.com/TeamMsgExtractor/msg-extractor
"""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path.
Args:
file_path: The path to the Outlook Message file.
"""
self.file_path = file_path
self.file_path = str(file_path)
if not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file" % self.file_path)
raise ValueError(f"File path {self.file_path} is not a valid file")
try:
import extract_msg # noqa:F401

@ -5,8 +5,9 @@ https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
import hashlib
import logging
from base64 import b64decode
from pathlib import Path
from time import strptime
from typing import Any, Dict, Iterator, List, Optional
from typing import Any, Dict, Iterator, List, Optional, Union
from langchain_core.documents import Document
@ -35,9 +36,9 @@ class EverNoteLoader(BaseLoader):
the 'source' which contains the file name of the export.
""" # noqa: E501
def __init__(self, file_path: str, load_single_document: bool = True):
def __init__(self, file_path: Union[str, Path], load_single_document: bool = True):
"""Initialize with file path."""
self.file_path = file_path
self.file_path = str(file_path)
self.load_single_document = load_single_document
def _lazy_load(self) -> Iterator[Document]:

@ -1,5 +1,6 @@
"""Loads Microsoft Excel files."""
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -27,7 +28,10 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

@ -1,7 +1,7 @@
import datetime
import json
from pathlib import Path
from typing import Iterator
from typing import Iterator, Union
from langchain_core.documents import Document
@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str:
class FacebookChatLoader(BaseLoader):
"""Load `Facebook Chat` messages directory dump."""
def __init__(self, path: str):
def __init__(self, path: Union[str, Path]):
"""Initialize with a path."""
self.file_path = path

@ -1,7 +1,8 @@
"""Document loader helpers."""
import concurrent.futures
from typing import List, NamedTuple, Optional, cast
from pathlib import Path
from typing import List, NamedTuple, Optional, Union, cast
class FileEncoding(NamedTuple):
@ -15,7 +16,9 @@ class FileEncoding(NamedTuple):
"""The language of the file."""
def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
def detect_file_encodings(
file_path: Union[str, Path], timeout: int = 5
) -> List[FileEncoding]:
"""Try to detect the file encoding.
Returns a list of `FileEncoding` tuples with the detected encodings ordered
@ -27,6 +30,8 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding
"""
import chardet
file_path = str(file_path)
def read_and_detect(file_path: str) -> List[dict]:
with open(file_path, "rb") as f:
rawdata = f.read()

@ -1,4 +1,5 @@
import logging
from pathlib import Path
from typing import Dict, Iterator, Union
from langchain_core.documents import Document
@ -13,7 +14,7 @@ class BSHTMLLoader(BaseLoader):
def __init__(
self,
file_path: str,
file_path: Union[str, Path],
open_encoding: Union[str, None] = None,
bs_kwargs: Union[dict, None] = None,
get_text_separator: str = "",
@ -57,7 +58,7 @@ class BSHTMLLoader(BaseLoader):
title = ""
metadata: Dict[str, Union[str, None]] = {
"source": self.file_path,
"source": str(self.file_path),
"title": title,
}
yield Document(page_content=text, metadata=metadata)

@ -1,4 +1,5 @@
from io import BytesIO
from pathlib import Path
from typing import Any, List, Tuple, Union
import requests
@ -17,7 +18,7 @@ class ImageCaptionLoader(BaseLoader):
def __init__(
self,
images: Union[str, bytes, List[Union[str, bytes]]],
images: Union[str, Path, bytes, List[Union[str, bytes, Path]]],
blip_processor: str = "Salesforce/blip-image-captioning-base",
blip_model: str = "Salesforce/blip-image-captioning-base",
):
@ -29,7 +30,7 @@ class ImageCaptionLoader(BaseLoader):
blip_processor: The name of the pre-trained BLIP processor.
blip_model: The name of the pre-trained BLIP model.
"""
if isinstance(images, (str, bytes)):
if isinstance(images, (str, Path, bytes)):
self.images = [images]
else:
self.images = images
@ -61,7 +62,7 @@ class ImageCaptionLoader(BaseLoader):
return results
def _get_captions_and_metadata(
self, model: Any, processor: Any, image: Union[str, bytes]
self, model: Any, processor: Any, image: Union[str, Path, bytes]
) -> Tuple[str, dict]:
"""Helper function for getting the captions and metadata of an image."""
try:
@ -76,7 +77,9 @@ class ImageCaptionLoader(BaseLoader):
try:
if isinstance(image, bytes):
image = Image.open(BytesIO(image)).convert("RGB")
elif image.startswith("http://") or image.startswith("https://"):
elif isinstance(image, str) and (
image.startswith("http://") or image.startswith("https://")
):
image = Image.open(requests.get(image, stream=True).raw).convert("RGB")
else:
image = Image.open(image).convert("RGB")
@ -94,6 +97,6 @@ class ImageCaptionLoader(BaseLoader):
if isinstance(image_source, bytes):
metadata: dict = {"image_source": "Image bytes provided"}
else:
metadata = {"image_path": image_source}
metadata = {"image_path": str(image_source)}
return caption, metadata

@ -1,5 +1,6 @@
import email
import logging
from pathlib import Path
from typing import Dict, Iterator, Union
from langchain_core.documents import Document
@ -14,7 +15,7 @@ class MHTMLLoader(BaseLoader):
def __init__(
self,
file_path: str,
file_path: Union[str, Path],
open_encoding: Union[str, None] = None,
bs_kwargs: Union[dict, None] = None,
get_text_separator: str = "",
@ -69,7 +70,7 @@ class MHTMLLoader(BaseLoader):
title = ""
metadata: Dict[str, Union[str, None]] = {
"source": self.file_path,
"source": str(self.file_path),
"title": title,
}
yield Document(page_content=text, metadata=metadata)

@ -1,7 +1,7 @@
"""Loads .ipynb notebook files."""
import json
from pathlib import Path
from typing import Any, List
from typing import Any, List, Union
from langchain_core.documents import Document
@ -75,7 +75,7 @@ class NotebookLoader(BaseLoader):
def __init__(
self,
path: str,
path: Union[str, Path],
include_outputs: bool = False,
max_output_length: int = 10,
remove_newline: bool = False,

@ -1,5 +1,5 @@
from pathlib import Path
from typing import List
from typing import List, Union
from langchain_core.documents import Document
@ -9,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
class NotionDirectoryLoader(BaseLoader):
"""Load `Notion directory` dump."""
def __init__(self, path: str, *, encoding: str = "utf-8") -> None:
def __init__(self, path: Union[str, Path], *, encoding: str = "utf-8") -> None:
"""Initialize with a file path."""
self.file_path = path
self.encoding = encoding

@ -2,7 +2,7 @@ import functools
import logging
import re
from pathlib import Path
from typing import Any, Dict, Iterator
from typing import Any, Dict, Iterator, Union
import yaml
from langchain_core.documents import Document
@ -23,7 +23,10 @@ class ObsidianLoader(BaseLoader):
DATAVIEW_INLINE_PAREN_REGEX = re.compile(r"\((\w+)::\s*(.*)\)", re.MULTILINE)
def __init__(
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
self,
path: Union[str, Path],
encoding: str = "UTF-8",
collect_metadata: bool = True,
):
"""Initialize with a path.

@ -1,4 +1,5 @@
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -31,7 +32,10 @@ class UnstructuredODTLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

@ -1,4 +1,5 @@
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -31,7 +32,10 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""

@ -80,14 +80,14 @@ class BasePDFLoader(BaseLoader, ABC):
clean up the temporary file after completion.
"""
def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None):
"""Initialize with a file path.
Args:
file_path: Either a local, S3 or web path to a PDF file.
headers: Headers to use for GET request to download a file from a web path.
"""
self.file_path = file_path
self.file_path = str(file_path)
self.web_path = None
self.headers = headers
if "~" in self.file_path:
@ -226,7 +226,7 @@ class PyPDFDirectoryLoader(BaseLoader):
def __init__(
self,
path: str,
path: Union[str, Path],
glob: str = "**/[!.]*.pdf",
silent_errors: bool = False,
load_hidden: bool = False,

@ -1,4 +1,6 @@
import tokenize
from pathlib import Path
from typing import Union
from langchain_community.document_loaders.text import TextLoader
@ -6,7 +8,7 @@ from langchain_community.document_loaders.text import TextLoader
class PythonLoader(TextLoader):
"""Load `Python` files, respecting any non-default encoding if specified."""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path.
Args:

@ -1,5 +1,5 @@
from pathlib import Path
from typing import List
from typing import List, Union
from langchain_core.documents import Document
@ -9,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
class RoamLoader(BaseLoader):
"""Load `Roam` files from a directory."""
def __init__(self, path: str):
def __init__(self, path: Union[str, Path]):
"""Initialize with a path."""
self.file_path = path

@ -1,5 +1,6 @@
"""Loads RST files."""
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -32,7 +33,10 @@ class UnstructuredRSTLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Initialize with a file path.

@ -1,5 +1,6 @@
"""Loads rich text files."""
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -32,7 +33,10 @@ class UnstructuredRTFLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Initialize with a file path.

@ -1,7 +1,7 @@
import json
import zipfile
from pathlib import Path
from typing import Dict, Iterator, List, Optional
from typing import Dict, Iterator, List, Optional, Union
from langchain_core.documents import Document
@ -11,7 +11,7 @@ from langchain_community.document_loaders.base import BaseLoader
class SlackDirectoryLoader(BaseLoader):
"""Load from a `Slack` directory dump."""
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
def __init__(self, zip_path: Union[str, Path], workspace_url: Optional[str] = None):
"""Initialize the SlackDirectoryLoader.
Args:

@ -1,4 +1,5 @@
from typing import List
from pathlib import Path
from typing import List, Union
from langchain_core.documents import Document
@ -8,7 +9,7 @@ from langchain_community.document_loaders.base import BaseLoader
class SRTLoader(BaseLoader):
"""Load `.srt` (subtitle) files."""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path."""
try:
import pysrt # noqa:F401
@ -16,7 +17,7 @@ class SRTLoader(BaseLoader):
raise ImportError(
"package `pysrt` not found, please install it with `pip install pysrt`"
)
self.file_path = file_path
self.file_path = str(file_path)
def load(self) -> List[Document]:
"""Load using pysrt file."""

@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str:
class TelegramChatFileLoader(BaseLoader):
"""Load from `Telegram chat` dump."""
def __init__(self, path: str):
def __init__(self, path: Union[str, Path]):
"""Initialize with a path."""
self.file_path = path

@ -1,5 +1,6 @@
import logging
from typing import Iterator, Optional
from pathlib import Path
from typing import Iterator, Optional, Union
from langchain_core.documents import Document
@ -25,7 +26,7 @@ class TextLoader(BaseLoader):
def __init__(
self,
file_path: str,
file_path: Union[str, Path],
encoding: Optional[str] = None,
autodetect_encoding: bool = False,
):
@ -56,5 +57,5 @@ class TextLoader(BaseLoader):
except Exception as e:
raise RuntimeError(f"Error loading {self.file_path}") from e
metadata = {"source": self.file_path}
metadata = {"source": str(self.file_path)}
yield Document(page_content=text, metadata=metadata)

@ -1,4 +1,5 @@
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -26,7 +27,10 @@ class UnstructuredTSVLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
validate_unstructured_version(min_unstructured_version="0.7.6")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

@ -1,6 +1,7 @@
"""Loader that uses unstructured to load files."""
import collections
from abc import ABC, abstractmethod
from pathlib import Path
from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
from langchain_core.documents import Document
@ -155,7 +156,7 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
def __init__(
self,
file_path: Union[str, List[str]],
file_path: Union[str, List[str], Path, List[Path]],
mode: str = "single",
**unstructured_kwargs: Any,
):
@ -169,9 +170,13 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
if isinstance(self.file_path, list):
elements = []
for file in self.file_path:
if isinstance(file, Path):
file = str(file)
elements.extend(partition(filename=file, **self.unstructured_kwargs))
return elements
else:
if isinstance(self.file_path, Path):
self.file_path = str(self.file_path)
return partition(filename=self.file_path, **self.unstructured_kwargs)
def _get_metadata(self) -> dict:
@ -179,14 +184,16 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
def get_elements_from_api(
file_path: Union[str, List[str], None] = None,
file_path: Union[str, List[str], Path, List[Path], None] = None,
file: Union[IO, Sequence[IO], None] = None,
api_url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
) -> List:
"""Retrieve a list of elements from the `Unstructured API`."""
if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
if is_list := isinstance(file_path, list):
file_path = [str(path) for path in file_path]
if isinstance(file, collections.abc.Sequence) or is_list:
from unstructured.partition.api import partition_multiple_via_api
_doc_elements = partition_multiple_via_api(
@ -206,7 +213,7 @@ def get_elements_from_api(
from unstructured.partition.api import partition_via_api
return partition_via_api(
filename=file_path,
filename=str(file_path),
file=file,
api_key=api_key,
api_url=api_url,

@ -1,7 +1,8 @@
import os
import tempfile
from abc import ABC
from typing import List
from pathlib import Path
from typing import List, Union
from urllib.parse import urlparse
import requests
@ -13,9 +14,9 @@ from langchain_community.document_loaders.parsers import VsdxParser
class VsdxLoader(BaseLoader, ABC):
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with file path."""
self.file_path = file_path
self.file_path = str(file_path)
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)

@ -2,7 +2,8 @@
import os
import tempfile
from abc import ABC
from typing import List
from pathlib import Path
from typing import List, Union
from urllib.parse import urlparse
import requests
@ -19,9 +20,9 @@ class Docx2txtLoader(BaseLoader, ABC):
to a temporary file, and use that, then clean up the temporary file after completion
"""
def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with file path."""
self.file_path = file_path
self.file_path = str(file_path)
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)

@ -1,5 +1,6 @@
"""Loads Microsoft Excel files."""
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
@ -32,8 +33,12 @@ class UnstructuredXMLLoader(UnstructuredFileLoader):
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.6.7")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

Loading…
Cancel
Save