docstrings: document_loaders consitency 2 (#9148)

This is Part 2. See #9139 (Part 1).
This commit is contained in:
Leonid Ganeline 2023-08-11 16:25:40 -07:00 committed by GitHub
parent 1b58460fe3
commit 19f504790e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
72 changed files with 114 additions and 144 deletions

View File

@ -1,4 +1,3 @@
"""Loads HuggingFace datasets."""
from typing import Iterator, List, Mapping, Optional, Sequence, Union from typing import Iterator, List, Mapping, Optional, Sequence, Union
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
class HuggingFaceDatasetLoader(BaseLoader): class HuggingFaceDatasetLoader(BaseLoader):
"""Load Documents from the Hugging Face Hub.""" """Load from `Hugging Face Hub` datasets."""
def __init__( def __init__(
self, self,

View File

@ -1,4 +1,3 @@
"""Loads iFixit data."""
from typing import List, Optional from typing import List, Optional
import requests import requests
@ -11,7 +10,7 @@ IFIXIT_BASE_URL = "https://www.ifixit.com/api/2.0"
class IFixitLoader(BaseLoader): class IFixitLoader(BaseLoader):
"""Load iFixit repair guides, device wikis and answers. """Load `iFixit` repair guides, device wikis and answers.
iFixit is the largest, open repair community on the web. The site contains nearly iFixit is the largest, open repair community on the web. The site contains nearly
100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is 100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is

View File

@ -1,11 +1,10 @@
"""Loads image files."""
from typing import List from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader
class UnstructuredImageLoader(UnstructuredFileLoader): class UnstructuredImageLoader(UnstructuredFileLoader):
"""Loader that uses Unstructured to load PNG and JPG files. """Load `PNG` and `JPG` files using `Unstructured`.
You can run the loader in one of two modes: "single" and "elements". You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single If you use "single" mode, the document will be returned as a single

View File

@ -1,9 +1,3 @@
"""Loads image captions.
By default, the loader utilizes the pre-trained BLIP image captioning model.
https://huggingface.co/Salesforce/blip-image-captioning-base
"""
from typing import Any, List, Tuple, Union from typing import Any, List, Tuple, Union
import requests import requests
@ -13,7 +7,12 @@ from langchain.document_loaders.base import BaseLoader
class ImageCaptionLoader(BaseLoader): class ImageCaptionLoader(BaseLoader):
"""Loads the captions of an image""" """Load image captions.
By default, the loader utilizes the pre-trained
Salesforce BLIP image captioning model.
https://huggingface.co/Salesforce/blip-image-captioning-base
"""
def __init__( def __init__(
self, self,

View File

@ -1,4 +1,3 @@
"""Loads IMSDb."""
from typing import List from typing import List
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -6,7 +5,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
class IMSDbLoader(WebBaseLoader): class IMSDbLoader(WebBaseLoader):
"""Loads IMSDb webpages.""" """Load `IMSDb` webpages."""
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load webpage.""" """Load webpage."""

View File

@ -1,4 +1,3 @@
"""Loader that fetches data from IUGU"""
import json import json
import urllib.request import urllib.request
from typing import List, Optional from typing import List, Optional
@ -17,7 +16,7 @@ IUGU_ENDPOINTS = {
class IuguLoader(BaseLoader): class IuguLoader(BaseLoader):
"""Loader that fetches data from IUGU.""" """Load from `IUGU`."""
def __init__(self, resource: str, api_token: Optional[str] = None) -> None: def __init__(self, resource: str, api_token: Optional[str] = None) -> None:
"""Initialize the IUGU resource. """Initialize the IUGU resource.

View File

@ -11,8 +11,7 @@ LINK_NOTE_TEMPLATE = "joplin://x-callback-url/openNote?id={id}"
class JoplinLoader(BaseLoader): class JoplinLoader(BaseLoader):
""" """Load notes from `Joplin`.
Loader that fetches notes from Joplin.
In order to use this loader, you need to have Joplin running with the In order to use this loader, you need to have Joplin running with the
Web Clipper enabled (look for "Web Clipper" in the app settings). Web Clipper enabled (look for "Web Clipper" in the app settings).

View File

@ -1,4 +1,3 @@
"""Loads data from JSON."""
import json import json
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union from typing import Any, Callable, Dict, List, Optional, Union
@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
class JSONLoader(BaseLoader): class JSONLoader(BaseLoader):
"""Loads a JSON file using a jq schema. """Load a `JSON` file using a `jq` schema.
Example: Example:
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text [{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text

View File

@ -1,4 +1,3 @@
"""Loads LarkSuite (FeiShu) document json dump."""
import json import json
import urllib.request import urllib.request
from typing import Any, Iterator, List from typing import Any, Iterator, List
@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
class LarkSuiteDocLoader(BaseLoader): class LarkSuiteDocLoader(BaseLoader):
"""Loads LarkSuite (FeiShu) document.""" """Load from `LarkSuite` (`FeiShu`)."""
def __init__(self, domain: str, access_token: str, document_id: str): def __init__(self, domain: str, access_token: str, document_id: str):
"""Initialize with domain, access_token (tenant / user), and document_id. """Initialize with domain, access_token (tenant / user), and document_id.

View File

@ -1,11 +1,10 @@
"""Loads Markdown files."""
from typing import List from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader
class UnstructuredMarkdownLoader(UnstructuredFileLoader): class UnstructuredMarkdownLoader(UnstructuredFileLoader):
"""Loader that uses Unstructured to load markdown files. """Load `Markdown` files using `Unstructured`.
You can run the loader in one of two modes: "single" and "elements". You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single If you use "single" mode, the document will be returned as a single

View File

@ -1,4 +1,3 @@
"""Mastodon document loader."""
from __future__ import annotations from __future__ import annotations
import os import os
@ -23,7 +22,7 @@ def _dependable_mastodon_import() -> mastodon:
class MastodonTootsLoader(BaseLoader): class MastodonTootsLoader(BaseLoader):
"""Mastodon toots loader.""" """Load the `Mastodon` 'toots'."""
def __init__( def __init__(
self, self,

View File

@ -8,7 +8,7 @@ from langchain.utilities.max_compute import MaxComputeAPIWrapper
class MaxComputeLoader(BaseLoader): class MaxComputeLoader(BaseLoader):
"""Loads a query result from Alibaba Cloud MaxCompute table into documents.""" """Load from `Alibaba Cloud MaxCompute` table."""
def __init__( def __init__(
self, self,

View File

@ -1,4 +1,3 @@
"""Load Data from a MediaWiki dump xml."""
import logging import logging
from pathlib import Path from pathlib import Path
from typing import List, Optional, Sequence, Union from typing import List, Optional, Sequence, Union
@ -10,8 +9,8 @@ logger = logging.getLogger(__name__)
class MWDumpLoader(BaseLoader): class MWDumpLoader(BaseLoader):
""" """Load `MediaWiki` dump from an `XML` file.
Load MediaWiki dump from XML file
Example: Example:
.. code-block:: python .. code-block:: python

View File

@ -1,5 +1,3 @@
"""Load MHTML files, enriching metadata with page title."""
import email import email
import logging import logging
from typing import Dict, List, Union from typing import Dict, List, Union
@ -11,7 +9,7 @@ logger = logging.getLogger(__name__)
class MHTMLLoader(BaseLoader): class MHTMLLoader(BaseLoader):
"""Loader that uses beautiful soup to parse HTML files.""" """Parse `MHTML` files with `BeautifulSoup`."""
def __init__( def __init__(
self, self,

View File

@ -1,4 +1,3 @@
"""Loader that fetches data from Modern Treasury"""
import json import json
import urllib.request import urllib.request
from base64 import b64encode from base64 import b64encode
@ -27,7 +26,7 @@ incoming_payment_details",
class ModernTreasuryLoader(BaseLoader): class ModernTreasuryLoader(BaseLoader):
"""Loader that fetches data from Modern Treasury.""" """Load from `Modern Treasury`."""
def __init__( def __init__(
self, self,

View File

@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)
class NewsURLLoader(BaseLoader): class NewsURLLoader(BaseLoader):
"""Loader that uses newspaper to load news articles from URLs. """Load news articles from URLs using `Unstructured`.
Args: Args:
urls: URLs to load. Each is loaded into its own document. urls: URLs to load. Each is loaded into its own document.

View File

@ -70,7 +70,7 @@ def remove_newlines(x: Any) -> Any:
class NotebookLoader(BaseLoader): class NotebookLoader(BaseLoader):
"""Loads .ipynb notebook files.""" """Load `Jupyter notebook` (.ipynb) files."""
def __init__( def __init__(
self, self,
@ -80,7 +80,7 @@ class NotebookLoader(BaseLoader):
remove_newline: bool = False, remove_newline: bool = False,
traceback: bool = False, traceback: bool = False,
): ):
"""Initialize with path. """Initialize with a path.
Args: Args:
path: The path to load the notebook from. path: The path to load the notebook from.

View File

@ -1,4 +1,3 @@
"""Loads Notion directory dump."""
from pathlib import Path from pathlib import Path
from typing import List from typing import List
@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
class NotionDirectoryLoader(BaseLoader): class NotionDirectoryLoader(BaseLoader):
"""Loads Notion directory dump.""" """Load `Notion directory` dump."""
def __init__(self, path: str): def __init__(self, path: str):
"""Initialize with a file path.""" """Initialize with a file path."""

View File

@ -1,5 +1,3 @@
"""Notion DB loader for langchain"""
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import requests import requests
@ -14,7 +12,7 @@ BLOCK_URL = NOTION_BASE_URL + "/blocks/{block_id}/children"
class NotionDBLoader(BaseLoader): class NotionDBLoader(BaseLoader):
"""Notion DB Loader. """Load from `Notion DB`.
Reads content from pages within a Notion Database. Reads content from pages within a Notion Database.
Args: Args:

View File

@ -1,4 +1,3 @@
"""Extract text from any file type."""
import json import json
import uuid import uuid
from typing import List from typing import List
@ -9,7 +8,7 @@ from langchain.tools.nuclia.tool import NucliaUnderstandingAPI
class NucliaLoader(BaseLoader): class NucliaLoader(BaseLoader):
"""Extract text from any file type.""" """Load from any file type using `Nuclia Understanding API`."""
def __init__(self, path: str, nuclia_tool: NucliaUnderstandingAPI): def __init__(self, path: str, nuclia_tool: NucliaUnderstandingAPI):
self.nua = nuclia_tool self.nua = nuclia_tool

View File

@ -7,7 +7,7 @@ from langchain.document_loaders.obs_file import OBSFileLoader
class OBSDirectoryLoader(BaseLoader): class OBSDirectoryLoader(BaseLoader):
"""Loading logic for loading documents from Huawei OBS.""" """Load from `Huawei OBS directory`."""
def __init__( def __init__(
self, self,

View File

@ -10,7 +10,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class OBSFileLoader(BaseLoader): class OBSFileLoader(BaseLoader):
"""Loader for Huawei OBS file.""" """Load from the `Huawei OBS file`."""
def __init__( def __init__(
self, self,

View File

@ -1,4 +1,3 @@
"""Loads Obsidian directory dump."""
import re import re
from pathlib import Path from pathlib import Path
from typing import List from typing import List
@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
class ObsidianLoader(BaseLoader): class ObsidianLoader(BaseLoader):
"""Loads Obsidian files from disk.""" """Load `Obsidian` files from directory."""
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL) FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)

View File

@ -1,4 +1,3 @@
"""Loads OpenOffice ODT files."""
from typing import Any, List from typing import Any, List
from langchain.document_loaders.unstructured import ( from langchain.document_loaders.unstructured import (
@ -8,7 +7,8 @@ from langchain.document_loaders.unstructured import (
class UnstructuredODTLoader(UnstructuredFileLoader): class UnstructuredODTLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load OpenOffice ODT files. """Load `OpenOffice ODT` files using `Unstructured`.
You can run the loader in one of two modes: "single" and "elements". You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured langchain Document object. If you use "elements" mode, the unstructured

View File

@ -60,7 +60,7 @@ class _SupportedFileTypes(BaseModel):
class OneDriveLoader(BaseLoader, BaseModel): class OneDriveLoader(BaseLoader, BaseModel):
"""Loads data from OneDrive.""" """Load from `Microsoft OneDrive`."""
settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings) settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
""" The settings for the OneDrive API client.""" """ The settings for the OneDrive API client."""

View File

@ -16,7 +16,7 @@ CHUNK_SIZE = 1024 * 1024 * 5
class OneDriveFileLoader(BaseLoader, BaseModel): class OneDriveFileLoader(BaseLoader, BaseModel):
"""Loads a file from OneDrive.""" """Load a file from `Microsoft OneDrive`."""
file: File = Field(...) file: File = Field(...)
"""The file to load.""" """The file to load."""

View File

@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
class OpenCityDataLoader(BaseLoader): class OpenCityDataLoader(BaseLoader):
"""Loads Open City data.""" """Load from `Open City`."""
def __init__(self, city_id: str, dataset_id: str, limit: int): def __init__(self, city_id: str, dataset_id: str, limit: int):
"""Initialize with dataset_id. """Initialize with dataset_id.

View File

@ -1,4 +1,3 @@
"""Loads Org-Mode files."""
from typing import Any, List from typing import Any, List
from langchain.document_loaders.unstructured import ( from langchain.document_loaders.unstructured import (
@ -8,7 +7,8 @@ from langchain.document_loaders.unstructured import (
class UnstructuredOrgModeLoader(UnstructuredFileLoader): class UnstructuredOrgModeLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load Org-Mode files. """Load `Org-Mode` files using `Unstructured`.
You can run the loader in one of two modes: "single" and "elements". You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured langchain Document object. If you use "elements" mode, the unstructured

View File

@ -1,4 +1,3 @@
"""Loads PDF files."""
import json import json
import logging import logging
import os import os
@ -30,7 +29,8 @@ logger = logging.getLogger(__file__)
class UnstructuredPDFLoader(UnstructuredFileLoader): class UnstructuredPDFLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load PDF files. """Load `PDF` files using `Unstructured`.
You can run the loader in one of two modes: "single" and "elements". You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured langchain Document object. If you use "elements" mode, the unstructured
@ -59,7 +59,7 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
class BasePDFLoader(BaseLoader, ABC): class BasePDFLoader(BaseLoader, ABC):
"""Base loader class for PDF files. """Base Loader class for `PDF` files.
Defaults to check for local file, but if the file is a web path, it will download it Defaults to check for local file, but if the file is a web path, it will download it
to a temporary file, use it, then clean up the temporary file after completion to a temporary file, use it, then clean up the temporary file after completion
@ -122,7 +122,7 @@ class BasePDFLoader(BaseLoader, ABC):
class OnlinePDFLoader(BasePDFLoader): class OnlinePDFLoader(BasePDFLoader):
"""Loads online PDFs.""" """Load online `PDF`."""
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
@ -131,7 +131,7 @@ class OnlinePDFLoader(BasePDFLoader):
class PyPDFLoader(BasePDFLoader): class PyPDFLoader(BasePDFLoader):
"""Loads a PDF with pypdf and chunks at character level. """Load `PDF using `pypdf` and chunks at character level.
Loader also stores page numbers in metadata. Loader also stores page numbers in metadata.
""" """
@ -162,7 +162,7 @@ class PyPDFLoader(BasePDFLoader):
class PyPDFium2Loader(BasePDFLoader): class PyPDFium2Loader(BasePDFLoader):
"""Loads a PDF with pypdfium2 and chunks at character level.""" """Load `PDF` using `pypdfium2` and chunks at character level."""
def __init__(self, file_path: str): def __init__(self, file_path: str):
"""Initialize with a file path.""" """Initialize with a file path."""
@ -182,7 +182,7 @@ class PyPDFium2Loader(BasePDFLoader):
class PyPDFDirectoryLoader(BaseLoader): class PyPDFDirectoryLoader(BaseLoader):
"""Loads a directory with PDF files with pypdf and chunks at character level. """Load a directory with `PDF` files using `pypdf` and chunks at character level.
Loader also stores page numbers in metadata. Loader also stores page numbers in metadata.
""" """
@ -227,7 +227,7 @@ class PyPDFDirectoryLoader(BaseLoader):
class PDFMinerLoader(BasePDFLoader): class PDFMinerLoader(BasePDFLoader):
"""Loader that uses PDFMiner to load PDF files.""" """Load `PDF` files using `PDFMiner`."""
def __init__(self, file_path: str) -> None: def __init__(self, file_path: str) -> None:
"""Initialize with file path.""" """Initialize with file path."""
@ -255,7 +255,7 @@ class PDFMinerLoader(BasePDFLoader):
class PDFMinerPDFasHTMLLoader(BasePDFLoader): class PDFMinerPDFasHTMLLoader(BasePDFLoader):
"""Loader that uses PDFMiner to load PDF files as HTML content.""" """Load `PDF` files as HTML content using `PDFMiner`."""
def __init__(self, file_path: str): def __init__(self, file_path: str):
"""Initialize with a file path.""" """Initialize with a file path."""
@ -289,7 +289,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
class PyMuPDFLoader(BasePDFLoader): class PyMuPDFLoader(BasePDFLoader):
"""Loader that uses PyMuPDF to load PDF files.""" """Load `PDF` files using `PyMuPDF`."""
def __init__(self, file_path: str) -> None: def __init__(self, file_path: str) -> None:
"""Initialize with a file path.""" """Initialize with a file path."""
@ -314,7 +314,7 @@ class PyMuPDFLoader(BasePDFLoader):
# MathpixPDFLoader implementation taken largely from Daniel Gross's: # MathpixPDFLoader implementation taken largely from Daniel Gross's:
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21 # https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
class MathpixPDFLoader(BasePDFLoader): class MathpixPDFLoader(BasePDFLoader):
"""This class uses Mathpix service to load PDF files.""" """Load `PDF` files using `Mathpix` service."""
def __init__( def __init__(
self, self,
@ -433,7 +433,7 @@ class MathpixPDFLoader(BasePDFLoader):
class PDFPlumberLoader(BasePDFLoader): class PDFPlumberLoader(BasePDFLoader):
"""Loader that uses pdfplumber to load PDF files.""" """Load `PDF` files using `pdfplumber`."""
def __init__( def __init__(
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
@ -459,7 +459,7 @@ class PDFPlumberLoader(BasePDFLoader):
class AmazonTextractPDFLoader(BasePDFLoader): class AmazonTextractPDFLoader(BasePDFLoader):
"""Loads a PDF document from local file system, HTTP or S3. """ "Load `PDF` files from a local file system, HTTP or S3.
To authenticate, the AWS client uses the following methods to To authenticate, the AWS client uses the following methods to
automatically load credentials: automatically load credentials:

View File

@ -1,4 +1,3 @@
"""Loads PowerPoint files."""
import os import os
from typing import List from typing import List
@ -6,7 +5,8 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class UnstructuredPowerPointLoader(UnstructuredFileLoader): class UnstructuredPowerPointLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load PowerPoint files. """Load `Microsoft PowerPoint` files using `Unstructured`.
Works with both .ppt and .pptx files. Works with both .ppt and .pptx files.
You can run the loader in one of two modes: "single" and "elements". You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single If you use "single" mode, the document will be returned as a single

View File

@ -1,4 +1,3 @@
"""Loads documents from Psychic.dev."""
from typing import List, Optional from typing import List, Optional
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
class PsychicLoader(BaseLoader): class PsychicLoader(BaseLoader):
"""Loads documents from Psychic.dev.""" """Load from `Psychic.dev`."""
def __init__( def __init__(
self, api_key: str, account_id: str, connector_id: Optional[str] = None self, api_key: str, account_id: str, connector_id: Optional[str] = None

View File

@ -6,7 +6,7 @@ from langchain.utilities.pubmed import PubMedAPIWrapper
class PubMedLoader(BaseLoader): class PubMedLoader(BaseLoader):
"""Loads a query result from PubMed biomedical library into a list of Documents. """Load from the `PubMed` biomedical library.
Attributes: Attributes:
query: The query to be passed to the PubMed API. query: The query to be passed to the PubMed API.

View File

@ -1,4 +1,3 @@
"""Load from a Spark Dataframe object"""
import itertools import itertools
import logging import logging
import sys import sys
@ -14,7 +13,7 @@ if TYPE_CHECKING:
class PySparkDataFrameLoader(BaseLoader): class PySparkDataFrameLoader(BaseLoader):
"""Load PySpark DataFrames""" """Load `PySpark` DataFrames."""
def __init__( def __init__(
self, self,

View File

@ -4,9 +4,7 @@ from langchain.document_loaders.text import TextLoader
class PythonLoader(TextLoader): class PythonLoader(TextLoader):
""" """Load `Python` files, respecting any non-default encoding if specified."""
Load Python files, respecting any non-default encoding if specified.
"""
def __init__(self, file_path: str): def __init__(self, file_path: str):
"""Initialize with a file path. """Initialize with a file path.

View File

@ -1,4 +1,3 @@
"""Loads ReadTheDocs documentation directory dump."""
from pathlib import Path from pathlib import Path
from typing import Any, List, Optional, Tuple, Union from typing import Any, List, Optional, Tuple, Union
@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
class ReadTheDocsLoader(BaseLoader): class ReadTheDocsLoader(BaseLoader):
"""Loads ReadTheDocs documentation directory dump.""" """Load `ReadTheDocs` documentation directory."""
def __init__( def __init__(
self, self,

View File

@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
class RecursiveUrlLoader(BaseLoader): class RecursiveUrlLoader(BaseLoader):
"""Loads all child links from a given url.""" """Load all child links from a URL page."""
def __init__( def __init__(
self, self,

View File

@ -1,4 +1,3 @@
"""Reddit document loader."""
from __future__ import annotations from __future__ import annotations
from typing import TYPE_CHECKING, Iterable, List, Optional, Sequence from typing import TYPE_CHECKING, Iterable, List, Optional, Sequence
@ -21,7 +20,8 @@ def _dependable_praw_import() -> praw:
class RedditPostsLoader(BaseLoader): class RedditPostsLoader(BaseLoader):
"""Reddit posts loader. """Load `Reddit` posts.
Read posts on a subreddit. Read posts on a subreddit.
First, you need to go to First, you need to go to
https://www.reddit.com/prefs/apps/ https://www.reddit.com/prefs/apps/

View File

@ -1,4 +1,3 @@
"""Loads Roam directory dump."""
from pathlib import Path from pathlib import Path
from typing import List from typing import List
@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
class RoamLoader(BaseLoader): class RoamLoader(BaseLoader):
"""Loads Roam files from disk.""" """Load `Roam` files from a directory."""
def __init__(self, path: str): def __init__(self, path: str):
"""Initialize with a path.""" """Initialize with a path."""

View File

@ -17,7 +17,7 @@ class ColumnNotFoundError(Exception):
class RocksetLoader(BaseLoader): class RocksetLoader(BaseLoader):
"""Wrapper around Rockset db """Load from a `Rockset` database.
To use, you should have the `rockset` python package installed. To use, you should have the `rockset` python package installed.

View File

@ -1,4 +1,3 @@
"""Loader that uses unstructured to load HTML files."""
import logging import logging
from typing import Any, Iterator, List, Optional, Sequence from typing import Any, Iterator, List, Optional, Sequence
@ -10,7 +9,7 @@ logger = logging.getLogger(__name__)
class RSSFeedLoader(BaseLoader): class RSSFeedLoader(BaseLoader):
"""Loader that uses newspaper to load news articles from RSS feeds. """Load news articles from `RSS` feeds using `Unstructured`.
Args: Args:
urls: URLs for RSS feeds to load. Each articles in the feed is loaded into its own document. urls: URLs for RSS feeds to load. Each articles in the feed is loaded into its own document.

View File

@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import (
class UnstructuredRSTLoader(UnstructuredFileLoader): class UnstructuredRSTLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load RST files. """Load `RST` files using `Unstructured`.
You can run the loader in one of two modes: "single" and "elements". You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured langchain Document object. If you use "elements" mode, the unstructured

View File

@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import (
class UnstructuredRTFLoader(UnstructuredFileLoader): class UnstructuredRTFLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load RTF files. """Load `RTF` files using `Unstructured`.
You can run the loader in one of two modes: "single" and "elements". You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured langchain Document object. If you use "elements" mode, the unstructured

View File

@ -1,4 +1,3 @@
"""Loading logic for loading documents from an AWS S3 directory."""
from typing import List from typing import List
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -7,7 +6,7 @@ from langchain.document_loaders.s3_file import S3FileLoader
class S3DirectoryLoader(BaseLoader): class S3DirectoryLoader(BaseLoader):
"""Loading logic for loading documents from an AWS S3.""" """Load from `Amazon AWS S3` directory."""
def __init__(self, bucket: str, prefix: str = ""): def __init__(self, bucket: str, prefix: str = ""):
"""Initialize with bucket and key name. """Initialize with bucket and key name.

View File

@ -1,4 +1,3 @@
"""Loading logic for loading documents from an AWS S3 file."""
import os import os
import tempfile import tempfile
from typing import List from typing import List
@ -9,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class S3FileLoader(BaseLoader): class S3FileLoader(BaseLoader):
"""Loading logic for loading documents from an AWS S3 file.""" """Load from `Amazon AWS S3` file."""
def __init__(self, bucket: str, key: str): def __init__(self, bucket: str, key: str):
"""Initialize with bucket and key name. """Initialize with bucket and key name.

View File

@ -1,4 +1,3 @@
"""Loader that fetches a sitemap and loads those URLs."""
import itertools import itertools
import re import re
from typing import Any, Callable, Generator, Iterable, List, Optional from typing import Any, Callable, Generator, Iterable, List, Optional
@ -22,7 +21,7 @@ def _batch_block(iterable: Iterable, size: int) -> Generator[List[dict], None, N
class SitemapLoader(WebBaseLoader): class SitemapLoader(WebBaseLoader):
"""Loader that fetches a sitemap and loads those URLs.""" """Load a sitemap and its URLs."""
def __init__( def __init__(
self, self,

View File

@ -1,4 +1,3 @@
"""Loader for documents from a Slack export."""
import json import json
import zipfile import zipfile
from pathlib import Path from pathlib import Path
@ -9,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader
class SlackDirectoryLoader(BaseLoader): class SlackDirectoryLoader(BaseLoader):
"""Loads documents from a Slack directory dump.""" """Load from a `Slack` directory dump."""
def __init__(self, zip_path: str, workspace_url: Optional[str] = None): def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
"""Initialize the SlackDirectoryLoader. """Initialize the SlackDirectoryLoader.

View File

@ -7,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
class SnowflakeLoader(BaseLoader): class SnowflakeLoader(BaseLoader):
"""Loads a query result from Snowflake into a list of documents. """Load from `Snowflake` API.
Each document represents one row of the result. The `page_content_columns` Each document represents one row of the result. The `page_content_columns`
are written into the `page_content` of the document. The `metadata_columns` are written into the `page_content` of the document. The `metadata_columns`

View File

@ -1,4 +1,3 @@
"""Loader that fetches data from Spreedly API."""
import json import json
import urllib.request import urllib.request
from typing import List from typing import List
@ -20,7 +19,7 @@ SPREEDLY_ENDPOINTS = {
class SpreedlyLoader(BaseLoader): class SpreedlyLoader(BaseLoader):
"""Loader that fetches data from Spreedly API.""" """Load from `Spreedly` API."""
def __init__(self, access_token: str, resource: str) -> None: def __init__(self, access_token: str, resource: str) -> None:
"""Initialize with an access token and a resource. """Initialize with an access token and a resource.

View File

@ -1,4 +1,3 @@
"""Loader for .srt (subtitle) files."""
from typing import List from typing import List
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
class SRTLoader(BaseLoader): class SRTLoader(BaseLoader):
"""Loader for .srt (subtitle) files.""" """Load `.srt` (subtitle) files."""
def __init__(self, file_path: str): def __init__(self, file_path: str):
"""Initialize with a file path.""" """Initialize with a file path."""

View File

@ -1,4 +1,3 @@
"""Loader that fetches data from Stripe"""
import json import json
import urllib.request import urllib.request
from typing import List, Optional from typing import List, Optional
@ -18,7 +17,7 @@ STRIPE_ENDPOINTS = {
class StripeLoader(BaseLoader): class StripeLoader(BaseLoader):
"""Loader that fetches data from Stripe.""" """Load from `Stripe` API."""
def __init__(self, resource: str, access_token: Optional[str] = None) -> None: def __init__(self, resource: str, access_token: Optional[str] = None) -> None:
"""Initialize with a resource and an access token. """Initialize with a resource and an access token.

View File

@ -1,4 +1,3 @@
"""Loads Telegram chat json dump."""
from __future__ import annotations from __future__ import annotations
import asyncio import asyncio
@ -24,7 +23,7 @@ def concatenate_rows(row: dict) -> str:
class TelegramChatFileLoader(BaseLoader): class TelegramChatFileLoader(BaseLoader):
"""Loads Telegram chat json directory dump.""" """Load from `Telegram chat` dump."""
def __init__(self, path: str): def __init__(self, path: str):
"""Initialize with a path.""" """Initialize with a path."""

View File

@ -1,4 +1,3 @@
"""Loading logic for loading documents from Tencent Cloud COS directory."""
from typing import Any, Iterator, List from typing import Any, Iterator, List
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -7,7 +6,7 @@ from langchain.document_loaders.tencent_cos_file import TencentCOSFileLoader
class TencentCOSDirectoryLoader(BaseLoader): class TencentCOSDirectoryLoader(BaseLoader):
"""Loader for Tencent Cloud COS directory.""" """Load from `Tencent Cloud COS` directory."""
def __init__(self, conf: Any, bucket: str, prefix: str = ""): def __init__(self, conf: Any, bucket: str, prefix: str = ""):
"""Initialize with COS config, bucket and prefix. """Initialize with COS config, bucket and prefix.

View File

@ -1,4 +1,3 @@
"""Loading logic for loading documents from Tencent Cloud COS file."""
import os import os
import tempfile import tempfile
from typing import Any, Iterator, List from typing import Any, Iterator, List
@ -9,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class TencentCOSFileLoader(BaseLoader): class TencentCOSFileLoader(BaseLoader):
"""Loader for Tencent Cloud COS file.""" """Load from `Tencent Cloud COS` file."""
def __init__(self, conf: Any, bucket: str, key: str): def __init__(self, conf: Any, bucket: str, key: str):
"""Initialize with COS config, bucket and key name. """Initialize with COS config, bucket and key name.

View File

@ -6,7 +6,7 @@ from langchain.utilities.tensorflow_datasets import TensorflowDatasets
class TensorflowDatasetLoader(BaseLoader): class TensorflowDatasetLoader(BaseLoader):
"""Loads from TensorFlow Datasets into a list of Documents. """Load from `TensorFlow Dataset`.
Attributes: Attributes:
dataset_name: the name of the dataset to load dataset_name: the name of the dataset to load

View File

@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)
class TextLoader(BaseLoader): class TextLoader(BaseLoader):
"""Load text files. """Load text file.
Args: Args:

View File

@ -1,4 +1,3 @@
"""Loads HTML to markdown using 2markdown."""
from __future__ import annotations from __future__ import annotations
from typing import Iterator, List from typing import Iterator, List
@ -10,7 +9,7 @@ from langchain.document_loaders.base import BaseLoader
class ToMarkdownLoader(BaseLoader): class ToMarkdownLoader(BaseLoader):
"""Loads HTML to markdown using 2markdown.""" """Load `HTML` using `2markdown API`."""
def __init__(self, url: str, api_key: str): def __init__(self, url: str, api_key: str):
"""Initialize with url and api key.""" """Initialize with url and api key."""

View File

@ -7,11 +7,10 @@ from langchain.document_loaders.base import BaseLoader
class TomlLoader(BaseLoader): class TomlLoader(BaseLoader):
""" """Load `TOML` files.
A TOML document loader that inherits from the BaseLoader class.
This class can be initialized with either a single source file or a source It can load a single source file or several files in a single
directory containing TOML files. directory.
""" """
def __init__(self, source: Union[str, Path]): def __init__(self, source: Union[str, Path]):

View File

@ -1,4 +1,3 @@
"""Loads cards from Trello"""
from __future__ import annotations from __future__ import annotations
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple
@ -12,7 +11,7 @@ if TYPE_CHECKING:
class TrelloLoader(BaseLoader): class TrelloLoader(BaseLoader):
"""Trello loader. Reads all cards from a Trello board.""" """Load cards from a `Trello` board."""
def __init__( def __init__(
self, self,

View File

@ -7,7 +7,9 @@ from langchain.document_loaders.unstructured import (
class UnstructuredTSVLoader(UnstructuredFileLoader): class UnstructuredTSVLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load TSV files. Like other """Load `TSV` files using `Unstructured`.
Like other
Unstructured loaders, UnstructuredTSVLoader can be used in both Unstructured loaders, UnstructuredTSVLoader can be used in both
"single" and "elements" mode. If you use the loader in "elements" "single" and "elements" mode. If you use the loader in "elements"
mode, the TSV file will be a single Unstructured Table element. mode, the TSV file will be a single Unstructured Table element.

View File

@ -1,4 +1,3 @@
"""Twitter document loader."""
from __future__ import annotations from __future__ import annotations
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
@ -22,8 +21,9 @@ def _dependable_tweepy_import() -> tweepy:
class TwitterTweetLoader(BaseLoader): class TwitterTweetLoader(BaseLoader):
"""Twitter tweets loader. """Load `Twitter` tweets.
Read tweets of user twitter handle.
Read tweets of the user's Twitter handle.
First you need to go to First you need to go to
`https://developer.twitter.com/en/docs/twitter-api `https://developer.twitter.com/en/docs/twitter-api

View File

@ -130,7 +130,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
class UnstructuredFileLoader(UnstructuredBaseLoader): class UnstructuredFileLoader(UnstructuredBaseLoader):
"""Loader that uses Unstructured to load files. """Load files using `Unstructured`.
The file loader uses the The file loader uses the
unstructured partition function and will automatically detect the file unstructured partition function and will automatically detect the file
@ -211,7 +211,7 @@ def get_elements_from_api(
class UnstructuredAPIFileLoader(UnstructuredFileLoader): class UnstructuredAPIFileLoader(UnstructuredFileLoader):
"""Loader that uses the Unstructured API to load files. """Load files using `Unstructured` API.
By default, the loader makes a call to the hosted Unstructured API. By default, the loader makes a call to the hosted Unstructured API.
If you are running the unstructured API locally, you can change the If you are running the unstructured API locally, you can change the
@ -275,7 +275,7 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
class UnstructuredFileIOLoader(UnstructuredBaseLoader): class UnstructuredFileIOLoader(UnstructuredBaseLoader):
"""Loader that uses Unstructured to load files. """Load files using `Unstructured`.
The file loader The file loader
uses the unstructured partition function and will automatically detect the file uses the unstructured partition function and will automatically detect the file
@ -322,7 +322,7 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader): class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
"""Loader that uses the Unstructured API to load files. """Load files using `Unstructured` API.
By default, the loader makes a call to the hosted Unstructured API. By default, the loader makes a call to the hosted Unstructured API.
If you are running the unstructured API locally, you can change the If you are running the unstructured API locally, you can change the

View File

@ -9,7 +9,8 @@ logger = logging.getLogger(__name__)
class UnstructuredURLLoader(BaseLoader): class UnstructuredURLLoader(BaseLoader):
"""Loader that use Unstructured to load files from remote URLs. """Load files from remote URLs using `Unstructured`.
Use the unstructured partition function to detect the MIME type Use the unstructured partition function to detect the MIME type
and route the file to the appropriate partitioner. and route the file to the appropriate partitioner.

View File

@ -10,7 +10,8 @@ logger = logging.getLogger(__name__)
class PlaywrightURLLoader(BaseLoader): class PlaywrightURLLoader(BaseLoader):
"""Loader that uses Playwright and to load a page and unstructured to load the html. """Load `HTML` pages with `Playwright` and parse with `Unstructured`.
This is useful for loading pages that require javascript to render. This is useful for loading pages that require javascript to render.
Attributes: Attributes:

View File

@ -13,7 +13,8 @@ logger = logging.getLogger(__name__)
class SeleniumURLLoader(BaseLoader): class SeleniumURLLoader(BaseLoader):
"""Loader that uses Selenium and to load a page and unstructured to load the html. """Load `HTML` pages with `Selenium` and parse with `Unstructured`.
This is useful for loading pages that require javascript to render. This is useful for loading pages that require javascript to render.
Attributes: Attributes:

View File

@ -10,7 +10,7 @@ from langchain.utilities.openweathermap import OpenWeatherMapAPIWrapper
class WeatherDataLoader(BaseLoader): class WeatherDataLoader(BaseLoader):
"""Weather Reader. """Load weather data with `Open Weather Map` API.
Reads the forecast & current weather of any location using OpenWeatherMap's free Reads the forecast & current weather of any location using OpenWeatherMap's free
API. Checkout 'https://openweathermap.org/appid' for more on how to generate a free API. Checkout 'https://openweathermap.org/appid' for more on how to generate a free

View File

@ -37,7 +37,7 @@ def _build_metadata(soup: Any, url: str) -> dict:
class WebBaseLoader(BaseLoader): class WebBaseLoader(BaseLoader):
"""Loader that uses urllib and beautiful soup to load webpages.""" """Load HTML pages using `urllib` and parse them with `BeautifulSoup'."""
web_paths: List[str] web_paths: List[str]

View File

@ -12,7 +12,7 @@ def concatenate_rows(date: str, sender: str, text: str) -> str:
class WhatsAppChatLoader(BaseLoader): class WhatsAppChatLoader(BaseLoader):
"""Loads WhatsApp messages text file.""" """Load `WhatsApp` messages text file."""
def __init__(self, path: str): def __init__(self, path: str):
"""Initialize with path.""" """Initialize with path."""

View File

@ -6,7 +6,8 @@ from langchain.utilities.wikipedia import WikipediaAPIWrapper
class WikipediaLoader(BaseLoader): class WikipediaLoader(BaseLoader):
"""Loads a query result from www.wikipedia.org into a list of Documents. """Load from `Wikipedia`.
The hard limit on the number of downloaded Documents is 300 for now. The hard limit on the number of downloaded Documents is 300 for now.
Each wiki page represents one Document. Each wiki page represents one Document.

View File

@ -13,7 +13,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class Docx2txtLoader(BaseLoader, ABC): class Docx2txtLoader(BaseLoader, ABC):
"""Loads a DOCX with docx2txt and chunks at character level. """Load `DOCX` file using `docx2txt` and chunks at character level.
Defaults to check for local file, but if the file is a web path, it will download it Defaults to check for local file, but if the file is a web path, it will download it
to a temporary file, and use that, then clean up the temporary file after completion to a temporary file, and use that, then clean up the temporary file after completion
@ -65,7 +65,8 @@ class Docx2txtLoader(BaseLoader, ABC):
class UnstructuredWordDocumentLoader(UnstructuredFileLoader): class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load word documents. """Load `Microsof Word` file using `Unstructured`.
Works with both .docx and .doc files. Works with both .docx and .doc files.
You can run the loader in one of two modes: "single" and "elements". You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single If you use "single" mode, the document will be returned as a single

View File

@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import (
class UnstructuredXMLLoader(UnstructuredFileLoader): class UnstructuredXMLLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load XML files. """Load `XML` file using `Unstructured`.
You can run the loader in one of two modes: "single" and "elements". You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured langchain Document object. If you use "elements" mode, the unstructured

View File

@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
class XorbitsLoader(BaseLoader): class XorbitsLoader(BaseLoader):
"""Load Xorbits DataFrame.""" """Load `Xorbits` DataFrame."""
def __init__(self, data_frame: Any, page_content_column: str = "text"): def __init__(self, data_frame: Any, page_content_column: str = "text"):
"""Initialize with dataframe object. """Initialize with dataframe object.

View File

@ -140,7 +140,7 @@ def _parse_video_id(url: str) -> Optional[str]:
class YoutubeLoader(BaseLoader): class YoutubeLoader(BaseLoader):
"""Loads Youtube transcripts.""" """Load `YouTube` transcripts."""
def __init__( def __init__(
self, self,
@ -252,7 +252,7 @@ class YoutubeLoader(BaseLoader):
@dataclass @dataclass
class GoogleApiYoutubeLoader(BaseLoader): class GoogleApiYoutubeLoader(BaseLoader):
"""Loads all Videos from a Channel """Load all Videos from a `YouTube` Channel.
To use, you should have the ``googleapiclient,youtube_transcript_api`` To use, you should have the ``googleapiclient,youtube_transcript_api``
python package installed. python package installed.