mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
docstrings: document_loaders consitency 2 (#9148)
This is Part 2. See #9139 (Part 1).
This commit is contained in:
parent
1b58460fe3
commit
19f504790e
@ -1,4 +1,3 @@
|
||||
"""Loads HuggingFace datasets."""
|
||||
from typing import Iterator, List, Mapping, Optional, Sequence, Union
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class HuggingFaceDatasetLoader(BaseLoader):
|
||||
"""Load Documents from the Hugging Face Hub."""
|
||||
"""Load from `Hugging Face Hub` datasets."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads iFixit data."""
|
||||
from typing import List, Optional
|
||||
|
||||
import requests
|
||||
@ -11,7 +10,7 @@ IFIXIT_BASE_URL = "https://www.ifixit.com/api/2.0"
|
||||
|
||||
|
||||
class IFixitLoader(BaseLoader):
|
||||
"""Load iFixit repair guides, device wikis and answers.
|
||||
"""Load `iFixit` repair guides, device wikis and answers.
|
||||
|
||||
iFixit is the largest, open repair community on the web. The site contains nearly
|
||||
100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is
|
||||
|
@ -1,11 +1,10 @@
|
||||
"""Loads image files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredImageLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses Unstructured to load PNG and JPG files.
|
||||
"""Load `PNG` and `JPG` files using `Unstructured`.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
|
@ -1,9 +1,3 @@
|
||||
"""Loads image captions.
|
||||
|
||||
By default, the loader utilizes the pre-trained BLIP image captioning model.
|
||||
https://huggingface.co/Salesforce/blip-image-captioning-base
|
||||
|
||||
"""
|
||||
from typing import Any, List, Tuple, Union
|
||||
|
||||
import requests
|
||||
@ -13,7 +7,12 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class ImageCaptionLoader(BaseLoader):
|
||||
"""Loads the captions of an image"""
|
||||
"""Load image captions.
|
||||
|
||||
By default, the loader utilizes the pre-trained
|
||||
Salesforce BLIP image captioning model.
|
||||
https://huggingface.co/Salesforce/blip-image-captioning-base
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads IMSDb."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -6,7 +5,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class IMSDbLoader(WebBaseLoader):
|
||||
"""Loads IMSDb webpages."""
|
||||
"""Load `IMSDb` webpages."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loader that fetches data from IUGU"""
|
||||
import json
|
||||
import urllib.request
|
||||
from typing import List, Optional
|
||||
@ -17,7 +16,7 @@ IUGU_ENDPOINTS = {
|
||||
|
||||
|
||||
class IuguLoader(BaseLoader):
|
||||
"""Loader that fetches data from IUGU."""
|
||||
"""Load from `IUGU`."""
|
||||
|
||||
def __init__(self, resource: str, api_token: Optional[str] = None) -> None:
|
||||
"""Initialize the IUGU resource.
|
||||
|
@ -11,8 +11,7 @@ LINK_NOTE_TEMPLATE = "joplin://x-callback-url/openNote?id={id}"
|
||||
|
||||
|
||||
class JoplinLoader(BaseLoader):
|
||||
"""
|
||||
Loader that fetches notes from Joplin.
|
||||
"""Load notes from `Joplin`.
|
||||
|
||||
In order to use this loader, you need to have Joplin running with the
|
||||
Web Clipper enabled (look for "Web Clipper" in the app settings).
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads data from JSON."""
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class JSONLoader(BaseLoader):
|
||||
"""Loads a JSON file using a jq schema.
|
||||
"""Load a `JSON` file using a `jq` schema.
|
||||
|
||||
Example:
|
||||
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads LarkSuite (FeiShu) document json dump."""
|
||||
import json
|
||||
import urllib.request
|
||||
from typing import Any, Iterator, List
|
||||
@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class LarkSuiteDocLoader(BaseLoader):
|
||||
"""Loads LarkSuite (FeiShu) document."""
|
||||
"""Load from `LarkSuite` (`FeiShu`)."""
|
||||
|
||||
def __init__(self, domain: str, access_token: str, document_id: str):
|
||||
"""Initialize with domain, access_token (tenant / user), and document_id.
|
||||
|
@ -1,11 +1,10 @@
|
||||
"""Loads Markdown files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredMarkdownLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses Unstructured to load markdown files.
|
||||
"""Load `Markdown` files using `Unstructured`.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Mastodon document loader."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
@ -23,7 +22,7 @@ def _dependable_mastodon_import() -> mastodon:
|
||||
|
||||
|
||||
class MastodonTootsLoader(BaseLoader):
|
||||
"""Mastodon toots loader."""
|
||||
"""Load the `Mastodon` 'toots'."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -8,7 +8,7 @@ from langchain.utilities.max_compute import MaxComputeAPIWrapper
|
||||
|
||||
|
||||
class MaxComputeLoader(BaseLoader):
|
||||
"""Loads a query result from Alibaba Cloud MaxCompute table into documents."""
|
||||
"""Load from `Alibaba Cloud MaxCompute` table."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Load Data from a MediaWiki dump xml."""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Sequence, Union
|
||||
@ -10,8 +9,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MWDumpLoader(BaseLoader):
|
||||
"""
|
||||
Load MediaWiki dump from XML file
|
||||
"""Load `MediaWiki` dump from an `XML` file.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
|
@ -1,5 +1,3 @@
|
||||
"""Load MHTML files, enriching metadata with page title."""
|
||||
|
||||
import email
|
||||
import logging
|
||||
from typing import Dict, List, Union
|
||||
@ -11,7 +9,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MHTMLLoader(BaseLoader):
|
||||
"""Loader that uses beautiful soup to parse HTML files."""
|
||||
"""Parse `MHTML` files with `BeautifulSoup`."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loader that fetches data from Modern Treasury"""
|
||||
import json
|
||||
import urllib.request
|
||||
from base64 import b64encode
|
||||
@ -27,7 +26,7 @@ incoming_payment_details",
|
||||
|
||||
|
||||
class ModernTreasuryLoader(BaseLoader):
|
||||
"""Loader that fetches data from Modern Treasury."""
|
||||
"""Load from `Modern Treasury`."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NewsURLLoader(BaseLoader):
|
||||
"""Loader that uses newspaper to load news articles from URLs.
|
||||
"""Load news articles from URLs using `Unstructured`.
|
||||
|
||||
Args:
|
||||
urls: URLs to load. Each is loaded into its own document.
|
||||
|
@ -70,7 +70,7 @@ def remove_newlines(x: Any) -> Any:
|
||||
|
||||
|
||||
class NotebookLoader(BaseLoader):
|
||||
"""Loads .ipynb notebook files."""
|
||||
"""Load `Jupyter notebook` (.ipynb) files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -80,7 +80,7 @@ class NotebookLoader(BaseLoader):
|
||||
remove_newline: bool = False,
|
||||
traceback: bool = False,
|
||||
):
|
||||
"""Initialize with path.
|
||||
"""Initialize with a path.
|
||||
|
||||
Args:
|
||||
path: The path to load the notebook from.
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads Notion directory dump."""
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class NotionDirectoryLoader(BaseLoader):
|
||||
"""Loads Notion directory dump."""
|
||||
"""Load `Notion directory` dump."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with a file path."""
|
||||
|
@ -1,5 +1,3 @@
|
||||
"""Notion DB loader for langchain"""
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import requests
|
||||
@ -14,7 +12,7 @@ BLOCK_URL = NOTION_BASE_URL + "/blocks/{block_id}/children"
|
||||
|
||||
|
||||
class NotionDBLoader(BaseLoader):
|
||||
"""Notion DB Loader.
|
||||
"""Load from `Notion DB`.
|
||||
|
||||
Reads content from pages within a Notion Database.
|
||||
Args:
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Extract text from any file type."""
|
||||
import json
|
||||
import uuid
|
||||
from typing import List
|
||||
@ -9,7 +8,7 @@ from langchain.tools.nuclia.tool import NucliaUnderstandingAPI
|
||||
|
||||
|
||||
class NucliaLoader(BaseLoader):
|
||||
"""Extract text from any file type."""
|
||||
"""Load from any file type using `Nuclia Understanding API`."""
|
||||
|
||||
def __init__(self, path: str, nuclia_tool: NucliaUnderstandingAPI):
|
||||
self.nua = nuclia_tool
|
||||
|
@ -7,7 +7,7 @@ from langchain.document_loaders.obs_file import OBSFileLoader
|
||||
|
||||
|
||||
class OBSDirectoryLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from Huawei OBS."""
|
||||
"""Load from `Huawei OBS directory`."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -10,7 +10,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class OBSFileLoader(BaseLoader):
|
||||
"""Loader for Huawei OBS file."""
|
||||
"""Load from the `Huawei OBS file`."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads Obsidian directory dump."""
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class ObsidianLoader(BaseLoader):
|
||||
"""Loads Obsidian files from disk."""
|
||||
"""Load `Obsidian` files from directory."""
|
||||
|
||||
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads OpenOffice ODT files."""
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.document_loaders.unstructured import (
|
||||
@ -8,7 +7,8 @@ from langchain.document_loaders.unstructured import (
|
||||
|
||||
|
||||
class UnstructuredODTLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load OpenOffice ODT files.
|
||||
"""Load `OpenOffice ODT` files using `Unstructured`.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
|
@ -60,7 +60,7 @@ class _SupportedFileTypes(BaseModel):
|
||||
|
||||
|
||||
class OneDriveLoader(BaseLoader, BaseModel):
|
||||
"""Loads data from OneDrive."""
|
||||
"""Load from `Microsoft OneDrive`."""
|
||||
|
||||
settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
|
||||
""" The settings for the OneDrive API client."""
|
||||
|
@ -16,7 +16,7 @@ CHUNK_SIZE = 1024 * 1024 * 5
|
||||
|
||||
|
||||
class OneDriveFileLoader(BaseLoader, BaseModel):
|
||||
"""Loads a file from OneDrive."""
|
||||
"""Load a file from `Microsoft OneDrive`."""
|
||||
|
||||
file: File = Field(...)
|
||||
"""The file to load."""
|
||||
|
@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class OpenCityDataLoader(BaseLoader):
|
||||
"""Loads Open City data."""
|
||||
"""Load from `Open City`."""
|
||||
|
||||
def __init__(self, city_id: str, dataset_id: str, limit: int):
|
||||
"""Initialize with dataset_id.
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads Org-Mode files."""
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.document_loaders.unstructured import (
|
||||
@ -8,7 +7,8 @@ from langchain.document_loaders.unstructured import (
|
||||
|
||||
|
||||
class UnstructuredOrgModeLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load Org-Mode files.
|
||||
"""Load `Org-Mode` files using `Unstructured`.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads PDF files."""
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@ -30,7 +29,8 @@ logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
class UnstructuredPDFLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load PDF files.
|
||||
"""Load `PDF` files using `Unstructured`.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
@ -59,7 +59,7 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
|
||||
|
||||
|
||||
class BasePDFLoader(BaseLoader, ABC):
|
||||
"""Base loader class for PDF files.
|
||||
"""Base Loader class for `PDF` files.
|
||||
|
||||
Defaults to check for local file, but if the file is a web path, it will download it
|
||||
to a temporary file, use it, then clean up the temporary file after completion
|
||||
@ -122,7 +122,7 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
|
||||
|
||||
class OnlinePDFLoader(BasePDFLoader):
|
||||
"""Loads online PDFs."""
|
||||
"""Load online `PDF`."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
@ -131,7 +131,7 @@ class OnlinePDFLoader(BasePDFLoader):
|
||||
|
||||
|
||||
class PyPDFLoader(BasePDFLoader):
|
||||
"""Loads a PDF with pypdf and chunks at character level.
|
||||
"""Load `PDF using `pypdf` and chunks at character level.
|
||||
|
||||
Loader also stores page numbers in metadata.
|
||||
"""
|
||||
@ -162,7 +162,7 @@ class PyPDFLoader(BasePDFLoader):
|
||||
|
||||
|
||||
class PyPDFium2Loader(BasePDFLoader):
|
||||
"""Loads a PDF with pypdfium2 and chunks at character level."""
|
||||
"""Load `PDF` using `pypdfium2` and chunks at character level."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with a file path."""
|
||||
@ -182,7 +182,7 @@ class PyPDFium2Loader(BasePDFLoader):
|
||||
|
||||
|
||||
class PyPDFDirectoryLoader(BaseLoader):
|
||||
"""Loads a directory with PDF files with pypdf and chunks at character level.
|
||||
"""Load a directory with `PDF` files using `pypdf` and chunks at character level.
|
||||
|
||||
Loader also stores page numbers in metadata.
|
||||
"""
|
||||
@ -227,7 +227,7 @@ class PyPDFDirectoryLoader(BaseLoader):
|
||||
|
||||
|
||||
class PDFMinerLoader(BasePDFLoader):
|
||||
"""Loader that uses PDFMiner to load PDF files."""
|
||||
"""Load `PDF` files using `PDFMiner`."""
|
||||
|
||||
def __init__(self, file_path: str) -> None:
|
||||
"""Initialize with file path."""
|
||||
@ -255,7 +255,7 @@ class PDFMinerLoader(BasePDFLoader):
|
||||
|
||||
|
||||
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
||||
"""Loader that uses PDFMiner to load PDF files as HTML content."""
|
||||
"""Load `PDF` files as HTML content using `PDFMiner`."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with a file path."""
|
||||
@ -289,7 +289,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
||||
|
||||
|
||||
class PyMuPDFLoader(BasePDFLoader):
|
||||
"""Loader that uses PyMuPDF to load PDF files."""
|
||||
"""Load `PDF` files using `PyMuPDF`."""
|
||||
|
||||
def __init__(self, file_path: str) -> None:
|
||||
"""Initialize with a file path."""
|
||||
@ -314,7 +314,7 @@ class PyMuPDFLoader(BasePDFLoader):
|
||||
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
|
||||
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
|
||||
class MathpixPDFLoader(BasePDFLoader):
|
||||
"""This class uses Mathpix service to load PDF files."""
|
||||
"""Load `PDF` files using `Mathpix` service."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -433,7 +433,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
|
||||
|
||||
class PDFPlumberLoader(BasePDFLoader):
|
||||
"""Loader that uses pdfplumber to load PDF files."""
|
||||
"""Load `PDF` files using `pdfplumber`."""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
|
||||
@ -459,7 +459,7 @@ class PDFPlumberLoader(BasePDFLoader):
|
||||
|
||||
|
||||
class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
"""Loads a PDF document from local file system, HTTP or S3.
|
||||
""" "Load `PDF` files from a local file system, HTTP or S3.
|
||||
|
||||
To authenticate, the AWS client uses the following methods to
|
||||
automatically load credentials:
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads PowerPoint files."""
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
@ -6,7 +5,8 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load PowerPoint files.
|
||||
"""Load `Microsoft PowerPoint` files using `Unstructured`.
|
||||
|
||||
Works with both .ppt and .pptx files.
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads documents from Psychic.dev."""
|
||||
from typing import List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class PsychicLoader(BaseLoader):
|
||||
"""Loads documents from Psychic.dev."""
|
||||
"""Load from `Psychic.dev`."""
|
||||
|
||||
def __init__(
|
||||
self, api_key: str, account_id: str, connector_id: Optional[str] = None
|
||||
|
@ -6,7 +6,7 @@ from langchain.utilities.pubmed import PubMedAPIWrapper
|
||||
|
||||
|
||||
class PubMedLoader(BaseLoader):
|
||||
"""Loads a query result from PubMed biomedical library into a list of Documents.
|
||||
"""Load from the `PubMed` biomedical library.
|
||||
|
||||
Attributes:
|
||||
query: The query to be passed to the PubMed API.
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Load from a Spark Dataframe object"""
|
||||
import itertools
|
||||
import logging
|
||||
import sys
|
||||
@ -14,7 +13,7 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class PySparkDataFrameLoader(BaseLoader):
|
||||
"""Load PySpark DataFrames"""
|
||||
"""Load `PySpark` DataFrames."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -4,9 +4,7 @@ from langchain.document_loaders.text import TextLoader
|
||||
|
||||
|
||||
class PythonLoader(TextLoader):
|
||||
"""
|
||||
Load Python files, respecting any non-default encoding if specified.
|
||||
"""
|
||||
"""Load `Python` files, respecting any non-default encoding if specified."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with a file path.
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads ReadTheDocs documentation directory dump."""
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Tuple, Union
|
||||
|
||||
@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class ReadTheDocsLoader(BaseLoader):
|
||||
"""Loads ReadTheDocs documentation directory dump."""
|
||||
"""Load `ReadTheDocs` documentation directory."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class RecursiveUrlLoader(BaseLoader):
|
||||
"""Loads all child links from a given url."""
|
||||
"""Load all child links from a URL page."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Reddit document loader."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Sequence
|
||||
@ -21,7 +20,8 @@ def _dependable_praw_import() -> praw:
|
||||
|
||||
|
||||
class RedditPostsLoader(BaseLoader):
|
||||
"""Reddit posts loader.
|
||||
"""Load `Reddit` posts.
|
||||
|
||||
Read posts on a subreddit.
|
||||
First, you need to go to
|
||||
https://www.reddit.com/prefs/apps/
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads Roam directory dump."""
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class RoamLoader(BaseLoader):
|
||||
"""Loads Roam files from disk."""
|
||||
"""Load `Roam` files from a directory."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with a path."""
|
||||
|
@ -17,7 +17,7 @@ class ColumnNotFoundError(Exception):
|
||||
|
||||
|
||||
class RocksetLoader(BaseLoader):
|
||||
"""Wrapper around Rockset db
|
||||
"""Load from a `Rockset` database.
|
||||
|
||||
To use, you should have the `rockset` python package installed.
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loader that uses unstructured to load HTML files."""
|
||||
import logging
|
||||
from typing import Any, Iterator, List, Optional, Sequence
|
||||
|
||||
@ -10,7 +9,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RSSFeedLoader(BaseLoader):
|
||||
"""Loader that uses newspaper to load news articles from RSS feeds.
|
||||
"""Load news articles from `RSS` feeds using `Unstructured`.
|
||||
|
||||
Args:
|
||||
urls: URLs for RSS feeds to load. Each articles in the feed is loaded into its own document.
|
||||
|
@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import (
|
||||
|
||||
|
||||
class UnstructuredRSTLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load RST files.
|
||||
"""Load `RST` files using `Unstructured`.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
|
@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import (
|
||||
|
||||
|
||||
class UnstructuredRTFLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load RTF files.
|
||||
"""Load `RTF` files using `Unstructured`.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loading logic for loading documents from an AWS S3 directory."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -7,7 +6,7 @@ from langchain.document_loaders.s3_file import S3FileLoader
|
||||
|
||||
|
||||
class S3DirectoryLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from an AWS S3."""
|
||||
"""Load from `Amazon AWS S3` directory."""
|
||||
|
||||
def __init__(self, bucket: str, prefix: str = ""):
|
||||
"""Initialize with bucket and key name.
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loading logic for loading documents from an AWS S3 file."""
|
||||
import os
|
||||
import tempfile
|
||||
from typing import List
|
||||
@ -9,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class S3FileLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from an AWS S3 file."""
|
||||
"""Load from `Amazon AWS S3` file."""
|
||||
|
||||
def __init__(self, bucket: str, key: str):
|
||||
"""Initialize with bucket and key name.
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loader that fetches a sitemap and loads those URLs."""
|
||||
import itertools
|
||||
import re
|
||||
from typing import Any, Callable, Generator, Iterable, List, Optional
|
||||
@ -22,7 +21,7 @@ def _batch_block(iterable: Iterable, size: int) -> Generator[List[dict], None, N
|
||||
|
||||
|
||||
class SitemapLoader(WebBaseLoader):
|
||||
"""Loader that fetches a sitemap and loads those URLs."""
|
||||
"""Load a sitemap and its URLs."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loader for documents from a Slack export."""
|
||||
import json
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
@ -9,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class SlackDirectoryLoader(BaseLoader):
|
||||
"""Loads documents from a Slack directory dump."""
|
||||
"""Load from a `Slack` directory dump."""
|
||||
|
||||
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
|
||||
"""Initialize the SlackDirectoryLoader.
|
||||
|
@ -7,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class SnowflakeLoader(BaseLoader):
|
||||
"""Loads a query result from Snowflake into a list of documents.
|
||||
"""Load from `Snowflake` API.
|
||||
|
||||
Each document represents one row of the result. The `page_content_columns`
|
||||
are written into the `page_content` of the document. The `metadata_columns`
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loader that fetches data from Spreedly API."""
|
||||
import json
|
||||
import urllib.request
|
||||
from typing import List
|
||||
@ -20,7 +19,7 @@ SPREEDLY_ENDPOINTS = {
|
||||
|
||||
|
||||
class SpreedlyLoader(BaseLoader):
|
||||
"""Loader that fetches data from Spreedly API."""
|
||||
"""Load from `Spreedly` API."""
|
||||
|
||||
def __init__(self, access_token: str, resource: str) -> None:
|
||||
"""Initialize with an access token and a resource.
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loader for .srt (subtitle) files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class SRTLoader(BaseLoader):
|
||||
"""Loader for .srt (subtitle) files."""
|
||||
"""Load `.srt` (subtitle) files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with a file path."""
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loader that fetches data from Stripe"""
|
||||
import json
|
||||
import urllib.request
|
||||
from typing import List, Optional
|
||||
@ -18,7 +17,7 @@ STRIPE_ENDPOINTS = {
|
||||
|
||||
|
||||
class StripeLoader(BaseLoader):
|
||||
"""Loader that fetches data from Stripe."""
|
||||
"""Load from `Stripe` API."""
|
||||
|
||||
def __init__(self, resource: str, access_token: Optional[str] = None) -> None:
|
||||
"""Initialize with a resource and an access token.
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads Telegram chat json dump."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
@ -24,7 +23,7 @@ def concatenate_rows(row: dict) -> str:
|
||||
|
||||
|
||||
class TelegramChatFileLoader(BaseLoader):
|
||||
"""Loads Telegram chat json directory dump."""
|
||||
"""Load from `Telegram chat` dump."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with a path."""
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loading logic for loading documents from Tencent Cloud COS directory."""
|
||||
from typing import Any, Iterator, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -7,7 +6,7 @@ from langchain.document_loaders.tencent_cos_file import TencentCOSFileLoader
|
||||
|
||||
|
||||
class TencentCOSDirectoryLoader(BaseLoader):
|
||||
"""Loader for Tencent Cloud COS directory."""
|
||||
"""Load from `Tencent Cloud COS` directory."""
|
||||
|
||||
def __init__(self, conf: Any, bucket: str, prefix: str = ""):
|
||||
"""Initialize with COS config, bucket and prefix.
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loading logic for loading documents from Tencent Cloud COS file."""
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Any, Iterator, List
|
||||
@ -9,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class TencentCOSFileLoader(BaseLoader):
|
||||
"""Loader for Tencent Cloud COS file."""
|
||||
"""Load from `Tencent Cloud COS` file."""
|
||||
|
||||
def __init__(self, conf: Any, bucket: str, key: str):
|
||||
"""Initialize with COS config, bucket and key name.
|
||||
|
@ -6,7 +6,7 @@ from langchain.utilities.tensorflow_datasets import TensorflowDatasets
|
||||
|
||||
|
||||
class TensorflowDatasetLoader(BaseLoader):
|
||||
"""Loads from TensorFlow Datasets into a list of Documents.
|
||||
"""Load from `TensorFlow Dataset`.
|
||||
|
||||
Attributes:
|
||||
dataset_name: the name of the dataset to load
|
||||
|
@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextLoader(BaseLoader):
|
||||
"""Load text files.
|
||||
"""Load text file.
|
||||
|
||||
|
||||
Args:
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads HTML to markdown using 2markdown."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterator, List
|
||||
@ -10,7 +9,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class ToMarkdownLoader(BaseLoader):
|
||||
"""Loads HTML to markdown using 2markdown."""
|
||||
"""Load `HTML` using `2markdown API`."""
|
||||
|
||||
def __init__(self, url: str, api_key: str):
|
||||
"""Initialize with url and api key."""
|
||||
|
@ -7,11 +7,10 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class TomlLoader(BaseLoader):
|
||||
"""
|
||||
A TOML document loader that inherits from the BaseLoader class.
|
||||
"""Load `TOML` files.
|
||||
|
||||
This class can be initialized with either a single source file or a source
|
||||
directory containing TOML files.
|
||||
It can load a single source file or several files in a single
|
||||
directory.
|
||||
"""
|
||||
|
||||
def __init__(self, source: Union[str, Path]):
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loads cards from Trello"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple
|
||||
@ -12,7 +11,7 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class TrelloLoader(BaseLoader):
|
||||
"""Trello loader. Reads all cards from a Trello board."""
|
||||
"""Load cards from a `Trello` board."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -7,7 +7,9 @@ from langchain.document_loaders.unstructured import (
|
||||
|
||||
|
||||
class UnstructuredTSVLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load TSV files. Like other
|
||||
"""Load `TSV` files using `Unstructured`.
|
||||
|
||||
Like other
|
||||
Unstructured loaders, UnstructuredTSVLoader can be used in both
|
||||
"single" and "elements" mode. If you use the loader in "elements"
|
||||
mode, the TSV file will be a single Unstructured Table element.
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Twitter document loader."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
|
||||
@ -22,8 +21,9 @@ def _dependable_tweepy_import() -> tweepy:
|
||||
|
||||
|
||||
class TwitterTweetLoader(BaseLoader):
|
||||
"""Twitter tweets loader.
|
||||
Read tweets of user twitter handle.
|
||||
"""Load `Twitter` tweets.
|
||||
|
||||
Read tweets of the user's Twitter handle.
|
||||
|
||||
First you need to go to
|
||||
`https://developer.twitter.com/en/docs/twitter-api
|
||||
|
@ -130,7 +130,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
|
||||
|
||||
class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
"""Loader that uses Unstructured to load files.
|
||||
"""Load files using `Unstructured`.
|
||||
|
||||
The file loader uses the
|
||||
unstructured partition function and will automatically detect the file
|
||||
@ -211,7 +211,7 @@ def get_elements_from_api(
|
||||
|
||||
|
||||
class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses the Unstructured API to load files.
|
||||
"""Load files using `Unstructured` API.
|
||||
|
||||
By default, the loader makes a call to the hosted Unstructured API.
|
||||
If you are running the unstructured API locally, you can change the
|
||||
@ -275,7 +275,7 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
||||
|
||||
|
||||
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
||||
"""Loader that uses Unstructured to load files.
|
||||
"""Load files using `Unstructured`.
|
||||
|
||||
The file loader
|
||||
uses the unstructured partition function and will automatically detect the file
|
||||
@ -322,7 +322,7 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
||||
|
||||
|
||||
class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
|
||||
"""Loader that uses the Unstructured API to load files.
|
||||
"""Load files using `Unstructured` API.
|
||||
|
||||
By default, the loader makes a call to the hosted Unstructured API.
|
||||
If you are running the unstructured API locally, you can change the
|
||||
|
@ -9,7 +9,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnstructuredURLLoader(BaseLoader):
|
||||
"""Loader that use Unstructured to load files from remote URLs.
|
||||
"""Load files from remote URLs using `Unstructured`.
|
||||
|
||||
Use the unstructured partition function to detect the MIME type
|
||||
and route the file to the appropriate partitioner.
|
||||
|
||||
|
@ -10,7 +10,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PlaywrightURLLoader(BaseLoader):
|
||||
"""Loader that uses Playwright and to load a page and unstructured to load the html.
|
||||
"""Load `HTML` pages with `Playwright` and parse with `Unstructured`.
|
||||
|
||||
This is useful for loading pages that require javascript to render.
|
||||
|
||||
Attributes:
|
||||
|
@ -13,7 +13,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SeleniumURLLoader(BaseLoader):
|
||||
"""Loader that uses Selenium and to load a page and unstructured to load the html.
|
||||
"""Load `HTML` pages with `Selenium` and parse with `Unstructured`.
|
||||
|
||||
This is useful for loading pages that require javascript to render.
|
||||
|
||||
Attributes:
|
||||
|
@ -10,7 +10,7 @@ from langchain.utilities.openweathermap import OpenWeatherMapAPIWrapper
|
||||
|
||||
|
||||
class WeatherDataLoader(BaseLoader):
|
||||
"""Weather Reader.
|
||||
"""Load weather data with `Open Weather Map` API.
|
||||
|
||||
Reads the forecast & current weather of any location using OpenWeatherMap's free
|
||||
API. Checkout 'https://openweathermap.org/appid' for more on how to generate a free
|
||||
|
@ -37,7 +37,7 @@ def _build_metadata(soup: Any, url: str) -> dict:
|
||||
|
||||
|
||||
class WebBaseLoader(BaseLoader):
|
||||
"""Loader that uses urllib and beautiful soup to load webpages."""
|
||||
"""Load HTML pages using `urllib` and parse them with `BeautifulSoup'."""
|
||||
|
||||
web_paths: List[str]
|
||||
|
||||
|
@ -12,7 +12,7 @@ def concatenate_rows(date: str, sender: str, text: str) -> str:
|
||||
|
||||
|
||||
class WhatsAppChatLoader(BaseLoader):
|
||||
"""Loads WhatsApp messages text file."""
|
||||
"""Load `WhatsApp` messages text file."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with path."""
|
||||
|
@ -6,7 +6,8 @@ from langchain.utilities.wikipedia import WikipediaAPIWrapper
|
||||
|
||||
|
||||
class WikipediaLoader(BaseLoader):
|
||||
"""Loads a query result from www.wikipedia.org into a list of Documents.
|
||||
"""Load from `Wikipedia`.
|
||||
|
||||
The hard limit on the number of downloaded Documents is 300 for now.
|
||||
|
||||
Each wiki page represents one Document.
|
||||
|
@ -13,7 +13,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class Docx2txtLoader(BaseLoader, ABC):
|
||||
"""Loads a DOCX with docx2txt and chunks at character level.
|
||||
"""Load `DOCX` file using `docx2txt` and chunks at character level.
|
||||
|
||||
Defaults to check for local file, but if the file is a web path, it will download it
|
||||
to a temporary file, and use that, then clean up the temporary file after completion
|
||||
@ -65,7 +65,8 @@ class Docx2txtLoader(BaseLoader, ABC):
|
||||
|
||||
|
||||
class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load word documents.
|
||||
"""Load `Microsof Word` file using `Unstructured`.
|
||||
|
||||
Works with both .docx and .doc files.
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
|
@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import (
|
||||
|
||||
|
||||
class UnstructuredXMLLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load XML files.
|
||||
"""Load `XML` file using `Unstructured`.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
|
@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class XorbitsLoader(BaseLoader):
|
||||
"""Load Xorbits DataFrame."""
|
||||
"""Load `Xorbits` DataFrame."""
|
||||
|
||||
def __init__(self, data_frame: Any, page_content_column: str = "text"):
|
||||
"""Initialize with dataframe object.
|
||||
|
@ -140,7 +140,7 @@ def _parse_video_id(url: str) -> Optional[str]:
|
||||
|
||||
|
||||
class YoutubeLoader(BaseLoader):
|
||||
"""Loads Youtube transcripts."""
|
||||
"""Load `YouTube` transcripts."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -252,7 +252,7 @@ class YoutubeLoader(BaseLoader):
|
||||
|
||||
@dataclass
|
||||
class GoogleApiYoutubeLoader(BaseLoader):
|
||||
"""Loads all Videos from a Channel
|
||||
"""Load all Videos from a `YouTube` Channel.
|
||||
|
||||
To use, you should have the ``googleapiclient,youtube_transcript_api``
|
||||
python package installed.
|
||||
|
Loading…
Reference in New Issue
Block a user