docstrings document_loaders 3 (#6937)

- Updated docstrings for `document_loaders`
- Mass update `"""Loader that loads` to `"""Loads`

@baskaryan  - please, review
This commit is contained in:
Leonid Ganeline 2023-07-10 08:56:53 -07:00 committed by GitHub
parent 9d13dcd17c
commit 5eec74d9a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
54 changed files with 316 additions and 105 deletions

View File

@ -1,4 +1,4 @@
"""Loader that loads acreom vault from a directory."""
"""Loads acreom vault from a directory."""
import re
from pathlib import Path
from typing import Iterator, List

View File

@ -1,4 +1,4 @@
"""Loader that loads local airbyte json files."""
"""Loads local airbyte json files."""
import json
from typing import List
@ -8,7 +8,7 @@ from langchain.utils import stringify_dict
class AirbyteJSONLoader(BaseLoader):
"""Loader that loads local airbyte json files."""
"""Loads local airbyte json files."""
def __init__(self, file_path: str):
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""

View File

@ -1,4 +1,4 @@
"""Loader that loads AZLyrics."""
"""Loads AZLyrics."""
from typing import List
from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
class AZLyricsLoader(WebBaseLoader):
"""Loader that loads AZLyrics webpages."""
"""Loads AZLyrics webpages."""
def load(self) -> List[Document]:
"""Load webpages into Documents."""

View File

@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
class BiliBiliLoader(BaseLoader):
"""Loader that loads bilibili transcripts."""
"""Loads bilibili transcripts."""
def __init__(self, video_urls: List[str]):
"""Initialize with bilibili url.

View File

@ -1,4 +1,4 @@
"""Loader that loads all documents from a blackboard course."""
"""Loads all documents from a blackboard course."""
import contextlib
import re
from pathlib import Path

View File

@ -1,3 +1,4 @@
"""Load conversations from ChatGPT data export"""
import datetime
import json
from typing import List
@ -31,7 +32,7 @@ class ChatGPTLoader(BaseLoader):
"""Load conversations from exported ChatGPT data."""
def __init__(self, log_file: str, num_logs: int = -1):
"""
"""Initialize a class object.
Args:
log_file: Path to the log file

View File

@ -1,4 +1,4 @@
"""Loader that loads College Confidential."""
"""Loads College Confidential."""
from typing import List
from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
class CollegeConfidentialLoader(WebBaseLoader):
"""Loader that loads College Confidential webpages."""
"""Loads College Confidential webpages."""
def load(self) -> List[Document]:
"""Load webpages as Documents."""

View File

@ -1,4 +1,4 @@
"""Loader that loads EPub files."""
"""Loads EPub files."""
from typing import List
from langchain.document_loaders.unstructured import (

View File

@ -1,4 +1,4 @@
"""Loader that loads Microsoft Excel files."""
"""Loads Microsoft Excel files."""
from typing import Any, List
from langchain.document_loaders.unstructured import (

View File

@ -1,4 +1,4 @@
"""Loader that loads Facebook chat json dump."""
"""Loads Facebook chat json dump."""
import datetime
import json
from pathlib import Path

View File

@ -1,4 +1,4 @@
"""Loader that loads Figma files json dump."""
"""Loads Figma files json dump."""
import json
import urllib.request
from typing import Any, List

View File

@ -1,4 +1,4 @@
"""Loader that loads GitBook."""
"""Loads GitBook."""
from typing import Any, List, Optional
from urllib.parse import urljoin, urlparse

View File

@ -1,4 +1,4 @@
"""Loader that loads data from Google Drive."""
"""Loads data from Google Drive."""
# Prerequisites:
# 1. Create a Google Cloud project

View File

@ -1,4 +1,4 @@
"""Loader that loads Hacker News."""
"""Loads HN."""
from typing import Any, List
from langchain.docstore.document import Document

View File

@ -1,4 +1,4 @@
"""Loader that loads iFixit data."""
"""Loads iFixit data."""
from typing import List, Optional
import requests

View File

@ -1,4 +1,4 @@
"""Loader that loads image files."""
"""Loads image files."""
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader

View File

@ -37,13 +37,13 @@ class MastodonTootsLoader(BaseLoader):
Args:
mastodon_accounts: The list of Mastodon accounts to query.
number_toots: How many toots to pull for each account. Default is 100.
number_toots: How many toots to pull for each account. Defaults to 100.
exclude_replies: Whether to exclude reply toots from the load.
Default is False.
Defaults to False.
access_token: An access token if toots are loaded as a Mastodon app. Can
also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
api_base_url: A Mastodon API base URL to talk to, if not using the default.
Default is "https://mastodon.social".
Defaults to "https://mastodon.social".
"""
mastodon = _dependable_mastodon_import()
access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")

View File

@ -24,10 +24,11 @@ class MHTMLLoader(BaseLoader):
to pass to the BeautifulSoup object.
Args:
file_path: The path to the file to load.
file_path: Path to file to load.
open_encoding: The encoding to use when opening the file.
bs_kwargs: soup kwargs to pass to the BeautifulSoup object.
get_text_separator: The separator to use when getting text from the soup.
bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
get_text_separator: The separator to use when getting the text
from the soup.
"""
try:
import bs4 # noqa:F401

View File

@ -35,6 +35,16 @@ class ModernTreasuryLoader(BaseLoader):
organization_id: Optional[str] = None,
api_key: Optional[str] = None,
) -> None:
"""
Args:
resource: The Modern Treasury resource to load.
organization_id: The Modern Treasury organization ID. It can also be
specified via the environment variable
"MODERN_TREASURY_ORGANIZATION_ID".
api_key: The Modern Treasury API key. It can also be specified via
the environment variable "MODERN_TREASURY_API_KEY".
"""
self.resource = resource
organization_id = organization_id or get_from_env(
"organization_id", "MODERN_TREASURY_ORGANIZATION_ID"

View File

@ -1,4 +1,4 @@
"""Loader that loads .ipynb notebook files."""
"""Loads .ipynb notebook files."""
import json
from pathlib import Path
from typing import Any, List
@ -10,7 +10,18 @@ from langchain.document_loaders.base import BaseLoader
def concatenate_cells(
cell: dict, include_outputs: bool, max_output_length: int, traceback: bool
) -> str:
"""Combine cells information in a readable format ready to be used."""
"""Combine cells information in a readable format ready to be used.
Args:
cell: A dictionary
include_outputs: Whether to include the outputs of the cell.
max_output_length: Maximum length of the output to be displayed.
traceback: Whether to return a traceback of the error.
Returns:
A string with the cell information.
"""
cell_type = cell["cell_type"]
source = cell["source"]
output = cell["outputs"]
@ -45,7 +56,7 @@ def concatenate_cells(
def remove_newlines(x: Any) -> Any:
"""Remove recursively newlines, no matter the data structure they are stored in."""
"""Recursively removes newlines, no matter the data structure they are stored in."""
import pandas as pd
if isinstance(x, str):
@ -59,7 +70,7 @@ def remove_newlines(x: Any) -> Any:
class NotebookLoader(BaseLoader):
"""Loader that loads .ipynb notebook files."""
"""Loads .ipynb notebook files."""
def __init__(
self,
@ -69,7 +80,19 @@ class NotebookLoader(BaseLoader):
remove_newline: bool = False,
traceback: bool = False,
):
"""Initialize with path."""
"""Initialize with path.
Args:
path: The path to load the notebook from.
include_outputs: Whether to include the outputs of the cell.
Defaults to False.
max_output_length: Maximum length of the output to be displayed.
Defaults to 10.
remove_newline: Whether to remove newlines from the notebook.
Defaults to False.
traceback: Whether to return a traceback of the error.
Defaults to False.
"""
self.file_path = path
self.include_outputs = include_outputs
self.max_output_length = max_output_length

View File

@ -1,4 +1,4 @@
"""Loader that loads Notion directory dump."""
"""Loads Notion directory dump."""
from pathlib import Path
from typing import List
@ -7,10 +7,10 @@ from langchain.document_loaders.base import BaseLoader
class NotionDirectoryLoader(BaseLoader):
"""Loader that loads Notion directory dump."""
"""Loads Notion directory dump."""
def __init__(self, path: str):
"""Initialize with path."""
"""Initialize with a file path."""
self.file_path = path
def load(self) -> List[Document]:

View File

@ -15,11 +15,12 @@ BLOCK_URL = NOTION_BASE_URL + "/blocks/{block_id}/children"
class NotionDBLoader(BaseLoader):
"""Notion DB Loader.
Reads content from pages within a Noton Database.
Reads content from pages within a Notion Database.
Args:
integration_token (str): Notion integration token.
database_id (str): Notion database id.
request_timeout_sec (int): Timeout for Notion requests in seconds.
Defaults to 10.
"""
def __init__(
@ -75,7 +76,11 @@ class NotionDBLoader(BaseLoader):
return pages
def load_page(self, page_summary: Dict[str, Any]) -> Document:
"""Read a page."""
"""Read a page.
Args:
page_summary: Page summary from Notion API.
"""
page_id = page_summary["id"]
# load properties as metadata

View File

@ -1,4 +1,4 @@
"""Loader that loads Obsidian directory dump."""
"""Loads Obsidian directory dump."""
import re
from pathlib import Path
from typing import List
@ -8,14 +8,21 @@ from langchain.document_loaders.base import BaseLoader
class ObsidianLoader(BaseLoader):
"""Loader that loads Obsidian files from disk."""
"""Loads Obsidian files from disk."""
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
def __init__(
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
):
"""Initialize with path."""
"""Initialize with a path.
Args:
path: Path to the directory containing the Obsidian files.
encoding: Charset encoding, defaults to "UTF-8"
collect_metadata: Whether to collect metadata from the front matter.
Defaults to True.
"""
self.file_path = path
self.encoding = encoding
self.collect_metadata = collect_metadata

View File

@ -1,4 +1,4 @@
"""Loader that loads Open Office ODT files."""
"""Loads OpenOffice ODT files."""
from typing import Any, List
from langchain.document_loaders.unstructured import (
@ -8,11 +8,19 @@ from langchain.document_loaders.unstructured import (
class UnstructuredODTLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load open office ODT files."""
"""Loader that uses unstructured to load OpenOffice ODT files."""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
"""
Args:
file_path: The path to the file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
validate_unstructured_version(min_unstructured_version="0.6.3")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

View File

@ -1,4 +1,4 @@
"""Loader that loads data from OneDrive"""
"""Loads data from OneDrive"""
from __future__ import annotations
import logging
@ -60,11 +60,18 @@ class _SupportedFileTypes(BaseModel):
class OneDriveLoader(BaseLoader, BaseModel):
"""Loads data from OneDrive."""
settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
""" The settings for the OneDrive API client."""
drive_id: str = Field(...)
""" The ID of the OneDrive drive to load data from."""
folder_path: Optional[str] = None
""" The path to the folder to load data from."""
object_ids: Optional[List[str]] = None
""" The IDs of the objects to load data from."""
auth_with_token: bool = False
""" Whether to authenticate with a token or not. Defaults to False."""
def _auth(self) -> Type[Account]:
"""

View File

@ -16,10 +16,15 @@ CHUNK_SIZE = 1024 * 1024 * 5
class OneDriveFileLoader(BaseLoader, BaseModel):
"""Loads a file from OneDrive."""
file: File = Field(...)
"""The file to load."""
class Config:
arbitrary_types_allowed = True
"""Allow arbitrary types. This is needed for the File type. Default is True.
See https://pydantic-docs.helpmanual.io/usage/types/#arbitrary-types-allowed"""
def load(self) -> List[Document]:
"""Load Documents"""

View File

@ -5,13 +5,19 @@ from langchain.document_loaders.base import BaseLoader
class OpenCityDataLoader(BaseLoader):
"""Loader that loads Open city data."""
"""Loads Open City data."""
def __init__(self, city_id: str, dataset_id: str, limit: int):
"""Initialize with dataset_id"""
""" Example: https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6 """
""" e.g., city_id = data.sfgov.org """
""" e.g., dataset_id = vw6y-z8j6 """
"""Initialize with dataset_id.
Example: https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6
e.g., city_id = data.sfgov.org
e.g., dataset_id = vw6y-z8j6
Args:
city_id: The Open City city identifier.
dataset_id: The Open City dataset identifier.
limit: The maximum number of documents to load.
"""
self.city_id = city_id
self.dataset_id = dataset_id
self.limit = limit

View File

@ -1,4 +1,4 @@
"""Loader that loads Org-Mode files."""
"""Loads Org-Mode files."""
from typing import Any, List
from langchain.document_loaders.unstructured import (
@ -13,6 +13,14 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader):
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
"""
Args:
file_path: The path to the file to load.
mode: The mode to load the file from. Default is "single".
**unstructured_kwargs: Any additional keyword arguments to pass
to the unstructured.
"""
validate_unstructured_version(min_unstructured_version="0.7.9")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

View File

@ -1,4 +1,4 @@
"""Loader that loads PDF files."""
"""Loads PDF files."""
import json
import logging
import os
@ -41,11 +41,11 @@ class BasePDFLoader(BaseLoader, ABC):
"""Base loader class for PDF files.
Defaults to check for local file, but if the file is a web path, it will download it
to a temporary file, and use that, then clean up the temporary file after completion
to a temporary file, use it, then clean up the temporary file after completion
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
"""Initialize with a file path."""
self.file_path = file_path
self.web_path = None
if "~" in self.file_path:
@ -86,7 +86,7 @@ class BasePDFLoader(BaseLoader, ABC):
class OnlinePDFLoader(BasePDFLoader):
"""Loader that loads online PDFs."""
"""Loads online PDFs."""
def load(self) -> List[Document]:
"""Load documents."""
@ -97,13 +97,13 @@ class OnlinePDFLoader(BasePDFLoader):
class PyPDFLoader(BasePDFLoader):
"""Loads a PDF with pypdf and chunks at character level.
Loader also stores page numbers in metadatas.
Loader also stores page numbers in metadata.
"""
def __init__(
self, file_path: str, password: Optional[Union[str, bytes]] = None
) -> None:
"""Initialize with file path."""
"""Initialize with a file path."""
try:
import pypdf # noqa:F401
except ImportError:
@ -129,7 +129,7 @@ class PyPDFium2Loader(BasePDFLoader):
"""Loads a PDF with pypdfium2 and chunks at character level."""
def __init__(self, file_path: str):
"""Initialize with file path."""
"""Initialize with a file path."""
super().__init__(file_path)
self.parser = PyPDFium2Parser()
@ -148,7 +148,7 @@ class PyPDFium2Loader(BasePDFLoader):
class PyPDFDirectoryLoader(BaseLoader):
"""Loads a directory with PDF files with pypdf and chunks at character level.
Loader also stores page numbers in metadatas.
Loader also stores page numbers in metadata.
"""
def __init__(
@ -222,7 +222,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
"""Loader that uses PDFMiner to load PDF files as HTML content."""
def __init__(self, file_path: str):
"""Initialize with file path."""
"""Initialize with a file path."""
try:
from pdfminer.high_level import extract_text_to_fp # noqa:F401
except ImportError:
@ -256,7 +256,7 @@ class PyMuPDFLoader(BasePDFLoader):
"""Loader that uses PyMuPDF to load PDF files."""
def __init__(self, file_path: str) -> None:
"""Initialize with file path."""
"""Initialize with a file path."""
try:
import fitz # noqa:F401
except ImportError:
@ -278,6 +278,8 @@ class PyMuPDFLoader(BasePDFLoader):
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
class MathpixPDFLoader(BasePDFLoader):
"""This class uses Mathpix service to load PDF files."""
def __init__(
self,
file_path: str,
@ -286,6 +288,16 @@ class MathpixPDFLoader(BasePDFLoader):
should_clean_pdf: bool = False,
**kwargs: Any,
) -> None:
"""Initialize with a file path.
Args:
file_path: a file for loading.
processed_file_format: a format of the processed file. Default is "mmd".
max_wait_time_seconds: a maximum time to wait for the response from
the server. Default is 500.
should_clean_pdf: a flag to clean the PDF file. Default is False.
**kwargs: additional keyword arguments.
"""
super().__init__(file_path)
self.mathpix_api_key = get_from_dict_or_env(
kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
@ -324,6 +336,13 @@ class MathpixPDFLoader(BasePDFLoader):
raise ValueError("Unable to send PDF to Mathpix.")
def wait_for_processing(self, pdf_id: str) -> None:
"""Wait for processing to complete.
Args:
pdf_id: a PDF id.
Returns: None
"""
url = self.url + "/" + pdf_id
for _ in range(0, self.max_wait_time_seconds, 5):
response = requests.get(url, headers=self.headers)
@ -346,6 +365,14 @@ class MathpixPDFLoader(BasePDFLoader):
return response.content.decode("utf-8")
def clean_pdf(self, contents: str) -> str:
"""Clean the PDF file.
Args:
contents: a PDF file contents.
Returns:
"""
contents = "\n".join(
[line for line in contents.split("\n") if not line.startswith("![]")]
)
@ -375,7 +402,7 @@ class PDFPlumberLoader(BasePDFLoader):
def __init__(
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
) -> None:
"""Initialize with file path."""
"""Initialize with a file path."""
try:
import pdfplumber # noqa:F401
except ImportError:

View File

@ -1,4 +1,4 @@
"""Loader that loads powerpoint files."""
"""Loads PowerPoint files."""
import os
from typing import List
@ -6,7 +6,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load powerpoint files."""
"""Loader that uses unstructured to load PowerPoint files."""
def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__

View File

@ -1,4 +1,4 @@
"""Loader that loads documents from Psychic.dev."""
"""Loads documents from Psychic.dev."""
from typing import List, Optional
from langchain.docstore.document import Document
@ -6,12 +6,18 @@ from langchain.document_loaders.base import BaseLoader
class PsychicLoader(BaseLoader):
"""Loader that loads documents from Psychic.dev."""
"""Loads documents from Psychic.dev."""
def __init__(
self, api_key: str, account_id: str, connector_id: Optional[str] = None
):
"""Initialize with API key, connector id, and account id."""
"""Initialize with API key, connector id, and account id.
Args:
api_key: The Psychic API key.
account_id: The Psychic account id.
connector_id: The Psychic connector id.
"""
try:
from psychicapi import ConnectorId, Psychic # noqa: F401

View File

@ -23,7 +23,15 @@ class PySparkDataFrameLoader(BaseLoader):
page_content_column: str = "text",
fraction_of_memory: float = 0.1,
):
"""Initialize with a Spark DataFrame object."""
"""Initialize with a Spark DataFrame object.
Args:
spark_session: The SparkSession object.
df: The Spark DataFrame object.
page_content_column: The name of the column containing the page content.
Defaults to "text".
fraction_of_memory: The fraction of memory to use. Defaults to 0.1.
"""
try:
from pyspark.sql import DataFrame, SparkSession
except ImportError:
@ -48,7 +56,7 @@ class PySparkDataFrameLoader(BaseLoader):
self.column_names = self.df.columns
def get_num_rows(self) -> Tuple[int, int]:
"""Gets the amount of "feasible" rows for the DataFrame"""
"""Gets the number of "feasible" rows for the DataFrame"""
try:
import psutil
except ImportError as e:

View File

@ -9,6 +9,11 @@ class PythonLoader(TextLoader):
"""
def __init__(self, file_path: str):
"""Initialize with a file path.
Args:
file_path: The path to the file to load.
"""
with open(file_path, "rb") as f:
encoding, _ = tokenize.detect_encoding(f.readline)
super().__init__(file_path=file_path, encoding=encoding)

View File

@ -1,4 +1,4 @@
"""Loader that loads ReadTheDocs documentation directory dump."""
"""Loads ReadTheDocs documentation directory dump."""
from pathlib import Path
from typing import Any, List, Optional, Tuple, Union
@ -7,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
class ReadTheDocsLoader(BaseLoader):
"""Loader that loads ReadTheDocs documentation directory dump."""
"""Loads ReadTheDocs documentation directory dump."""
def __init__(
self,
@ -20,7 +20,7 @@ class ReadTheDocsLoader(BaseLoader):
"""
Initialize ReadTheDocsLoader
The loader loops over all files under `path` and extract the actual content of
The loader loops over all files under `path` and extracts the actual content of
the files by retrieving main html tags. Default main html tags include
`<main id="main-content>`, <`div role="main>`, and `<article role="main">`. You
can also define your own html tags by passing custom_html_tag, e.g.
@ -31,7 +31,7 @@ class ReadTheDocsLoader(BaseLoader):
Args:
path: The location of pulled readthedocs folder.
encoding: The encoding with which to open the documents.
errors: Specifies how encoding and decoding errors are to be handledthis
errors: Specify how encoding and decoding errors are to be handledthis
cannot be used in binary mode.
custom_html_tag: Optional custom html tag to retrieve the content from
files.

View File

@ -8,17 +8,27 @@ from langchain.document_loaders.base import BaseLoader
class RecursiveUrlLoader(BaseLoader):
"""Loader that loads all child links from a given url."""
"""Loads all child links from a given url."""
def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None:
"""Initialize with URL to crawl and any sub-directories to exclude."""
"""Initialize with URL to crawl and any subdirectories to exclude.
Args:
url: The URL to crawl.
exclude_dirs: A list of subdirectories to exclude.
"""
self.url = url
self.exclude_dirs = exclude_dirs
def get_child_links_recursive(
self, url: str, visited: Optional[Set[str]] = None
) -> Set[str]:
"""Recursively get all child links starting with the path of the input URL."""
"""Recursively get all child links starting with the path of the input URL.
Args:
url: The URL to crawl.
visited: A set of visited URLs.
"""
try:
from bs4 import BeautifulSoup
@ -39,7 +49,7 @@ class RecursiveUrlLoader(BaseLoader):
if not parent_url.endswith("/"):
parent_url += "/"
# Exclude the root and parent from list
# Exclude the root and parent from a list
visited = set() if visited is None else visited
# Exclude the links that start with any of the excluded directories

View File

@ -23,7 +23,7 @@ def _dependable_praw_import() -> praw:
class RedditPostsLoader(BaseLoader):
"""Reddit posts loader.
Read posts on a subreddit.
First you need to go to
First, you need to go to
https://www.reddit.com/prefs/apps/
and create your application
"""
@ -38,6 +38,20 @@ class RedditPostsLoader(BaseLoader):
categories: Sequence[str] = ["new"],
number_posts: Optional[int] = 10,
):
"""
Initialize with client_id, client_secret, user_agent, search_queries, mode,
categories, number_posts.
Example: https://www.reddit.com/r/learnpython/
Args:
client_id: Reddit client id.
client_secret: Reddit client secret.
user_agent: Reddit user agent.
search_queries: The search queries.
mode: The mode.
categories: The categories. Default: ["new"]
number_posts: The number of posts. Default: 10
"""
self.client_id = client_id
self.client_secret = client_secret
self.user_agent = user_agent

View File

@ -1,4 +1,4 @@
"""Loader that loads Roam directory dump."""
"""Loads Roam directory dump."""
from pathlib import Path
from typing import List
@ -7,10 +7,10 @@ from langchain.document_loaders.base import BaseLoader
class RoamLoader(BaseLoader):
"""Loader that loads Roam files from disk."""
"""Loads Roam files from disk."""
def __init__(self, path: str):
"""Initialize with path."""
"""Initialize with a path."""
self.file_path = path
def load(self) -> List[Document]:

View File

@ -1,4 +1,4 @@
"""Loader that loads RST files."""
"""Loads RST files."""
from typing import Any, List
from langchain.document_loaders.unstructured import (
@ -13,6 +13,16 @@ class UnstructuredRSTLoader(UnstructuredFileLoader):
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
"""
Initialize with a file path.
Args:
file_path: The path to the file to load.
mode: The mode to use for partitioning. See unstructured for details.
Defaults to "single".
**unstructured_kwargs: Additional keyword arguments to pass
to unstructured.
"""
validate_unstructured_version(min_unstructured_version="0.7.5")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

View File

@ -1,4 +1,4 @@
"""Loader that loads rich text files."""
"""Loads rich text files."""
from typing import Any, List
from langchain.document_loaders.unstructured import (
@ -13,6 +13,16 @@ class UnstructuredRTFLoader(UnstructuredFileLoader):
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
"""
Initialize with a file path.
Args:
file_path: The path to the file to load.
mode: The mode to use for partitioning. See unstructured for details.
Defaults to "single".
**unstructured_kwargs: Additional keyword arguments to pass
to unstructured.
"""
min_unstructured_version = "0.5.12"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(

View File

@ -1,4 +1,4 @@
"""Loading logic for loading documents from an s3 directory."""
"""Loading logic for loading documents from an AWS S3 directory."""
from typing import List
from langchain.docstore.document import Document
@ -7,10 +7,15 @@ from langchain.document_loaders.s3_file import S3FileLoader
class S3DirectoryLoader(BaseLoader):
"""Loading logic for loading documents from s3."""
"""Loading logic for loading documents from an AWS S3."""
def __init__(self, bucket: str, prefix: str = ""):
"""Initialize with bucket and key name."""
"""Initialize with bucket and key name.
Args:
bucket: The name of the S3 bucket.
prefix: The prefix of the S3 key. Defaults to "".
"""
self.bucket = bucket
self.prefix = prefix

View File

@ -1,4 +1,4 @@
"""Loading logic for loading documents from an s3 file."""
"""Loading logic for loading documents from an AWS S3 file."""
import os
import tempfile
from typing import List
@ -9,10 +9,15 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class S3FileLoader(BaseLoader):
"""Loading logic for loading documents from s3."""
"""Loading logic for loading documents from an AWS S3 file."""
def __init__(self, bucket: str, key: str):
"""Initialize with bucket and key name."""
"""Initialize with bucket and key name.
Args:
bucket: The name of the S3 bucket.
key: The key of the S3 object.
"""
self.bucket = bucket
self.key = key

View File

@ -42,11 +42,12 @@ class SitemapLoader(WebBaseLoader):
urls that are parsed and loaded
parsing_function: Function to parse bs4.Soup output
blocksize: number of sitemap locations per block
blocknum: the number of the block that should be loaded - zero indexed
blocknum: the number of the block that should be loaded - zero indexed.
Default: 0
meta_function: Function to parse bs4.Soup output for metadata
remember when setting this method to also copy metadata["loc"]
to metadata["source"] if you are using this field
is_local: whether the sitemap is a local file
is_local: whether the sitemap is a local file. Default: False
"""
if blocksize is not None and blocksize < 1:
@ -72,7 +73,14 @@ class SitemapLoader(WebBaseLoader):
self.is_local = is_local
def parse_sitemap(self, soup: Any) -> List[dict]:
"""Parse sitemap xml and load into a list of dicts."""
"""Parse sitemap xml and load into a list of dicts.
Args:
soup: BeautifulSoup object.
Returns:
List of dicts.
"""
els = []
for url in soup.find_all("url"):
loc = url.find("loc")

View File

@ -9,7 +9,7 @@ from langchain.document_loaders.base import BaseLoader
class SlackDirectoryLoader(BaseLoader):
"""Loader for loading documents from a Slack directory dump."""
"""Loads documents from a Slack directory dump."""
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
"""Initialize the SlackDirectoryLoader.

View File

@ -41,6 +41,7 @@ class SnowflakeLoader(BaseLoader):
role: Snowflake role.
database: Snowflake database
schema: Snowflake schema
parameters: Optional. Parameters to pass to the query.
page_content_columns: Optional. Columns written to Document `page_content`.
metadata_columns: Optional. Columns written to Document `metadata`.
"""
@ -62,7 +63,7 @@ class SnowflakeLoader(BaseLoader):
try:
import snowflake.connector
except ImportError as ex:
raise ValueError(
raise ImportError(
"Could not import snowflake-connector-python package. "
"Please install it with `pip install snowflake-connector-python`."
) from ex

View File

@ -23,6 +23,12 @@ class SpreedlyLoader(BaseLoader):
"""Loader that fetches data from Spreedly API."""
def __init__(self, access_token: str, resource: str) -> None:
"""Initialize with an access token and a resource.
Args:
access_token: The access token.
resource: The resource.
"""
self.access_token = access_token
self.resource = resource
self.headers = {

View File

@ -9,7 +9,7 @@ class SRTLoader(BaseLoader):
"""Loader for .srt (subtitle) files."""
def __init__(self, file_path: str):
"""Initialize with file path."""
"""Initialize with a file path."""
try:
import pysrt # noqa:F401
except ImportError:

View File

@ -21,6 +21,12 @@ class StripeLoader(BaseLoader):
"""Loader that fetches data from Stripe."""
def __init__(self, resource: str, access_token: Optional[str] = None) -> None:
"""Initialize with a resource and an access token.
Args:
resource: The resource.
access_token: The access token.
"""
self.resource = resource
access_token = access_token or get_from_env(
"access_token", "STRIPE_ACCESS_TOKEN"

View File

@ -1,4 +1,4 @@
"""Loader that loads Telegram chat json dump."""
"""Loads Telegram chat json dump."""
from __future__ import annotations
import asyncio
@ -24,10 +24,10 @@ def concatenate_rows(row: dict) -> str:
class TelegramChatFileLoader(BaseLoader):
"""Loader that loads Telegram chat json directory dump."""
"""Loads Telegram chat json directory dump."""
def __init__(self, path: str):
"""Initialize with path."""
"""Initialize with a path."""
self.file_path = path
def load(self) -> List[Document]:
@ -79,7 +79,7 @@ def text_to_docs(text: Union[str, List[str]]) -> List[Document]:
class TelegramChatApiLoader(BaseLoader):
"""Loader that loads Telegram chat json directory dump."""
"""Loads Telegram chat json directory dump."""
def __init__(
self,
@ -89,7 +89,16 @@ class TelegramChatApiLoader(BaseLoader):
username: Optional[str] = None,
file_path: str = "telegram_data.json",
):
"""Initialize with API parameters."""
"""Initialize with API parameters.
Args:
chat_entity: The chat entity to fetch data from.
api_id: The API ID.
api_hash: The API hash.
username: The username.
file_path: The file path to save the data to. Defaults to
"telegram_data.json".
"""
self.chat_entity = chat_entity
self.api_id = api_id
self.api_hash = api_hash

View File

@ -1,4 +1,4 @@
"""Loader that loads HTML to markdown using 2markdown."""
"""Loads HTML to markdown using 2markdown."""
from __future__ import annotations
from typing import Iterator, List
@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
class ToMarkdownLoader(BaseLoader):
"""Loader that loads HTML to markdown using 2markdown."""
"""Loads HTML to markdown using 2markdown."""
def __init__(self, url: str, api_key: str):
"""Initialize with url and api key."""

View File

@ -1,4 +1,4 @@
"""Loader that loads cards from Trello"""
"""Loads cards from Trello"""
from __future__ import annotations
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple

View File

@ -12,7 +12,7 @@ def concatenate_rows(date: str, sender: str, text: str) -> str:
class WhatsAppChatLoader(BaseLoader):
"""Loader that loads WhatsApp messages text file."""
"""Loads WhatsApp messages text file."""
def __init__(self, path: str):
"""Initialize with path."""

View File

@ -1,4 +1,4 @@
"""Loader that loads word documents."""
"""Loads word documents."""
import os
import tempfile
from abc import ABC

View File

@ -1,4 +1,4 @@
"""Loader that loads Microsoft Excel files."""
"""Loads Microsoft Excel files."""
from typing import Any, List
from langchain.document_loaders.unstructured import (

View File

@ -1,4 +1,4 @@
"""Loader that loads YouTube transcript."""
"""Loads YouTube transcript."""
from __future__ import annotations
import logging
@ -140,7 +140,7 @@ def _parse_video_id(url: str) -> Optional[str]:
class YoutubeLoader(BaseLoader):
"""Loader that loads Youtube transcripts."""
"""Loads Youtube transcripts."""
def __init__(
self,
@ -252,7 +252,7 @@ class YoutubeLoader(BaseLoader):
@dataclass
class GoogleApiYoutubeLoader(BaseLoader):
"""Loader that loads all Videos from a Channel
"""Loads all Videos from a Channel
To use, you should have the ``googleapiclient,youtube_transcript_api``
python package installed.