`. You can also define your own html tags by passing custom_html_tag, e.g. @@ -31,7 +31,7 @@ class ReadTheDocsLoader(BaseLoader): Args: path: The location of pulled readthedocs folder. encoding: The encoding with which to open the documents. - errors: Specifies how encoding and decoding errors are to be handled—this + errors: Specify how encoding and decoding errors are to be handled—this cannot be used in binary mode. custom_html_tag: Optional custom html tag to retrieve the content from files. diff --git a/langchain/document_loaders/recursive_url_loader.py b/langchain/document_loaders/recursive_url_loader.py index b1a0250d74..7462d85888 100644 --- a/langchain/document_loaders/recursive_url_loader.py +++ b/langchain/document_loaders/recursive_url_loader.py @@ -8,17 +8,27 @@ from langchain.document_loaders.base import BaseLoader class RecursiveUrlLoader(BaseLoader): - """Loader that loads all child links from a given url.""" + """Loads all child links from a given url.""" def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None: - """Initialize with URL to crawl and any sub-directories to exclude.""" + """Initialize with URL to crawl and any subdirectories to exclude. + + Args: + url: The URL to crawl. + exclude_dirs: A list of subdirectories to exclude. + """ self.url = url self.exclude_dirs = exclude_dirs def get_child_links_recursive( self, url: str, visited: Optional[Set[str]] = None ) -> Set[str]: - """Recursively get all child links starting with the path of the input URL.""" + """Recursively get all child links starting with the path of the input URL. + + Args: + url: The URL to crawl. + visited: A set of visited URLs. + """ try: from bs4 import BeautifulSoup @@ -39,7 +49,7 @@ class RecursiveUrlLoader(BaseLoader): if not parent_url.endswith("/"): parent_url += "/" - # Exclude the root and parent from list + # Exclude the root and parent from a list visited = set() if visited is None else visited # Exclude the links that start with any of the excluded directories diff --git a/langchain/document_loaders/reddit.py b/langchain/document_loaders/reddit.py index 80e9fbb599..22fd112db1 100644 --- a/langchain/document_loaders/reddit.py +++ b/langchain/document_loaders/reddit.py @@ -23,7 +23,7 @@ def _dependable_praw_import() -> praw: class RedditPostsLoader(BaseLoader): """Reddit posts loader. Read posts on a subreddit. - First you need to go to + First, you need to go to https://www.reddit.com/prefs/apps/ and create your application """ @@ -38,6 +38,20 @@ class RedditPostsLoader(BaseLoader): categories: Sequence[str] = ["new"], number_posts: Optional[int] = 10, ): + """ + Initialize with client_id, client_secret, user_agent, search_queries, mode, + categories, number_posts. + Example: https://www.reddit.com/r/learnpython/ + + Args: + client_id: Reddit client id. + client_secret: Reddit client secret. + user_agent: Reddit user agent. + search_queries: The search queries. + mode: The mode. + categories: The categories. Default: ["new"] + number_posts: The number of posts. Default: 10 + """ self.client_id = client_id self.client_secret = client_secret self.user_agent = user_agent diff --git a/langchain/document_loaders/roam.py b/langchain/document_loaders/roam.py index ff06885764..136bc116d0 100644 --- a/langchain/document_loaders/roam.py +++ b/langchain/document_loaders/roam.py @@ -1,4 +1,4 @@ -"""Loader that loads Roam directory dump.""" +"""Loads Roam directory dump.""" from pathlib import Path from typing import List @@ -7,10 +7,10 @@ from langchain.document_loaders.base import BaseLoader class RoamLoader(BaseLoader): - """Loader that loads Roam files from disk.""" + """Loads Roam files from disk.""" def __init__(self, path: str): - """Initialize with path.""" + """Initialize with a path.""" self.file_path = path def load(self) -> List[Document]: diff --git a/langchain/document_loaders/rst.py b/langchain/document_loaders/rst.py index 9b20e7bab4..abff302227 100644 --- a/langchain/document_loaders/rst.py +++ b/langchain/document_loaders/rst.py @@ -1,4 +1,4 @@ -"""Loader that loads RST files.""" +"""Loads RST files.""" from typing import Any, List from langchain.document_loaders.unstructured import ( @@ -13,6 +13,16 @@ class UnstructuredRSTLoader(UnstructuredFileLoader): def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any ): + """ + Initialize with a file path. + + Args: + file_path: The path to the file to load. + mode: The mode to use for partitioning. See unstructured for details. + Defaults to "single". + **unstructured_kwargs: Additional keyword arguments to pass + to unstructured. + """ validate_unstructured_version(min_unstructured_version="0.7.5") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/langchain/document_loaders/rtf.py b/langchain/document_loaders/rtf.py index c4113be206..3536cd3791 100644 --- a/langchain/document_loaders/rtf.py +++ b/langchain/document_loaders/rtf.py @@ -1,4 +1,4 @@ -"""Loader that loads rich text files.""" +"""Loads rich text files.""" from typing import Any, List from langchain.document_loaders.unstructured import ( @@ -13,6 +13,16 @@ class UnstructuredRTFLoader(UnstructuredFileLoader): def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any ): + """ + Initialize with a file path. + + Args: + file_path: The path to the file to load. + mode: The mode to use for partitioning. See unstructured for details. + Defaults to "single". + **unstructured_kwargs: Additional keyword arguments to pass + to unstructured. + """ min_unstructured_version = "0.5.12" if not satisfies_min_unstructured_version(min_unstructured_version): raise ValueError( diff --git a/langchain/document_loaders/s3_directory.py b/langchain/document_loaders/s3_directory.py index 0c842d0523..60085ee904 100644 --- a/langchain/document_loaders/s3_directory.py +++ b/langchain/document_loaders/s3_directory.py @@ -1,4 +1,4 @@ -"""Loading logic for loading documents from an s3 directory.""" +"""Loading logic for loading documents from an AWS S3 directory.""" from typing import List from langchain.docstore.document import Document @@ -7,10 +7,15 @@ from langchain.document_loaders.s3_file import S3FileLoader class S3DirectoryLoader(BaseLoader): - """Loading logic for loading documents from s3.""" + """Loading logic for loading documents from an AWS S3.""" def __init__(self, bucket: str, prefix: str = ""): - """Initialize with bucket and key name.""" + """Initialize with bucket and key name. + + Args: + bucket: The name of the S3 bucket. + prefix: The prefix of the S3 key. Defaults to "". + """ self.bucket = bucket self.prefix = prefix diff --git a/langchain/document_loaders/s3_file.py b/langchain/document_loaders/s3_file.py index 246b0095b5..28195d8fe6 100644 --- a/langchain/document_loaders/s3_file.py +++ b/langchain/document_loaders/s3_file.py @@ -1,4 +1,4 @@ -"""Loading logic for loading documents from an s3 file.""" +"""Loading logic for loading documents from an AWS S3 file.""" import os import tempfile from typing import List @@ -9,10 +9,15 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class S3FileLoader(BaseLoader): - """Loading logic for loading documents from s3.""" + """Loading logic for loading documents from an AWS S3 file.""" def __init__(self, bucket: str, key: str): - """Initialize with bucket and key name.""" + """Initialize with bucket and key name. + + Args: + bucket: The name of the S3 bucket. + key: The key of the S3 object. + """ self.bucket = bucket self.key = key diff --git a/langchain/document_loaders/sitemap.py b/langchain/document_loaders/sitemap.py index 64e3707a3d..68fe88eefb 100644 --- a/langchain/document_loaders/sitemap.py +++ b/langchain/document_loaders/sitemap.py @@ -42,11 +42,12 @@ class SitemapLoader(WebBaseLoader): urls that are parsed and loaded parsing_function: Function to parse bs4.Soup output blocksize: number of sitemap locations per block - blocknum: the number of the block that should be loaded - zero indexed + blocknum: the number of the block that should be loaded - zero indexed. + Default: 0 meta_function: Function to parse bs4.Soup output for metadata remember when setting this method to also copy metadata["loc"] to metadata["source"] if you are using this field - is_local: whether the sitemap is a local file + is_local: whether the sitemap is a local file. Default: False """ if blocksize is not None and blocksize < 1: @@ -72,7 +73,14 @@ class SitemapLoader(WebBaseLoader): self.is_local = is_local def parse_sitemap(self, soup: Any) -> List[dict]: - """Parse sitemap xml and load into a list of dicts.""" + """Parse sitemap xml and load into a list of dicts. + + Args: + soup: BeautifulSoup object. + + Returns: + List of dicts. + """ els = [] for url in soup.find_all("url"): loc = url.find("loc") diff --git a/langchain/document_loaders/slack_directory.py b/langchain/document_loaders/slack_directory.py index 718367c4d4..16aa5b4fc5 100644 --- a/langchain/document_loaders/slack_directory.py +++ b/langchain/document_loaders/slack_directory.py @@ -9,7 +9,7 @@ from langchain.document_loaders.base import BaseLoader class SlackDirectoryLoader(BaseLoader): - """Loader for loading documents from a Slack directory dump.""" + """Loads documents from a Slack directory dump.""" def __init__(self, zip_path: str, workspace_url: Optional[str] = None): """Initialize the SlackDirectoryLoader. diff --git a/langchain/document_loaders/snowflake_loader.py b/langchain/document_loaders/snowflake_loader.py index 59164124dc..50219ce35b 100644 --- a/langchain/document_loaders/snowflake_loader.py +++ b/langchain/document_loaders/snowflake_loader.py @@ -41,6 +41,7 @@ class SnowflakeLoader(BaseLoader): role: Snowflake role. database: Snowflake database schema: Snowflake schema + parameters: Optional. Parameters to pass to the query. page_content_columns: Optional. Columns written to Document `page_content`. metadata_columns: Optional. Columns written to Document `metadata`. """ @@ -62,7 +63,7 @@ class SnowflakeLoader(BaseLoader): try: import snowflake.connector except ImportError as ex: - raise ValueError( + raise ImportError( "Could not import snowflake-connector-python package. " "Please install it with `pip install snowflake-connector-python`." ) from ex diff --git a/langchain/document_loaders/spreedly.py b/langchain/document_loaders/spreedly.py index b471341e79..2ec0cfc4c0 100644 --- a/langchain/document_loaders/spreedly.py +++ b/langchain/document_loaders/spreedly.py @@ -23,6 +23,12 @@ class SpreedlyLoader(BaseLoader): """Loader that fetches data from Spreedly API.""" def __init__(self, access_token: str, resource: str) -> None: + """Initialize with an access token and a resource. + + Args: + access_token: The access token. + resource: The resource. + """ self.access_token = access_token self.resource = resource self.headers = { diff --git a/langchain/document_loaders/srt.py b/langchain/document_loaders/srt.py index ee26d3230a..c6114beba9 100644 --- a/langchain/document_loaders/srt.py +++ b/langchain/document_loaders/srt.py @@ -9,7 +9,7 @@ class SRTLoader(BaseLoader): """Loader for .srt (subtitle) files.""" def __init__(self, file_path: str): - """Initialize with file path.""" + """Initialize with a file path.""" try: import pysrt # noqa:F401 except ImportError: diff --git a/langchain/document_loaders/stripe.py b/langchain/document_loaders/stripe.py index efc55824f6..41f978d194 100644 --- a/langchain/document_loaders/stripe.py +++ b/langchain/document_loaders/stripe.py @@ -21,6 +21,12 @@ class StripeLoader(BaseLoader): """Loader that fetches data from Stripe.""" def __init__(self, resource: str, access_token: Optional[str] = None) -> None: + """Initialize with a resource and an access token. + + Args: + resource: The resource. + access_token: The access token. + """ self.resource = resource access_token = access_token or get_from_env( "access_token", "STRIPE_ACCESS_TOKEN" diff --git a/langchain/document_loaders/telegram.py b/langchain/document_loaders/telegram.py index 3e4bf0e5ea..88225ecc44 100644 --- a/langchain/document_loaders/telegram.py +++ b/langchain/document_loaders/telegram.py @@ -1,4 +1,4 @@ -"""Loader that loads Telegram chat json dump.""" +"""Loads Telegram chat json dump.""" from __future__ import annotations import asyncio @@ -24,10 +24,10 @@ def concatenate_rows(row: dict) -> str: class TelegramChatFileLoader(BaseLoader): - """Loader that loads Telegram chat json directory dump.""" + """Loads Telegram chat json directory dump.""" def __init__(self, path: str): - """Initialize with path.""" + """Initialize with a path.""" self.file_path = path def load(self) -> List[Document]: @@ -79,7 +79,7 @@ def text_to_docs(text: Union[str, List[str]]) -> List[Document]: class TelegramChatApiLoader(BaseLoader): - """Loader that loads Telegram chat json directory dump.""" + """Loads Telegram chat json directory dump.""" def __init__( self, @@ -89,7 +89,16 @@ class TelegramChatApiLoader(BaseLoader): username: Optional[str] = None, file_path: str = "telegram_data.json", ): - """Initialize with API parameters.""" + """Initialize with API parameters. + + Args: + chat_entity: The chat entity to fetch data from. + api_id: The API ID. + api_hash: The API hash. + username: The username. + file_path: The file path to save the data to. Defaults to + "telegram_data.json". + """ self.chat_entity = chat_entity self.api_id = api_id self.api_hash = api_hash diff --git a/langchain/document_loaders/tomarkdown.py b/langchain/document_loaders/tomarkdown.py index a3fbf6f792..00ba512dd2 100644 --- a/langchain/document_loaders/tomarkdown.py +++ b/langchain/document_loaders/tomarkdown.py @@ -1,4 +1,4 @@ -"""Loader that loads HTML to markdown using 2markdown.""" +"""Loads HTML to markdown using 2markdown.""" from __future__ import annotations from typing import Iterator, List @@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader class ToMarkdownLoader(BaseLoader): - """Loader that loads HTML to markdown using 2markdown.""" + """Loads HTML to markdown using 2markdown.""" def __init__(self, url: str, api_key: str): """Initialize with url and api key.""" diff --git a/langchain/document_loaders/trello.py b/langchain/document_loaders/trello.py index 5c24358661..11a59a4f5c 100644 --- a/langchain/document_loaders/trello.py +++ b/langchain/document_loaders/trello.py @@ -1,4 +1,4 @@ -"""Loader that loads cards from Trello""" +"""Loads cards from Trello""" from __future__ import annotations from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple diff --git a/langchain/document_loaders/whatsapp_chat.py b/langchain/document_loaders/whatsapp_chat.py index b3022a555b..cad93ac8d0 100644 --- a/langchain/document_loaders/whatsapp_chat.py +++ b/langchain/document_loaders/whatsapp_chat.py @@ -12,7 +12,7 @@ def concatenate_rows(date: str, sender: str, text: str) -> str: class WhatsAppChatLoader(BaseLoader): - """Loader that loads WhatsApp messages text file.""" + """Loads WhatsApp messages text file.""" def __init__(self, path: str): """Initialize with path.""" diff --git a/langchain/document_loaders/word_document.py b/langchain/document_loaders/word_document.py index f0272b2efe..3ee91ab741 100644 --- a/langchain/document_loaders/word_document.py +++ b/langchain/document_loaders/word_document.py @@ -1,4 +1,4 @@ -"""Loader that loads word documents.""" +"""Loads word documents.""" import os import tempfile from abc import ABC diff --git a/langchain/document_loaders/xml.py b/langchain/document_loaders/xml.py index 78156ee205..4239a49396 100644 --- a/langchain/document_loaders/xml.py +++ b/langchain/document_loaders/xml.py @@ -1,4 +1,4 @@ -"""Loader that loads Microsoft Excel files.""" +"""Loads Microsoft Excel files.""" from typing import Any, List from langchain.document_loaders.unstructured import ( diff --git a/langchain/document_loaders/youtube.py b/langchain/document_loaders/youtube.py index b828c3b0b7..86d7c42a8e 100644 --- a/langchain/document_loaders/youtube.py +++ b/langchain/document_loaders/youtube.py @@ -1,4 +1,4 @@ -"""Loader that loads YouTube transcript.""" +"""Loads YouTube transcript.""" from __future__ import annotations import logging @@ -140,7 +140,7 @@ def _parse_video_id(url: str) -> Optional[str]: class YoutubeLoader(BaseLoader): - """Loader that loads Youtube transcripts.""" + """Loads Youtube transcripts.""" def __init__( self, @@ -252,7 +252,7 @@ class YoutubeLoader(BaseLoader): @dataclass class GoogleApiYoutubeLoader(BaseLoader): - """Loader that loads all Videos from a Channel + """Loads all Videos from a Channel To use, you should have the ``googleapiclient,youtube_transcript_api`` python package installed.