From 19f504790ea9e8df4f6613003379d5099c915115 Mon Sep 17 00:00:00 2001 From: Leonid Ganeline Date: Fri, 11 Aug 2023 16:25:40 -0700 Subject: [PATCH] docstrings: document_loaders consitency 2 (#9148) This is Part 2. See #9139 (Part 1). --- .../document_loaders/hugging_face_dataset.py | 3 +-- .../langchain/document_loaders/ifixit.py | 3 +-- .../langchain/document_loaders/image.py | 3 +-- .../document_loaders/image_captions.py | 13 +++++----- .../langchain/document_loaders/imsdb.py | 3 +-- .../langchain/document_loaders/iugu.py | 3 +-- .../langchain/document_loaders/joplin.py | 3 +-- .../langchain/document_loaders/json_loader.py | 3 +-- .../langchain/document_loaders/larksuite.py | 3 +-- .../langchain/document_loaders/markdown.py | 3 +-- .../langchain/document_loaders/mastodon.py | 3 +-- .../langchain/document_loaders/max_compute.py | 2 +- .../document_loaders/mediawikidump.py | 5 ++-- .../langchain/document_loaders/mhtml.py | 4 +-- .../document_loaders/modern_treasury.py | 3 +-- .../langchain/document_loaders/news.py | 2 +- .../langchain/document_loaders/notebook.py | 4 +-- .../langchain/document_loaders/notion.py | 3 +-- .../langchain/document_loaders/notiondb.py | 4 +-- .../langchain/document_loaders/nuclia.py | 3 +-- .../document_loaders/obs_directory.py | 2 +- .../langchain/document_loaders/obs_file.py | 2 +- .../langchain/document_loaders/obsidian.py | 3 +-- .../langchain/document_loaders/odt.py | 4 +-- .../langchain/document_loaders/onedrive.py | 2 +- .../document_loaders/onedrive_file.py | 2 +- .../document_loaders/open_city_data.py | 2 +- .../langchain/document_loaders/org_mode.py | 4 +-- .../langchain/document_loaders/pdf.py | 26 +++++++++---------- .../langchain/document_loaders/powerpoint.py | 4 +-- .../langchain/document_loaders/psychic.py | 3 +-- .../langchain/document_loaders/pubmed.py | 2 +- .../document_loaders/pyspark_dataframe.py | 3 +-- .../langchain/document_loaders/python.py | 4 +-- .../langchain/document_loaders/readthedocs.py | 3 +-- .../document_loaders/recursive_url_loader.py | 2 +- .../langchain/document_loaders/reddit.py | 4 +-- .../langchain/document_loaders/roam.py | 3 +-- .../langchain/document_loaders/rocksetdb.py | 2 +- .../langchain/document_loaders/rss.py | 3 +-- .../langchain/document_loaders/rst.py | 3 ++- .../langchain/document_loaders/rtf.py | 3 ++- .../document_loaders/s3_directory.py | 3 +-- .../langchain/document_loaders/s3_file.py | 3 +-- .../langchain/document_loaders/sitemap.py | 3 +-- .../document_loaders/slack_directory.py | 3 +-- .../document_loaders/snowflake_loader.py | 2 +- .../langchain/document_loaders/spreedly.py | 3 +-- .../langchain/document_loaders/srt.py | 3 +-- .../langchain/document_loaders/stripe.py | 3 +-- .../langchain/document_loaders/telegram.py | 3 +-- .../document_loaders/tencent_cos_directory.py | 3 +-- .../document_loaders/tencent_cos_file.py | 3 +-- .../document_loaders/tensorflow_datasets.py | 2 +- .../langchain/document_loaders/text.py | 2 +- .../langchain/document_loaders/tomarkdown.py | 3 +-- .../langchain/document_loaders/toml.py | 7 +++-- .../langchain/document_loaders/trello.py | 3 +-- .../langchain/document_loaders/tsv.py | 4 ++- .../langchain/document_loaders/twitter.py | 6 ++--- .../document_loaders/unstructured.py | 8 +++--- .../langchain/document_loaders/url.py | 3 ++- .../document_loaders/url_playwright.py | 3 ++- .../document_loaders/url_selenium.py | 3 ++- .../langchain/document_loaders/weather.py | 2 +- .../langchain/document_loaders/web_base.py | 2 +- .../document_loaders/whatsapp_chat.py | 2 +- .../langchain/document_loaders/wikipedia.py | 3 ++- .../document_loaders/word_document.py | 5 ++-- .../langchain/document_loaders/xml.py | 3 ++- .../langchain/document_loaders/xorbits.py | 2 +- .../langchain/document_loaders/youtube.py | 4 +-- 72 files changed, 114 insertions(+), 144 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/hugging_face_dataset.py b/libs/langchain/langchain/document_loaders/hugging_face_dataset.py index 17b823dd21..a253e72024 100644 --- a/libs/langchain/langchain/document_loaders/hugging_face_dataset.py +++ b/libs/langchain/langchain/document_loaders/hugging_face_dataset.py @@ -1,4 +1,3 @@ -"""Loads HuggingFace datasets.""" from typing import Iterator, List, Mapping, Optional, Sequence, Union from langchain.docstore.document import Document @@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class HuggingFaceDatasetLoader(BaseLoader): - """Load Documents from the Hugging Face Hub.""" + """Load from `Hugging Face Hub` datasets.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/ifixit.py b/libs/langchain/langchain/document_loaders/ifixit.py index 1669dace99..3507334c56 100644 --- a/libs/langchain/langchain/document_loaders/ifixit.py +++ b/libs/langchain/langchain/document_loaders/ifixit.py @@ -1,4 +1,3 @@ -"""Loads iFixit data.""" from typing import List, Optional import requests @@ -11,7 +10,7 @@ IFIXIT_BASE_URL = "https://www.ifixit.com/api/2.0" class IFixitLoader(BaseLoader): - """Load iFixit repair guides, device wikis and answers. + """Load `iFixit` repair guides, device wikis and answers. iFixit is the largest, open repair community on the web. The site contains nearly 100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is diff --git a/libs/langchain/langchain/document_loaders/image.py b/libs/langchain/langchain/document_loaders/image.py index 9a31bd00c6..4d854676d5 100644 --- a/libs/langchain/langchain/document_loaders/image.py +++ b/libs/langchain/langchain/document_loaders/image.py @@ -1,11 +1,10 @@ -"""Loads image files.""" from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader class UnstructuredImageLoader(UnstructuredFileLoader): - """Loader that uses Unstructured to load PNG and JPG files. + """Load `PNG` and `JPG` files using `Unstructured`. You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single diff --git a/libs/langchain/langchain/document_loaders/image_captions.py b/libs/langchain/langchain/document_loaders/image_captions.py index 53e3c694e2..40d3de9379 100644 --- a/libs/langchain/langchain/document_loaders/image_captions.py +++ b/libs/langchain/langchain/document_loaders/image_captions.py @@ -1,9 +1,3 @@ -"""Loads image captions. - -By default, the loader utilizes the pre-trained BLIP image captioning model. -https://huggingface.co/Salesforce/blip-image-captioning-base - -""" from typing import Any, List, Tuple, Union import requests @@ -13,7 +7,12 @@ from langchain.document_loaders.base import BaseLoader class ImageCaptionLoader(BaseLoader): - """Loads the captions of an image""" + """Load image captions. + + By default, the loader utilizes the pre-trained + Salesforce BLIP image captioning model. + https://huggingface.co/Salesforce/blip-image-captioning-base + """ def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/imsdb.py b/libs/langchain/langchain/document_loaders/imsdb.py index 312e25a850..acf0e1b59b 100644 --- a/libs/langchain/langchain/document_loaders/imsdb.py +++ b/libs/langchain/langchain/document_loaders/imsdb.py @@ -1,4 +1,3 @@ -"""Loads IMSDb.""" from typing import List from langchain.docstore.document import Document @@ -6,7 +5,7 @@ from langchain.document_loaders.web_base import WebBaseLoader class IMSDbLoader(WebBaseLoader): - """Loads IMSDb webpages.""" + """Load `IMSDb` webpages.""" def load(self) -> List[Document]: """Load webpage.""" diff --git a/libs/langchain/langchain/document_loaders/iugu.py b/libs/langchain/langchain/document_loaders/iugu.py index 229cf6f629..c8ee3c3511 100644 --- a/libs/langchain/langchain/document_loaders/iugu.py +++ b/libs/langchain/langchain/document_loaders/iugu.py @@ -1,4 +1,3 @@ -"""Loader that fetches data from IUGU""" import json import urllib.request from typing import List, Optional @@ -17,7 +16,7 @@ IUGU_ENDPOINTS = { class IuguLoader(BaseLoader): - """Loader that fetches data from IUGU.""" + """Load from `IUGU`.""" def __init__(self, resource: str, api_token: Optional[str] = None) -> None: """Initialize the IUGU resource. diff --git a/libs/langchain/langchain/document_loaders/joplin.py b/libs/langchain/langchain/document_loaders/joplin.py index 0cce974622..62efe62fd9 100644 --- a/libs/langchain/langchain/document_loaders/joplin.py +++ b/libs/langchain/langchain/document_loaders/joplin.py @@ -11,8 +11,7 @@ LINK_NOTE_TEMPLATE = "joplin://x-callback-url/openNote?id={id}" class JoplinLoader(BaseLoader): - """ - Loader that fetches notes from Joplin. + """Load notes from `Joplin`. In order to use this loader, you need to have Joplin running with the Web Clipper enabled (look for "Web Clipper" in the app settings). diff --git a/libs/langchain/langchain/document_loaders/json_loader.py b/libs/langchain/langchain/document_loaders/json_loader.py index 9e793798e3..e13ae1bb89 100644 --- a/libs/langchain/langchain/document_loaders/json_loader.py +++ b/libs/langchain/langchain/document_loaders/json_loader.py @@ -1,4 +1,3 @@ -"""Loads data from JSON.""" import json from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Union @@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader class JSONLoader(BaseLoader): - """Loads a JSON file using a jq schema. + """Load a `JSON` file using a `jq` schema. Example: [{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text diff --git a/libs/langchain/langchain/document_loaders/larksuite.py b/libs/langchain/langchain/document_loaders/larksuite.py index d57c10b898..1b94ca0175 100644 --- a/libs/langchain/langchain/document_loaders/larksuite.py +++ b/libs/langchain/langchain/document_loaders/larksuite.py @@ -1,4 +1,3 @@ -"""Loads LarkSuite (FeiShu) document json dump.""" import json import urllib.request from typing import Any, Iterator, List @@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader class LarkSuiteDocLoader(BaseLoader): - """Loads LarkSuite (FeiShu) document.""" + """Load from `LarkSuite` (`FeiShu`).""" def __init__(self, domain: str, access_token: str, document_id: str): """Initialize with domain, access_token (tenant / user), and document_id. diff --git a/libs/langchain/langchain/document_loaders/markdown.py b/libs/langchain/langchain/document_loaders/markdown.py index 820b5b53de..8b3665a496 100644 --- a/libs/langchain/langchain/document_loaders/markdown.py +++ b/libs/langchain/langchain/document_loaders/markdown.py @@ -1,11 +1,10 @@ -"""Loads Markdown files.""" from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader class UnstructuredMarkdownLoader(UnstructuredFileLoader): - """Loader that uses Unstructured to load markdown files. + """Load `Markdown` files using `Unstructured`. You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single diff --git a/libs/langchain/langchain/document_loaders/mastodon.py b/libs/langchain/langchain/document_loaders/mastodon.py index ae353790cd..3ed70e0de1 100644 --- a/libs/langchain/langchain/document_loaders/mastodon.py +++ b/libs/langchain/langchain/document_loaders/mastodon.py @@ -1,4 +1,3 @@ -"""Mastodon document loader.""" from __future__ import annotations import os @@ -23,7 +22,7 @@ def _dependable_mastodon_import() -> mastodon: class MastodonTootsLoader(BaseLoader): - """Mastodon toots loader.""" + """Load the `Mastodon` 'toots'.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/max_compute.py b/libs/langchain/langchain/document_loaders/max_compute.py index ee3c64ae79..d75439634b 100644 --- a/libs/langchain/langchain/document_loaders/max_compute.py +++ b/libs/langchain/langchain/document_loaders/max_compute.py @@ -8,7 +8,7 @@ from langchain.utilities.max_compute import MaxComputeAPIWrapper class MaxComputeLoader(BaseLoader): - """Loads a query result from Alibaba Cloud MaxCompute table into documents.""" + """Load from `Alibaba Cloud MaxCompute` table.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/mediawikidump.py b/libs/langchain/langchain/document_loaders/mediawikidump.py index 370289f128..d4f7815cec 100644 --- a/libs/langchain/langchain/document_loaders/mediawikidump.py +++ b/libs/langchain/langchain/document_loaders/mediawikidump.py @@ -1,4 +1,3 @@ -"""Load Data from a MediaWiki dump xml.""" import logging from pathlib import Path from typing import List, Optional, Sequence, Union @@ -10,8 +9,8 @@ logger = logging.getLogger(__name__) class MWDumpLoader(BaseLoader): - """ - Load MediaWiki dump from XML file + """Load `MediaWiki` dump from an `XML` file. + Example: .. code-block:: python diff --git a/libs/langchain/langchain/document_loaders/mhtml.py b/libs/langchain/langchain/document_loaders/mhtml.py index 4def89a2dd..c7b7f55ec9 100644 --- a/libs/langchain/langchain/document_loaders/mhtml.py +++ b/libs/langchain/langchain/document_loaders/mhtml.py @@ -1,5 +1,3 @@ -"""Load MHTML files, enriching metadata with page title.""" - import email import logging from typing import Dict, List, Union @@ -11,7 +9,7 @@ logger = logging.getLogger(__name__) class MHTMLLoader(BaseLoader): - """Loader that uses beautiful soup to parse HTML files.""" + """Parse `MHTML` files with `BeautifulSoup`.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/modern_treasury.py b/libs/langchain/langchain/document_loaders/modern_treasury.py index 2d12d6bd8e..21607ba1c8 100644 --- a/libs/langchain/langchain/document_loaders/modern_treasury.py +++ b/libs/langchain/langchain/document_loaders/modern_treasury.py @@ -1,4 +1,3 @@ -"""Loader that fetches data from Modern Treasury""" import json import urllib.request from base64 import b64encode @@ -27,7 +26,7 @@ incoming_payment_details", class ModernTreasuryLoader(BaseLoader): - """Loader that fetches data from Modern Treasury.""" + """Load from `Modern Treasury`.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/news.py b/libs/langchain/langchain/document_loaders/news.py index 7e4931f94b..2709a8cd10 100644 --- a/libs/langchain/langchain/document_loaders/news.py +++ b/libs/langchain/langchain/document_loaders/news.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) class NewsURLLoader(BaseLoader): - """Loader that uses newspaper to load news articles from URLs. + """Load news articles from URLs using `Unstructured`. Args: urls: URLs to load. Each is loaded into its own document. diff --git a/libs/langchain/langchain/document_loaders/notebook.py b/libs/langchain/langchain/document_loaders/notebook.py index 19522c2a5c..e9f84666b3 100644 --- a/libs/langchain/langchain/document_loaders/notebook.py +++ b/libs/langchain/langchain/document_loaders/notebook.py @@ -70,7 +70,7 @@ def remove_newlines(x: Any) -> Any: class NotebookLoader(BaseLoader): - """Loads .ipynb notebook files.""" + """Load `Jupyter notebook` (.ipynb) files.""" def __init__( self, @@ -80,7 +80,7 @@ class NotebookLoader(BaseLoader): remove_newline: bool = False, traceback: bool = False, ): - """Initialize with path. + """Initialize with a path. Args: path: The path to load the notebook from. diff --git a/libs/langchain/langchain/document_loaders/notion.py b/libs/langchain/langchain/document_loaders/notion.py index 4801052558..15678ace9e 100644 --- a/libs/langchain/langchain/document_loaders/notion.py +++ b/libs/langchain/langchain/document_loaders/notion.py @@ -1,4 +1,3 @@ -"""Loads Notion directory dump.""" from pathlib import Path from typing import List @@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader class NotionDirectoryLoader(BaseLoader): - """Loads Notion directory dump.""" + """Load `Notion directory` dump.""" def __init__(self, path: str): """Initialize with a file path.""" diff --git a/libs/langchain/langchain/document_loaders/notiondb.py b/libs/langchain/langchain/document_loaders/notiondb.py index 16915fba0e..1f7a67a2a5 100644 --- a/libs/langchain/langchain/document_loaders/notiondb.py +++ b/libs/langchain/langchain/document_loaders/notiondb.py @@ -1,5 +1,3 @@ -"""Notion DB loader for langchain""" - from typing import Any, Dict, List, Optional import requests @@ -14,7 +12,7 @@ BLOCK_URL = NOTION_BASE_URL + "/blocks/{block_id}/children" class NotionDBLoader(BaseLoader): - """Notion DB Loader. + """Load from `Notion DB`. Reads content from pages within a Notion Database. Args: diff --git a/libs/langchain/langchain/document_loaders/nuclia.py b/libs/langchain/langchain/document_loaders/nuclia.py index 097564ee55..218c11f633 100644 --- a/libs/langchain/langchain/document_loaders/nuclia.py +++ b/libs/langchain/langchain/document_loaders/nuclia.py @@ -1,4 +1,3 @@ -"""Extract text from any file type.""" import json import uuid from typing import List @@ -9,7 +8,7 @@ from langchain.tools.nuclia.tool import NucliaUnderstandingAPI class NucliaLoader(BaseLoader): - """Extract text from any file type.""" + """Load from any file type using `Nuclia Understanding API`.""" def __init__(self, path: str, nuclia_tool: NucliaUnderstandingAPI): self.nua = nuclia_tool diff --git a/libs/langchain/langchain/document_loaders/obs_directory.py b/libs/langchain/langchain/document_loaders/obs_directory.py index ba30d6a578..b7af3e330b 100644 --- a/libs/langchain/langchain/document_loaders/obs_directory.py +++ b/libs/langchain/langchain/document_loaders/obs_directory.py @@ -7,7 +7,7 @@ from langchain.document_loaders.obs_file import OBSFileLoader class OBSDirectoryLoader(BaseLoader): - """Loading logic for loading documents from Huawei OBS.""" + """Load from `Huawei OBS directory`.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/obs_file.py b/libs/langchain/langchain/document_loaders/obs_file.py index b366f4e0e4..69a63808e4 100644 --- a/libs/langchain/langchain/document_loaders/obs_file.py +++ b/libs/langchain/langchain/document_loaders/obs_file.py @@ -10,7 +10,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class OBSFileLoader(BaseLoader): - """Loader for Huawei OBS file.""" + """Load from the `Huawei OBS file`.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/obsidian.py b/libs/langchain/langchain/document_loaders/obsidian.py index fd43b07543..ab3a3b188a 100644 --- a/libs/langchain/langchain/document_loaders/obsidian.py +++ b/libs/langchain/langchain/document_loaders/obsidian.py @@ -1,4 +1,3 @@ -"""Loads Obsidian directory dump.""" import re from pathlib import Path from typing import List @@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader class ObsidianLoader(BaseLoader): - """Loads Obsidian files from disk.""" + """Load `Obsidian` files from directory.""" FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL) diff --git a/libs/langchain/langchain/document_loaders/odt.py b/libs/langchain/langchain/document_loaders/odt.py index a8641b6562..e8e740fc7b 100644 --- a/libs/langchain/langchain/document_loaders/odt.py +++ b/libs/langchain/langchain/document_loaders/odt.py @@ -1,4 +1,3 @@ -"""Loads OpenOffice ODT files.""" from typing import Any, List from langchain.document_loaders.unstructured import ( @@ -8,7 +7,8 @@ from langchain.document_loaders.unstructured import ( class UnstructuredODTLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load OpenOffice ODT files. + """Load `OpenOffice ODT` files using `Unstructured`. + You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single langchain Document object. If you use "elements" mode, the unstructured diff --git a/libs/langchain/langchain/document_loaders/onedrive.py b/libs/langchain/langchain/document_loaders/onedrive.py index 77f75e2c07..2da8a1c4da 100644 --- a/libs/langchain/langchain/document_loaders/onedrive.py +++ b/libs/langchain/langchain/document_loaders/onedrive.py @@ -60,7 +60,7 @@ class _SupportedFileTypes(BaseModel): class OneDriveLoader(BaseLoader, BaseModel): - """Loads data from OneDrive.""" + """Load from `Microsoft OneDrive`.""" settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings) """ The settings for the OneDrive API client.""" diff --git a/libs/langchain/langchain/document_loaders/onedrive_file.py b/libs/langchain/langchain/document_loaders/onedrive_file.py index c83a216196..afebe2a4b0 100644 --- a/libs/langchain/langchain/document_loaders/onedrive_file.py +++ b/libs/langchain/langchain/document_loaders/onedrive_file.py @@ -16,7 +16,7 @@ CHUNK_SIZE = 1024 * 1024 * 5 class OneDriveFileLoader(BaseLoader, BaseModel): - """Loads a file from OneDrive.""" + """Load a file from `Microsoft OneDrive`.""" file: File = Field(...) """The file to load.""" diff --git a/libs/langchain/langchain/document_loaders/open_city_data.py b/libs/langchain/langchain/document_loaders/open_city_data.py index 03801d7995..60384cf47c 100644 --- a/libs/langchain/langchain/document_loaders/open_city_data.py +++ b/libs/langchain/langchain/document_loaders/open_city_data.py @@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class OpenCityDataLoader(BaseLoader): - """Loads Open City data.""" + """Load from `Open City`.""" def __init__(self, city_id: str, dataset_id: str, limit: int): """Initialize with dataset_id. diff --git a/libs/langchain/langchain/document_loaders/org_mode.py b/libs/langchain/langchain/document_loaders/org_mode.py index dbb38411fd..35617675c7 100644 --- a/libs/langchain/langchain/document_loaders/org_mode.py +++ b/libs/langchain/langchain/document_loaders/org_mode.py @@ -1,4 +1,3 @@ -"""Loads Org-Mode files.""" from typing import Any, List from langchain.document_loaders.unstructured import ( @@ -8,7 +7,8 @@ from langchain.document_loaders.unstructured import ( class UnstructuredOrgModeLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load Org-Mode files. + """Load `Org-Mode` files using `Unstructured`. + You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single langchain Document object. If you use "elements" mode, the unstructured diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index d8eba3d981..b671d90ebc 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -1,4 +1,3 @@ -"""Loads PDF files.""" import json import logging import os @@ -30,7 +29,8 @@ logger = logging.getLogger(__file__) class UnstructuredPDFLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load PDF files. + """Load `PDF` files using `Unstructured`. + You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single langchain Document object. If you use "elements" mode, the unstructured @@ -59,7 +59,7 @@ class UnstructuredPDFLoader(UnstructuredFileLoader): class BasePDFLoader(BaseLoader, ABC): - """Base loader class for PDF files. + """Base Loader class for `PDF` files. Defaults to check for local file, but if the file is a web path, it will download it to a temporary file, use it, then clean up the temporary file after completion @@ -122,7 +122,7 @@ class BasePDFLoader(BaseLoader, ABC): class OnlinePDFLoader(BasePDFLoader): - """Loads online PDFs.""" + """Load online `PDF`.""" def load(self) -> List[Document]: """Load documents.""" @@ -131,7 +131,7 @@ class OnlinePDFLoader(BasePDFLoader): class PyPDFLoader(BasePDFLoader): - """Loads a PDF with pypdf and chunks at character level. + """Load `PDF using `pypdf` and chunks at character level. Loader also stores page numbers in metadata. """ @@ -162,7 +162,7 @@ class PyPDFLoader(BasePDFLoader): class PyPDFium2Loader(BasePDFLoader): - """Loads a PDF with pypdfium2 and chunks at character level.""" + """Load `PDF` using `pypdfium2` and chunks at character level.""" def __init__(self, file_path: str): """Initialize with a file path.""" @@ -182,7 +182,7 @@ class PyPDFium2Loader(BasePDFLoader): class PyPDFDirectoryLoader(BaseLoader): - """Loads a directory with PDF files with pypdf and chunks at character level. + """Load a directory with `PDF` files using `pypdf` and chunks at character level. Loader also stores page numbers in metadata. """ @@ -227,7 +227,7 @@ class PyPDFDirectoryLoader(BaseLoader): class PDFMinerLoader(BasePDFLoader): - """Loader that uses PDFMiner to load PDF files.""" + """Load `PDF` files using `PDFMiner`.""" def __init__(self, file_path: str) -> None: """Initialize with file path.""" @@ -255,7 +255,7 @@ class PDFMinerLoader(BasePDFLoader): class PDFMinerPDFasHTMLLoader(BasePDFLoader): - """Loader that uses PDFMiner to load PDF files as HTML content.""" + """Load `PDF` files as HTML content using `PDFMiner`.""" def __init__(self, file_path: str): """Initialize with a file path.""" @@ -289,7 +289,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader): class PyMuPDFLoader(BasePDFLoader): - """Loader that uses PyMuPDF to load PDF files.""" + """Load `PDF` files using `PyMuPDF`.""" def __init__(self, file_path: str) -> None: """Initialize with a file path.""" @@ -314,7 +314,7 @@ class PyMuPDFLoader(BasePDFLoader): # MathpixPDFLoader implementation taken largely from Daniel Gross's: # https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21 class MathpixPDFLoader(BasePDFLoader): - """This class uses Mathpix service to load PDF files.""" + """Load `PDF` files using `Mathpix` service.""" def __init__( self, @@ -433,7 +433,7 @@ class MathpixPDFLoader(BasePDFLoader): class PDFPlumberLoader(BasePDFLoader): - """Loader that uses pdfplumber to load PDF files.""" + """Load `PDF` files using `pdfplumber`.""" def __init__( self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None @@ -459,7 +459,7 @@ class PDFPlumberLoader(BasePDFLoader): class AmazonTextractPDFLoader(BasePDFLoader): - """Loads a PDF document from local file system, HTTP or S3. + """ "Load `PDF` files from a local file system, HTTP or S3. To authenticate, the AWS client uses the following methods to automatically load credentials: diff --git a/libs/langchain/langchain/document_loaders/powerpoint.py b/libs/langchain/langchain/document_loaders/powerpoint.py index d9c3e7f84d..f762a56cc0 100644 --- a/libs/langchain/langchain/document_loaders/powerpoint.py +++ b/libs/langchain/langchain/document_loaders/powerpoint.py @@ -1,4 +1,3 @@ -"""Loads PowerPoint files.""" import os from typing import List @@ -6,7 +5,8 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class UnstructuredPowerPointLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load PowerPoint files. + """Load `Microsoft PowerPoint` files using `Unstructured`. + Works with both .ppt and .pptx files. You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single diff --git a/libs/langchain/langchain/document_loaders/psychic.py b/libs/langchain/langchain/document_loaders/psychic.py index d1adee8e3c..88db3cdd05 100644 --- a/libs/langchain/langchain/document_loaders/psychic.py +++ b/libs/langchain/langchain/document_loaders/psychic.py @@ -1,4 +1,3 @@ -"""Loads documents from Psychic.dev.""" from typing import List, Optional from langchain.docstore.document import Document @@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class PsychicLoader(BaseLoader): - """Loads documents from Psychic.dev.""" + """Load from `Psychic.dev`.""" def __init__( self, api_key: str, account_id: str, connector_id: Optional[str] = None diff --git a/libs/langchain/langchain/document_loaders/pubmed.py b/libs/langchain/langchain/document_loaders/pubmed.py index 4e354acfaa..71d9890731 100644 --- a/libs/langchain/langchain/document_loaders/pubmed.py +++ b/libs/langchain/langchain/document_loaders/pubmed.py @@ -6,7 +6,7 @@ from langchain.utilities.pubmed import PubMedAPIWrapper class PubMedLoader(BaseLoader): - """Loads a query result from PubMed biomedical library into a list of Documents. + """Load from the `PubMed` biomedical library. Attributes: query: The query to be passed to the PubMed API. diff --git a/libs/langchain/langchain/document_loaders/pyspark_dataframe.py b/libs/langchain/langchain/document_loaders/pyspark_dataframe.py index 490926fb10..cee1e65492 100644 --- a/libs/langchain/langchain/document_loaders/pyspark_dataframe.py +++ b/libs/langchain/langchain/document_loaders/pyspark_dataframe.py @@ -1,4 +1,3 @@ -"""Load from a Spark Dataframe object""" import itertools import logging import sys @@ -14,7 +13,7 @@ if TYPE_CHECKING: class PySparkDataFrameLoader(BaseLoader): - """Load PySpark DataFrames""" + """Load `PySpark` DataFrames.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/python.py b/libs/langchain/langchain/document_loaders/python.py index e8e238afc6..af970ee1f2 100644 --- a/libs/langchain/langchain/document_loaders/python.py +++ b/libs/langchain/langchain/document_loaders/python.py @@ -4,9 +4,7 @@ from langchain.document_loaders.text import TextLoader class PythonLoader(TextLoader): - """ - Load Python files, respecting any non-default encoding if specified. - """ + """Load `Python` files, respecting any non-default encoding if specified.""" def __init__(self, file_path: str): """Initialize with a file path. diff --git a/libs/langchain/langchain/document_loaders/readthedocs.py b/libs/langchain/langchain/document_loaders/readthedocs.py index 219364bd7d..a123f6a72e 100644 --- a/libs/langchain/langchain/document_loaders/readthedocs.py +++ b/libs/langchain/langchain/document_loaders/readthedocs.py @@ -1,4 +1,3 @@ -"""Loads ReadTheDocs documentation directory dump.""" from pathlib import Path from typing import Any, List, Optional, Tuple, Union @@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader class ReadTheDocsLoader(BaseLoader): - """Loads ReadTheDocs documentation directory dump.""" + """Load `ReadTheDocs` documentation directory.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py index 17cbe8ce8f..61b9c7032e 100644 --- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py +++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py @@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader class RecursiveUrlLoader(BaseLoader): - """Loads all child links from a given url.""" + """Load all child links from a URL page.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/reddit.py b/libs/langchain/langchain/document_loaders/reddit.py index 44fa4bb811..544624a84c 100644 --- a/libs/langchain/langchain/document_loaders/reddit.py +++ b/libs/langchain/langchain/document_loaders/reddit.py @@ -1,4 +1,3 @@ -"""Reddit document loader.""" from __future__ import annotations from typing import TYPE_CHECKING, Iterable, List, Optional, Sequence @@ -21,7 +20,8 @@ def _dependable_praw_import() -> praw: class RedditPostsLoader(BaseLoader): - """Reddit posts loader. + """Load `Reddit` posts. + Read posts on a subreddit. First, you need to go to https://www.reddit.com/prefs/apps/ diff --git a/libs/langchain/langchain/document_loaders/roam.py b/libs/langchain/langchain/document_loaders/roam.py index 136bc116d0..df2e4882f4 100644 --- a/libs/langchain/langchain/document_loaders/roam.py +++ b/libs/langchain/langchain/document_loaders/roam.py @@ -1,4 +1,3 @@ -"""Loads Roam directory dump.""" from pathlib import Path from typing import List @@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader class RoamLoader(BaseLoader): - """Loads Roam files from disk.""" + """Load `Roam` files from a directory.""" def __init__(self, path: str): """Initialize with a path.""" diff --git a/libs/langchain/langchain/document_loaders/rocksetdb.py b/libs/langchain/langchain/document_loaders/rocksetdb.py index 4355c0a74e..b39775af02 100644 --- a/libs/langchain/langchain/document_loaders/rocksetdb.py +++ b/libs/langchain/langchain/document_loaders/rocksetdb.py @@ -17,7 +17,7 @@ class ColumnNotFoundError(Exception): class RocksetLoader(BaseLoader): - """Wrapper around Rockset db + """Load from a `Rockset` database. To use, you should have the `rockset` python package installed. diff --git a/libs/langchain/langchain/document_loaders/rss.py b/libs/langchain/langchain/document_loaders/rss.py index 870849d0ff..750b8bcccb 100644 --- a/libs/langchain/langchain/document_loaders/rss.py +++ b/libs/langchain/langchain/document_loaders/rss.py @@ -1,4 +1,3 @@ -"""Loader that uses unstructured to load HTML files.""" import logging from typing import Any, Iterator, List, Optional, Sequence @@ -10,7 +9,7 @@ logger = logging.getLogger(__name__) class RSSFeedLoader(BaseLoader): - """Loader that uses newspaper to load news articles from RSS feeds. + """Load news articles from `RSS` feeds using `Unstructured`. Args: urls: URLs for RSS feeds to load. Each articles in the feed is loaded into its own document. diff --git a/libs/langchain/langchain/document_loaders/rst.py b/libs/langchain/langchain/document_loaders/rst.py index a0c0095ec5..3da16849f5 100644 --- a/libs/langchain/langchain/document_loaders/rst.py +++ b/libs/langchain/langchain/document_loaders/rst.py @@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import ( class UnstructuredRSTLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load RST files. + """Load `RST` files using `Unstructured`. + You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single langchain Document object. If you use "elements" mode, the unstructured diff --git a/libs/langchain/langchain/document_loaders/rtf.py b/libs/langchain/langchain/document_loaders/rtf.py index 1cc7b4674d..47af8009e1 100644 --- a/libs/langchain/langchain/document_loaders/rtf.py +++ b/libs/langchain/langchain/document_loaders/rtf.py @@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import ( class UnstructuredRTFLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load RTF files. + """Load `RTF` files using `Unstructured`. + You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single langchain Document object. If you use "elements" mode, the unstructured diff --git a/libs/langchain/langchain/document_loaders/s3_directory.py b/libs/langchain/langchain/document_loaders/s3_directory.py index 60085ee904..b4d6f2a40a 100644 --- a/libs/langchain/langchain/document_loaders/s3_directory.py +++ b/libs/langchain/langchain/document_loaders/s3_directory.py @@ -1,4 +1,3 @@ -"""Loading logic for loading documents from an AWS S3 directory.""" from typing import List from langchain.docstore.document import Document @@ -7,7 +6,7 @@ from langchain.document_loaders.s3_file import S3FileLoader class S3DirectoryLoader(BaseLoader): - """Loading logic for loading documents from an AWS S3.""" + """Load from `Amazon AWS S3` directory.""" def __init__(self, bucket: str, prefix: str = ""): """Initialize with bucket and key name. diff --git a/libs/langchain/langchain/document_loaders/s3_file.py b/libs/langchain/langchain/document_loaders/s3_file.py index 28195d8fe6..509b1ea1ee 100644 --- a/libs/langchain/langchain/document_loaders/s3_file.py +++ b/libs/langchain/langchain/document_loaders/s3_file.py @@ -1,4 +1,3 @@ -"""Loading logic for loading documents from an AWS S3 file.""" import os import tempfile from typing import List @@ -9,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class S3FileLoader(BaseLoader): - """Loading logic for loading documents from an AWS S3 file.""" + """Load from `Amazon AWS S3` file.""" def __init__(self, bucket: str, key: str): """Initialize with bucket and key name. diff --git a/libs/langchain/langchain/document_loaders/sitemap.py b/libs/langchain/langchain/document_loaders/sitemap.py index 158f19e0d2..a67347a940 100644 --- a/libs/langchain/langchain/document_loaders/sitemap.py +++ b/libs/langchain/langchain/document_loaders/sitemap.py @@ -1,4 +1,3 @@ -"""Loader that fetches a sitemap and loads those URLs.""" import itertools import re from typing import Any, Callable, Generator, Iterable, List, Optional @@ -22,7 +21,7 @@ def _batch_block(iterable: Iterable, size: int) -> Generator[List[dict], None, N class SitemapLoader(WebBaseLoader): - """Loader that fetches a sitemap and loads those URLs.""" + """Load a sitemap and its URLs.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/slack_directory.py b/libs/langchain/langchain/document_loaders/slack_directory.py index 16aa5b4fc5..899f4eebd9 100644 --- a/libs/langchain/langchain/document_loaders/slack_directory.py +++ b/libs/langchain/langchain/document_loaders/slack_directory.py @@ -1,4 +1,3 @@ -"""Loader for documents from a Slack export.""" import json import zipfile from pathlib import Path @@ -9,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader class SlackDirectoryLoader(BaseLoader): - """Loads documents from a Slack directory dump.""" + """Load from a `Slack` directory dump.""" def __init__(self, zip_path: str, workspace_url: Optional[str] = None): """Initialize the SlackDirectoryLoader. diff --git a/libs/langchain/langchain/document_loaders/snowflake_loader.py b/libs/langchain/langchain/document_loaders/snowflake_loader.py index 5ed676a248..51d98180a7 100644 --- a/libs/langchain/langchain/document_loaders/snowflake_loader.py +++ b/libs/langchain/langchain/document_loaders/snowflake_loader.py @@ -7,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader class SnowflakeLoader(BaseLoader): - """Loads a query result from Snowflake into a list of documents. + """Load from `Snowflake` API. Each document represents one row of the result. The `page_content_columns` are written into the `page_content` of the document. The `metadata_columns` diff --git a/libs/langchain/langchain/document_loaders/spreedly.py b/libs/langchain/langchain/document_loaders/spreedly.py index 2ec0cfc4c0..e95518d106 100644 --- a/libs/langchain/langchain/document_loaders/spreedly.py +++ b/libs/langchain/langchain/document_loaders/spreedly.py @@ -1,4 +1,3 @@ -"""Loader that fetches data from Spreedly API.""" import json import urllib.request from typing import List @@ -20,7 +19,7 @@ SPREEDLY_ENDPOINTS = { class SpreedlyLoader(BaseLoader): - """Loader that fetches data from Spreedly API.""" + """Load from `Spreedly` API.""" def __init__(self, access_token: str, resource: str) -> None: """Initialize with an access token and a resource. diff --git a/libs/langchain/langchain/document_loaders/srt.py b/libs/langchain/langchain/document_loaders/srt.py index c6114beba9..d110b628f7 100644 --- a/libs/langchain/langchain/document_loaders/srt.py +++ b/libs/langchain/langchain/document_loaders/srt.py @@ -1,4 +1,3 @@ -"""Loader for .srt (subtitle) files.""" from typing import List from langchain.docstore.document import Document @@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class SRTLoader(BaseLoader): - """Loader for .srt (subtitle) files.""" + """Load `.srt` (subtitle) files.""" def __init__(self, file_path: str): """Initialize with a file path.""" diff --git a/libs/langchain/langchain/document_loaders/stripe.py b/libs/langchain/langchain/document_loaders/stripe.py index 41f978d194..95ea8615e9 100644 --- a/libs/langchain/langchain/document_loaders/stripe.py +++ b/libs/langchain/langchain/document_loaders/stripe.py @@ -1,4 +1,3 @@ -"""Loader that fetches data from Stripe""" import json import urllib.request from typing import List, Optional @@ -18,7 +17,7 @@ STRIPE_ENDPOINTS = { class StripeLoader(BaseLoader): - """Loader that fetches data from Stripe.""" + """Load from `Stripe` API.""" def __init__(self, resource: str, access_token: Optional[str] = None) -> None: """Initialize with a resource and an access token. diff --git a/libs/langchain/langchain/document_loaders/telegram.py b/libs/langchain/langchain/document_loaders/telegram.py index 88225ecc44..9b4f81f4ad 100644 --- a/libs/langchain/langchain/document_loaders/telegram.py +++ b/libs/langchain/langchain/document_loaders/telegram.py @@ -1,4 +1,3 @@ -"""Loads Telegram chat json dump.""" from __future__ import annotations import asyncio @@ -24,7 +23,7 @@ def concatenate_rows(row: dict) -> str: class TelegramChatFileLoader(BaseLoader): - """Loads Telegram chat json directory dump.""" + """Load from `Telegram chat` dump.""" def __init__(self, path: str): """Initialize with a path.""" diff --git a/libs/langchain/langchain/document_loaders/tencent_cos_directory.py b/libs/langchain/langchain/document_loaders/tencent_cos_directory.py index 14a249f8e1..e4a8ded85f 100644 --- a/libs/langchain/langchain/document_loaders/tencent_cos_directory.py +++ b/libs/langchain/langchain/document_loaders/tencent_cos_directory.py @@ -1,4 +1,3 @@ -"""Loading logic for loading documents from Tencent Cloud COS directory.""" from typing import Any, Iterator, List from langchain.docstore.document import Document @@ -7,7 +6,7 @@ from langchain.document_loaders.tencent_cos_file import TencentCOSFileLoader class TencentCOSDirectoryLoader(BaseLoader): - """Loader for Tencent Cloud COS directory.""" + """Load from `Tencent Cloud COS` directory.""" def __init__(self, conf: Any, bucket: str, prefix: str = ""): """Initialize with COS config, bucket and prefix. diff --git a/libs/langchain/langchain/document_loaders/tencent_cos_file.py b/libs/langchain/langchain/document_loaders/tencent_cos_file.py index 4d96596716..664efde2cf 100644 --- a/libs/langchain/langchain/document_loaders/tencent_cos_file.py +++ b/libs/langchain/langchain/document_loaders/tencent_cos_file.py @@ -1,4 +1,3 @@ -"""Loading logic for loading documents from Tencent Cloud COS file.""" import os import tempfile from typing import Any, Iterator, List @@ -9,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class TencentCOSFileLoader(BaseLoader): - """Loader for Tencent Cloud COS file.""" + """Load from `Tencent Cloud COS` file.""" def __init__(self, conf: Any, bucket: str, key: str): """Initialize with COS config, bucket and key name. diff --git a/libs/langchain/langchain/document_loaders/tensorflow_datasets.py b/libs/langchain/langchain/document_loaders/tensorflow_datasets.py index e908aac873..82b59d8004 100644 --- a/libs/langchain/langchain/document_loaders/tensorflow_datasets.py +++ b/libs/langchain/langchain/document_loaders/tensorflow_datasets.py @@ -6,7 +6,7 @@ from langchain.utilities.tensorflow_datasets import TensorflowDatasets class TensorflowDatasetLoader(BaseLoader): - """Loads from TensorFlow Datasets into a list of Documents. + """Load from `TensorFlow Dataset`. Attributes: dataset_name: the name of the dataset to load diff --git a/libs/langchain/langchain/document_loaders/text.py b/libs/langchain/langchain/document_loaders/text.py index e148e1d4ee..2bc23c01bb 100644 --- a/libs/langchain/langchain/document_loaders/text.py +++ b/libs/langchain/langchain/document_loaders/text.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) class TextLoader(BaseLoader): - """Load text files. + """Load text file. Args: diff --git a/libs/langchain/langchain/document_loaders/tomarkdown.py b/libs/langchain/langchain/document_loaders/tomarkdown.py index 00ba512dd2..1ffbf3ed00 100644 --- a/libs/langchain/langchain/document_loaders/tomarkdown.py +++ b/libs/langchain/langchain/document_loaders/tomarkdown.py @@ -1,4 +1,3 @@ -"""Loads HTML to markdown using 2markdown.""" from __future__ import annotations from typing import Iterator, List @@ -10,7 +9,7 @@ from langchain.document_loaders.base import BaseLoader class ToMarkdownLoader(BaseLoader): - """Loads HTML to markdown using 2markdown.""" + """Load `HTML` using `2markdown API`.""" def __init__(self, url: str, api_key: str): """Initialize with url and api key.""" diff --git a/libs/langchain/langchain/document_loaders/toml.py b/libs/langchain/langchain/document_loaders/toml.py index 0f52d314f9..b0916508d6 100644 --- a/libs/langchain/langchain/document_loaders/toml.py +++ b/libs/langchain/langchain/document_loaders/toml.py @@ -7,11 +7,10 @@ from langchain.document_loaders.base import BaseLoader class TomlLoader(BaseLoader): - """ - A TOML document loader that inherits from the BaseLoader class. + """Load `TOML` files. - This class can be initialized with either a single source file or a source - directory containing TOML files. + It can load a single source file or several files in a single + directory. """ def __init__(self, source: Union[str, Path]): diff --git a/libs/langchain/langchain/document_loaders/trello.py b/libs/langchain/langchain/document_loaders/trello.py index 11a59a4f5c..f199a68295 100644 --- a/libs/langchain/langchain/document_loaders/trello.py +++ b/libs/langchain/langchain/document_loaders/trello.py @@ -1,4 +1,3 @@ -"""Loads cards from Trello""" from __future__ import annotations from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple @@ -12,7 +11,7 @@ if TYPE_CHECKING: class TrelloLoader(BaseLoader): - """Trello loader. Reads all cards from a Trello board.""" + """Load cards from a `Trello` board.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/tsv.py b/libs/langchain/langchain/document_loaders/tsv.py index 5a5c7b6d7c..1de4d9895c 100644 --- a/libs/langchain/langchain/document_loaders/tsv.py +++ b/libs/langchain/langchain/document_loaders/tsv.py @@ -7,7 +7,9 @@ from langchain.document_loaders.unstructured import ( class UnstructuredTSVLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load TSV files. Like other + """Load `TSV` files using `Unstructured`. + + Like other Unstructured loaders, UnstructuredTSVLoader can be used in both "single" and "elements" mode. If you use the loader in "elements" mode, the TSV file will be a single Unstructured Table element. diff --git a/libs/langchain/langchain/document_loaders/twitter.py b/libs/langchain/langchain/document_loaders/twitter.py index 1cf93321a1..3c681dcf93 100644 --- a/libs/langchain/langchain/document_loaders/twitter.py +++ b/libs/langchain/langchain/document_loaders/twitter.py @@ -1,4 +1,3 @@ -"""Twitter document loader.""" from __future__ import annotations from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union @@ -22,8 +21,9 @@ def _dependable_tweepy_import() -> tweepy: class TwitterTweetLoader(BaseLoader): - """Twitter tweets loader. - Read tweets of user twitter handle. + """Load `Twitter` tweets. + + Read tweets of the user's Twitter handle. First you need to go to `https://developer.twitter.com/en/docs/twitter-api diff --git a/libs/langchain/langchain/document_loaders/unstructured.py b/libs/langchain/langchain/document_loaders/unstructured.py index b9ed6f8a5d..748a29d344 100644 --- a/libs/langchain/langchain/document_loaders/unstructured.py +++ b/libs/langchain/langchain/document_loaders/unstructured.py @@ -130,7 +130,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC): class UnstructuredFileLoader(UnstructuredBaseLoader): - """Loader that uses Unstructured to load files. + """Load files using `Unstructured`. The file loader uses the unstructured partition function and will automatically detect the file @@ -211,7 +211,7 @@ def get_elements_from_api( class UnstructuredAPIFileLoader(UnstructuredFileLoader): - """Loader that uses the Unstructured API to load files. + """Load files using `Unstructured` API. By default, the loader makes a call to the hosted Unstructured API. If you are running the unstructured API locally, you can change the @@ -275,7 +275,7 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader): class UnstructuredFileIOLoader(UnstructuredBaseLoader): - """Loader that uses Unstructured to load files. + """Load files using `Unstructured`. The file loader uses the unstructured partition function and will automatically detect the file @@ -322,7 +322,7 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader): class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader): - """Loader that uses the Unstructured API to load files. + """Load files using `Unstructured` API. By default, the loader makes a call to the hosted Unstructured API. If you are running the unstructured API locally, you can change the diff --git a/libs/langchain/langchain/document_loaders/url.py b/libs/langchain/langchain/document_loaders/url.py index 013ec108f2..e2c48b2d65 100644 --- a/libs/langchain/langchain/document_loaders/url.py +++ b/libs/langchain/langchain/document_loaders/url.py @@ -9,7 +9,8 @@ logger = logging.getLogger(__name__) class UnstructuredURLLoader(BaseLoader): - """Loader that use Unstructured to load files from remote URLs. + """Load files from remote URLs using `Unstructured`. + Use the unstructured partition function to detect the MIME type and route the file to the appropriate partitioner. diff --git a/libs/langchain/langchain/document_loaders/url_playwright.py b/libs/langchain/langchain/document_loaders/url_playwright.py index ee4a47f251..0db4168e5c 100644 --- a/libs/langchain/langchain/document_loaders/url_playwright.py +++ b/libs/langchain/langchain/document_loaders/url_playwright.py @@ -10,7 +10,8 @@ logger = logging.getLogger(__name__) class PlaywrightURLLoader(BaseLoader): - """Loader that uses Playwright and to load a page and unstructured to load the html. + """Load `HTML` pages with `Playwright` and parse with `Unstructured`. + This is useful for loading pages that require javascript to render. Attributes: diff --git a/libs/langchain/langchain/document_loaders/url_selenium.py b/libs/langchain/langchain/document_loaders/url_selenium.py index 5cc3f0ce09..e47da8b5fc 100644 --- a/libs/langchain/langchain/document_loaders/url_selenium.py +++ b/libs/langchain/langchain/document_loaders/url_selenium.py @@ -13,7 +13,8 @@ logger = logging.getLogger(__name__) class SeleniumURLLoader(BaseLoader): - """Loader that uses Selenium and to load a page and unstructured to load the html. + """Load `HTML` pages with `Selenium` and parse with `Unstructured`. + This is useful for loading pages that require javascript to render. Attributes: diff --git a/libs/langchain/langchain/document_loaders/weather.py b/libs/langchain/langchain/document_loaders/weather.py index 958b6f39c0..4481cced78 100644 --- a/libs/langchain/langchain/document_loaders/weather.py +++ b/libs/langchain/langchain/document_loaders/weather.py @@ -10,7 +10,7 @@ from langchain.utilities.openweathermap import OpenWeatherMapAPIWrapper class WeatherDataLoader(BaseLoader): - """Weather Reader. + """Load weather data with `Open Weather Map` API. Reads the forecast & current weather of any location using OpenWeatherMap's free API. Checkout 'https://openweathermap.org/appid' for more on how to generate a free diff --git a/libs/langchain/langchain/document_loaders/web_base.py b/libs/langchain/langchain/document_loaders/web_base.py index 5ae9482bce..e6ee1db045 100644 --- a/libs/langchain/langchain/document_loaders/web_base.py +++ b/libs/langchain/langchain/document_loaders/web_base.py @@ -37,7 +37,7 @@ def _build_metadata(soup: Any, url: str) -> dict: class WebBaseLoader(BaseLoader): - """Loader that uses urllib and beautiful soup to load webpages.""" + """Load HTML pages using `urllib` and parse them with `BeautifulSoup'.""" web_paths: List[str] diff --git a/libs/langchain/langchain/document_loaders/whatsapp_chat.py b/libs/langchain/langchain/document_loaders/whatsapp_chat.py index cad93ac8d0..561f62b743 100644 --- a/libs/langchain/langchain/document_loaders/whatsapp_chat.py +++ b/libs/langchain/langchain/document_loaders/whatsapp_chat.py @@ -12,7 +12,7 @@ def concatenate_rows(date: str, sender: str, text: str) -> str: class WhatsAppChatLoader(BaseLoader): - """Loads WhatsApp messages text file.""" + """Load `WhatsApp` messages text file.""" def __init__(self, path: str): """Initialize with path.""" diff --git a/libs/langchain/langchain/document_loaders/wikipedia.py b/libs/langchain/langchain/document_loaders/wikipedia.py index 5f0bbb73cd..7140a82474 100644 --- a/libs/langchain/langchain/document_loaders/wikipedia.py +++ b/libs/langchain/langchain/document_loaders/wikipedia.py @@ -6,7 +6,8 @@ from langchain.utilities.wikipedia import WikipediaAPIWrapper class WikipediaLoader(BaseLoader): - """Loads a query result from www.wikipedia.org into a list of Documents. + """Load from `Wikipedia`. + The hard limit on the number of downloaded Documents is 300 for now. Each wiki page represents one Document. diff --git a/libs/langchain/langchain/document_loaders/word_document.py b/libs/langchain/langchain/document_loaders/word_document.py index 66ccfbd090..3a2ae3a6ae 100644 --- a/libs/langchain/langchain/document_loaders/word_document.py +++ b/libs/langchain/langchain/document_loaders/word_document.py @@ -13,7 +13,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class Docx2txtLoader(BaseLoader, ABC): - """Loads a DOCX with docx2txt and chunks at character level. + """Load `DOCX` file using `docx2txt` and chunks at character level. Defaults to check for local file, but if the file is a web path, it will download it to a temporary file, and use that, then clean up the temporary file after completion @@ -65,7 +65,8 @@ class Docx2txtLoader(BaseLoader, ABC): class UnstructuredWordDocumentLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load word documents. + """Load `Microsof Word` file using `Unstructured`. + Works with both .docx and .doc files. You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single diff --git a/libs/langchain/langchain/document_loaders/xml.py b/libs/langchain/langchain/document_loaders/xml.py index ac6ceeea9a..df3d364c78 100644 --- a/libs/langchain/langchain/document_loaders/xml.py +++ b/libs/langchain/langchain/document_loaders/xml.py @@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import ( class UnstructuredXMLLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load XML files. + """Load `XML` file using `Unstructured`. + You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single langchain Document object. If you use "elements" mode, the unstructured diff --git a/libs/langchain/langchain/document_loaders/xorbits.py b/libs/langchain/langchain/document_loaders/xorbits.py index e8259d8f0a..bcc4e680f6 100644 --- a/libs/langchain/langchain/document_loaders/xorbits.py +++ b/libs/langchain/langchain/document_loaders/xorbits.py @@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class XorbitsLoader(BaseLoader): - """Load Xorbits DataFrame.""" + """Load `Xorbits` DataFrame.""" def __init__(self, data_frame: Any, page_content_column: str = "text"): """Initialize with dataframe object. diff --git a/libs/langchain/langchain/document_loaders/youtube.py b/libs/langchain/langchain/document_loaders/youtube.py index 86d7c42a8e..99f2df56c6 100644 --- a/libs/langchain/langchain/document_loaders/youtube.py +++ b/libs/langchain/langchain/document_loaders/youtube.py @@ -140,7 +140,7 @@ def _parse_video_id(url: str) -> Optional[str]: class YoutubeLoader(BaseLoader): - """Loads Youtube transcripts.""" + """Load `YouTube` transcripts.""" def __init__( self, @@ -252,7 +252,7 @@ class YoutubeLoader(BaseLoader): @dataclass class GoogleApiYoutubeLoader(BaseLoader): - """Loads all Videos from a Channel + """Load all Videos from a `YouTube` Channel. To use, you should have the ``googleapiclient,youtube_transcript_api`` python package installed.