From edb585228d212c49951504fb68c9a8639288d621 Mon Sep 17 00:00:00 2001 From: Leonid Ganeline Date: Fri, 11 Aug 2023 13:09:31 -0700 Subject: [PATCH] docstrings: document_loaders consitency (#9139) Formatted docstrings from different formats to consistent format, lile: >Loads processed docs from Docugami. "Load from `Docugami`." >Loader that uses Unstructured to load HTML files. "Load `HTML` files using `Unstructured`." >Load documents from a directory. "Load from a directory." - `Load` - no `Loads` - DocumentLoader always loads Documents, so no more "documents/docs/texts/ etc" - integrated systems and APIs enclosed in backticks, --- libs/langchain/langchain/document_loaders/acreom.py | 4 ++-- libs/langchain/langchain/document_loaders/airbyte.py | 3 +-- libs/langchain/langchain/document_loaders/airbyte_json.py | 3 +-- libs/langchain/langchain/document_loaders/airtable.py | 2 +- libs/langchain/langchain/document_loaders/apify_dataset.py | 3 ++- libs/langchain/langchain/document_loaders/arxiv.py | 2 +- libs/langchain/langchain/document_loaders/async_html.py | 4 ++-- libs/langchain/langchain/document_loaders/azlyrics.py | 3 +-- .../document_loaders/azure_blob_storage_container.py | 3 +-- .../langchain/document_loaders/azure_blob_storage_file.py | 2 +- libs/langchain/langchain/document_loaders/base.py | 2 +- libs/langchain/langchain/document_loaders/bibtex.py | 2 +- libs/langchain/langchain/document_loaders/bigquery.py | 2 +- libs/langchain/langchain/document_loaders/bilibili.py | 2 +- libs/langchain/langchain/document_loaders/blackboard.py | 3 +-- libs/langchain/langchain/document_loaders/blockchain.py | 2 +- libs/langchain/langchain/document_loaders/brave_search.py | 2 +- libs/langchain/langchain/document_loaders/browserless.py | 2 +- libs/langchain/langchain/document_loaders/chatgpt.py | 3 +-- .../langchain/document_loaders/college_confidential.py | 3 +-- libs/langchain/langchain/document_loaders/concurrent.py | 4 +--- libs/langchain/langchain/document_loaders/confluence.py | 3 +-- libs/langchain/langchain/document_loaders/conllu.py | 3 +-- libs/langchain/langchain/document_loaders/csv_loader.py | 2 +- libs/langchain/langchain/document_loaders/cube_semantic.py | 2 +- libs/langchain/langchain/document_loaders/datadog_logs.py | 3 +-- libs/langchain/langchain/document_loaders/dataframe.py | 3 +-- libs/langchain/langchain/document_loaders/diffbot.py | 3 +-- libs/langchain/langchain/document_loaders/directory.py | 3 +-- libs/langchain/langchain/document_loaders/discord.py | 3 +-- libs/langchain/langchain/document_loaders/docugami.py | 4 +--- libs/langchain/langchain/document_loaders/dropbox.py | 4 +--- libs/langchain/langchain/document_loaders/duckdb_loader.py | 2 +- libs/langchain/langchain/document_loaders/email.py | 5 +++-- libs/langchain/langchain/document_loaders/embaas.py | 6 +++--- libs/langchain/langchain/document_loaders/epub.py | 3 +-- libs/langchain/langchain/document_loaders/etherscan.py | 3 +-- libs/langchain/langchain/document_loaders/evernote.py | 2 +- libs/langchain/langchain/document_loaders/excel.py | 4 +++- libs/langchain/langchain/document_loaders/facebook_chat.py | 3 +-- libs/langchain/langchain/document_loaders/fauna.py | 2 +- libs/langchain/langchain/document_loaders/figma.py | 3 +-- libs/langchain/langchain/document_loaders/gcs_directory.py | 3 +-- libs/langchain/langchain/document_loaders/gcs_file.py | 3 +-- libs/langchain/langchain/document_loaders/generic.py | 2 +- libs/langchain/langchain/document_loaders/geodataframe.py | 3 +-- libs/langchain/langchain/document_loaders/git.py | 3 ++- libs/langchain/langchain/document_loaders/gitbook.py | 3 +-- libs/langchain/langchain/document_loaders/github.py | 2 +- libs/langchain/langchain/document_loaders/googledrive.py | 4 +--- libs/langchain/langchain/document_loaders/gutenberg.py | 3 +-- libs/langchain/langchain/document_loaders/hn.py | 5 +++-- libs/langchain/langchain/document_loaders/html.py | 3 +-- libs/langchain/langchain/document_loaders/html_bs.py | 4 +--- 54 files changed, 66 insertions(+), 94 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/acreom.py b/libs/langchain/langchain/document_loaders/acreom.py index b157c4d4e7..c69ba37d0b 100644 --- a/libs/langchain/langchain/document_loaders/acreom.py +++ b/libs/langchain/langchain/document_loaders/acreom.py @@ -1,4 +1,3 @@ -"""Loads acreom vault from a directory.""" import re from pathlib import Path from typing import Iterator, List @@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader class AcreomLoader(BaseLoader): - """Loader that loads acreom vault from a directory.""" + """Load `acreom` vault from a directory.""" FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL) """Regex to match front matter metadata in markdown files.""" @@ -16,6 +15,7 @@ class AcreomLoader(BaseLoader): def __init__( self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True ): + """Initialize the loader.""" self.file_path = path """Path to the directory containing the markdown files.""" self.encoding = encoding diff --git a/libs/langchain/langchain/document_loaders/airbyte.py b/libs/langchain/langchain/document_loaders/airbyte.py index 51411f0120..05f3ca62ce 100644 --- a/libs/langchain/langchain/document_loaders/airbyte.py +++ b/libs/langchain/langchain/document_loaders/airbyte.py @@ -1,4 +1,3 @@ -"""Loads local airbyte json files.""" from typing import Any, Callable, Iterator, List, Mapping, Optional from langchain.docstore.document import Document @@ -9,7 +8,7 @@ RecordHandler = Callable[[Any, Optional[str]], Document] class AirbyteCDKLoader(BaseLoader): - """Loads records using an Airbyte source connector implemented using the CDK.""" + """Load with an `Airbyte` source connector implemented using the `CDK`.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/airbyte_json.py b/libs/langchain/langchain/document_loaders/airbyte_json.py index b9033b39f2..02bcb9e50e 100644 --- a/libs/langchain/langchain/document_loaders/airbyte_json.py +++ b/libs/langchain/langchain/document_loaders/airbyte_json.py @@ -1,4 +1,3 @@ -"""Loads local airbyte json files.""" import json from typing import List @@ -8,7 +7,7 @@ from langchain.utils import stringify_dict class AirbyteJSONLoader(BaseLoader): - """Loads local airbyte json files.""" + """Load local `Airbyte` json files.""" def __init__(self, file_path: str): """Initialize with a file path. This should start with '/tmp/airbyte_local/'.""" diff --git a/libs/langchain/langchain/document_loaders/airtable.py b/libs/langchain/langchain/document_loaders/airtable.py index 824799b284..0bec883597 100644 --- a/libs/langchain/langchain/document_loaders/airtable.py +++ b/libs/langchain/langchain/document_loaders/airtable.py @@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class AirtableLoader(BaseLoader): - """Loader for Airtable tables.""" + """Load the `Airtable` tables.""" def __init__(self, api_token: str, table_id: str, base_id: str): """Initialize with API token and the IDs for table and base""" diff --git a/libs/langchain/langchain/document_loaders/apify_dataset.py b/libs/langchain/langchain/document_loaders/apify_dataset.py index ca3ae6f995..2a155424e3 100644 --- a/libs/langchain/langchain/document_loaders/apify_dataset.py +++ b/libs/langchain/langchain/document_loaders/apify_dataset.py @@ -7,7 +7,8 @@ from langchain.document_loaders.base import BaseLoader class ApifyDatasetLoader(BaseLoader, BaseModel): - """Loads datasets from Apify-a web scraping, crawling, and data extraction platform. + """Load datasets from `Apify` web scraping, crawling, and data extraction platform. + For details, see https://docs.apify.com/platform/integrations/langchain Example: diff --git a/libs/langchain/langchain/document_loaders/arxiv.py b/libs/langchain/langchain/document_loaders/arxiv.py index 6a7e139ca4..0cfde95afa 100644 --- a/libs/langchain/langchain/document_loaders/arxiv.py +++ b/libs/langchain/langchain/document_loaders/arxiv.py @@ -6,7 +6,7 @@ from langchain.utilities.arxiv import ArxivAPIWrapper class ArxivLoader(BaseLoader): - """Loads a query result from arxiv.org into a list of Documents. + """Load a query result from `Arxiv`. The loader converts the original PDF format into the text. """ diff --git a/libs/langchain/langchain/document_loaders/async_html.py b/libs/langchain/langchain/document_loaders/async_html.py index 9ba8a94473..ce54573ff9 100644 --- a/libs/langchain/langchain/document_loaders/async_html.py +++ b/libs/langchain/langchain/document_loaders/async_html.py @@ -24,7 +24,7 @@ default_header_template = { class AsyncHtmlLoader(BaseLoader): - """Loads HTML asynchronously.""" + """Load `HTML` asynchronously.""" def __init__( self, @@ -36,7 +36,7 @@ class AsyncHtmlLoader(BaseLoader): requests_kwargs: Dict[str, Any] = {}, raise_for_status: bool = False, ): - """Initialize with webpage path.""" + """Initialize with a webpage path.""" # TODO: Deprecate web_path in favor of web_paths, and remove this # left like this because there are a number of loaders that expect single diff --git a/libs/langchain/langchain/document_loaders/azlyrics.py b/libs/langchain/langchain/document_loaders/azlyrics.py index d7b6fe28ba..df355899e8 100644 --- a/libs/langchain/langchain/document_loaders/azlyrics.py +++ b/libs/langchain/langchain/document_loaders/azlyrics.py @@ -1,4 +1,3 @@ -"""Loads AZLyrics.""" from typing import List from langchain.docstore.document import Document @@ -6,7 +5,7 @@ from langchain.document_loaders.web_base import WebBaseLoader class AZLyricsLoader(WebBaseLoader): - """Loads AZLyrics webpages.""" + """Load `AZLyrics` webpages.""" def load(self) -> List[Document]: """Load webpages into Documents.""" diff --git a/libs/langchain/langchain/document_loaders/azure_blob_storage_container.py b/libs/langchain/langchain/document_loaders/azure_blob_storage_container.py index 12155d7fd3..da3542a1d8 100644 --- a/libs/langchain/langchain/document_loaders/azure_blob_storage_container.py +++ b/libs/langchain/langchain/document_loaders/azure_blob_storage_container.py @@ -1,4 +1,3 @@ -"""Loading logic for loading documents from an Azure Blob Storage container.""" from typing import List from langchain.docstore.document import Document @@ -9,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader class AzureBlobStorageContainerLoader(BaseLoader): - """Loading Documents from Azure Blob Storage.""" + """Load from `Azure Blob Storage` container.""" def __init__(self, conn_str: str, container: str, prefix: str = ""): """Initialize with connection string, container and blob prefix.""" diff --git a/libs/langchain/langchain/document_loaders/azure_blob_storage_file.py b/libs/langchain/langchain/document_loaders/azure_blob_storage_file.py index 64b7e2d677..264e7f39ed 100644 --- a/libs/langchain/langchain/document_loaders/azure_blob_storage_file.py +++ b/libs/langchain/langchain/document_loaders/azure_blob_storage_file.py @@ -8,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class AzureBlobStorageFileLoader(BaseLoader): - """Loading Documents from Azure Blob Storage.""" + """Load from `Azure Blob Storage` files.""" def __init__(self, conn_str: str, container: str, blob_name: str): """Initialize with connection string, container and blob name.""" diff --git a/libs/langchain/langchain/document_loaders/base.py b/libs/langchain/langchain/document_loaders/base.py index b41f985d51..448f79a96d 100644 --- a/libs/langchain/langchain/document_loaders/base.py +++ b/libs/langchain/langchain/document_loaders/base.py @@ -8,7 +8,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter class BaseLoader(ABC): - """Interface for loading Documents. + """Interface for Document Loader. Implementations should implement the lazy-loading method using generators to avoid loading all Documents into memory at once. diff --git a/libs/langchain/langchain/document_loaders/bibtex.py b/libs/langchain/langchain/document_loaders/bibtex.py index d538b1d44c..1e32ff6b17 100644 --- a/libs/langchain/langchain/document_loaders/bibtex.py +++ b/libs/langchain/langchain/document_loaders/bibtex.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) class BibtexLoader(BaseLoader): - """Loads a bibtex file into a list of Documents. + """Load a `bibtex` file. Each document represents one entry from the bibtex file. diff --git a/libs/langchain/langchain/document_loaders/bigquery.py b/libs/langchain/langchain/document_loaders/bigquery.py index b49b1542f2..abfb171e5c 100644 --- a/libs/langchain/langchain/document_loaders/bigquery.py +++ b/libs/langchain/langchain/document_loaders/bigquery.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: class BigQueryLoader(BaseLoader): - """Loads a query result from BigQuery into a list of documents. + """Load from the Google Cloud Platform `BigQuery`. Each document represents one row of the result. The `page_content_columns` are written into the `page_content` of the document. The `metadata_columns` diff --git a/libs/langchain/langchain/document_loaders/bilibili.py b/libs/langchain/langchain/document_loaders/bilibili.py index 0c1c815180..e22ac20f8d 100644 --- a/libs/langchain/langchain/document_loaders/bilibili.py +++ b/libs/langchain/langchain/document_loaders/bilibili.py @@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader class BiliBiliLoader(BaseLoader): - """Loads bilibili transcripts.""" + """Load `BiliBili` video transcripts.""" def __init__(self, video_urls: List[str]): """Initialize with bilibili url. diff --git a/libs/langchain/langchain/document_loaders/blackboard.py b/libs/langchain/langchain/document_loaders/blackboard.py index b21db2dfd1..91bc72d26e 100644 --- a/libs/langchain/langchain/document_loaders/blackboard.py +++ b/libs/langchain/langchain/document_loaders/blackboard.py @@ -1,4 +1,3 @@ -"""Loads all documents from a blackboard course.""" import contextlib import re from pathlib import Path @@ -12,7 +11,7 @@ from langchain.document_loaders.web_base import WebBaseLoader class BlackboardLoader(WebBaseLoader): - """Loads all documents from a Blackboard course. + """Load a `Blackboard` course. This loader is not compatible with all Blackboard courses. It is only compatible with courses that use the new Blackboard interface. diff --git a/libs/langchain/langchain/document_loaders/blockchain.py b/libs/langchain/langchain/document_loaders/blockchain.py index 6b103fe1c8..c79f67d96a 100644 --- a/libs/langchain/langchain/document_loaders/blockchain.py +++ b/libs/langchain/langchain/document_loaders/blockchain.py @@ -20,7 +20,7 @@ class BlockchainType(Enum): class BlockchainDocumentLoader(BaseLoader): - """Loads elements from a blockchain smart contract into Langchain documents. + """Load elements from a blockchain smart contract. The supported blockchains are: Ethereum mainnet, Ethereum Goerli testnet, Polygon mainnet, and Polygon Mumbai testnet. diff --git a/libs/langchain/langchain/document_loaders/brave_search.py b/libs/langchain/langchain/document_loaders/brave_search.py index 2887256e1e..e3a4821457 100644 --- a/libs/langchain/langchain/document_loaders/brave_search.py +++ b/libs/langchain/langchain/document_loaders/brave_search.py @@ -6,7 +6,7 @@ from langchain.utilities.brave_search import BraveSearchWrapper class BraveSearchLoader(BaseLoader): - """Loads a query result from Brave Search engine into a list of Documents.""" + """Load with `Brave Search` engine.""" def __init__(self, query: str, api_key: str, search_kwargs: Optional[dict] = None): """Initializes the BraveLoader. diff --git a/libs/langchain/langchain/document_loaders/browserless.py b/libs/langchain/langchain/document_loaders/browserless.py index 1cb7cb6727..326e7e303a 100644 --- a/libs/langchain/langchain/document_loaders/browserless.py +++ b/libs/langchain/langchain/document_loaders/browserless.py @@ -7,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader class BrowserlessLoader(BaseLoader): - """Loads the content of webpages using Browserless' /content endpoint""" + """Load webpages with `Browserless` /content endpoint.""" def __init__( self, api_token: str, urls: Union[str, List[str]], text_content: bool = True diff --git a/libs/langchain/langchain/document_loaders/chatgpt.py b/libs/langchain/langchain/document_loaders/chatgpt.py index 5dbb90585f..35d638d6b3 100644 --- a/libs/langchain/langchain/document_loaders/chatgpt.py +++ b/libs/langchain/langchain/document_loaders/chatgpt.py @@ -1,4 +1,3 @@ -"""Load conversations from ChatGPT data export""" import datetime import json from typing import List @@ -29,7 +28,7 @@ def concatenate_rows(message: dict, title: str) -> str: class ChatGPTLoader(BaseLoader): - """Load conversations from exported ChatGPT data.""" + """Load conversations from exported `ChatGPT` data.""" def __init__(self, log_file: str, num_logs: int = -1): """Initialize a class object. diff --git a/libs/langchain/langchain/document_loaders/college_confidential.py b/libs/langchain/langchain/document_loaders/college_confidential.py index 5763fdcdf0..d93a53c846 100644 --- a/libs/langchain/langchain/document_loaders/college_confidential.py +++ b/libs/langchain/langchain/document_loaders/college_confidential.py @@ -1,4 +1,3 @@ -"""Loads College Confidential.""" from typing import List from langchain.docstore.document import Document @@ -6,7 +5,7 @@ from langchain.document_loaders.web_base import WebBaseLoader class CollegeConfidentialLoader(WebBaseLoader): - """Loads College Confidential webpages.""" + """Load `College Confidential` webpages.""" def load(self) -> List[Document]: """Load webpages as Documents.""" diff --git a/libs/langchain/langchain/document_loaders/concurrent.py b/libs/langchain/langchain/document_loaders/concurrent.py index 545449527c..2044dcc7fb 100644 --- a/libs/langchain/langchain/document_loaders/concurrent.py +++ b/libs/langchain/langchain/document_loaders/concurrent.py @@ -16,9 +16,7 @@ DEFAULT = Literal["default"] class ConcurrentLoader(GenericLoader): - """ - A generic document loader that loads and parses documents concurrently. - """ + """Load and pars Documents concurrently.""" def __init__( self, blob_loader: BlobLoader, blob_parser: BaseBlobParser, num_workers: int = 4 diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py index 8615faa7d8..98d63cfc18 100644 --- a/libs/langchain/langchain/document_loaders/confluence.py +++ b/libs/langchain/langchain/document_loaders/confluence.py @@ -1,4 +1,3 @@ -"""Load Data from a Confluence Space""" import logging from enum import Enum from io import BytesIO @@ -33,7 +32,7 @@ class ContentFormat(str, Enum): class ConfluenceLoader(BaseLoader): - """Load Confluence pages. + """Load `Confluence` pages. Port of https://llamahub.ai/l/confluence This currently supports username/api_key, Oauth2 login or personal access token diff --git a/libs/langchain/langchain/document_loaders/conllu.py b/libs/langchain/langchain/document_loaders/conllu.py index d22c8500d2..a9170c932d 100644 --- a/libs/langchain/langchain/document_loaders/conllu.py +++ b/libs/langchain/langchain/document_loaders/conllu.py @@ -1,4 +1,3 @@ -"""Load CoNLL-U files.""" import csv from typing import List @@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader class CoNLLULoader(BaseLoader): - """Load CoNLL-U files.""" + """Load `CoNLL-U` files.""" def __init__(self, file_path: str): """Initialize with a file path.""" diff --git a/libs/langchain/langchain/document_loaders/csv_loader.py b/libs/langchain/langchain/document_loaders/csv_loader.py index f6d9314c9b..45133786fa 100644 --- a/libs/langchain/langchain/document_loaders/csv_loader.py +++ b/libs/langchain/langchain/document_loaders/csv_loader.py @@ -10,7 +10,7 @@ from langchain.document_loaders.unstructured import ( class CSVLoader(BaseLoader): - """Loads a CSV file into a list of documents. + """Load a `CSV` file into a list of Documents. Each document represents one row of the CSV file. Every row is converted into a key/value pair and outputted to a new line in the document's page_content. diff --git a/libs/langchain/langchain/document_loaders/cube_semantic.py b/libs/langchain/langchain/document_loaders/cube_semantic.py index 2d645e9e5c..a29a41f02f 100644 --- a/libs/langchain/langchain/document_loaders/cube_semantic.py +++ b/libs/langchain/langchain/document_loaders/cube_semantic.py @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) class CubeSemanticLoader(BaseLoader): - """Load Cube semantic layer metadata. + """Load `Cube semantic layer` metadata. Args: cube_api_url: REST API endpoint. diff --git a/libs/langchain/langchain/document_loaders/datadog_logs.py b/libs/langchain/langchain/document_loaders/datadog_logs.py index 613288a99e..a79d7975c0 100644 --- a/libs/langchain/langchain/document_loaders/datadog_logs.py +++ b/libs/langchain/langchain/document_loaders/datadog_logs.py @@ -1,4 +1,3 @@ -"""Load Datadog logs.""" from datetime import datetime, timedelta from typing import List, Optional @@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader class DatadogLogsLoader(BaseLoader): - """Loads a query result from Datadog into a list of documents. + """Load `Datadog` logs. Logs are written into the `page_content` and into the `metadata`. """ diff --git a/libs/langchain/langchain/document_loaders/dataframe.py b/libs/langchain/langchain/document_loaders/dataframe.py index b1a9194c90..0476f6a298 100644 --- a/libs/langchain/langchain/document_loaders/dataframe.py +++ b/libs/langchain/langchain/document_loaders/dataframe.py @@ -1,4 +1,3 @@ -"""Load from a Dataframe object""" from typing import Any, Iterator, List from langchain.docstore.document import Document @@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class DataFrameLoader(BaseLoader): - """Load Pandas DataFrame.""" + """Load `Pandas` DataFrame.""" def __init__(self, data_frame: Any, page_content_column: str = "text"): """Initialize with dataframe object. diff --git a/libs/langchain/langchain/document_loaders/diffbot.py b/libs/langchain/langchain/document_loaders/diffbot.py index e5ef3d5f7f..3c2bf4c7aa 100644 --- a/libs/langchain/langchain/document_loaders/diffbot.py +++ b/libs/langchain/langchain/document_loaders/diffbot.py @@ -1,4 +1,3 @@ -"""Loader that uses Diffbot to load webpages in text format.""" import logging from typing import Any, List @@ -11,7 +10,7 @@ logger = logging.getLogger(__name__) class DiffbotLoader(BaseLoader): - """Loads Diffbot file json.""" + """Load `Diffbot` json file.""" def __init__( self, api_token: str, urls: List[str], continue_on_failure: bool = True diff --git a/libs/langchain/langchain/document_loaders/directory.py b/libs/langchain/langchain/document_loaders/directory.py index 729c953236..bd6ae0bbb1 100644 --- a/libs/langchain/langchain/document_loaders/directory.py +++ b/libs/langchain/langchain/document_loaders/directory.py @@ -1,4 +1,3 @@ -"""Load documents from a directory.""" import concurrent import logging import random @@ -26,7 +25,7 @@ def _is_visible(p: Path) -> bool: class DirectoryLoader(BaseLoader): - """Load documents from a directory.""" + """Load from a directory.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/discord.py b/libs/langchain/langchain/document_loaders/discord.py index be6f290d70..a9b5b93570 100644 --- a/libs/langchain/langchain/document_loaders/discord.py +++ b/libs/langchain/langchain/document_loaders/discord.py @@ -1,4 +1,3 @@ -"""Load from Discord chat dump""" from __future__ import annotations from typing import TYPE_CHECKING, List @@ -11,7 +10,7 @@ if TYPE_CHECKING: class DiscordChatLoader(BaseLoader): - """Load Discord chat logs.""" + """Load `Discord` chat logs.""" def __init__(self, chat_log: pd.DataFrame, user_id_col: str = "ID"): """Initialize with a Pandas DataFrame containing chat logs. diff --git a/libs/langchain/langchain/document_loaders/docugami.py b/libs/langchain/langchain/document_loaders/docugami.py index b60326f24c..23f293010b 100644 --- a/libs/langchain/langchain/document_loaders/docugami.py +++ b/libs/langchain/langchain/document_loaders/docugami.py @@ -1,5 +1,3 @@ -"""Loads processed documents from Docugami.""" - import io import logging import os @@ -29,7 +27,7 @@ logger = logging.getLogger(__name__) class DocugamiLoader(BaseLoader, BaseModel): - """Loads processed docs from Docugami. + """Load from `Docugami`. To use, you should have the ``lxml`` python package installed. """ diff --git a/libs/langchain/langchain/document_loaders/dropbox.py b/libs/langchain/langchain/document_loaders/dropbox.py index 1197ef59c0..123956274e 100644 --- a/libs/langchain/langchain/document_loaders/dropbox.py +++ b/libs/langchain/langchain/document_loaders/dropbox.py @@ -1,5 +1,3 @@ -"""Loads data from Dropbox.""" - # Prerequisites: # 1. Create a Dropbox app. # 2. Give the app these scope permissions: `files.metadata.read` @@ -20,7 +18,7 @@ from langchain.document_loaders.base import BaseLoader class DropboxLoader(BaseLoader, BaseModel): - """Loads files from Dropbox. + """Load files from `Dropbox`. In addition to common files such as text and PDF files, it also supports *Dropbox Paper* files. diff --git a/libs/langchain/langchain/document_loaders/duckdb_loader.py b/libs/langchain/langchain/document_loaders/duckdb_loader.py index f1805f69e1..11d64ce969 100644 --- a/libs/langchain/langchain/document_loaders/duckdb_loader.py +++ b/libs/langchain/langchain/document_loaders/duckdb_loader.py @@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class DuckDBLoader(BaseLoader): - """Loads a query result from DuckDB into a list of documents. + """Load from `DuckDB`. Each document represents one row of the result. The `page_content_columns` are written into the `page_content` of the document. The `metadata_columns` diff --git a/libs/langchain/langchain/document_loaders/email.py b/libs/langchain/langchain/document_loaders/email.py index 968502b153..f23ef88d39 100644 --- a/libs/langchain/langchain/document_loaders/email.py +++ b/libs/langchain/langchain/document_loaders/email.py @@ -1,4 +1,3 @@ -"""Loads email files.""" import os from typing import Any, List @@ -11,7 +10,9 @@ from langchain.document_loaders.unstructured import ( class UnstructuredEmailLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load email files. Works with both + """Load email files with `unstructured`. + + Works with both .eml and .msg files. You can process attachments in addition to the e-mail message itself by passing process_attachments=True into the constructor for the loader. By default, attachments will be processed diff --git a/libs/langchain/langchain/document_loaders/embaas.py b/libs/langchain/langchain/document_loaders/embaas.py index 89959f9a59..da8b2afe6d 100644 --- a/libs/langchain/langchain/document_loaders/embaas.py +++ b/libs/langchain/langchain/document_loaders/embaas.py @@ -52,7 +52,7 @@ class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters): class BaseEmbaasLoader(BaseModel): - """Base class for embedding a model into an Embaas document extraction API.""" + """Base loader for embedding a model into an `Embaas` document extraction API.""" embaas_api_key: Optional[str] = None """The API key for the embaas document extraction API.""" @@ -72,7 +72,7 @@ class BaseEmbaasLoader(BaseModel): class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser): - """Embaas's document byte loader. + """Load `Embaas` blob. To use, you should have the environment variable ``EMBAAS_API_KEY`` set with your API key, or pass @@ -178,7 +178,7 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser): class EmbaasLoader(BaseEmbaasLoader, BaseLoader): - """Embaas's document loader. + """Load from `Embaas`. To use, you should have the environment variable ``EMBAAS_API_KEY`` set with your API key, or pass diff --git a/libs/langchain/langchain/document_loaders/epub.py b/libs/langchain/langchain/document_loaders/epub.py index 2c2b49721e..449f0221a0 100644 --- a/libs/langchain/langchain/document_loaders/epub.py +++ b/libs/langchain/langchain/document_loaders/epub.py @@ -1,4 +1,3 @@ -"""Loads EPub files.""" from typing import List from langchain.document_loaders.unstructured import ( @@ -8,7 +7,7 @@ from langchain.document_loaders.unstructured import ( class UnstructuredEPubLoader(UnstructuredFileLoader): - """Loader that uses Unstructured to load EPUB files. + """Load `EPub` files using `Unstructured`. You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single diff --git a/libs/langchain/langchain/document_loaders/etherscan.py b/libs/langchain/langchain/document_loaders/etherscan.py index d3fa00368d..eed6df98e2 100644 --- a/libs/langchain/langchain/document_loaders/etherscan.py +++ b/libs/langchain/langchain/document_loaders/etherscan.py @@ -9,8 +9,7 @@ from langchain.document_loaders.base import BaseLoader class EtherscanLoader(BaseLoader): - """ - Load transactions from an account on Ethereum mainnet. + """Load transactions from `Ethereum` mainnet. The Loader use Etherscan API to interact with Ethereum mainnet. diff --git a/libs/langchain/langchain/document_loaders/evernote.py b/libs/langchain/langchain/document_loaders/evernote.py index d9765e55ca..0f0f674342 100644 --- a/libs/langchain/langchain/document_loaders/evernote.py +++ b/libs/langchain/langchain/document_loaders/evernote.py @@ -15,7 +15,7 @@ logger = logging.getLogger(__name__) class EverNoteLoader(BaseLoader): - """EverNote Loader. + """Load from `EverNote`. Loads an EverNote notebook export file e.g. my_notebook.enex into Documents. Instructions on producing this file can be found at diff --git a/libs/langchain/langchain/document_loaders/excel.py b/libs/langchain/langchain/document_loaders/excel.py index 619a082f5b..19b3dece87 100644 --- a/libs/langchain/langchain/document_loaders/excel.py +++ b/libs/langchain/langchain/document_loaders/excel.py @@ -8,7 +8,9 @@ from langchain.document_loaders.unstructured import ( class UnstructuredExcelLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load Excel files. Like other + """Load Microsoft Excel files using `Unstructured`. + + Like other Unstructured loaders, UnstructuredExcelLoader can be used in both "single" and "elements" mode. If you use the loader in "elements" mode, each sheet in the Excel file will be a an Unstructured Table diff --git a/libs/langchain/langchain/document_loaders/facebook_chat.py b/libs/langchain/langchain/document_loaders/facebook_chat.py index 58de558a81..fa5a5ba468 100644 --- a/libs/langchain/langchain/document_loaders/facebook_chat.py +++ b/libs/langchain/langchain/document_loaders/facebook_chat.py @@ -1,4 +1,3 @@ -"""Loads Facebook chat json dump.""" import datetime import json from pathlib import Path @@ -23,7 +22,7 @@ def concatenate_rows(row: dict) -> str: class FacebookChatLoader(BaseLoader): - """Loads Facebook messages json directory dump.""" + """Load `Facebook Chat` messages directory dump.""" def __init__(self, path: str): """Initialize with a path.""" diff --git a/libs/langchain/langchain/document_loaders/fauna.py b/libs/langchain/langchain/document_loaders/fauna.py index 3782fb97ee..3c949a7063 100644 --- a/libs/langchain/langchain/document_loaders/fauna.py +++ b/libs/langchain/langchain/document_loaders/fauna.py @@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class FaunaLoader(BaseLoader): - """FaunaDB Loader. + """Load from `FaunaDB`. Attributes: query (str): The FQL query string to execute. diff --git a/libs/langchain/langchain/document_loaders/figma.py b/libs/langchain/langchain/document_loaders/figma.py index ff67c9c45c..5f424be4f7 100644 --- a/libs/langchain/langchain/document_loaders/figma.py +++ b/libs/langchain/langchain/document_loaders/figma.py @@ -1,4 +1,3 @@ -"""Loads Figma files json dump.""" import json import urllib.request from typing import Any, List @@ -9,7 +8,7 @@ from langchain.utils import stringify_dict class FigmaFileLoader(BaseLoader): - """Loads Figma file json.""" + """Load `Figma` file.""" def __init__(self, access_token: str, ids: str, key: str): """Initialize with access token, ids, and key. diff --git a/libs/langchain/langchain/document_loaders/gcs_directory.py b/libs/langchain/langchain/document_loaders/gcs_directory.py index 1820f198b5..a990662897 100644 --- a/libs/langchain/langchain/document_loaders/gcs_directory.py +++ b/libs/langchain/langchain/document_loaders/gcs_directory.py @@ -1,4 +1,3 @@ -"""Loading logic for loading documents from an GCS directory.""" from typing import Callable, List, Optional from langchain.docstore.document import Document @@ -7,7 +6,7 @@ from langchain.document_loaders.gcs_file import GCSFileLoader class GCSDirectoryLoader(BaseLoader): - """Loads Documents from GCS.""" + """Load from GCS directory.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/gcs_file.py b/libs/langchain/langchain/document_loaders/gcs_file.py index 3dd6950393..1e6a6da767 100644 --- a/libs/langchain/langchain/document_loaders/gcs_file.py +++ b/libs/langchain/langchain/document_loaders/gcs_file.py @@ -1,4 +1,3 @@ -"""Load documents from a GCS file.""" import os import tempfile from typing import Callable, List, Optional @@ -9,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class GCSFileLoader(BaseLoader): - """Load Documents from a GCS file.""" + """Load from GCS file.""" def __init__( self, diff --git a/libs/langchain/langchain/document_loaders/generic.py b/libs/langchain/langchain/document_loaders/generic.py index 7dbfc6e1e6..2728e0ae9f 100644 --- a/libs/langchain/langchain/document_loaders/generic.py +++ b/libs/langchain/langchain/document_loaders/generic.py @@ -15,7 +15,7 @@ DEFAULT = Literal["default"] class GenericLoader(BaseLoader): - """A generic document loader. + """Generic Document Loader. A generic document loader that allows combining an arbitrary blob loader with a blob parser. diff --git a/libs/langchain/langchain/document_loaders/geodataframe.py b/libs/langchain/langchain/document_loaders/geodataframe.py index 68998212a1..70f5fa626e 100644 --- a/libs/langchain/langchain/document_loaders/geodataframe.py +++ b/libs/langchain/langchain/document_loaders/geodataframe.py @@ -1,4 +1,3 @@ -"""Load from Dataframe object""" from typing import Any, Iterator, List from langchain.docstore.document import Document @@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class GeoDataFrameLoader(BaseLoader): - """Load geopandas Dataframe.""" + """Load `geopandas` Dataframe.""" def __init__(self, data_frame: Any, page_content_column: str = "geometry"): """Initialize with geopandas Dataframe. diff --git a/libs/langchain/langchain/document_loaders/git.py b/libs/langchain/langchain/document_loaders/git.py index 14b2676a45..14a8849c7e 100644 --- a/libs/langchain/langchain/document_loaders/git.py +++ b/libs/langchain/langchain/document_loaders/git.py @@ -6,7 +6,8 @@ from langchain.document_loaders.base import BaseLoader class GitLoader(BaseLoader): - """Loads files from a Git repository into a list of documents. + """Load `Git` repository files. + The Repository can be local on disk available at `repo_path`, or remote at `clone_url` that will be cloned to `repo_path`. Currently, supports only text files. diff --git a/libs/langchain/langchain/document_loaders/gitbook.py b/libs/langchain/langchain/document_loaders/gitbook.py index 1fcec22922..996a5e12d2 100644 --- a/libs/langchain/langchain/document_loaders/gitbook.py +++ b/libs/langchain/langchain/document_loaders/gitbook.py @@ -1,4 +1,3 @@ -"""Loads GitBook.""" from typing import Any, List, Optional from urllib.parse import urljoin, urlparse @@ -7,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader class GitbookLoader(WebBaseLoader): - """Load GitBook data. + """Load `GitBook` data. 1. load from either a single page, or 2. load all (relative) paths in the navbar. diff --git a/libs/langchain/langchain/document_loaders/github.py b/libs/langchain/langchain/document_loaders/github.py index 2a55e3d002..eaa10f0955 100644 --- a/libs/langchain/langchain/document_loaders/github.py +++ b/libs/langchain/langchain/document_loaders/github.py @@ -11,7 +11,7 @@ from langchain.utils import get_from_dict_or_env class BaseGitHubLoader(BaseLoader, BaseModel, ABC): - """Load issues of a GitHub repository.""" + """Load `GitHub` repository Issues.""" repo: str """Name of repository""" diff --git a/libs/langchain/langchain/document_loaders/googledrive.py b/libs/langchain/langchain/document_loaders/googledrive.py index 9a0290a35e..dd0dde3fc3 100644 --- a/libs/langchain/langchain/document_loaders/googledrive.py +++ b/libs/langchain/langchain/document_loaders/googledrive.py @@ -1,5 +1,3 @@ -"""Loads data from Google Drive.""" - # Prerequisites: # 1. Create a Google Cloud project # 2. Enable the Google Drive API: @@ -22,7 +20,7 @@ SCOPES = ["https://www.googleapis.com/auth/drive.readonly"] class GoogleDriveLoader(BaseLoader, BaseModel): - """Loads Google Docs from Google Drive.""" + """Load Google Docs from `Google Drive`.""" service_account_key: Path = Path.home() / ".credentials" / "keys.json" """Path to the service account key file.""" diff --git a/libs/langchain/langchain/document_loaders/gutenberg.py b/libs/langchain/langchain/document_loaders/gutenberg.py index bcf1370dd8..f1280ef651 100644 --- a/libs/langchain/langchain/document_loaders/gutenberg.py +++ b/libs/langchain/langchain/document_loaders/gutenberg.py @@ -1,4 +1,3 @@ -"""Loads .txt web files.""" from typing import List from langchain.docstore.document import Document @@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader class GutenbergLoader(BaseLoader): - """Loader that uses urllib to load .txt web files.""" + """Load from `Gutenberg.org`.""" def __init__(self, file_path: str): """Initialize with a file path.""" diff --git a/libs/langchain/langchain/document_loaders/hn.py b/libs/langchain/langchain/document_loaders/hn.py index 34b284648b..a5f21b1f6f 100644 --- a/libs/langchain/langchain/document_loaders/hn.py +++ b/libs/langchain/langchain/document_loaders/hn.py @@ -1,4 +1,3 @@ -"""Loads HN.""" from typing import Any, List from langchain.docstore.document import Document @@ -6,7 +5,9 @@ from langchain.document_loaders.web_base import WebBaseLoader class HNLoader(WebBaseLoader): - """Load Hacker News data from either main page results or the comments page.""" + """Load `Hacker News` data. + + It loads data from either main page results or the comments page.""" def load(self) -> List[Document]: """Get important HN webpage information. diff --git a/libs/langchain/langchain/document_loaders/html.py b/libs/langchain/langchain/document_loaders/html.py index dce2697076..4cca1bfd1e 100644 --- a/libs/langchain/langchain/document_loaders/html.py +++ b/libs/langchain/langchain/document_loaders/html.py @@ -1,11 +1,10 @@ -"""Loader that uses unstructured to load HTML files.""" from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader class UnstructuredHTMLLoader(UnstructuredFileLoader): - """Loader that uses Unstructured to load HTML files. + """Load `HTML` files using `Unstructured`. You can run the loader in one of two modes: "single" and "elements". If you use "single" mode, the document will be returned as a single diff --git a/libs/langchain/langchain/document_loaders/html_bs.py b/libs/langchain/langchain/document_loaders/html_bs.py index cebf3c1baf..85a5d7ad8f 100644 --- a/libs/langchain/langchain/document_loaders/html_bs.py +++ b/libs/langchain/langchain/document_loaders/html_bs.py @@ -1,5 +1,3 @@ -"""Loader that uses bs4 to load HTML files, enriching metadata with page title.""" - import logging from typing import Dict, List, Union @@ -10,7 +8,7 @@ logger = logging.getLogger(__name__) class BSHTMLLoader(BaseLoader): - """Loader that uses beautiful soup to parse HTML files.""" + """Load `HTML` files and parse them with `beautiful soup`.""" def __init__( self,