diff --git a/langchain/document_loaders/college_confidential.py b/langchain/document_loaders/college_confidential.py index 1eaa64bcb2..6094046984 100644 --- a/langchain/document_loaders/college_confidential.py +++ b/langchain/document_loaders/college_confidential.py @@ -9,7 +9,7 @@ class CollegeConfidentialLoader(WebBaseLoader): """Loader that loads College Confidential webpages.""" def load(self) -> List[Document]: - """Load webpage.""" + """Load webpages as Documents.""" soup = self.scrape() text = soup.select_one("main[class='skin-handler']").text metadata = {"source": self.web_path} diff --git a/langchain/document_loaders/confluence.py b/langchain/document_loaders/confluence.py index f792238ff3..b7608f23e7 100644 --- a/langchain/document_loaders/confluence.py +++ b/langchain/document_loaders/confluence.py @@ -33,8 +33,9 @@ class ContentFormat(str, Enum): class ConfluenceLoader(BaseLoader): - """ - Load Confluence pages. Port of https://llamahub.ai/l/confluence + """Load Confluence pages. + + Port of https://llamahub.ai/l/confluence This currently supports username/api_key, Oauth2 login or personal access token authentication. @@ -175,7 +176,7 @@ class ConfluenceLoader(BaseLoader): "key_cert", ]: errors.append( - "You have either ommited require keys or added extra " + "You have either omitted require keys or added extra " "keys to the oauth2 dictionary. key values should be " "`['access_token', 'access_token_secret', 'consumer_key', 'key_cert']`" ) @@ -340,10 +341,10 @@ class ConfluenceLoader(BaseLoader): """Paginate the various methods to retrieve groups of pages. Unfortunately, due to page size, sometimes the Confluence API - doesn't match the limit value. If `limit` is >100 confluence + doesn't match the limit value. If `limit` is >100 confluence seems to cap the response to 100. Also, due to the Atlassian Python package, we don't get the "next" values from the "_links" key because - they only return the value from the results key. So here, the pagination + they only return the value from the result key. So here, the pagination starts from 0 and goes until the max_pages, getting the `limit` number of pages with each request. We have to manually check if there are more docs based on the length of the returned list of pages, rather than diff --git a/langchain/document_loaders/conllu.py b/langchain/document_loaders/conllu.py index f1fc12da40..939ada2075 100644 --- a/langchain/document_loaders/conllu.py +++ b/langchain/document_loaders/conllu.py @@ -10,11 +10,11 @@ class CoNLLULoader(BaseLoader): """Load CoNLL-U files.""" def __init__(self, file_path: str): - """Initialize with file path.""" + """Initialize with a file path.""" self.file_path = file_path def load(self) -> List[Document]: - """Load from file path.""" + """Load from a file path.""" with open(self.file_path, encoding="utf8") as f: tsv = list(csv.reader(f, delimiter="\t")) diff --git a/langchain/document_loaders/csv_loader.py b/langchain/document_loaders/csv_loader.py index 3d5e47b1a6..17bb84df26 100644 --- a/langchain/document_loaders/csv_loader.py +++ b/langchain/document_loaders/csv_loader.py @@ -37,6 +37,16 @@ class CSVLoader(BaseLoader): csv_args: Optional[Dict] = None, encoding: Optional[str] = None, ): + """ + + Args: + file_path: The path to the CSV file. + source_column: The name of the column in the CSV file to use as the source. + Optional. Defaults to None. + csv_args: A dictionary of arguments to pass to the csv.DictReader. + Optional. Defaults to None. + encoding: The encoding of the CSV file. Optional. Defaults to None. + """ self.file_path = file_path self.source_column = source_column self.encoding = encoding @@ -73,6 +83,14 @@ class UnstructuredCSVLoader(UnstructuredFileLoader): def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any ): + """ + + Args: + file_path: The path to the CSV file. + mode: The mode to use when loading the CSV file. + Optional. Defaults to "single". + **unstructured_kwargs: Keyword arguments to pass to unstructured. + """ validate_unstructured_version(min_unstructured_version="0.6.8") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/langchain/document_loaders/dataframe.py b/langchain/document_loaders/dataframe.py index e0218a3978..b1a9194c90 100644 --- a/langchain/document_loaders/dataframe.py +++ b/langchain/document_loaders/dataframe.py @@ -1,4 +1,4 @@ -"""Load from Dataframe object""" +"""Load from a Dataframe object""" from typing import Any, Iterator, List from langchain.docstore.document import Document @@ -6,10 +6,16 @@ from langchain.document_loaders.base import BaseLoader class DataFrameLoader(BaseLoader): - """Load Pandas DataFrames.""" + """Load Pandas DataFrame.""" def __init__(self, data_frame: Any, page_content_column: str = "text"): - """Initialize with dataframe object.""" + """Initialize with dataframe object. + + Args: + data_frame: Pandas DataFrame object. + page_content_column: Name of the column containing the page content. + Defaults to "text". + """ import pandas as pd if not isinstance(data_frame, pd.DataFrame): diff --git a/langchain/document_loaders/diffbot.py b/langchain/document_loaders/diffbot.py index c740ee3201..e5ef3d5f7f 100644 --- a/langchain/document_loaders/diffbot.py +++ b/langchain/document_loaders/diffbot.py @@ -11,12 +11,19 @@ logger = logging.getLogger(__name__) class DiffbotLoader(BaseLoader): - """Loader that loads Diffbot file json.""" + """Loads Diffbot file json.""" def __init__( self, api_token: str, urls: List[str], continue_on_failure: bool = True ): - """Initialize with API token, ids, and key.""" + """Initialize with API token, ids, and key. + + Args: + api_token: Diffbot API token. + urls: List of URLs to load. + continue_on_failure: Whether to continue loading other URLs if one fails. + Defaults to True. + """ self.api_token = api_token self.urls = urls self.continue_on_failure = continue_on_failure @@ -38,7 +45,7 @@ class DiffbotLoader(BaseLoader): return response.json() if response.ok else {} def load(self) -> List[Document]: - """Extract text from Diffbot on all the URLs and return Document instances""" + """Extract text from Diffbot on all the URLs and return Documents""" docs: List[Document] = list() for url in self.urls: diff --git a/langchain/document_loaders/directory.py b/langchain/document_loaders/directory.py index 003d6f0122..6da941a05c 100644 --- a/langchain/document_loaders/directory.py +++ b/langchain/document_loaders/directory.py @@ -1,4 +1,4 @@ -"""Loading logic for loading documents from a directory.""" +"""Load documents from a directory.""" import concurrent import logging from pathlib import Path @@ -25,7 +25,7 @@ def _is_visible(p: Path) -> bool: class DirectoryLoader(BaseLoader): - """Loading logic for loading documents from a directory.""" + """Load documents from a directory.""" def __init__( self, @@ -40,7 +40,22 @@ class DirectoryLoader(BaseLoader): use_multithreading: bool = False, max_concurrency: int = 4, ): - """Initialize with path to directory and how to glob over it.""" + """Initialize with a path to directory and how to glob over it. + + Args: + path: Path to directory. + glob: Glob pattern to use to find files. Defaults to "**/[!.]*" + (all files except hidden). + silent_errors: Whether to silently ignore errors. Defaults to False. + load_hidden: Whether to load hidden files. Defaults to False. + loader_cls: Loader class to use for loading files. + Defaults to UnstructuredFileLoader. + loader_kwargs: Keyword arguments to pass to loader_cls. Defaults to None. + recursive: Whether to recursively search for files. Defaults to False. + show_progress: Whether to show a progress bar. Defaults to False. + use_multithreading: Whether to use multithreading. Defaults to False. + max_concurrency: The maximum number of threads to use. Defaults to 4. + """ if loader_kwargs is None: loader_kwargs = {} self.path = path @@ -57,6 +72,14 @@ class DirectoryLoader(BaseLoader): def load_file( self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any] ) -> None: + """Load a file. + + Args: + item: File path. + path: Directory path. + docs: List of documents to append to. + pbar: Progress bar. Defaults to None. + """ if item.is_file(): if _is_visible(item.relative_to(path)) or self.load_hidden: try: diff --git a/langchain/document_loaders/discord.py b/langchain/document_loaders/discord.py index 4f4da44036..be6f290d70 100644 --- a/langchain/document_loaders/discord.py +++ b/langchain/document_loaders/discord.py @@ -14,7 +14,12 @@ class DiscordChatLoader(BaseLoader): """Load Discord chat logs.""" def __init__(self, chat_log: pd.DataFrame, user_id_col: str = "ID"): - """Initialize with a Pandas DataFrame containing chat logs.""" + """Initialize with a Pandas DataFrame containing chat logs. + + Args: + chat_log: Pandas DataFrame containing chat logs. + user_id_col: Name of the column containing the user ID. Defaults to "ID". + """ if not isinstance(chat_log, pd.DataFrame): raise ValueError( f"Expected chat_log to be a pd.DataFrame, got {type(chat_log)}" diff --git a/langchain/document_loaders/docugami.py b/langchain/document_loaders/docugami.py index 449b968744..b60326f24c 100644 --- a/langchain/document_loaders/docugami.py +++ b/langchain/document_loaders/docugami.py @@ -1,4 +1,4 @@ -"""Loader that loads processed documents from Docugami.""" +"""Loads processed documents from Docugami.""" import io import logging @@ -29,22 +29,35 @@ logger = logging.getLogger(__name__) class DocugamiLoader(BaseLoader, BaseModel): - """Loader that loads processed docs from Docugami. + """Loads processed docs from Docugami. To use, you should have the ``lxml`` python package installed. """ api: str = DEFAULT_API_ENDPOINT + """The Docugami API endpoint to use.""" access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY") + """The Docugami API access token to use.""" docset_id: Optional[str] + """The Docugami API docset ID to use.""" document_ids: Optional[Sequence[str]] + """The Docugami API document IDs to use.""" file_paths: Optional[Sequence[Union[Path, str]]] + """The local file paths to use.""" min_chunk_size: int = 32 # appended to the next chunk to avoid over-chunking + """The minimum chunk size to use when parsing DGML. Defaults to 32.""" @root_validator def validate_local_or_remote(cls, values: Dict[str, Any]) -> Dict[str, Any]: - """Validate that either local file paths are given, or remote API docset ID.""" + """Validate that either local file paths are given, or remote API docset ID. + + Args: + values: The values to validate. + + Returns: + The validated values. + """ if values.get("file_paths") and values.get("docset_id"): raise ValueError("Cannot specify both file_paths and remote API docset_id") diff --git a/langchain/document_loaders/duckdb_loader.py b/langchain/document_loaders/duckdb_loader.py index 43948fe37d..f1805f69e1 100644 --- a/langchain/document_loaders/duckdb_loader.py +++ b/langchain/document_loaders/duckdb_loader.py @@ -22,6 +22,20 @@ class DuckDBLoader(BaseLoader): page_content_columns: Optional[List[str]] = None, metadata_columns: Optional[List[str]] = None, ): + """ + + Args: + query: The query to execute. + database: The database to connect to. Defaults to ":memory:". + read_only: Whether to open the database in read-only mode. + Defaults to False. + config: A dictionary of configuration options to pass to the database. + Optional. + page_content_columns: The columns to write into the `page_content` + of the document. Optional. + metadata_columns: The columns to write into the `metadata` of the document. + Optional. + """ self.query = query self.database = database self.read_only = read_only diff --git a/langchain/document_loaders/email.py b/langchain/document_loaders/email.py index f68fcbec36..968502b153 100644 --- a/langchain/document_loaders/email.py +++ b/langchain/document_loaders/email.py @@ -1,4 +1,4 @@ -"""Loader that loads email files.""" +"""Loads email files.""" import os from typing import Any, List @@ -72,12 +72,17 @@ class UnstructuredEmailLoader(UnstructuredFileLoader): class OutlookMessageLoader(BaseLoader): """ - Loader that loads Outlook Message files using extract_msg. + Loads Outlook Message files using extract_msg. + https://github.com/TeamMsgExtractor/msg-extractor """ def __init__(self, file_path: str): - """Initialize with file path.""" + """Initialize with a file path. + + Args: + file_path: The path to the Outlook Message file. + """ self.file_path = file_path diff --git a/langchain/document_loaders/embaas.py b/langchain/document_loaders/embaas.py index 34bad0bcfa..89959f9a59 100644 --- a/langchain/document_loaders/embaas.py +++ b/langchain/document_loaders/embaas.py @@ -52,7 +52,10 @@ class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters): class BaseEmbaasLoader(BaseModel): + """Base class for embedding a model into an Embaas document extraction API.""" + embaas_api_key: Optional[str] = None + """The API key for the embaas document extraction API.""" api_url: str = EMBAAS_DOC_API_URL """The URL of the embaas document extraction API.""" params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters() @@ -69,7 +72,7 @@ class BaseEmbaasLoader(BaseModel): class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser): - """Wrapper around embaas's document byte loader service. + """Embaas's document byte loader. To use, you should have the environment variable ``EMBAAS_API_KEY`` set with your API key, or pass @@ -99,6 +102,11 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser): """ def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Parses the blob lazily. + + Args: + blob: The blob to parse. + """ yield from self._get_documents(blob=blob) @staticmethod @@ -170,7 +178,7 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser): class EmbaasLoader(BaseEmbaasLoader, BaseLoader): - """Wrapper around embaas's document loader service. + """Embaas's document loader. To use, you should have the environment variable ``EMBAAS_API_KEY`` set with your API key, or pass diff --git a/langchain/document_loaders/evernote.py b/langchain/document_loaders/evernote.py index be9a05ac32..40056924e3 100644 --- a/langchain/document_loaders/evernote.py +++ b/langchain/document_loaders/evernote.py @@ -14,6 +14,7 @@ from langchain.document_loaders.base import BaseLoader class EverNoteLoader(BaseLoader): """EverNote Loader. + Loads an EverNote notebook export file e.g. my_notebook.enex into Documents. Instructions on producing this file can be found at https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML diff --git a/langchain/document_loaders/excel.py b/langchain/document_loaders/excel.py index 94e6fb1bd9..54e96bf269 100644 --- a/langchain/document_loaders/excel.py +++ b/langchain/document_loaders/excel.py @@ -13,6 +13,14 @@ class UnstructuredExcelLoader(UnstructuredFileLoader): def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any ): + """ + + Args: + file_path: The path to the Microsoft Excel file. + mode: The mode to use when partitioning the file. See unstructured docs + for more info. Optional. Defaults to "single". + **unstructured_kwargs: Keyword arguments to pass to unstructured. + """ validate_unstructured_version(min_unstructured_version="0.6.7") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/langchain/document_loaders/facebook_chat.py b/langchain/document_loaders/facebook_chat.py index 4063619845..53ec5b216a 100644 --- a/langchain/document_loaders/facebook_chat.py +++ b/langchain/document_loaders/facebook_chat.py @@ -9,7 +9,11 @@ from langchain.document_loaders.base import BaseLoader def concatenate_rows(row: dict) -> str: - """Combine message information in a readable format ready to be used.""" + """Combine message information in a readable format ready to be used. + + Args: + row: dictionary containing message information. + """ sender = row["sender_name"] text = row["content"] date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime( @@ -19,10 +23,10 @@ def concatenate_rows(row: dict) -> str: class FacebookChatLoader(BaseLoader): - """Loader that loads Facebook messages json directory dump.""" + """Loads Facebook messages json directory dump.""" def __init__(self, path: str): - """Initialize with path.""" + """Initialize with a path.""" self.file_path = path def load(self) -> List[Document]: diff --git a/langchain/document_loaders/figma.py b/langchain/document_loaders/figma.py index 8a1a4722d3..37b842719b 100644 --- a/langchain/document_loaders/figma.py +++ b/langchain/document_loaders/figma.py @@ -9,10 +9,16 @@ from langchain.utils import stringify_dict class FigmaFileLoader(BaseLoader): - """Loader that loads Figma file json.""" + """Loads Figma file json.""" def __init__(self, access_token: str, ids: str, key: str): - """Initialize with access token, ids, and key.""" + """Initialize with access token, ids, and key. + + Args: + access_token: The access token for the Figma REST API. + ids: The ids of the Figma file. + key: The key for the Figma file + """ self.access_token = access_token self.ids = ids self.key = key diff --git a/langchain/document_loaders/gcs_directory.py b/langchain/document_loaders/gcs_directory.py index 4b81012b2a..469723f565 100644 --- a/langchain/document_loaders/gcs_directory.py +++ b/langchain/document_loaders/gcs_directory.py @@ -7,10 +7,16 @@ from langchain.document_loaders.gcs_file import GCSFileLoader class GCSDirectoryLoader(BaseLoader): - """Loading logic for loading documents from GCS.""" + """Loads Documents from GCS.""" def __init__(self, project_name: str, bucket: str, prefix: str = ""): - """Initialize with bucket and key name.""" + """Initialize with bucket and key name. + + Args: + project_name: The name of the project for the GCS bucket. + bucket: The name of the GCS bucket. + prefix: The prefix of the GCS bucket. + """ self.project_name = project_name self.bucket = bucket self.prefix = prefix @@ -20,7 +26,7 @@ class GCSDirectoryLoader(BaseLoader): try: from google.cloud import storage except ImportError: - raise ValueError( + raise ImportError( "Could not import google-cloud-storage python package. " "Please install it with `pip install google-cloud-storage`." ) diff --git a/langchain/document_loaders/gcs_file.py b/langchain/document_loaders/gcs_file.py index b1dc43e383..6d41d2c410 100644 --- a/langchain/document_loaders/gcs_file.py +++ b/langchain/document_loaders/gcs_file.py @@ -1,4 +1,4 @@ -"""Loading logic for loading documents from a GCS file.""" +"""Load documents from a GCS file.""" import os import tempfile from typing import List @@ -9,10 +9,16 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class GCSFileLoader(BaseLoader): - """Loading logic for loading documents from GCS.""" + """Load Documents from a GCS file.""" def __init__(self, project_name: str, bucket: str, blob: str): - """Initialize with bucket and key name.""" + """Initialize with bucket and key name. + + Args: + project_name: The name of the project to load + bucket: The name of the GCS bucket. + blob: The name of the GCS blob to load. + """ self.bucket = bucket self.blob = blob self.project_name = project_name @@ -22,7 +28,7 @@ class GCSFileLoader(BaseLoader): try: from google.cloud import storage except ImportError: - raise ValueError( + raise ImportError( "Could not import google-cloud-storage python package. " "Please install it with `pip install google-cloud-storage`." ) diff --git a/langchain/document_loaders/git.py b/langchain/document_loaders/git.py index 3bf6efaa53..3898381170 100644 --- a/langchain/document_loaders/git.py +++ b/langchain/document_loaders/git.py @@ -7,9 +7,9 @@ from langchain.document_loaders.base import BaseLoader class GitLoader(BaseLoader): """Loads files from a Git repository into a list of documents. - Repository can be local on disk available at `repo_path`, + The Repository can be local on disk available at `repo_path`, or remote at `clone_url` that will be cloned to `repo_path`. - Currently supports only text files. + Currently, supports only text files. Each document represents one file in the repository. The `path` points to the local Git repository, and the `branch` specifies the branch to load @@ -23,6 +23,15 @@ class GitLoader(BaseLoader): branch: Optional[str] = "main", file_filter: Optional[Callable[[str], bool]] = None, ): + """ + + Args: + repo_path: The path to the Git repository. + clone_url: Optional. The URL to clone the repository from. + branch: Optional. The branch to load files from. Defaults to `main`. + file_filter: Optional. A function that takes a file path and returns + a boolean indicating whether to load the file. Defaults to None. + """ self.repo_path = repo_path self.clone_url = clone_url self.branch = branch diff --git a/langchain/document_loaders/gitbook.py b/langchain/document_loaders/gitbook.py index f47c9dc1a2..aa286da485 100644 --- a/langchain/document_loaders/gitbook.py +++ b/langchain/document_loaders/gitbook.py @@ -28,7 +28,9 @@ class GitbookLoader(WebBaseLoader): load_all_paths: If set to True, all relative paths in the navbar are loaded instead of only `web_page`. base_url: If `load_all_paths` is True, the relative paths are - appended to this base url. Defaults to `web_page` if not set. + appended to this base url. Defaults to `web_page`. + content_selector: The CSS selector for the content to load. + Defaults to "main". """ self.base_url = base_url or web_page if self.base_url.endswith("/"): diff --git a/langchain/document_loaders/github.py b/langchain/document_loaders/github.py index 0dd8389477..2a55e3d002 100644 --- a/langchain/document_loaders/github.py +++ b/langchain/document_loaders/github.py @@ -35,6 +35,8 @@ class BaseGitHubLoader(BaseLoader, BaseModel, ABC): class GitHubIssuesLoader(BaseGitHubLoader): + """Load issues of a GitHub repository.""" + include_prs: bool = True """If True include Pull Requests in results, otherwise ignore them.""" milestone: Union[int, Literal["*", "none"], None] = None @@ -159,6 +161,7 @@ class GitHubIssuesLoader(BaseGitHubLoader): @property def query_params(self) -> str: + """Create query parameters for GitHub API.""" labels = ",".join(self.labels) if self.labels else self.labels query_params_dict = { "milestone": self.milestone, @@ -179,4 +182,5 @@ class GitHubIssuesLoader(BaseGitHubLoader): @property def url(self) -> str: + """Create URL for GitHub API.""" return f"https://api.github.com/repos/{self.repo}/issues?{self.query_params}" diff --git a/langchain/document_loaders/googledrive.py b/langchain/document_loaders/googledrive.py index 56c4ae4535..cc4b3f1d6b 100644 --- a/langchain/document_loaders/googledrive.py +++ b/langchain/document_loaders/googledrive.py @@ -22,21 +22,32 @@ SCOPES = ["https://www.googleapis.com/auth/drive.readonly"] class GoogleDriveLoader(BaseLoader, BaseModel): - """Loader that loads Google Docs from Google Drive.""" + """Loads Google Docs from Google Drive.""" service_account_key: Path = Path.home() / ".credentials" / "keys.json" + """Path to the service account key file.""" credentials_path: Path = Path.home() / ".credentials" / "credentials.json" + """Path to the credentials file.""" token_path: Path = Path.home() / ".credentials" / "token.json" + """Path to the token file.""" folder_id: Optional[str] = None + """The folder id to load from.""" document_ids: Optional[List[str]] = None + """The document ids to load from.""" file_ids: Optional[List[str]] = None + """The file ids to load from.""" recursive: bool = False + """Whether to load recursively. Only applies when folder_id is given.""" file_types: Optional[Sequence[str]] = None + """The file types to load. Only applies when folder_id is given.""" load_trashed_files: bool = False + """Whether to load trashed files. Only applies when folder_id is given.""" # NOTE(MthwRobinson) - changing the file_loader_cls to type here currently # results in pydantic validation errors file_loader_cls: Any = None + """The file loader class to use.""" file_loader_kwargs: Dict["str", Any] = {} + """The file loader kwargs to use.""" @root_validator def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]: diff --git a/langchain/document_loaders/gutenberg.py b/langchain/document_loaders/gutenberg.py index 41a0a5f55a..bcf1370dd8 100644 --- a/langchain/document_loaders/gutenberg.py +++ b/langchain/document_loaders/gutenberg.py @@ -1,4 +1,4 @@ -"""Loader that loads .txt web files.""" +"""Loads .txt web files.""" from typing import List from langchain.docstore.document import Document @@ -9,7 +9,7 @@ class GutenbergLoader(BaseLoader): """Loader that uses urllib to load .txt web files.""" def __init__(self, file_path: str): - """Initialize with file path.""" + """Initialize with a file path.""" if not file_path.startswith("https://www.gutenberg.org"): raise ValueError("file path must start with 'https://www.gutenberg.org'") diff --git a/langchain/document_loaders/helpers.py b/langchain/document_loaders/helpers.py index 3ccf4f7d77..c48d0b8eb1 100644 --- a/langchain/document_loaders/helpers.py +++ b/langchain/document_loaders/helpers.py @@ -5,9 +5,14 @@ from typing import List, NamedTuple, Optional, cast class FileEncoding(NamedTuple): + """A file encoding as the NamedTuple.""" + encoding: Optional[str] + """The encoding of the file.""" confidence: float + """The confidence of the encoding.""" language: Optional[str] + """The language of the file.""" def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]: @@ -15,6 +20,10 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding Returns a list of `FileEncoding` tuples with the detected encodings ordered by confidence. + + Args: + file_path: The path to the file to detect the encoding for. + timeout: The timeout in seconds for the encoding detection. """ import chardet diff --git a/langchain/document_loaders/hn.py b/langchain/document_loaders/hn.py index 91ff8d9d5e..d786285f70 100644 --- a/langchain/document_loaders/hn.py +++ b/langchain/document_loaders/hn.py @@ -1,4 +1,4 @@ -"""Loader that loads HN.""" +"""Loader that loads Hacker News.""" from typing import Any, List from langchain.docstore.document import Document @@ -11,7 +11,7 @@ class HNLoader(WebBaseLoader): def load(self) -> List[Document]: """Get important HN webpage information. - Components are: + HN webpage components are: - title - content - source url, diff --git a/langchain/document_loaders/html_bs.py b/langchain/document_loaders/html_bs.py index 4a73187ad3..cebf3c1baf 100644 --- a/langchain/document_loaders/html_bs.py +++ b/langchain/document_loaders/html_bs.py @@ -20,11 +20,18 @@ class BSHTMLLoader(BaseLoader): get_text_separator: str = "", ) -> None: """Initialise with path, and optionally, file encoding to use, and any kwargs - to pass to the BeautifulSoup object.""" + to pass to the BeautifulSoup object. + + Args: + file_path: The path to the file to load. + open_encoding: The encoding to use when opening the file. + bs_kwargs: Any kwargs to pass to the BeautifulSoup object. + get_text_separator: The separator to use when calling get_text on the soup. + """ try: import bs4 # noqa:F401 except ImportError: - raise ValueError( + raise ImportError( "beautifulsoup4 package not found, please install it with " "`pip install beautifulsoup4`" ) @@ -37,9 +44,9 @@ class BSHTMLLoader(BaseLoader): self.get_text_separator = get_text_separator def load(self) -> List[Document]: + """Load HTML document into document objects.""" from bs4 import BeautifulSoup - """Load HTML document into document objects.""" with open(self.file_path, "r", encoding=self.open_encoding) as f: soup = BeautifulSoup(f, **self.bs_kwargs) diff --git a/langchain/document_loaders/hugging_face_dataset.py b/langchain/document_loaders/hugging_face_dataset.py index 12b0af92de..17b823dd21 100644 --- a/langchain/document_loaders/hugging_face_dataset.py +++ b/langchain/document_loaders/hugging_face_dataset.py @@ -1,4 +1,4 @@ -"""Loader that loads HuggingFace datasets.""" +"""Loads HuggingFace datasets.""" from typing import Iterator, List, Mapping, Optional, Sequence, Union from langchain.docstore.document import Document @@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader class HuggingFaceDatasetLoader(BaseLoader): - """Loading logic for loading documents from the Hugging Face Hub.""" + """Load Documents from the Hugging Face Hub.""" def __init__( self, @@ -27,14 +27,15 @@ class HuggingFaceDatasetLoader(BaseLoader): Args: path: Path or name of the dataset. - page_content_column: Page content column name. + page_content_column: Page content column name. Default is "text". name: Name of the dataset configuration. data_dir: Data directory of the dataset configuration. data_files: Path(s) to source data file(s). cache_dir: Directory to read/write data. keep_in_memory: Whether to copy the dataset in-memory. save_infos: Save the dataset information (checksums/size/splits/...). - use_auth_token: Bearer token for remote files on the Datasets Hub. + Default is False. + use_auth_token: Bearer token for remote files on the Dataset Hub. num_proc: Number of processes. """ diff --git a/langchain/document_loaders/ifixit.py b/langchain/document_loaders/ifixit.py index 61169ade60..8e3f42c8cd 100644 --- a/langchain/document_loaders/ifixit.py +++ b/langchain/document_loaders/ifixit.py @@ -22,7 +22,7 @@ class IFixitLoader(BaseLoader): """ def __init__(self, web_path: str): - """Initialize with web path.""" + """Initialize with a web path.""" if not web_path.startswith("https://www.ifixit.com"): raise ValueError("web path must start with 'https://www.ifixit.com'") @@ -60,6 +60,16 @@ class IFixitLoader(BaseLoader): @staticmethod def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]: + """Load suggestions. + + Args: + query: A query string + doc_type: The type of document to search for. Can be one of "all", + "device", "guide", "teardown", "answer", "wiki". + + Returns: + + """ res = requests.get( IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type ) @@ -89,6 +99,14 @@ class IFixitLoader(BaseLoader): def load_questions_and_answers( self, url_override: Optional[str] = None ) -> List[Document]: + """Load a list of questions and answers. + + Args: + url_override: A URL to override the default URL. + + Returns: List[Document] + + """ loader = WebBaseLoader(self.web_path if url_override is None else url_override) soup = loader.scrape() @@ -125,6 +143,16 @@ class IFixitLoader(BaseLoader): def load_device( self, url_override: Optional[str] = None, include_guides: bool = True ) -> List[Document]: + """Loads a device + + Args: + url_override: A URL to override the default URL. + include_guides: Whether to include guides linked to from the device. + Defaults to True. + + Returns: + + """ documents = [] if url_override is None: url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id @@ -153,6 +181,14 @@ class IFixitLoader(BaseLoader): return documents def load_guide(self, url_override: Optional[str] = None) -> List[Document]: + """Load a guide + + Args: + url_override: A URL to override the default URL. + + Returns: List[Document] + + """ if url_override is None: url = IFIXIT_BASE_URL + "/guides/" + self.id else: diff --git a/langchain/document_loaders/image_captions.py b/langchain/document_loaders/image_captions.py index 6df4a03d05..53e3c694e2 100644 --- a/langchain/document_loaders/image_captions.py +++ b/langchain/document_loaders/image_captions.py @@ -1,5 +1,5 @@ -""" -Loader that loads image captions +"""Loads image captions. + By default, the loader utilizes the pre-trained BLIP image captioning model. https://huggingface.co/Salesforce/blip-image-captioning-base @@ -13,7 +13,7 @@ from langchain.document_loaders.base import BaseLoader class ImageCaptionLoader(BaseLoader): - """Loader that loads the captions of an image""" + """Loads the captions of an image""" def __init__( self, @@ -23,6 +23,11 @@ class ImageCaptionLoader(BaseLoader): ): """ Initialize with a list of image paths + + Args: + path_images: A list of image paths. + blip_processor: The name of the pre-trained BLIP processor. + blip_model: The name of the pre-trained BLIP model. """ if isinstance(path_images, str): self.image_paths = [path_images] diff --git a/langchain/document_loaders/imsdb.py b/langchain/document_loaders/imsdb.py index 4589553d33..312e25a850 100644 --- a/langchain/document_loaders/imsdb.py +++ b/langchain/document_loaders/imsdb.py @@ -1,4 +1,4 @@ -"""Loader that loads IMSDb.""" +"""Loads IMSDb.""" from typing import List from langchain.docstore.document import Document @@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader class IMSDbLoader(WebBaseLoader): - """Loader that loads IMSDb webpages.""" + """Loads IMSDb webpages.""" def load(self) -> List[Document]: """Load webpage.""" diff --git a/langchain/document_loaders/iugu.py b/langchain/document_loaders/iugu.py index fe2be8674a..229cf6f629 100644 --- a/langchain/document_loaders/iugu.py +++ b/langchain/document_loaders/iugu.py @@ -20,6 +20,12 @@ class IuguLoader(BaseLoader): """Loader that fetches data from IUGU.""" def __init__(self, resource: str, api_token: Optional[str] = None) -> None: + """Initialize the IUGU resource. + + Args: + resource: The name of the resource to fetch. + api_token: The IUGU API token to use. + """ self.resource = resource api_token = api_token or get_from_env("api_token", "IUGU_API_TOKEN") self.headers = {"Authorization": f"Bearer {api_token}"} diff --git a/langchain/document_loaders/joplin.py b/langchain/document_loaders/joplin.py index 0dc6730672..0cce974622 100644 --- a/langchain/document_loaders/joplin.py +++ b/langchain/document_loaders/joplin.py @@ -30,6 +30,14 @@ class JoplinLoader(BaseLoader): port: int = 41184, host: str = "localhost", ) -> None: + """ + + Args: + access_token: The access token to use. + port: The port where the Web Clipper service is running. Default is 41184. + host: The host where the Web Clipper service is running. + Default is localhost. + """ access_token = access_token or get_from_env( "access_token", "JOPLIN_ACCESS_TOKEN" ) diff --git a/langchain/document_loaders/json_loader.py b/langchain/document_loaders/json_loader.py index 153c321b09..c31b9a4848 100644 --- a/langchain/document_loaders/json_loader.py +++ b/langchain/document_loaders/json_loader.py @@ -1,4 +1,4 @@ -"""Loader that loads data from JSON.""" +"""Loads data from JSON.""" import json from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Union @@ -8,8 +8,7 @@ from langchain.document_loaders.base import BaseLoader class JSONLoader(BaseLoader): - """Loads a JSON file and references a jq schema provided to load the text into - documents. + """Loads a JSON file using a jq schema. Example: [{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text @@ -101,7 +100,7 @@ class JSONLoader(BaseLoader): return str(content) if content is not None else "" def _validate_content_key(self, data: Any) -> None: - """Check if content key is valid""" + """Check if a content key is valid""" sample = data.first() if not isinstance(sample, dict): raise ValueError( diff --git a/langchain/document_loaders/larksuite.py b/langchain/document_loaders/larksuite.py index b7263e4470..d57c10b898 100644 --- a/langchain/document_loaders/larksuite.py +++ b/langchain/document_loaders/larksuite.py @@ -1,4 +1,4 @@ -"""Loader that loads LarkSuite (FeiShu) document json dump.""" +"""Loads LarkSuite (FeiShu) document json dump.""" import json import urllib.request from typing import Any, Iterator, List @@ -8,10 +8,16 @@ from langchain.document_loaders.base import BaseLoader class LarkSuiteDocLoader(BaseLoader): - """Loader that loads LarkSuite (FeiShu) document.""" + """Loads LarkSuite (FeiShu) document.""" def __init__(self, domain: str, access_token: str, document_id: str): - """Initialize with domain, access_token (tenant / user), and document_id.""" + """Initialize with domain, access_token (tenant / user), and document_id. + + Args: + domain: The domain to load the LarkSuite. + access_token: The access_token to use. + document_id: The document_id to load. + """ self.domain = domain self.access_token = access_token self.document_id = document_id diff --git a/langchain/document_loaders/markdown.py b/langchain/document_loaders/markdown.py index db7b8094d8..3ecad43905 100644 --- a/langchain/document_loaders/markdown.py +++ b/langchain/document_loaders/markdown.py @@ -1,4 +1,4 @@ -"""Loader that loads Markdown files.""" +"""Loads Markdown files.""" from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader diff --git a/langchain/document_loaders/mastodon.py b/langchain/document_loaders/mastodon.py index db4b308ace..ef64cf463a 100644 --- a/langchain/document_loaders/mastodon.py +++ b/langchain/document_loaders/mastodon.py @@ -15,7 +15,7 @@ def _dependable_mastodon_import() -> mastodon: try: import mastodon except ImportError: - raise ValueError( + raise ImportError( "Mastodon.py package not found, " "please install it with `pip install Mastodon.py`" ) @@ -37,11 +37,13 @@ class MastodonTootsLoader(BaseLoader): Args: mastodon_accounts: The list of Mastodon accounts to query. - number_toots: How many toots to pull for each account. + number_toots: How many toots to pull for each account. Default is 100. exclude_replies: Whether to exclude reply toots from the load. + Default is False. access_token: An access token if toots are loaded as a Mastodon app. Can also be specified via the environment variables "MASTODON_ACCESS_TOKEN". api_base_url: A Mastodon API base URL to talk to, if not using the default. + Default is "https://mastodon.social". """ mastodon = _dependable_mastodon_import() access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN") diff --git a/langchain/document_loaders/mediawikidump.py b/langchain/document_loaders/mediawikidump.py index 68cd29f32a..56d79bdb4e 100644 --- a/langchain/document_loaders/mediawikidump.py +++ b/langchain/document_loaders/mediawikidump.py @@ -32,12 +32,17 @@ class MWDumpLoader(BaseLoader): """ def __init__(self, file_path: str, encoding: Optional[str] = "utf8"): - """Initialize with file path.""" + """Initialize with a file path. + + Args: + file_path: XML local file path + encoding: Charset encoding, defaults to "utf8" + """ self.file_path = file_path self.encoding = encoding def load(self) -> List[Document]: - """Load from file path.""" + """Load from a file path.""" import mwparserfromhell import mwxml diff --git a/langchain/document_loaders/mhtml.py b/langchain/document_loaders/mhtml.py index 27d3eceb12..6f1a4699ea 100644 --- a/langchain/document_loaders/mhtml.py +++ b/langchain/document_loaders/mhtml.py @@ -1,4 +1,4 @@ -"""Loader to load MHTML files, enriching metadata with page title.""" +"""Load MHTML files, enriching metadata with page title.""" import email import logging @@ -21,11 +21,18 @@ class MHTMLLoader(BaseLoader): get_text_separator: str = "", ) -> None: """Initialise with path, and optionally, file encoding to use, and any kwargs - to pass to the BeautifulSoup object.""" + to pass to the BeautifulSoup object. + + Args: + file_path: The path to the file to load. + open_encoding: The encoding to use when opening the file. + bs_kwargs: soup kwargs to pass to the BeautifulSoup object. + get_text_separator: The separator to use when getting text from the soup. + """ try: import bs4 # noqa:F401 except ImportError: - raise ValueError( + raise ImportError( "beautifulsoup4 package not found, please install it with " "`pip install beautifulsoup4`" )