docstrings `document_loaders` 2 (#6890)

updated docstring for the `document_loaders`

Maintainer responsibilities:
  - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
pull/6937/head
Leonid Ganeline 1 year ago committed by GitHub
parent 77ae8084a0
commit 1feac83323
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -9,7 +9,7 @@ class CollegeConfidentialLoader(WebBaseLoader):
"""Loader that loads College Confidential webpages."""
def load(self) -> List[Document]:
"""Load webpage."""
"""Load webpages as Documents."""
soup = self.scrape()
text = soup.select_one("main[class='skin-handler']").text
metadata = {"source": self.web_path}

@ -33,8 +33,9 @@ class ContentFormat(str, Enum):
class ConfluenceLoader(BaseLoader):
"""
Load Confluence pages. Port of https://llamahub.ai/l/confluence
"""Load Confluence pages.
Port of https://llamahub.ai/l/confluence
This currently supports username/api_key, Oauth2 login or personal access token
authentication.
@ -175,7 +176,7 @@ class ConfluenceLoader(BaseLoader):
"key_cert",
]:
errors.append(
"You have either ommited require keys or added extra "
"You have either omitted require keys or added extra "
"keys to the oauth2 dictionary. key values should be "
"`['access_token', 'access_token_secret', 'consumer_key', 'key_cert']`"
)
@ -340,10 +341,10 @@ class ConfluenceLoader(BaseLoader):
"""Paginate the various methods to retrieve groups of pages.
Unfortunately, due to page size, sometimes the Confluence API
doesn't match the limit value. If `limit` is >100 confluence
doesn't match the limit value. If `limit` is >100 confluence
seems to cap the response to 100. Also, due to the Atlassian Python
package, we don't get the "next" values from the "_links" key because
they only return the value from the results key. So here, the pagination
they only return the value from the result key. So here, the pagination
starts from 0 and goes until the max_pages, getting the `limit` number
of pages with each request. We have to manually check if there
are more docs based on the length of the returned list of pages, rather than

@ -10,11 +10,11 @@ class CoNLLULoader(BaseLoader):
"""Load CoNLL-U files."""
def __init__(self, file_path: str):
"""Initialize with file path."""
"""Initialize with a file path."""
self.file_path = file_path
def load(self) -> List[Document]:
"""Load from file path."""
"""Load from a file path."""
with open(self.file_path, encoding="utf8") as f:
tsv = list(csv.reader(f, delimiter="\t"))

@ -37,6 +37,16 @@ class CSVLoader(BaseLoader):
csv_args: Optional[Dict] = None,
encoding: Optional[str] = None,
):
"""
Args:
file_path: The path to the CSV file.
source_column: The name of the column in the CSV file to use as the source.
Optional. Defaults to None.
csv_args: A dictionary of arguments to pass to the csv.DictReader.
Optional. Defaults to None.
encoding: The encoding of the CSV file. Optional. Defaults to None.
"""
self.file_path = file_path
self.source_column = source_column
self.encoding = encoding
@ -73,6 +83,14 @@ class UnstructuredCSVLoader(UnstructuredFileLoader):
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
"""
Args:
file_path: The path to the CSV file.
mode: The mode to use when loading the CSV file.
Optional. Defaults to "single".
**unstructured_kwargs: Keyword arguments to pass to unstructured.
"""
validate_unstructured_version(min_unstructured_version="0.6.8")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

@ -1,4 +1,4 @@
"""Load from Dataframe object"""
"""Load from a Dataframe object"""
from typing import Any, Iterator, List
from langchain.docstore.document import Document
@ -6,10 +6,16 @@ from langchain.document_loaders.base import BaseLoader
class DataFrameLoader(BaseLoader):
"""Load Pandas DataFrames."""
"""Load Pandas DataFrame."""
def __init__(self, data_frame: Any, page_content_column: str = "text"):
"""Initialize with dataframe object."""
"""Initialize with dataframe object.
Args:
data_frame: Pandas DataFrame object.
page_content_column: Name of the column containing the page content.
Defaults to "text".
"""
import pandas as pd
if not isinstance(data_frame, pd.DataFrame):

@ -11,12 +11,19 @@ logger = logging.getLogger(__name__)
class DiffbotLoader(BaseLoader):
"""Loader that loads Diffbot file json."""
"""Loads Diffbot file json."""
def __init__(
self, api_token: str, urls: List[str], continue_on_failure: bool = True
):
"""Initialize with API token, ids, and key."""
"""Initialize with API token, ids, and key.
Args:
api_token: Diffbot API token.
urls: List of URLs to load.
continue_on_failure: Whether to continue loading other URLs if one fails.
Defaults to True.
"""
self.api_token = api_token
self.urls = urls
self.continue_on_failure = continue_on_failure
@ -38,7 +45,7 @@ class DiffbotLoader(BaseLoader):
return response.json() if response.ok else {}
def load(self) -> List[Document]:
"""Extract text from Diffbot on all the URLs and return Document instances"""
"""Extract text from Diffbot on all the URLs and return Documents"""
docs: List[Document] = list()
for url in self.urls:

@ -1,4 +1,4 @@
"""Loading logic for loading documents from a directory."""
"""Load documents from a directory."""
import concurrent
import logging
from pathlib import Path
@ -25,7 +25,7 @@ def _is_visible(p: Path) -> bool:
class DirectoryLoader(BaseLoader):
"""Loading logic for loading documents from a directory."""
"""Load documents from a directory."""
def __init__(
self,
@ -40,7 +40,22 @@ class DirectoryLoader(BaseLoader):
use_multithreading: bool = False,
max_concurrency: int = 4,
):
"""Initialize with path to directory and how to glob over it."""
"""Initialize with a path to directory and how to glob over it.
Args:
path: Path to directory.
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
(all files except hidden).
silent_errors: Whether to silently ignore errors. Defaults to False.
load_hidden: Whether to load hidden files. Defaults to False.
loader_cls: Loader class to use for loading files.
Defaults to UnstructuredFileLoader.
loader_kwargs: Keyword arguments to pass to loader_cls. Defaults to None.
recursive: Whether to recursively search for files. Defaults to False.
show_progress: Whether to show a progress bar. Defaults to False.
use_multithreading: Whether to use multithreading. Defaults to False.
max_concurrency: The maximum number of threads to use. Defaults to 4.
"""
if loader_kwargs is None:
loader_kwargs = {}
self.path = path
@ -57,6 +72,14 @@ class DirectoryLoader(BaseLoader):
def load_file(
self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any]
) -> None:
"""Load a file.
Args:
item: File path.
path: Directory path.
docs: List of documents to append to.
pbar: Progress bar. Defaults to None.
"""
if item.is_file():
if _is_visible(item.relative_to(path)) or self.load_hidden:
try:

@ -14,7 +14,12 @@ class DiscordChatLoader(BaseLoader):
"""Load Discord chat logs."""
def __init__(self, chat_log: pd.DataFrame, user_id_col: str = "ID"):
"""Initialize with a Pandas DataFrame containing chat logs."""
"""Initialize with a Pandas DataFrame containing chat logs.
Args:
chat_log: Pandas DataFrame containing chat logs.
user_id_col: Name of the column containing the user ID. Defaults to "ID".
"""
if not isinstance(chat_log, pd.DataFrame):
raise ValueError(
f"Expected chat_log to be a pd.DataFrame, got {type(chat_log)}"

@ -1,4 +1,4 @@
"""Loader that loads processed documents from Docugami."""
"""Loads processed documents from Docugami."""
import io
import logging
@ -29,22 +29,35 @@ logger = logging.getLogger(__name__)
class DocugamiLoader(BaseLoader, BaseModel):
"""Loader that loads processed docs from Docugami.
"""Loads processed docs from Docugami.
To use, you should have the ``lxml`` python package installed.
"""
api: str = DEFAULT_API_ENDPOINT
"""The Docugami API endpoint to use."""
access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY")
"""The Docugami API access token to use."""
docset_id: Optional[str]
"""The Docugami API docset ID to use."""
document_ids: Optional[Sequence[str]]
"""The Docugami API document IDs to use."""
file_paths: Optional[Sequence[Union[Path, str]]]
"""The local file paths to use."""
min_chunk_size: int = 32 # appended to the next chunk to avoid over-chunking
"""The minimum chunk size to use when parsing DGML. Defaults to 32."""
@root_validator
def validate_local_or_remote(cls, values: Dict[str, Any]) -> Dict[str, Any]:
"""Validate that either local file paths are given, or remote API docset ID."""
"""Validate that either local file paths are given, or remote API docset ID.
Args:
values: The values to validate.
Returns:
The validated values.
"""
if values.get("file_paths") and values.get("docset_id"):
raise ValueError("Cannot specify both file_paths and remote API docset_id")

@ -22,6 +22,20 @@ class DuckDBLoader(BaseLoader):
page_content_columns: Optional[List[str]] = None,
metadata_columns: Optional[List[str]] = None,
):
"""
Args:
query: The query to execute.
database: The database to connect to. Defaults to ":memory:".
read_only: Whether to open the database in read-only mode.
Defaults to False.
config: A dictionary of configuration options to pass to the database.
Optional.
page_content_columns: The columns to write into the `page_content`
of the document. Optional.
metadata_columns: The columns to write into the `metadata` of the document.
Optional.
"""
self.query = query
self.database = database
self.read_only = read_only

@ -1,4 +1,4 @@
"""Loader that loads email files."""
"""Loads email files."""
import os
from typing import Any, List
@ -72,12 +72,17 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
class OutlookMessageLoader(BaseLoader):
"""
Loader that loads Outlook Message files using extract_msg.
Loads Outlook Message files using extract_msg.
https://github.com/TeamMsgExtractor/msg-extractor
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
"""Initialize with a file path.
Args:
file_path: The path to the Outlook Message file.
"""
self.file_path = file_path

@ -52,7 +52,10 @@ class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters):
class BaseEmbaasLoader(BaseModel):
"""Base class for embedding a model into an Embaas document extraction API."""
embaas_api_key: Optional[str] = None
"""The API key for the embaas document extraction API."""
api_url: str = EMBAAS_DOC_API_URL
"""The URL of the embaas document extraction API."""
params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters()
@ -69,7 +72,7 @@ class BaseEmbaasLoader(BaseModel):
class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
"""Wrapper around embaas's document byte loader service.
"""Embaas's document byte loader.
To use, you should have the
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
@ -99,6 +102,11 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
"""
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Parses the blob lazily.
Args:
blob: The blob to parse.
"""
yield from self._get_documents(blob=blob)
@staticmethod
@ -170,7 +178,7 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
class EmbaasLoader(BaseEmbaasLoader, BaseLoader):
"""Wrapper around embaas's document loader service.
"""Embaas's document loader.
To use, you should have the
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass

@ -14,6 +14,7 @@ from langchain.document_loaders.base import BaseLoader
class EverNoteLoader(BaseLoader):
"""EverNote Loader.
Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
Instructions on producing this file can be found at
https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML

@ -13,6 +13,14 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
"""
Args:
file_path: The path to the Microsoft Excel file.
mode: The mode to use when partitioning the file. See unstructured docs
for more info. Optional. Defaults to "single".
**unstructured_kwargs: Keyword arguments to pass to unstructured.
"""
validate_unstructured_version(min_unstructured_version="0.6.7")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

@ -9,7 +9,11 @@ from langchain.document_loaders.base import BaseLoader
def concatenate_rows(row: dict) -> str:
"""Combine message information in a readable format ready to be used."""
"""Combine message information in a readable format ready to be used.
Args:
row: dictionary containing message information.
"""
sender = row["sender_name"]
text = row["content"]
date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime(
@ -19,10 +23,10 @@ def concatenate_rows(row: dict) -> str:
class FacebookChatLoader(BaseLoader):
"""Loader that loads Facebook messages json directory dump."""
"""Loads Facebook messages json directory dump."""
def __init__(self, path: str):
"""Initialize with path."""
"""Initialize with a path."""
self.file_path = path
def load(self) -> List[Document]:

@ -9,10 +9,16 @@ from langchain.utils import stringify_dict
class FigmaFileLoader(BaseLoader):
"""Loader that loads Figma file json."""
"""Loads Figma file json."""
def __init__(self, access_token: str, ids: str, key: str):
"""Initialize with access token, ids, and key."""
"""Initialize with access token, ids, and key.
Args:
access_token: The access token for the Figma REST API.
ids: The ids of the Figma file.
key: The key for the Figma file
"""
self.access_token = access_token
self.ids = ids
self.key = key

@ -7,10 +7,16 @@ from langchain.document_loaders.gcs_file import GCSFileLoader
class GCSDirectoryLoader(BaseLoader):
"""Loading logic for loading documents from GCS."""
"""Loads Documents from GCS."""
def __init__(self, project_name: str, bucket: str, prefix: str = ""):
"""Initialize with bucket and key name."""
"""Initialize with bucket and key name.
Args:
project_name: The name of the project for the GCS bucket.
bucket: The name of the GCS bucket.
prefix: The prefix of the GCS bucket.
"""
self.project_name = project_name
self.bucket = bucket
self.prefix = prefix
@ -20,7 +26,7 @@ class GCSDirectoryLoader(BaseLoader):
try:
from google.cloud import storage
except ImportError:
raise ValueError(
raise ImportError(
"Could not import google-cloud-storage python package. "
"Please install it with `pip install google-cloud-storage`."
)

@ -1,4 +1,4 @@
"""Loading logic for loading documents from a GCS file."""
"""Load documents from a GCS file."""
import os
import tempfile
from typing import List
@ -9,10 +9,16 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class GCSFileLoader(BaseLoader):
"""Loading logic for loading documents from GCS."""
"""Load Documents from a GCS file."""
def __init__(self, project_name: str, bucket: str, blob: str):
"""Initialize with bucket and key name."""
"""Initialize with bucket and key name.
Args:
project_name: The name of the project to load
bucket: The name of the GCS bucket.
blob: The name of the GCS blob to load.
"""
self.bucket = bucket
self.blob = blob
self.project_name = project_name
@ -22,7 +28,7 @@ class GCSFileLoader(BaseLoader):
try:
from google.cloud import storage
except ImportError:
raise ValueError(
raise ImportError(
"Could not import google-cloud-storage python package. "
"Please install it with `pip install google-cloud-storage`."
)

@ -7,9 +7,9 @@ from langchain.document_loaders.base import BaseLoader
class GitLoader(BaseLoader):
"""Loads files from a Git repository into a list of documents.
Repository can be local on disk available at `repo_path`,
The Repository can be local on disk available at `repo_path`,
or remote at `clone_url` that will be cloned to `repo_path`.
Currently supports only text files.
Currently, supports only text files.
Each document represents one file in the repository. The `path` points to
the local Git repository, and the `branch` specifies the branch to load
@ -23,6 +23,15 @@ class GitLoader(BaseLoader):
branch: Optional[str] = "main",
file_filter: Optional[Callable[[str], bool]] = None,
):
"""
Args:
repo_path: The path to the Git repository.
clone_url: Optional. The URL to clone the repository from.
branch: Optional. The branch to load files from. Defaults to `main`.
file_filter: Optional. A function that takes a file path and returns
a boolean indicating whether to load the file. Defaults to None.
"""
self.repo_path = repo_path
self.clone_url = clone_url
self.branch = branch

@ -28,7 +28,9 @@ class GitbookLoader(WebBaseLoader):
load_all_paths: If set to True, all relative paths in the navbar
are loaded instead of only `web_page`.
base_url: If `load_all_paths` is True, the relative paths are
appended to this base url. Defaults to `web_page` if not set.
appended to this base url. Defaults to `web_page`.
content_selector: The CSS selector for the content to load.
Defaults to "main".
"""
self.base_url = base_url or web_page
if self.base_url.endswith("/"):

@ -35,6 +35,8 @@ class BaseGitHubLoader(BaseLoader, BaseModel, ABC):
class GitHubIssuesLoader(BaseGitHubLoader):
"""Load issues of a GitHub repository."""
include_prs: bool = True
"""If True include Pull Requests in results, otherwise ignore them."""
milestone: Union[int, Literal["*", "none"], None] = None
@ -159,6 +161,7 @@ class GitHubIssuesLoader(BaseGitHubLoader):
@property
def query_params(self) -> str:
"""Create query parameters for GitHub API."""
labels = ",".join(self.labels) if self.labels else self.labels
query_params_dict = {
"milestone": self.milestone,
@ -179,4 +182,5 @@ class GitHubIssuesLoader(BaseGitHubLoader):
@property
def url(self) -> str:
"""Create URL for GitHub API."""
return f"https://api.github.com/repos/{self.repo}/issues?{self.query_params}"

@ -22,21 +22,32 @@ SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
class GoogleDriveLoader(BaseLoader, BaseModel):
"""Loader that loads Google Docs from Google Drive."""
"""Loads Google Docs from Google Drive."""
service_account_key: Path = Path.home() / ".credentials" / "keys.json"
"""Path to the service account key file."""
credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
"""Path to the credentials file."""
token_path: Path = Path.home() / ".credentials" / "token.json"
"""Path to the token file."""
folder_id: Optional[str] = None
"""The folder id to load from."""
document_ids: Optional[List[str]] = None
"""The document ids to load from."""
file_ids: Optional[List[str]] = None
"""The file ids to load from."""
recursive: bool = False
"""Whether to load recursively. Only applies when folder_id is given."""
file_types: Optional[Sequence[str]] = None
"""The file types to load. Only applies when folder_id is given."""
load_trashed_files: bool = False
"""Whether to load trashed files. Only applies when folder_id is given."""
# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
# results in pydantic validation errors
file_loader_cls: Any = None
"""The file loader class to use."""
file_loader_kwargs: Dict["str", Any] = {}
"""The file loader kwargs to use."""
@root_validator
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:

@ -1,4 +1,4 @@
"""Loader that loads .txt web files."""
"""Loads .txt web files."""
from typing import List
from langchain.docstore.document import Document
@ -9,7 +9,7 @@ class GutenbergLoader(BaseLoader):
"""Loader that uses urllib to load .txt web files."""
def __init__(self, file_path: str):
"""Initialize with file path."""
"""Initialize with a file path."""
if not file_path.startswith("https://www.gutenberg.org"):
raise ValueError("file path must start with 'https://www.gutenberg.org'")

@ -5,9 +5,14 @@ from typing import List, NamedTuple, Optional, cast
class FileEncoding(NamedTuple):
"""A file encoding as the NamedTuple."""
encoding: Optional[str]
"""The encoding of the file."""
confidence: float
"""The confidence of the encoding."""
language: Optional[str]
"""The language of the file."""
def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
@ -15,6 +20,10 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding
Returns a list of `FileEncoding` tuples with the detected encodings ordered
by confidence.
Args:
file_path: The path to the file to detect the encoding for.
timeout: The timeout in seconds for the encoding detection.
"""
import chardet

@ -1,4 +1,4 @@
"""Loader that loads HN."""
"""Loader that loads Hacker News."""
from typing import Any, List
from langchain.docstore.document import Document
@ -11,7 +11,7 @@ class HNLoader(WebBaseLoader):
def load(self) -> List[Document]:
"""Get important HN webpage information.
Components are:
HN webpage components are:
- title
- content
- source url,

@ -20,11 +20,18 @@ class BSHTMLLoader(BaseLoader):
get_text_separator: str = "",
) -> None:
"""Initialise with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object."""
to pass to the BeautifulSoup object.
Args:
file_path: The path to the file to load.
open_encoding: The encoding to use when opening the file.
bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
get_text_separator: The separator to use when calling get_text on the soup.
"""
try:
import bs4 # noqa:F401
except ImportError:
raise ValueError(
raise ImportError(
"beautifulsoup4 package not found, please install it with "
"`pip install beautifulsoup4`"
)
@ -37,9 +44,9 @@ class BSHTMLLoader(BaseLoader):
self.get_text_separator = get_text_separator
def load(self) -> List[Document]:
"""Load HTML document into document objects."""
from bs4 import BeautifulSoup
"""Load HTML document into document objects."""
with open(self.file_path, "r", encoding=self.open_encoding) as f:
soup = BeautifulSoup(f, **self.bs_kwargs)

@ -1,4 +1,4 @@
"""Loader that loads HuggingFace datasets."""
"""Loads HuggingFace datasets."""
from typing import Iterator, List, Mapping, Optional, Sequence, Union
from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
class HuggingFaceDatasetLoader(BaseLoader):
"""Loading logic for loading documents from the Hugging Face Hub."""
"""Load Documents from the Hugging Face Hub."""
def __init__(
self,
@ -27,14 +27,15 @@ class HuggingFaceDatasetLoader(BaseLoader):
Args:
path: Path or name of the dataset.
page_content_column: Page content column name.
page_content_column: Page content column name. Default is "text".
name: Name of the dataset configuration.
data_dir: Data directory of the dataset configuration.
data_files: Path(s) to source data file(s).
cache_dir: Directory to read/write data.
keep_in_memory: Whether to copy the dataset in-memory.
save_infos: Save the dataset information (checksums/size/splits/...).
use_auth_token: Bearer token for remote files on the Datasets Hub.
Default is False.
use_auth_token: Bearer token for remote files on the Dataset Hub.
num_proc: Number of processes.
"""

@ -22,7 +22,7 @@ class IFixitLoader(BaseLoader):
"""
def __init__(self, web_path: str):
"""Initialize with web path."""
"""Initialize with a web path."""
if not web_path.startswith("https://www.ifixit.com"):
raise ValueError("web path must start with 'https://www.ifixit.com'")
@ -60,6 +60,16 @@ class IFixitLoader(BaseLoader):
@staticmethod
def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
"""Load suggestions.
Args:
query: A query string
doc_type: The type of document to search for. Can be one of "all",
"device", "guide", "teardown", "answer", "wiki".
Returns:
"""
res = requests.get(
IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
)
@ -89,6 +99,14 @@ class IFixitLoader(BaseLoader):
def load_questions_and_answers(
self, url_override: Optional[str] = None
) -> List[Document]:
"""Load a list of questions and answers.
Args:
url_override: A URL to override the default URL.
Returns: List[Document]
"""
loader = WebBaseLoader(self.web_path if url_override is None else url_override)
soup = loader.scrape()
@ -125,6 +143,16 @@ class IFixitLoader(BaseLoader):
def load_device(
self, url_override: Optional[str] = None, include_guides: bool = True
) -> List[Document]:
"""Loads a device
Args:
url_override: A URL to override the default URL.
include_guides: Whether to include guides linked to from the device.
Defaults to True.
Returns:
"""
documents = []
if url_override is None:
url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
@ -153,6 +181,14 @@ class IFixitLoader(BaseLoader):
return documents
def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
"""Load a guide
Args:
url_override: A URL to override the default URL.
Returns: List[Document]
"""
if url_override is None:
url = IFIXIT_BASE_URL + "/guides/" + self.id
else:

@ -1,5 +1,5 @@
"""
Loader that loads image captions
"""Loads image captions.
By default, the loader utilizes the pre-trained BLIP image captioning model.
https://huggingface.co/Salesforce/blip-image-captioning-base
@ -13,7 +13,7 @@ from langchain.document_loaders.base import BaseLoader
class ImageCaptionLoader(BaseLoader):
"""Loader that loads the captions of an image"""
"""Loads the captions of an image"""
def __init__(
self,
@ -23,6 +23,11 @@ class ImageCaptionLoader(BaseLoader):
):
"""
Initialize with a list of image paths
Args:
path_images: A list of image paths.
blip_processor: The name of the pre-trained BLIP processor.
blip_model: The name of the pre-trained BLIP model.
"""
if isinstance(path_images, str):
self.image_paths = [path_images]

@ -1,4 +1,4 @@
"""Loader that loads IMSDb."""
"""Loads IMSDb."""
from typing import List
from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
class IMSDbLoader(WebBaseLoader):
"""Loader that loads IMSDb webpages."""
"""Loads IMSDb webpages."""
def load(self) -> List[Document]:
"""Load webpage."""

@ -20,6 +20,12 @@ class IuguLoader(BaseLoader):
"""Loader that fetches data from IUGU."""
def __init__(self, resource: str, api_token: Optional[str] = None) -> None:
"""Initialize the IUGU resource.
Args:
resource: The name of the resource to fetch.
api_token: The IUGU API token to use.
"""
self.resource = resource
api_token = api_token or get_from_env("api_token", "IUGU_API_TOKEN")
self.headers = {"Authorization": f"Bearer {api_token}"}

@ -30,6 +30,14 @@ class JoplinLoader(BaseLoader):
port: int = 41184,
host: str = "localhost",
) -> None:
"""
Args:
access_token: The access token to use.
port: The port where the Web Clipper service is running. Default is 41184.
host: The host where the Web Clipper service is running.
Default is localhost.
"""
access_token = access_token or get_from_env(
"access_token", "JOPLIN_ACCESS_TOKEN"
)

@ -1,4 +1,4 @@
"""Loader that loads data from JSON."""
"""Loads data from JSON."""
import json
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union
@ -8,8 +8,7 @@ from langchain.document_loaders.base import BaseLoader
class JSONLoader(BaseLoader):
"""Loads a JSON file and references a jq schema provided to load the text into
documents.
"""Loads a JSON file using a jq schema.
Example:
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
@ -101,7 +100,7 @@ class JSONLoader(BaseLoader):
return str(content) if content is not None else ""
def _validate_content_key(self, data: Any) -> None:
"""Check if content key is valid"""
"""Check if a content key is valid"""
sample = data.first()
if not isinstance(sample, dict):
raise ValueError(

@ -1,4 +1,4 @@
"""Loader that loads LarkSuite (FeiShu) document json dump."""
"""Loads LarkSuite (FeiShu) document json dump."""
import json
import urllib.request
from typing import Any, Iterator, List
@ -8,10 +8,16 @@ from langchain.document_loaders.base import BaseLoader
class LarkSuiteDocLoader(BaseLoader):
"""Loader that loads LarkSuite (FeiShu) document."""
"""Loads LarkSuite (FeiShu) document."""
def __init__(self, domain: str, access_token: str, document_id: str):
"""Initialize with domain, access_token (tenant / user), and document_id."""
"""Initialize with domain, access_token (tenant / user), and document_id.
Args:
domain: The domain to load the LarkSuite.
access_token: The access_token to use.
document_id: The document_id to load.
"""
self.domain = domain
self.access_token = access_token
self.document_id = document_id

@ -1,4 +1,4 @@
"""Loader that loads Markdown files."""
"""Loads Markdown files."""
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader

@ -15,7 +15,7 @@ def _dependable_mastodon_import() -> mastodon:
try:
import mastodon
except ImportError:
raise ValueError(
raise ImportError(
"Mastodon.py package not found, "
"please install it with `pip install Mastodon.py`"
)
@ -37,11 +37,13 @@ class MastodonTootsLoader(BaseLoader):
Args:
mastodon_accounts: The list of Mastodon accounts to query.
number_toots: How many toots to pull for each account.
number_toots: How many toots to pull for each account. Default is 100.
exclude_replies: Whether to exclude reply toots from the load.
Default is False.
access_token: An access token if toots are loaded as a Mastodon app. Can
also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
api_base_url: A Mastodon API base URL to talk to, if not using the default.
Default is "https://mastodon.social".
"""
mastodon = _dependable_mastodon_import()
access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")

@ -32,12 +32,17 @@ class MWDumpLoader(BaseLoader):
"""
def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
"""Initialize with file path."""
"""Initialize with a file path.
Args:
file_path: XML local file path
encoding: Charset encoding, defaults to "utf8"
"""
self.file_path = file_path
self.encoding = encoding
def load(self) -> List[Document]:
"""Load from file path."""
"""Load from a file path."""
import mwparserfromhell
import mwxml

@ -1,4 +1,4 @@
"""Loader to load MHTML files, enriching metadata with page title."""
"""Load MHTML files, enriching metadata with page title."""
import email
import logging
@ -21,11 +21,18 @@ class MHTMLLoader(BaseLoader):
get_text_separator: str = "",
) -> None:
"""Initialise with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object."""
to pass to the BeautifulSoup object.
Args:
file_path: The path to the file to load.
open_encoding: The encoding to use when opening the file.
bs_kwargs: soup kwargs to pass to the BeautifulSoup object.
get_text_separator: The separator to use when getting text from the soup.
"""
try:
import bs4 # noqa:F401
except ImportError:
raise ValueError(
raise ImportError(
"beautifulsoup4 package not found, please install it with "
"`pip install beautifulsoup4`"
)

Loading…
Cancel
Save