mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
docstrings document_loaders
3 (#6937)
- Updated docstrings for `document_loaders` - Mass update `"""Loader that loads` to `"""Loads` @baskaryan - please, review
This commit is contained in:
parent
9d13dcd17c
commit
5eec74d9a5
@ -1,4 +1,4 @@
|
||||
"""Loader that loads acreom vault from a directory."""
|
||||
"""Loads acreom vault from a directory."""
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads local airbyte json files."""
|
||||
"""Loads local airbyte json files."""
|
||||
import json
|
||||
from typing import List
|
||||
|
||||
@ -8,7 +8,7 @@ from langchain.utils import stringify_dict
|
||||
|
||||
|
||||
class AirbyteJSONLoader(BaseLoader):
|
||||
"""Loader that loads local airbyte json files."""
|
||||
"""Loads local airbyte json files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads AZLyrics."""
|
||||
"""Loads AZLyrics."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class AZLyricsLoader(WebBaseLoader):
|
||||
"""Loader that loads AZLyrics webpages."""
|
||||
"""Loads AZLyrics webpages."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpages into Documents."""
|
||||
|
@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class BiliBiliLoader(BaseLoader):
|
||||
"""Loader that loads bilibili transcripts."""
|
||||
"""Loads bilibili transcripts."""
|
||||
|
||||
def __init__(self, video_urls: List[str]):
|
||||
"""Initialize with bilibili url.
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads all documents from a blackboard course."""
|
||||
"""Loads all documents from a blackboard course."""
|
||||
import contextlib
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
@ -1,3 +1,4 @@
|
||||
"""Load conversations from ChatGPT data export"""
|
||||
import datetime
|
||||
import json
|
||||
from typing import List
|
||||
@ -31,7 +32,7 @@ class ChatGPTLoader(BaseLoader):
|
||||
"""Load conversations from exported ChatGPT data."""
|
||||
|
||||
def __init__(self, log_file: str, num_logs: int = -1):
|
||||
"""
|
||||
"""Initialize a class object.
|
||||
|
||||
Args:
|
||||
log_file: Path to the log file
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads College Confidential."""
|
||||
"""Loads College Confidential."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class CollegeConfidentialLoader(WebBaseLoader):
|
||||
"""Loader that loads College Confidential webpages."""
|
||||
"""Loads College Confidential webpages."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpages as Documents."""
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads EPub files."""
|
||||
"""Loads EPub files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.document_loaders.unstructured import (
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Microsoft Excel files."""
|
||||
"""Loads Microsoft Excel files."""
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.document_loaders.unstructured import (
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Facebook chat json dump."""
|
||||
"""Loads Facebook chat json dump."""
|
||||
import datetime
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Figma files json dump."""
|
||||
"""Loads Figma files json dump."""
|
||||
import json
|
||||
import urllib.request
|
||||
from typing import Any, List
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads GitBook."""
|
||||
"""Loads GitBook."""
|
||||
from typing import Any, List, Optional
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads data from Google Drive."""
|
||||
"""Loads data from Google Drive."""
|
||||
|
||||
# Prerequisites:
|
||||
# 1. Create a Google Cloud project
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Hacker News."""
|
||||
"""Loads HN."""
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads iFixit data."""
|
||||
"""Loads iFixit data."""
|
||||
from typing import List, Optional
|
||||
|
||||
import requests
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads image files."""
|
||||
"""Loads image files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
@ -37,13 +37,13 @@ class MastodonTootsLoader(BaseLoader):
|
||||
|
||||
Args:
|
||||
mastodon_accounts: The list of Mastodon accounts to query.
|
||||
number_toots: How many toots to pull for each account. Default is 100.
|
||||
number_toots: How many toots to pull for each account. Defaults to 100.
|
||||
exclude_replies: Whether to exclude reply toots from the load.
|
||||
Default is False.
|
||||
Defaults to False.
|
||||
access_token: An access token if toots are loaded as a Mastodon app. Can
|
||||
also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
|
||||
api_base_url: A Mastodon API base URL to talk to, if not using the default.
|
||||
Default is "https://mastodon.social".
|
||||
Defaults to "https://mastodon.social".
|
||||
"""
|
||||
mastodon = _dependable_mastodon_import()
|
||||
access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")
|
||||
|
@ -24,10 +24,11 @@ class MHTMLLoader(BaseLoader):
|
||||
to pass to the BeautifulSoup object.
|
||||
|
||||
Args:
|
||||
file_path: The path to the file to load.
|
||||
file_path: Path to file to load.
|
||||
open_encoding: The encoding to use when opening the file.
|
||||
bs_kwargs: soup kwargs to pass to the BeautifulSoup object.
|
||||
get_text_separator: The separator to use when getting text from the soup.
|
||||
bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
|
||||
get_text_separator: The separator to use when getting the text
|
||||
from the soup.
|
||||
"""
|
||||
try:
|
||||
import bs4 # noqa:F401
|
||||
|
@ -35,6 +35,16 @@ class ModernTreasuryLoader(BaseLoader):
|
||||
organization_id: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
|
||||
Args:
|
||||
resource: The Modern Treasury resource to load.
|
||||
organization_id: The Modern Treasury organization ID. It can also be
|
||||
specified via the environment variable
|
||||
"MODERN_TREASURY_ORGANIZATION_ID".
|
||||
api_key: The Modern Treasury API key. It can also be specified via
|
||||
the environment variable "MODERN_TREASURY_API_KEY".
|
||||
"""
|
||||
self.resource = resource
|
||||
organization_id = organization_id or get_from_env(
|
||||
"organization_id", "MODERN_TREASURY_ORGANIZATION_ID"
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads .ipynb notebook files."""
|
||||
"""Loads .ipynb notebook files."""
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, List
|
||||
@ -10,7 +10,18 @@ from langchain.document_loaders.base import BaseLoader
|
||||
def concatenate_cells(
|
||||
cell: dict, include_outputs: bool, max_output_length: int, traceback: bool
|
||||
) -> str:
|
||||
"""Combine cells information in a readable format ready to be used."""
|
||||
"""Combine cells information in a readable format ready to be used.
|
||||
|
||||
Args:
|
||||
cell: A dictionary
|
||||
include_outputs: Whether to include the outputs of the cell.
|
||||
max_output_length: Maximum length of the output to be displayed.
|
||||
traceback: Whether to return a traceback of the error.
|
||||
|
||||
Returns:
|
||||
A string with the cell information.
|
||||
|
||||
"""
|
||||
cell_type = cell["cell_type"]
|
||||
source = cell["source"]
|
||||
output = cell["outputs"]
|
||||
@ -45,7 +56,7 @@ def concatenate_cells(
|
||||
|
||||
|
||||
def remove_newlines(x: Any) -> Any:
|
||||
"""Remove recursively newlines, no matter the data structure they are stored in."""
|
||||
"""Recursively removes newlines, no matter the data structure they are stored in."""
|
||||
import pandas as pd
|
||||
|
||||
if isinstance(x, str):
|
||||
@ -59,7 +70,7 @@ def remove_newlines(x: Any) -> Any:
|
||||
|
||||
|
||||
class NotebookLoader(BaseLoader):
|
||||
"""Loader that loads .ipynb notebook files."""
|
||||
"""Loads .ipynb notebook files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -69,7 +80,19 @@ class NotebookLoader(BaseLoader):
|
||||
remove_newline: bool = False,
|
||||
traceback: bool = False,
|
||||
):
|
||||
"""Initialize with path."""
|
||||
"""Initialize with path.
|
||||
|
||||
Args:
|
||||
path: The path to load the notebook from.
|
||||
include_outputs: Whether to include the outputs of the cell.
|
||||
Defaults to False.
|
||||
max_output_length: Maximum length of the output to be displayed.
|
||||
Defaults to 10.
|
||||
remove_newline: Whether to remove newlines from the notebook.
|
||||
Defaults to False.
|
||||
traceback: Whether to return a traceback of the error.
|
||||
Defaults to False.
|
||||
"""
|
||||
self.file_path = path
|
||||
self.include_outputs = include_outputs
|
||||
self.max_output_length = max_output_length
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Notion directory dump."""
|
||||
"""Loads Notion directory dump."""
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
@ -7,10 +7,10 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class NotionDirectoryLoader(BaseLoader):
|
||||
"""Loader that loads Notion directory dump."""
|
||||
"""Loads Notion directory dump."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with path."""
|
||||
"""Initialize with a file path."""
|
||||
self.file_path = path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
|
@ -15,11 +15,12 @@ BLOCK_URL = NOTION_BASE_URL + "/blocks/{block_id}/children"
|
||||
|
||||
class NotionDBLoader(BaseLoader):
|
||||
"""Notion DB Loader.
|
||||
Reads content from pages within a Noton Database.
|
||||
Reads content from pages within a Notion Database.
|
||||
Args:
|
||||
integration_token (str): Notion integration token.
|
||||
database_id (str): Notion database id.
|
||||
request_timeout_sec (int): Timeout for Notion requests in seconds.
|
||||
Defaults to 10.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -75,7 +76,11 @@ class NotionDBLoader(BaseLoader):
|
||||
return pages
|
||||
|
||||
def load_page(self, page_summary: Dict[str, Any]) -> Document:
|
||||
"""Read a page."""
|
||||
"""Read a page.
|
||||
|
||||
Args:
|
||||
page_summary: Page summary from Notion API.
|
||||
"""
|
||||
page_id = page_summary["id"]
|
||||
|
||||
# load properties as metadata
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Obsidian directory dump."""
|
||||
"""Loads Obsidian directory dump."""
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
@ -8,14 +8,21 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class ObsidianLoader(BaseLoader):
|
||||
"""Loader that loads Obsidian files from disk."""
|
||||
"""Loads Obsidian files from disk."""
|
||||
|
||||
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
|
||||
|
||||
def __init__(
|
||||
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
|
||||
):
|
||||
"""Initialize with path."""
|
||||
"""Initialize with a path.
|
||||
|
||||
Args:
|
||||
path: Path to the directory containing the Obsidian files.
|
||||
encoding: Charset encoding, defaults to "UTF-8"
|
||||
collect_metadata: Whether to collect metadata from the front matter.
|
||||
Defaults to True.
|
||||
"""
|
||||
self.file_path = path
|
||||
self.encoding = encoding
|
||||
self.collect_metadata = collect_metadata
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Open Office ODT files."""
|
||||
"""Loads OpenOffice ODT files."""
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.document_loaders.unstructured import (
|
||||
@ -8,11 +8,19 @@ from langchain.document_loaders.unstructured import (
|
||||
|
||||
|
||||
class UnstructuredODTLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load open office ODT files."""
|
||||
"""Loader that uses unstructured to load OpenOffice ODT files."""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the file to load.
|
||||
mode: The mode to use when loading the file. Can be one of "single",
|
||||
"multi", or "all". Default is "single".
|
||||
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
||||
"""
|
||||
validate_unstructured_version(min_unstructured_version="0.6.3")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads data from OneDrive"""
|
||||
"""Loads data from OneDrive"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
@ -60,11 +60,18 @@ class _SupportedFileTypes(BaseModel):
|
||||
|
||||
|
||||
class OneDriveLoader(BaseLoader, BaseModel):
|
||||
"""Loads data from OneDrive."""
|
||||
|
||||
settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
|
||||
""" The settings for the OneDrive API client."""
|
||||
drive_id: str = Field(...)
|
||||
""" The ID of the OneDrive drive to load data from."""
|
||||
folder_path: Optional[str] = None
|
||||
""" The path to the folder to load data from."""
|
||||
object_ids: Optional[List[str]] = None
|
||||
""" The IDs of the objects to load data from."""
|
||||
auth_with_token: bool = False
|
||||
""" Whether to authenticate with a token or not. Defaults to False."""
|
||||
|
||||
def _auth(self) -> Type[Account]:
|
||||
"""
|
||||
|
@ -16,10 +16,15 @@ CHUNK_SIZE = 1024 * 1024 * 5
|
||||
|
||||
|
||||
class OneDriveFileLoader(BaseLoader, BaseModel):
|
||||
"""Loads a file from OneDrive."""
|
||||
|
||||
file: File = Field(...)
|
||||
"""The file to load."""
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
"""Allow arbitrary types. This is needed for the File type. Default is True.
|
||||
See https://pydantic-docs.helpmanual.io/usage/types/#arbitrary-types-allowed"""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load Documents"""
|
||||
|
@ -5,13 +5,19 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class OpenCityDataLoader(BaseLoader):
|
||||
"""Loader that loads Open city data."""
|
||||
"""Loads Open City data."""
|
||||
|
||||
def __init__(self, city_id: str, dataset_id: str, limit: int):
|
||||
"""Initialize with dataset_id"""
|
||||
""" Example: https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6 """
|
||||
""" e.g., city_id = data.sfgov.org """
|
||||
""" e.g., dataset_id = vw6y-z8j6 """
|
||||
"""Initialize with dataset_id.
|
||||
Example: https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6
|
||||
e.g., city_id = data.sfgov.org
|
||||
e.g., dataset_id = vw6y-z8j6
|
||||
|
||||
Args:
|
||||
city_id: The Open City city identifier.
|
||||
dataset_id: The Open City dataset identifier.
|
||||
limit: The maximum number of documents to load.
|
||||
"""
|
||||
self.city_id = city_id
|
||||
self.dataset_id = dataset_id
|
||||
self.limit = limit
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Org-Mode files."""
|
||||
"""Loads Org-Mode files."""
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.document_loaders.unstructured import (
|
||||
@ -13,6 +13,14 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader):
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the file to load.
|
||||
mode: The mode to load the file from. Default is "single".
|
||||
**unstructured_kwargs: Any additional keyword arguments to pass
|
||||
to the unstructured.
|
||||
"""
|
||||
validate_unstructured_version(min_unstructured_version="0.7.9")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads PDF files."""
|
||||
"""Loads PDF files."""
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@ -41,11 +41,11 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
"""Base loader class for PDF files.
|
||||
|
||||
Defaults to check for local file, but if the file is a web path, it will download it
|
||||
to a temporary file, and use that, then clean up the temporary file after completion
|
||||
to a temporary file, use it, then clean up the temporary file after completion
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with a file path."""
|
||||
self.file_path = file_path
|
||||
self.web_path = None
|
||||
if "~" in self.file_path:
|
||||
@ -86,7 +86,7 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
|
||||
|
||||
class OnlinePDFLoader(BasePDFLoader):
|
||||
"""Loader that loads online PDFs."""
|
||||
"""Loads online PDFs."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
@ -97,13 +97,13 @@ class OnlinePDFLoader(BasePDFLoader):
|
||||
class PyPDFLoader(BasePDFLoader):
|
||||
"""Loads a PDF with pypdf and chunks at character level.
|
||||
|
||||
Loader also stores page numbers in metadatas.
|
||||
Loader also stores page numbers in metadata.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, password: Optional[Union[str, bytes]] = None
|
||||
) -> None:
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
@ -129,7 +129,7 @@ class PyPDFium2Loader(BasePDFLoader):
|
||||
"""Loads a PDF with pypdfium2 and chunks at character level."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with a file path."""
|
||||
super().__init__(file_path)
|
||||
self.parser = PyPDFium2Parser()
|
||||
|
||||
@ -148,7 +148,7 @@ class PyPDFium2Loader(BasePDFLoader):
|
||||
class PyPDFDirectoryLoader(BaseLoader):
|
||||
"""Loads a directory with PDF files with pypdf and chunks at character level.
|
||||
|
||||
Loader also stores page numbers in metadatas.
|
||||
Loader also stores page numbers in metadata.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -222,7 +222,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
||||
"""Loader that uses PDFMiner to load PDF files as HTML content."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
from pdfminer.high_level import extract_text_to_fp # noqa:F401
|
||||
except ImportError:
|
||||
@ -256,7 +256,7 @@ class PyMuPDFLoader(BasePDFLoader):
|
||||
"""Loader that uses PyMuPDF to load PDF files."""
|
||||
|
||||
def __init__(self, file_path: str) -> None:
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
import fitz # noqa:F401
|
||||
except ImportError:
|
||||
@ -278,6 +278,8 @@ class PyMuPDFLoader(BasePDFLoader):
|
||||
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
|
||||
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
|
||||
class MathpixPDFLoader(BasePDFLoader):
|
||||
"""This class uses Mathpix service to load PDF files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
@ -286,6 +288,16 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
should_clean_pdf: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with a file path.
|
||||
|
||||
Args:
|
||||
file_path: a file for loading.
|
||||
processed_file_format: a format of the processed file. Default is "mmd".
|
||||
max_wait_time_seconds: a maximum time to wait for the response from
|
||||
the server. Default is 500.
|
||||
should_clean_pdf: a flag to clean the PDF file. Default is False.
|
||||
**kwargs: additional keyword arguments.
|
||||
"""
|
||||
super().__init__(file_path)
|
||||
self.mathpix_api_key = get_from_dict_or_env(
|
||||
kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
|
||||
@ -324,6 +336,13 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
raise ValueError("Unable to send PDF to Mathpix.")
|
||||
|
||||
def wait_for_processing(self, pdf_id: str) -> None:
|
||||
"""Wait for processing to complete.
|
||||
|
||||
Args:
|
||||
pdf_id: a PDF id.
|
||||
|
||||
Returns: None
|
||||
"""
|
||||
url = self.url + "/" + pdf_id
|
||||
for _ in range(0, self.max_wait_time_seconds, 5):
|
||||
response = requests.get(url, headers=self.headers)
|
||||
@ -346,6 +365,14 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
return response.content.decode("utf-8")
|
||||
|
||||
def clean_pdf(self, contents: str) -> str:
|
||||
"""Clean the PDF file.
|
||||
|
||||
Args:
|
||||
contents: a PDF file contents.
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
contents = "\n".join(
|
||||
[line for line in contents.split("\n") if not line.startswith("![]")]
|
||||
)
|
||||
@ -375,7 +402,7 @@ class PDFPlumberLoader(BasePDFLoader):
|
||||
def __init__(
|
||||
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
|
||||
) -> None:
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
import pdfplumber # noqa:F401
|
||||
except ImportError:
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads powerpoint files."""
|
||||
"""Loads PowerPoint files."""
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
@ -6,7 +6,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load powerpoint files."""
|
||||
"""Loader that uses unstructured to load PowerPoint files."""
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads documents from Psychic.dev."""
|
||||
"""Loads documents from Psychic.dev."""
|
||||
from typing import List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -6,12 +6,18 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class PsychicLoader(BaseLoader):
|
||||
"""Loader that loads documents from Psychic.dev."""
|
||||
"""Loads documents from Psychic.dev."""
|
||||
|
||||
def __init__(
|
||||
self, api_key: str, account_id: str, connector_id: Optional[str] = None
|
||||
):
|
||||
"""Initialize with API key, connector id, and account id."""
|
||||
"""Initialize with API key, connector id, and account id.
|
||||
|
||||
Args:
|
||||
api_key: The Psychic API key.
|
||||
account_id: The Psychic account id.
|
||||
connector_id: The Psychic connector id.
|
||||
"""
|
||||
|
||||
try:
|
||||
from psychicapi import ConnectorId, Psychic # noqa: F401
|
||||
|
@ -23,7 +23,15 @@ class PySparkDataFrameLoader(BaseLoader):
|
||||
page_content_column: str = "text",
|
||||
fraction_of_memory: float = 0.1,
|
||||
):
|
||||
"""Initialize with a Spark DataFrame object."""
|
||||
"""Initialize with a Spark DataFrame object.
|
||||
|
||||
Args:
|
||||
spark_session: The SparkSession object.
|
||||
df: The Spark DataFrame object.
|
||||
page_content_column: The name of the column containing the page content.
|
||||
Defaults to "text".
|
||||
fraction_of_memory: The fraction of memory to use. Defaults to 0.1.
|
||||
"""
|
||||
try:
|
||||
from pyspark.sql import DataFrame, SparkSession
|
||||
except ImportError:
|
||||
@ -48,7 +56,7 @@ class PySparkDataFrameLoader(BaseLoader):
|
||||
self.column_names = self.df.columns
|
||||
|
||||
def get_num_rows(self) -> Tuple[int, int]:
|
||||
"""Gets the amount of "feasible" rows for the DataFrame"""
|
||||
"""Gets the number of "feasible" rows for the DataFrame"""
|
||||
try:
|
||||
import psutil
|
||||
except ImportError as e:
|
||||
|
@ -9,6 +9,11 @@ class PythonLoader(TextLoader):
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with a file path.
|
||||
|
||||
Args:
|
||||
file_path: The path to the file to load.
|
||||
"""
|
||||
with open(file_path, "rb") as f:
|
||||
encoding, _ = tokenize.detect_encoding(f.readline)
|
||||
super().__init__(file_path=file_path, encoding=encoding)
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads ReadTheDocs documentation directory dump."""
|
||||
"""Loads ReadTheDocs documentation directory dump."""
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Tuple, Union
|
||||
|
||||
@ -7,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class ReadTheDocsLoader(BaseLoader):
|
||||
"""Loader that loads ReadTheDocs documentation directory dump."""
|
||||
"""Loads ReadTheDocs documentation directory dump."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -20,7 +20,7 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
"""
|
||||
Initialize ReadTheDocsLoader
|
||||
|
||||
The loader loops over all files under `path` and extract the actual content of
|
||||
The loader loops over all files under `path` and extracts the actual content of
|
||||
the files by retrieving main html tags. Default main html tags include
|
||||
`<main id="main-content>`, <`div role="main>`, and `<article role="main">`. You
|
||||
can also define your own html tags by passing custom_html_tag, e.g.
|
||||
@ -31,7 +31,7 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
Args:
|
||||
path: The location of pulled readthedocs folder.
|
||||
encoding: The encoding with which to open the documents.
|
||||
errors: Specifies how encoding and decoding errors are to be handled—this
|
||||
errors: Specify how encoding and decoding errors are to be handled—this
|
||||
cannot be used in binary mode.
|
||||
custom_html_tag: Optional custom html tag to retrieve the content from
|
||||
files.
|
||||
|
@ -8,17 +8,27 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class RecursiveUrlLoader(BaseLoader):
|
||||
"""Loader that loads all child links from a given url."""
|
||||
"""Loads all child links from a given url."""
|
||||
|
||||
def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None:
|
||||
"""Initialize with URL to crawl and any sub-directories to exclude."""
|
||||
"""Initialize with URL to crawl and any subdirectories to exclude.
|
||||
|
||||
Args:
|
||||
url: The URL to crawl.
|
||||
exclude_dirs: A list of subdirectories to exclude.
|
||||
"""
|
||||
self.url = url
|
||||
self.exclude_dirs = exclude_dirs
|
||||
|
||||
def get_child_links_recursive(
|
||||
self, url: str, visited: Optional[Set[str]] = None
|
||||
) -> Set[str]:
|
||||
"""Recursively get all child links starting with the path of the input URL."""
|
||||
"""Recursively get all child links starting with the path of the input URL.
|
||||
|
||||
Args:
|
||||
url: The URL to crawl.
|
||||
visited: A set of visited URLs.
|
||||
"""
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
@ -39,7 +49,7 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
if not parent_url.endswith("/"):
|
||||
parent_url += "/"
|
||||
|
||||
# Exclude the root and parent from list
|
||||
# Exclude the root and parent from a list
|
||||
visited = set() if visited is None else visited
|
||||
|
||||
# Exclude the links that start with any of the excluded directories
|
||||
|
@ -23,7 +23,7 @@ def _dependable_praw_import() -> praw:
|
||||
class RedditPostsLoader(BaseLoader):
|
||||
"""Reddit posts loader.
|
||||
Read posts on a subreddit.
|
||||
First you need to go to
|
||||
First, you need to go to
|
||||
https://www.reddit.com/prefs/apps/
|
||||
and create your application
|
||||
"""
|
||||
@ -38,6 +38,20 @@ class RedditPostsLoader(BaseLoader):
|
||||
categories: Sequence[str] = ["new"],
|
||||
number_posts: Optional[int] = 10,
|
||||
):
|
||||
"""
|
||||
Initialize with client_id, client_secret, user_agent, search_queries, mode,
|
||||
categories, number_posts.
|
||||
Example: https://www.reddit.com/r/learnpython/
|
||||
|
||||
Args:
|
||||
client_id: Reddit client id.
|
||||
client_secret: Reddit client secret.
|
||||
user_agent: Reddit user agent.
|
||||
search_queries: The search queries.
|
||||
mode: The mode.
|
||||
categories: The categories. Default: ["new"]
|
||||
number_posts: The number of posts. Default: 10
|
||||
"""
|
||||
self.client_id = client_id
|
||||
self.client_secret = client_secret
|
||||
self.user_agent = user_agent
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Roam directory dump."""
|
||||
"""Loads Roam directory dump."""
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
@ -7,10 +7,10 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class RoamLoader(BaseLoader):
|
||||
"""Loader that loads Roam files from disk."""
|
||||
"""Loads Roam files from disk."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with path."""
|
||||
"""Initialize with a path."""
|
||||
self.file_path = path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads RST files."""
|
||||
"""Loads RST files."""
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.document_loaders.unstructured import (
|
||||
@ -13,6 +13,16 @@ class UnstructuredRSTLoader(UnstructuredFileLoader):
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
):
|
||||
"""
|
||||
Initialize with a file path.
|
||||
|
||||
Args:
|
||||
file_path: The path to the file to load.
|
||||
mode: The mode to use for partitioning. See unstructured for details.
|
||||
Defaults to "single".
|
||||
**unstructured_kwargs: Additional keyword arguments to pass
|
||||
to unstructured.
|
||||
"""
|
||||
validate_unstructured_version(min_unstructured_version="0.7.5")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads rich text files."""
|
||||
"""Loads rich text files."""
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.document_loaders.unstructured import (
|
||||
@ -13,6 +13,16 @@ class UnstructuredRTFLoader(UnstructuredFileLoader):
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
):
|
||||
"""
|
||||
Initialize with a file path.
|
||||
|
||||
Args:
|
||||
file_path: The path to the file to load.
|
||||
mode: The mode to use for partitioning. See unstructured for details.
|
||||
Defaults to "single".
|
||||
**unstructured_kwargs: Additional keyword arguments to pass
|
||||
to unstructured.
|
||||
"""
|
||||
min_unstructured_version = "0.5.12"
|
||||
if not satisfies_min_unstructured_version(min_unstructured_version):
|
||||
raise ValueError(
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loading logic for loading documents from an s3 directory."""
|
||||
"""Loading logic for loading documents from an AWS S3 directory."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -7,10 +7,15 @@ from langchain.document_loaders.s3_file import S3FileLoader
|
||||
|
||||
|
||||
class S3DirectoryLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from s3."""
|
||||
"""Loading logic for loading documents from an AWS S3."""
|
||||
|
||||
def __init__(self, bucket: str, prefix: str = ""):
|
||||
"""Initialize with bucket and key name."""
|
||||
"""Initialize with bucket and key name.
|
||||
|
||||
Args:
|
||||
bucket: The name of the S3 bucket.
|
||||
prefix: The prefix of the S3 key. Defaults to "".
|
||||
"""
|
||||
self.bucket = bucket
|
||||
self.prefix = prefix
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loading logic for loading documents from an s3 file."""
|
||||
"""Loading logic for loading documents from an AWS S3 file."""
|
||||
import os
|
||||
import tempfile
|
||||
from typing import List
|
||||
@ -9,10 +9,15 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class S3FileLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from s3."""
|
||||
"""Loading logic for loading documents from an AWS S3 file."""
|
||||
|
||||
def __init__(self, bucket: str, key: str):
|
||||
"""Initialize with bucket and key name."""
|
||||
"""Initialize with bucket and key name.
|
||||
|
||||
Args:
|
||||
bucket: The name of the S3 bucket.
|
||||
key: The key of the S3 object.
|
||||
"""
|
||||
self.bucket = bucket
|
||||
self.key = key
|
||||
|
||||
|
@ -42,11 +42,12 @@ class SitemapLoader(WebBaseLoader):
|
||||
urls that are parsed and loaded
|
||||
parsing_function: Function to parse bs4.Soup output
|
||||
blocksize: number of sitemap locations per block
|
||||
blocknum: the number of the block that should be loaded - zero indexed
|
||||
blocknum: the number of the block that should be loaded - zero indexed.
|
||||
Default: 0
|
||||
meta_function: Function to parse bs4.Soup output for metadata
|
||||
remember when setting this method to also copy metadata["loc"]
|
||||
to metadata["source"] if you are using this field
|
||||
is_local: whether the sitemap is a local file
|
||||
is_local: whether the sitemap is a local file. Default: False
|
||||
"""
|
||||
|
||||
if blocksize is not None and blocksize < 1:
|
||||
@ -72,7 +73,14 @@ class SitemapLoader(WebBaseLoader):
|
||||
self.is_local = is_local
|
||||
|
||||
def parse_sitemap(self, soup: Any) -> List[dict]:
|
||||
"""Parse sitemap xml and load into a list of dicts."""
|
||||
"""Parse sitemap xml and load into a list of dicts.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup object.
|
||||
|
||||
Returns:
|
||||
List of dicts.
|
||||
"""
|
||||
els = []
|
||||
for url in soup.find_all("url"):
|
||||
loc = url.find("loc")
|
||||
|
@ -9,7 +9,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class SlackDirectoryLoader(BaseLoader):
|
||||
"""Loader for loading documents from a Slack directory dump."""
|
||||
"""Loads documents from a Slack directory dump."""
|
||||
|
||||
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
|
||||
"""Initialize the SlackDirectoryLoader.
|
||||
|
@ -41,6 +41,7 @@ class SnowflakeLoader(BaseLoader):
|
||||
role: Snowflake role.
|
||||
database: Snowflake database
|
||||
schema: Snowflake schema
|
||||
parameters: Optional. Parameters to pass to the query.
|
||||
page_content_columns: Optional. Columns written to Document `page_content`.
|
||||
metadata_columns: Optional. Columns written to Document `metadata`.
|
||||
"""
|
||||
@ -62,7 +63,7 @@ class SnowflakeLoader(BaseLoader):
|
||||
try:
|
||||
import snowflake.connector
|
||||
except ImportError as ex:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"Could not import snowflake-connector-python package. "
|
||||
"Please install it with `pip install snowflake-connector-python`."
|
||||
) from ex
|
||||
|
@ -23,6 +23,12 @@ class SpreedlyLoader(BaseLoader):
|
||||
"""Loader that fetches data from Spreedly API."""
|
||||
|
||||
def __init__(self, access_token: str, resource: str) -> None:
|
||||
"""Initialize with an access token and a resource.
|
||||
|
||||
Args:
|
||||
access_token: The access token.
|
||||
resource: The resource.
|
||||
"""
|
||||
self.access_token = access_token
|
||||
self.resource = resource
|
||||
self.headers = {
|
||||
|
@ -9,7 +9,7 @@ class SRTLoader(BaseLoader):
|
||||
"""Loader for .srt (subtitle) files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
import pysrt # noqa:F401
|
||||
except ImportError:
|
||||
|
@ -21,6 +21,12 @@ class StripeLoader(BaseLoader):
|
||||
"""Loader that fetches data from Stripe."""
|
||||
|
||||
def __init__(self, resource: str, access_token: Optional[str] = None) -> None:
|
||||
"""Initialize with a resource and an access token.
|
||||
|
||||
Args:
|
||||
resource: The resource.
|
||||
access_token: The access token.
|
||||
"""
|
||||
self.resource = resource
|
||||
access_token = access_token or get_from_env(
|
||||
"access_token", "STRIPE_ACCESS_TOKEN"
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Telegram chat json dump."""
|
||||
"""Loads Telegram chat json dump."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
@ -24,10 +24,10 @@ def concatenate_rows(row: dict) -> str:
|
||||
|
||||
|
||||
class TelegramChatFileLoader(BaseLoader):
|
||||
"""Loader that loads Telegram chat json directory dump."""
|
||||
"""Loads Telegram chat json directory dump."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with path."""
|
||||
"""Initialize with a path."""
|
||||
self.file_path = path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
@ -79,7 +79,7 @@ def text_to_docs(text: Union[str, List[str]]) -> List[Document]:
|
||||
|
||||
|
||||
class TelegramChatApiLoader(BaseLoader):
|
||||
"""Loader that loads Telegram chat json directory dump."""
|
||||
"""Loads Telegram chat json directory dump."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -89,7 +89,16 @@ class TelegramChatApiLoader(BaseLoader):
|
||||
username: Optional[str] = None,
|
||||
file_path: str = "telegram_data.json",
|
||||
):
|
||||
"""Initialize with API parameters."""
|
||||
"""Initialize with API parameters.
|
||||
|
||||
Args:
|
||||
chat_entity: The chat entity to fetch data from.
|
||||
api_id: The API ID.
|
||||
api_hash: The API hash.
|
||||
username: The username.
|
||||
file_path: The file path to save the data to. Defaults to
|
||||
"telegram_data.json".
|
||||
"""
|
||||
self.chat_entity = chat_entity
|
||||
self.api_id = api_id
|
||||
self.api_hash = api_hash
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads HTML to markdown using 2markdown."""
|
||||
"""Loads HTML to markdown using 2markdown."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterator, List
|
||||
@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class ToMarkdownLoader(BaseLoader):
|
||||
"""Loader that loads HTML to markdown using 2markdown."""
|
||||
"""Loads HTML to markdown using 2markdown."""
|
||||
|
||||
def __init__(self, url: str, api_key: str):
|
||||
"""Initialize with url and api key."""
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads cards from Trello"""
|
||||
"""Loads cards from Trello"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple
|
||||
|
@ -12,7 +12,7 @@ def concatenate_rows(date: str, sender: str, text: str) -> str:
|
||||
|
||||
|
||||
class WhatsAppChatLoader(BaseLoader):
|
||||
"""Loader that loads WhatsApp messages text file."""
|
||||
"""Loads WhatsApp messages text file."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with path."""
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads word documents."""
|
||||
"""Loads word documents."""
|
||||
import os
|
||||
import tempfile
|
||||
from abc import ABC
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Microsoft Excel files."""
|
||||
"""Loads Microsoft Excel files."""
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.document_loaders.unstructured import (
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads YouTube transcript."""
|
||||
"""Loads YouTube transcript."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
@ -140,7 +140,7 @@ def _parse_video_id(url: str) -> Optional[str]:
|
||||
|
||||
|
||||
class YoutubeLoader(BaseLoader):
|
||||
"""Loader that loads Youtube transcripts."""
|
||||
"""Loads Youtube transcripts."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -252,7 +252,7 @@ class YoutubeLoader(BaseLoader):
|
||||
|
||||
@dataclass
|
||||
class GoogleApiYoutubeLoader(BaseLoader):
|
||||
"""Loader that loads all Videos from a Channel
|
||||
"""Loads all Videos from a Channel
|
||||
|
||||
To use, you should have the ``googleapiclient,youtube_transcript_api``
|
||||
python package installed.
|
||||
|
Loading…
Reference in New Issue
Block a user