mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
docstrings: document_loaders consitency 2 (#9148)
This is Part 2. See #9139 (Part 1).
This commit is contained in:
parent
1b58460fe3
commit
19f504790e
@ -1,4 +1,3 @@
|
|||||||
"""Loads HuggingFace datasets."""
|
|
||||||
from typing import Iterator, List, Mapping, Optional, Sequence, Union
|
from typing import Iterator, List, Mapping, Optional, Sequence, Union
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class HuggingFaceDatasetLoader(BaseLoader):
|
class HuggingFaceDatasetLoader(BaseLoader):
|
||||||
"""Load Documents from the Hugging Face Hub."""
|
"""Load from `Hugging Face Hub` datasets."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads iFixit data."""
|
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -11,7 +10,7 @@ IFIXIT_BASE_URL = "https://www.ifixit.com/api/2.0"
|
|||||||
|
|
||||||
|
|
||||||
class IFixitLoader(BaseLoader):
|
class IFixitLoader(BaseLoader):
|
||||||
"""Load iFixit repair guides, device wikis and answers.
|
"""Load `iFixit` repair guides, device wikis and answers.
|
||||||
|
|
||||||
iFixit is the largest, open repair community on the web. The site contains nearly
|
iFixit is the largest, open repair community on the web. The site contains nearly
|
||||||
100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is
|
100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
"""Loads image files."""
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredImageLoader(UnstructuredFileLoader):
|
class UnstructuredImageLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses Unstructured to load PNG and JPG files.
|
"""Load `PNG` and `JPG` files using `Unstructured`.
|
||||||
|
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
|
@ -1,9 +1,3 @@
|
|||||||
"""Loads image captions.
|
|
||||||
|
|
||||||
By default, the loader utilizes the pre-trained BLIP image captioning model.
|
|
||||||
https://huggingface.co/Salesforce/blip-image-captioning-base
|
|
||||||
|
|
||||||
"""
|
|
||||||
from typing import Any, List, Tuple, Union
|
from typing import Any, List, Tuple, Union
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -13,7 +7,12 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class ImageCaptionLoader(BaseLoader):
|
class ImageCaptionLoader(BaseLoader):
|
||||||
"""Loads the captions of an image"""
|
"""Load image captions.
|
||||||
|
|
||||||
|
By default, the loader utilizes the pre-trained
|
||||||
|
Salesforce BLIP image captioning model.
|
||||||
|
https://huggingface.co/Salesforce/blip-image-captioning-base
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads IMSDb."""
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +5,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class IMSDbLoader(WebBaseLoader):
|
class IMSDbLoader(WebBaseLoader):
|
||||||
"""Loads IMSDb webpages."""
|
"""Load `IMSDb` webpages."""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load webpage."""
|
"""Load webpage."""
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loader that fetches data from IUGU"""
|
|
||||||
import json
|
import json
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
@ -17,7 +16,7 @@ IUGU_ENDPOINTS = {
|
|||||||
|
|
||||||
|
|
||||||
class IuguLoader(BaseLoader):
|
class IuguLoader(BaseLoader):
|
||||||
"""Loader that fetches data from IUGU."""
|
"""Load from `IUGU`."""
|
||||||
|
|
||||||
def __init__(self, resource: str, api_token: Optional[str] = None) -> None:
|
def __init__(self, resource: str, api_token: Optional[str] = None) -> None:
|
||||||
"""Initialize the IUGU resource.
|
"""Initialize the IUGU resource.
|
||||||
|
@ -11,8 +11,7 @@ LINK_NOTE_TEMPLATE = "joplin://x-callback-url/openNote?id={id}"
|
|||||||
|
|
||||||
|
|
||||||
class JoplinLoader(BaseLoader):
|
class JoplinLoader(BaseLoader):
|
||||||
"""
|
"""Load notes from `Joplin`.
|
||||||
Loader that fetches notes from Joplin.
|
|
||||||
|
|
||||||
In order to use this loader, you need to have Joplin running with the
|
In order to use this loader, you need to have Joplin running with the
|
||||||
Web Clipper enabled (look for "Web Clipper" in the app settings).
|
Web Clipper enabled (look for "Web Clipper" in the app settings).
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads data from JSON."""
|
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Dict, List, Optional, Union
|
from typing import Any, Callable, Dict, List, Optional, Union
|
||||||
@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class JSONLoader(BaseLoader):
|
class JSONLoader(BaseLoader):
|
||||||
"""Loads a JSON file using a jq schema.
|
"""Load a `JSON` file using a `jq` schema.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
|
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads LarkSuite (FeiShu) document json dump."""
|
|
||||||
import json
|
import json
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator, List
|
||||||
@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class LarkSuiteDocLoader(BaseLoader):
|
class LarkSuiteDocLoader(BaseLoader):
|
||||||
"""Loads LarkSuite (FeiShu) document."""
|
"""Load from `LarkSuite` (`FeiShu`)."""
|
||||||
|
|
||||||
def __init__(self, domain: str, access_token: str, document_id: str):
|
def __init__(self, domain: str, access_token: str, document_id: str):
|
||||||
"""Initialize with domain, access_token (tenant / user), and document_id.
|
"""Initialize with domain, access_token (tenant / user), and document_id.
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
"""Loads Markdown files."""
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredMarkdownLoader(UnstructuredFileLoader):
|
class UnstructuredMarkdownLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses Unstructured to load markdown files.
|
"""Load `Markdown` files using `Unstructured`.
|
||||||
|
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Mastodon document loader."""
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@ -23,7 +22,7 @@ def _dependable_mastodon_import() -> mastodon:
|
|||||||
|
|
||||||
|
|
||||||
class MastodonTootsLoader(BaseLoader):
|
class MastodonTootsLoader(BaseLoader):
|
||||||
"""Mastodon toots loader."""
|
"""Load the `Mastodon` 'toots'."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -8,7 +8,7 @@ from langchain.utilities.max_compute import MaxComputeAPIWrapper
|
|||||||
|
|
||||||
|
|
||||||
class MaxComputeLoader(BaseLoader):
|
class MaxComputeLoader(BaseLoader):
|
||||||
"""Loads a query result from Alibaba Cloud MaxCompute table into documents."""
|
"""Load from `Alibaba Cloud MaxCompute` table."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Load Data from a MediaWiki dump xml."""
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Sequence, Union
|
from typing import List, Optional, Sequence, Union
|
||||||
@ -10,8 +9,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class MWDumpLoader(BaseLoader):
|
class MWDumpLoader(BaseLoader):
|
||||||
"""
|
"""Load `MediaWiki` dump from an `XML` file.
|
||||||
Load MediaWiki dump from XML file
|
|
||||||
Example:
|
Example:
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
"""Load MHTML files, enriching metadata with page title."""
|
|
||||||
|
|
||||||
import email
|
import email
|
||||||
import logging
|
import logging
|
||||||
from typing import Dict, List, Union
|
from typing import Dict, List, Union
|
||||||
@ -11,7 +9,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class MHTMLLoader(BaseLoader):
|
class MHTMLLoader(BaseLoader):
|
||||||
"""Loader that uses beautiful soup to parse HTML files."""
|
"""Parse `MHTML` files with `BeautifulSoup`."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loader that fetches data from Modern Treasury"""
|
|
||||||
import json
|
import json
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from base64 import b64encode
|
from base64 import b64encode
|
||||||
@ -27,7 +26,7 @@ incoming_payment_details",
|
|||||||
|
|
||||||
|
|
||||||
class ModernTreasuryLoader(BaseLoader):
|
class ModernTreasuryLoader(BaseLoader):
|
||||||
"""Loader that fetches data from Modern Treasury."""
|
"""Load from `Modern Treasury`."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class NewsURLLoader(BaseLoader):
|
class NewsURLLoader(BaseLoader):
|
||||||
"""Loader that uses newspaper to load news articles from URLs.
|
"""Load news articles from URLs using `Unstructured`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls: URLs to load. Each is loaded into its own document.
|
urls: URLs to load. Each is loaded into its own document.
|
||||||
|
@ -70,7 +70,7 @@ def remove_newlines(x: Any) -> Any:
|
|||||||
|
|
||||||
|
|
||||||
class NotebookLoader(BaseLoader):
|
class NotebookLoader(BaseLoader):
|
||||||
"""Loads .ipynb notebook files."""
|
"""Load `Jupyter notebook` (.ipynb) files."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -80,7 +80,7 @@ class NotebookLoader(BaseLoader):
|
|||||||
remove_newline: bool = False,
|
remove_newline: bool = False,
|
||||||
traceback: bool = False,
|
traceback: bool = False,
|
||||||
):
|
):
|
||||||
"""Initialize with path.
|
"""Initialize with a path.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
path: The path to load the notebook from.
|
path: The path to load the notebook from.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads Notion directory dump."""
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class NotionDirectoryLoader(BaseLoader):
|
class NotionDirectoryLoader(BaseLoader):
|
||||||
"""Loads Notion directory dump."""
|
"""Load `Notion directory` dump."""
|
||||||
|
|
||||||
def __init__(self, path: str):
|
def __init__(self, path: str):
|
||||||
"""Initialize with a file path."""
|
"""Initialize with a file path."""
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
"""Notion DB loader for langchain"""
|
|
||||||
|
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -14,7 +12,7 @@ BLOCK_URL = NOTION_BASE_URL + "/blocks/{block_id}/children"
|
|||||||
|
|
||||||
|
|
||||||
class NotionDBLoader(BaseLoader):
|
class NotionDBLoader(BaseLoader):
|
||||||
"""Notion DB Loader.
|
"""Load from `Notion DB`.
|
||||||
|
|
||||||
Reads content from pages within a Notion Database.
|
Reads content from pages within a Notion Database.
|
||||||
Args:
|
Args:
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Extract text from any file type."""
|
|
||||||
import json
|
import json
|
||||||
import uuid
|
import uuid
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -9,7 +8,7 @@ from langchain.tools.nuclia.tool import NucliaUnderstandingAPI
|
|||||||
|
|
||||||
|
|
||||||
class NucliaLoader(BaseLoader):
|
class NucliaLoader(BaseLoader):
|
||||||
"""Extract text from any file type."""
|
"""Load from any file type using `Nuclia Understanding API`."""
|
||||||
|
|
||||||
def __init__(self, path: str, nuclia_tool: NucliaUnderstandingAPI):
|
def __init__(self, path: str, nuclia_tool: NucliaUnderstandingAPI):
|
||||||
self.nua = nuclia_tool
|
self.nua = nuclia_tool
|
||||||
|
@ -7,7 +7,7 @@ from langchain.document_loaders.obs_file import OBSFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class OBSDirectoryLoader(BaseLoader):
|
class OBSDirectoryLoader(BaseLoader):
|
||||||
"""Loading logic for loading documents from Huawei OBS."""
|
"""Load from `Huawei OBS directory`."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -10,7 +10,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class OBSFileLoader(BaseLoader):
|
class OBSFileLoader(BaseLoader):
|
||||||
"""Loader for Huawei OBS file."""
|
"""Load from the `Huawei OBS file`."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads Obsidian directory dump."""
|
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class ObsidianLoader(BaseLoader):
|
class ObsidianLoader(BaseLoader):
|
||||||
"""Loads Obsidian files from disk."""
|
"""Load `Obsidian` files from directory."""
|
||||||
|
|
||||||
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
|
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
|
||||||
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads OpenOffice ODT files."""
|
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import (
|
from langchain.document_loaders.unstructured import (
|
||||||
@ -8,7 +7,8 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredODTLoader(UnstructuredFileLoader):
|
class UnstructuredODTLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load OpenOffice ODT files.
|
"""Load `OpenOffice ODT` files using `Unstructured`.
|
||||||
|
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
langchain Document object. If you use "elements" mode, the unstructured
|
langchain Document object. If you use "elements" mode, the unstructured
|
||||||
|
@ -60,7 +60,7 @@ class _SupportedFileTypes(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class OneDriveLoader(BaseLoader, BaseModel):
|
class OneDriveLoader(BaseLoader, BaseModel):
|
||||||
"""Loads data from OneDrive."""
|
"""Load from `Microsoft OneDrive`."""
|
||||||
|
|
||||||
settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
|
settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
|
||||||
""" The settings for the OneDrive API client."""
|
""" The settings for the OneDrive API client."""
|
||||||
|
@ -16,7 +16,7 @@ CHUNK_SIZE = 1024 * 1024 * 5
|
|||||||
|
|
||||||
|
|
||||||
class OneDriveFileLoader(BaseLoader, BaseModel):
|
class OneDriveFileLoader(BaseLoader, BaseModel):
|
||||||
"""Loads a file from OneDrive."""
|
"""Load a file from `Microsoft OneDrive`."""
|
||||||
|
|
||||||
file: File = Field(...)
|
file: File = Field(...)
|
||||||
"""The file to load."""
|
"""The file to load."""
|
||||||
|
@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class OpenCityDataLoader(BaseLoader):
|
class OpenCityDataLoader(BaseLoader):
|
||||||
"""Loads Open City data."""
|
"""Load from `Open City`."""
|
||||||
|
|
||||||
def __init__(self, city_id: str, dataset_id: str, limit: int):
|
def __init__(self, city_id: str, dataset_id: str, limit: int):
|
||||||
"""Initialize with dataset_id.
|
"""Initialize with dataset_id.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads Org-Mode files."""
|
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import (
|
from langchain.document_loaders.unstructured import (
|
||||||
@ -8,7 +7,8 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredOrgModeLoader(UnstructuredFileLoader):
|
class UnstructuredOrgModeLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load Org-Mode files.
|
"""Load `Org-Mode` files using `Unstructured`.
|
||||||
|
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
langchain Document object. If you use "elements" mode, the unstructured
|
langchain Document object. If you use "elements" mode, the unstructured
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads PDF files."""
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@ -30,7 +29,8 @@ logger = logging.getLogger(__file__)
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredPDFLoader(UnstructuredFileLoader):
|
class UnstructuredPDFLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load PDF files.
|
"""Load `PDF` files using `Unstructured`.
|
||||||
|
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
langchain Document object. If you use "elements" mode, the unstructured
|
langchain Document object. If you use "elements" mode, the unstructured
|
||||||
@ -59,7 +59,7 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
|
|||||||
|
|
||||||
|
|
||||||
class BasePDFLoader(BaseLoader, ABC):
|
class BasePDFLoader(BaseLoader, ABC):
|
||||||
"""Base loader class for PDF files.
|
"""Base Loader class for `PDF` files.
|
||||||
|
|
||||||
Defaults to check for local file, but if the file is a web path, it will download it
|
Defaults to check for local file, but if the file is a web path, it will download it
|
||||||
to a temporary file, use it, then clean up the temporary file after completion
|
to a temporary file, use it, then clean up the temporary file after completion
|
||||||
@ -122,7 +122,7 @@ class BasePDFLoader(BaseLoader, ABC):
|
|||||||
|
|
||||||
|
|
||||||
class OnlinePDFLoader(BasePDFLoader):
|
class OnlinePDFLoader(BasePDFLoader):
|
||||||
"""Loads online PDFs."""
|
"""Load online `PDF`."""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load documents."""
|
"""Load documents."""
|
||||||
@ -131,7 +131,7 @@ class OnlinePDFLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PyPDFLoader(BasePDFLoader):
|
class PyPDFLoader(BasePDFLoader):
|
||||||
"""Loads a PDF with pypdf and chunks at character level.
|
"""Load `PDF using `pypdf` and chunks at character level.
|
||||||
|
|
||||||
Loader also stores page numbers in metadata.
|
Loader also stores page numbers in metadata.
|
||||||
"""
|
"""
|
||||||
@ -162,7 +162,7 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PyPDFium2Loader(BasePDFLoader):
|
class PyPDFium2Loader(BasePDFLoader):
|
||||||
"""Loads a PDF with pypdfium2 and chunks at character level."""
|
"""Load `PDF` using `pypdfium2` and chunks at character level."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with a file path."""
|
"""Initialize with a file path."""
|
||||||
@ -182,7 +182,7 @@ class PyPDFium2Loader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PyPDFDirectoryLoader(BaseLoader):
|
class PyPDFDirectoryLoader(BaseLoader):
|
||||||
"""Loads a directory with PDF files with pypdf and chunks at character level.
|
"""Load a directory with `PDF` files using `pypdf` and chunks at character level.
|
||||||
|
|
||||||
Loader also stores page numbers in metadata.
|
Loader also stores page numbers in metadata.
|
||||||
"""
|
"""
|
||||||
@ -227,7 +227,7 @@ class PyPDFDirectoryLoader(BaseLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PDFMinerLoader(BasePDFLoader):
|
class PDFMinerLoader(BasePDFLoader):
|
||||||
"""Loader that uses PDFMiner to load PDF files."""
|
"""Load `PDF` files using `PDFMiner`."""
|
||||||
|
|
||||||
def __init__(self, file_path: str) -> None:
|
def __init__(self, file_path: str) -> None:
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
@ -255,7 +255,7 @@ class PDFMinerLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
||||||
"""Loader that uses PDFMiner to load PDF files as HTML content."""
|
"""Load `PDF` files as HTML content using `PDFMiner`."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with a file path."""
|
"""Initialize with a file path."""
|
||||||
@ -289,7 +289,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PyMuPDFLoader(BasePDFLoader):
|
class PyMuPDFLoader(BasePDFLoader):
|
||||||
"""Loader that uses PyMuPDF to load PDF files."""
|
"""Load `PDF` files using `PyMuPDF`."""
|
||||||
|
|
||||||
def __init__(self, file_path: str) -> None:
|
def __init__(self, file_path: str) -> None:
|
||||||
"""Initialize with a file path."""
|
"""Initialize with a file path."""
|
||||||
@ -314,7 +314,7 @@ class PyMuPDFLoader(BasePDFLoader):
|
|||||||
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
|
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
|
||||||
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
|
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
|
||||||
class MathpixPDFLoader(BasePDFLoader):
|
class MathpixPDFLoader(BasePDFLoader):
|
||||||
"""This class uses Mathpix service to load PDF files."""
|
"""Load `PDF` files using `Mathpix` service."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -433,7 +433,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PDFPlumberLoader(BasePDFLoader):
|
class PDFPlumberLoader(BasePDFLoader):
|
||||||
"""Loader that uses pdfplumber to load PDF files."""
|
"""Load `PDF` files using `pdfplumber`."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
|
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
|
||||||
@ -459,7 +459,7 @@ class PDFPlumberLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class AmazonTextractPDFLoader(BasePDFLoader):
|
class AmazonTextractPDFLoader(BasePDFLoader):
|
||||||
"""Loads a PDF document from local file system, HTTP or S3.
|
""" "Load `PDF` files from a local file system, HTTP or S3.
|
||||||
|
|
||||||
To authenticate, the AWS client uses the following methods to
|
To authenticate, the AWS client uses the following methods to
|
||||||
automatically load credentials:
|
automatically load credentials:
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads PowerPoint files."""
|
|
||||||
import os
|
import os
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -6,7 +5,8 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load PowerPoint files.
|
"""Load `Microsoft PowerPoint` files using `Unstructured`.
|
||||||
|
|
||||||
Works with both .ppt and .pptx files.
|
Works with both .ppt and .pptx files.
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads documents from Psychic.dev."""
|
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class PsychicLoader(BaseLoader):
|
class PsychicLoader(BaseLoader):
|
||||||
"""Loads documents from Psychic.dev."""
|
"""Load from `Psychic.dev`."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, api_key: str, account_id: str, connector_id: Optional[str] = None
|
self, api_key: str, account_id: str, connector_id: Optional[str] = None
|
||||||
|
@ -6,7 +6,7 @@ from langchain.utilities.pubmed import PubMedAPIWrapper
|
|||||||
|
|
||||||
|
|
||||||
class PubMedLoader(BaseLoader):
|
class PubMedLoader(BaseLoader):
|
||||||
"""Loads a query result from PubMed biomedical library into a list of Documents.
|
"""Load from the `PubMed` biomedical library.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
query: The query to be passed to the PubMed API.
|
query: The query to be passed to the PubMed API.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Load from a Spark Dataframe object"""
|
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
@ -14,7 +13,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
class PySparkDataFrameLoader(BaseLoader):
|
class PySparkDataFrameLoader(BaseLoader):
|
||||||
"""Load PySpark DataFrames"""
|
"""Load `PySpark` DataFrames."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -4,9 +4,7 @@ from langchain.document_loaders.text import TextLoader
|
|||||||
|
|
||||||
|
|
||||||
class PythonLoader(TextLoader):
|
class PythonLoader(TextLoader):
|
||||||
"""
|
"""Load `Python` files, respecting any non-default encoding if specified."""
|
||||||
Load Python files, respecting any non-default encoding if specified.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with a file path.
|
"""Initialize with a file path.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads ReadTheDocs documentation directory dump."""
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, List, Optional, Tuple, Union
|
from typing import Any, List, Optional, Tuple, Union
|
||||||
|
|
||||||
@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class ReadTheDocsLoader(BaseLoader):
|
class ReadTheDocsLoader(BaseLoader):
|
||||||
"""Loads ReadTheDocs documentation directory dump."""
|
"""Load `ReadTheDocs` documentation directory."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class RecursiveUrlLoader(BaseLoader):
|
class RecursiveUrlLoader(BaseLoader):
|
||||||
"""Loads all child links from a given url."""
|
"""Load all child links from a URL page."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Reddit document loader."""
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Sequence
|
from typing import TYPE_CHECKING, Iterable, List, Optional, Sequence
|
||||||
@ -21,7 +20,8 @@ def _dependable_praw_import() -> praw:
|
|||||||
|
|
||||||
|
|
||||||
class RedditPostsLoader(BaseLoader):
|
class RedditPostsLoader(BaseLoader):
|
||||||
"""Reddit posts loader.
|
"""Load `Reddit` posts.
|
||||||
|
|
||||||
Read posts on a subreddit.
|
Read posts on a subreddit.
|
||||||
First, you need to go to
|
First, you need to go to
|
||||||
https://www.reddit.com/prefs/apps/
|
https://www.reddit.com/prefs/apps/
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads Roam directory dump."""
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class RoamLoader(BaseLoader):
|
class RoamLoader(BaseLoader):
|
||||||
"""Loads Roam files from disk."""
|
"""Load `Roam` files from a directory."""
|
||||||
|
|
||||||
def __init__(self, path: str):
|
def __init__(self, path: str):
|
||||||
"""Initialize with a path."""
|
"""Initialize with a path."""
|
||||||
|
@ -17,7 +17,7 @@ class ColumnNotFoundError(Exception):
|
|||||||
|
|
||||||
|
|
||||||
class RocksetLoader(BaseLoader):
|
class RocksetLoader(BaseLoader):
|
||||||
"""Wrapper around Rockset db
|
"""Load from a `Rockset` database.
|
||||||
|
|
||||||
To use, you should have the `rockset` python package installed.
|
To use, you should have the `rockset` python package installed.
|
||||||
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loader that uses unstructured to load HTML files."""
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Any, Iterator, List, Optional, Sequence
|
from typing import Any, Iterator, List, Optional, Sequence
|
||||||
|
|
||||||
@ -10,7 +9,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class RSSFeedLoader(BaseLoader):
|
class RSSFeedLoader(BaseLoader):
|
||||||
"""Loader that uses newspaper to load news articles from RSS feeds.
|
"""Load news articles from `RSS` feeds using `Unstructured`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls: URLs for RSS feeds to load. Each articles in the feed is loaded into its own document.
|
urls: URLs for RSS feeds to load. Each articles in the feed is loaded into its own document.
|
||||||
|
@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredRSTLoader(UnstructuredFileLoader):
|
class UnstructuredRSTLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load RST files.
|
"""Load `RST` files using `Unstructured`.
|
||||||
|
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
langchain Document object. If you use "elements" mode, the unstructured
|
langchain Document object. If you use "elements" mode, the unstructured
|
||||||
|
@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredRTFLoader(UnstructuredFileLoader):
|
class UnstructuredRTFLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load RTF files.
|
"""Load `RTF` files using `Unstructured`.
|
||||||
|
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
langchain Document object. If you use "elements" mode, the unstructured
|
langchain Document object. If you use "elements" mode, the unstructured
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loading logic for loading documents from an AWS S3 directory."""
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -7,7 +6,7 @@ from langchain.document_loaders.s3_file import S3FileLoader
|
|||||||
|
|
||||||
|
|
||||||
class S3DirectoryLoader(BaseLoader):
|
class S3DirectoryLoader(BaseLoader):
|
||||||
"""Loading logic for loading documents from an AWS S3."""
|
"""Load from `Amazon AWS S3` directory."""
|
||||||
|
|
||||||
def __init__(self, bucket: str, prefix: str = ""):
|
def __init__(self, bucket: str, prefix: str = ""):
|
||||||
"""Initialize with bucket and key name.
|
"""Initialize with bucket and key name.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loading logic for loading documents from an AWS S3 file."""
|
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -9,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class S3FileLoader(BaseLoader):
|
class S3FileLoader(BaseLoader):
|
||||||
"""Loading logic for loading documents from an AWS S3 file."""
|
"""Load from `Amazon AWS S3` file."""
|
||||||
|
|
||||||
def __init__(self, bucket: str, key: str):
|
def __init__(self, bucket: str, key: str):
|
||||||
"""Initialize with bucket and key name.
|
"""Initialize with bucket and key name.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loader that fetches a sitemap and loads those URLs."""
|
|
||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
from typing import Any, Callable, Generator, Iterable, List, Optional
|
from typing import Any, Callable, Generator, Iterable, List, Optional
|
||||||
@ -22,7 +21,7 @@ def _batch_block(iterable: Iterable, size: int) -> Generator[List[dict], None, N
|
|||||||
|
|
||||||
|
|
||||||
class SitemapLoader(WebBaseLoader):
|
class SitemapLoader(WebBaseLoader):
|
||||||
"""Loader that fetches a sitemap and loads those URLs."""
|
"""Load a sitemap and its URLs."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loader for documents from a Slack export."""
|
|
||||||
import json
|
import json
|
||||||
import zipfile
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -9,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class SlackDirectoryLoader(BaseLoader):
|
class SlackDirectoryLoader(BaseLoader):
|
||||||
"""Loads documents from a Slack directory dump."""
|
"""Load from a `Slack` directory dump."""
|
||||||
|
|
||||||
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
|
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
|
||||||
"""Initialize the SlackDirectoryLoader.
|
"""Initialize the SlackDirectoryLoader.
|
||||||
|
@ -7,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class SnowflakeLoader(BaseLoader):
|
class SnowflakeLoader(BaseLoader):
|
||||||
"""Loads a query result from Snowflake into a list of documents.
|
"""Load from `Snowflake` API.
|
||||||
|
|
||||||
Each document represents one row of the result. The `page_content_columns`
|
Each document represents one row of the result. The `page_content_columns`
|
||||||
are written into the `page_content` of the document. The `metadata_columns`
|
are written into the `page_content` of the document. The `metadata_columns`
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loader that fetches data from Spreedly API."""
|
|
||||||
import json
|
import json
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -20,7 +19,7 @@ SPREEDLY_ENDPOINTS = {
|
|||||||
|
|
||||||
|
|
||||||
class SpreedlyLoader(BaseLoader):
|
class SpreedlyLoader(BaseLoader):
|
||||||
"""Loader that fetches data from Spreedly API."""
|
"""Load from `Spreedly` API."""
|
||||||
|
|
||||||
def __init__(self, access_token: str, resource: str) -> None:
|
def __init__(self, access_token: str, resource: str) -> None:
|
||||||
"""Initialize with an access token and a resource.
|
"""Initialize with an access token and a resource.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loader for .srt (subtitle) files."""
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class SRTLoader(BaseLoader):
|
class SRTLoader(BaseLoader):
|
||||||
"""Loader for .srt (subtitle) files."""
|
"""Load `.srt` (subtitle) files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with a file path."""
|
"""Initialize with a file path."""
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loader that fetches data from Stripe"""
|
|
||||||
import json
|
import json
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
@ -18,7 +17,7 @@ STRIPE_ENDPOINTS = {
|
|||||||
|
|
||||||
|
|
||||||
class StripeLoader(BaseLoader):
|
class StripeLoader(BaseLoader):
|
||||||
"""Loader that fetches data from Stripe."""
|
"""Load from `Stripe` API."""
|
||||||
|
|
||||||
def __init__(self, resource: str, access_token: Optional[str] = None) -> None:
|
def __init__(self, resource: str, access_token: Optional[str] = None) -> None:
|
||||||
"""Initialize with a resource and an access token.
|
"""Initialize with a resource and an access token.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads Telegram chat json dump."""
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
@ -24,7 +23,7 @@ def concatenate_rows(row: dict) -> str:
|
|||||||
|
|
||||||
|
|
||||||
class TelegramChatFileLoader(BaseLoader):
|
class TelegramChatFileLoader(BaseLoader):
|
||||||
"""Loads Telegram chat json directory dump."""
|
"""Load from `Telegram chat` dump."""
|
||||||
|
|
||||||
def __init__(self, path: str):
|
def __init__(self, path: str):
|
||||||
"""Initialize with a path."""
|
"""Initialize with a path."""
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loading logic for loading documents from Tencent Cloud COS directory."""
|
|
||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator, List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -7,7 +6,7 @@ from langchain.document_loaders.tencent_cos_file import TencentCOSFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class TencentCOSDirectoryLoader(BaseLoader):
|
class TencentCOSDirectoryLoader(BaseLoader):
|
||||||
"""Loader for Tencent Cloud COS directory."""
|
"""Load from `Tencent Cloud COS` directory."""
|
||||||
|
|
||||||
def __init__(self, conf: Any, bucket: str, prefix: str = ""):
|
def __init__(self, conf: Any, bucket: str, prefix: str = ""):
|
||||||
"""Initialize with COS config, bucket and prefix.
|
"""Initialize with COS config, bucket and prefix.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loading logic for loading documents from Tencent Cloud COS file."""
|
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator, List
|
||||||
@ -9,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class TencentCOSFileLoader(BaseLoader):
|
class TencentCOSFileLoader(BaseLoader):
|
||||||
"""Loader for Tencent Cloud COS file."""
|
"""Load from `Tencent Cloud COS` file."""
|
||||||
|
|
||||||
def __init__(self, conf: Any, bucket: str, key: str):
|
def __init__(self, conf: Any, bucket: str, key: str):
|
||||||
"""Initialize with COS config, bucket and key name.
|
"""Initialize with COS config, bucket and key name.
|
||||||
|
@ -6,7 +6,7 @@ from langchain.utilities.tensorflow_datasets import TensorflowDatasets
|
|||||||
|
|
||||||
|
|
||||||
class TensorflowDatasetLoader(BaseLoader):
|
class TensorflowDatasetLoader(BaseLoader):
|
||||||
"""Loads from TensorFlow Datasets into a list of Documents.
|
"""Load from `TensorFlow Dataset`.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
dataset_name: the name of the dataset to load
|
dataset_name: the name of the dataset to load
|
||||||
|
@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class TextLoader(BaseLoader):
|
class TextLoader(BaseLoader):
|
||||||
"""Load text files.
|
"""Load text file.
|
||||||
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads HTML to markdown using 2markdown."""
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Iterator, List
|
from typing import Iterator, List
|
||||||
@ -10,7 +9,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class ToMarkdownLoader(BaseLoader):
|
class ToMarkdownLoader(BaseLoader):
|
||||||
"""Loads HTML to markdown using 2markdown."""
|
"""Load `HTML` using `2markdown API`."""
|
||||||
|
|
||||||
def __init__(self, url: str, api_key: str):
|
def __init__(self, url: str, api_key: str):
|
||||||
"""Initialize with url and api key."""
|
"""Initialize with url and api key."""
|
||||||
|
@ -7,11 +7,10 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class TomlLoader(BaseLoader):
|
class TomlLoader(BaseLoader):
|
||||||
"""
|
"""Load `TOML` files.
|
||||||
A TOML document loader that inherits from the BaseLoader class.
|
|
||||||
|
|
||||||
This class can be initialized with either a single source file or a source
|
It can load a single source file or several files in a single
|
||||||
directory containing TOML files.
|
directory.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, source: Union[str, Path]):
|
def __init__(self, source: Union[str, Path]):
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads cards from Trello"""
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple
|
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple
|
||||||
@ -12,7 +11,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
class TrelloLoader(BaseLoader):
|
class TrelloLoader(BaseLoader):
|
||||||
"""Trello loader. Reads all cards from a Trello board."""
|
"""Load cards from a `Trello` board."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -7,7 +7,9 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredTSVLoader(UnstructuredFileLoader):
|
class UnstructuredTSVLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load TSV files. Like other
|
"""Load `TSV` files using `Unstructured`.
|
||||||
|
|
||||||
|
Like other
|
||||||
Unstructured loaders, UnstructuredTSVLoader can be used in both
|
Unstructured loaders, UnstructuredTSVLoader can be used in both
|
||||||
"single" and "elements" mode. If you use the loader in "elements"
|
"single" and "elements" mode. If you use the loader in "elements"
|
||||||
mode, the TSV file will be a single Unstructured Table element.
|
mode, the TSV file will be a single Unstructured Table element.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Twitter document loader."""
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
|
||||||
@ -22,8 +21,9 @@ def _dependable_tweepy_import() -> tweepy:
|
|||||||
|
|
||||||
|
|
||||||
class TwitterTweetLoader(BaseLoader):
|
class TwitterTweetLoader(BaseLoader):
|
||||||
"""Twitter tweets loader.
|
"""Load `Twitter` tweets.
|
||||||
Read tweets of user twitter handle.
|
|
||||||
|
Read tweets of the user's Twitter handle.
|
||||||
|
|
||||||
First you need to go to
|
First you need to go to
|
||||||
`https://developer.twitter.com/en/docs/twitter-api
|
`https://developer.twitter.com/en/docs/twitter-api
|
||||||
|
@ -130,7 +130,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredFileLoader(UnstructuredBaseLoader):
|
class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||||
"""Loader that uses Unstructured to load files.
|
"""Load files using `Unstructured`.
|
||||||
|
|
||||||
The file loader uses the
|
The file loader uses the
|
||||||
unstructured partition function and will automatically detect the file
|
unstructured partition function and will automatically detect the file
|
||||||
@ -211,7 +211,7 @@ def get_elements_from_api(
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses the Unstructured API to load files.
|
"""Load files using `Unstructured` API.
|
||||||
|
|
||||||
By default, the loader makes a call to the hosted Unstructured API.
|
By default, the loader makes a call to the hosted Unstructured API.
|
||||||
If you are running the unstructured API locally, you can change the
|
If you are running the unstructured API locally, you can change the
|
||||||
@ -275,7 +275,7 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
||||||
"""Loader that uses Unstructured to load files.
|
"""Load files using `Unstructured`.
|
||||||
|
|
||||||
The file loader
|
The file loader
|
||||||
uses the unstructured partition function and will automatically detect the file
|
uses the unstructured partition function and will automatically detect the file
|
||||||
@ -322,7 +322,7 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
|
class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
|
||||||
"""Loader that uses the Unstructured API to load files.
|
"""Load files using `Unstructured` API.
|
||||||
|
|
||||||
By default, the loader makes a call to the hosted Unstructured API.
|
By default, the loader makes a call to the hosted Unstructured API.
|
||||||
If you are running the unstructured API locally, you can change the
|
If you are running the unstructured API locally, you can change the
|
||||||
|
@ -9,7 +9,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredURLLoader(BaseLoader):
|
class UnstructuredURLLoader(BaseLoader):
|
||||||
"""Loader that use Unstructured to load files from remote URLs.
|
"""Load files from remote URLs using `Unstructured`.
|
||||||
|
|
||||||
Use the unstructured partition function to detect the MIME type
|
Use the unstructured partition function to detect the MIME type
|
||||||
and route the file to the appropriate partitioner.
|
and route the file to the appropriate partitioner.
|
||||||
|
|
||||||
|
@ -10,7 +10,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class PlaywrightURLLoader(BaseLoader):
|
class PlaywrightURLLoader(BaseLoader):
|
||||||
"""Loader that uses Playwright and to load a page and unstructured to load the html.
|
"""Load `HTML` pages with `Playwright` and parse with `Unstructured`.
|
||||||
|
|
||||||
This is useful for loading pages that require javascript to render.
|
This is useful for loading pages that require javascript to render.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
|
@ -13,7 +13,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class SeleniumURLLoader(BaseLoader):
|
class SeleniumURLLoader(BaseLoader):
|
||||||
"""Loader that uses Selenium and to load a page and unstructured to load the html.
|
"""Load `HTML` pages with `Selenium` and parse with `Unstructured`.
|
||||||
|
|
||||||
This is useful for loading pages that require javascript to render.
|
This is useful for loading pages that require javascript to render.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
|
@ -10,7 +10,7 @@ from langchain.utilities.openweathermap import OpenWeatherMapAPIWrapper
|
|||||||
|
|
||||||
|
|
||||||
class WeatherDataLoader(BaseLoader):
|
class WeatherDataLoader(BaseLoader):
|
||||||
"""Weather Reader.
|
"""Load weather data with `Open Weather Map` API.
|
||||||
|
|
||||||
Reads the forecast & current weather of any location using OpenWeatherMap's free
|
Reads the forecast & current weather of any location using OpenWeatherMap's free
|
||||||
API. Checkout 'https://openweathermap.org/appid' for more on how to generate a free
|
API. Checkout 'https://openweathermap.org/appid' for more on how to generate a free
|
||||||
|
@ -37,7 +37,7 @@ def _build_metadata(soup: Any, url: str) -> dict:
|
|||||||
|
|
||||||
|
|
||||||
class WebBaseLoader(BaseLoader):
|
class WebBaseLoader(BaseLoader):
|
||||||
"""Loader that uses urllib and beautiful soup to load webpages."""
|
"""Load HTML pages using `urllib` and parse them with `BeautifulSoup'."""
|
||||||
|
|
||||||
web_paths: List[str]
|
web_paths: List[str]
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ def concatenate_rows(date: str, sender: str, text: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
class WhatsAppChatLoader(BaseLoader):
|
class WhatsAppChatLoader(BaseLoader):
|
||||||
"""Loads WhatsApp messages text file."""
|
"""Load `WhatsApp` messages text file."""
|
||||||
|
|
||||||
def __init__(self, path: str):
|
def __init__(self, path: str):
|
||||||
"""Initialize with path."""
|
"""Initialize with path."""
|
||||||
|
@ -6,7 +6,8 @@ from langchain.utilities.wikipedia import WikipediaAPIWrapper
|
|||||||
|
|
||||||
|
|
||||||
class WikipediaLoader(BaseLoader):
|
class WikipediaLoader(BaseLoader):
|
||||||
"""Loads a query result from www.wikipedia.org into a list of Documents.
|
"""Load from `Wikipedia`.
|
||||||
|
|
||||||
The hard limit on the number of downloaded Documents is 300 for now.
|
The hard limit on the number of downloaded Documents is 300 for now.
|
||||||
|
|
||||||
Each wiki page represents one Document.
|
Each wiki page represents one Document.
|
||||||
|
@ -13,7 +13,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class Docx2txtLoader(BaseLoader, ABC):
|
class Docx2txtLoader(BaseLoader, ABC):
|
||||||
"""Loads a DOCX with docx2txt and chunks at character level.
|
"""Load `DOCX` file using `docx2txt` and chunks at character level.
|
||||||
|
|
||||||
Defaults to check for local file, but if the file is a web path, it will download it
|
Defaults to check for local file, but if the file is a web path, it will download it
|
||||||
to a temporary file, and use that, then clean up the temporary file after completion
|
to a temporary file, and use that, then clean up the temporary file after completion
|
||||||
@ -65,7 +65,8 @@ class Docx2txtLoader(BaseLoader, ABC):
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
|
class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load word documents.
|
"""Load `Microsof Word` file using `Unstructured`.
|
||||||
|
|
||||||
Works with both .docx and .doc files.
|
Works with both .docx and .doc files.
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
|
@ -8,7 +8,8 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredXMLLoader(UnstructuredFileLoader):
|
class UnstructuredXMLLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load XML files.
|
"""Load `XML` file using `Unstructured`.
|
||||||
|
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
langchain Document object. If you use "elements" mode, the unstructured
|
langchain Document object. If you use "elements" mode, the unstructured
|
||||||
|
@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class XorbitsLoader(BaseLoader):
|
class XorbitsLoader(BaseLoader):
|
||||||
"""Load Xorbits DataFrame."""
|
"""Load `Xorbits` DataFrame."""
|
||||||
|
|
||||||
def __init__(self, data_frame: Any, page_content_column: str = "text"):
|
def __init__(self, data_frame: Any, page_content_column: str = "text"):
|
||||||
"""Initialize with dataframe object.
|
"""Initialize with dataframe object.
|
||||||
|
@ -140,7 +140,7 @@ def _parse_video_id(url: str) -> Optional[str]:
|
|||||||
|
|
||||||
|
|
||||||
class YoutubeLoader(BaseLoader):
|
class YoutubeLoader(BaseLoader):
|
||||||
"""Loads Youtube transcripts."""
|
"""Load `YouTube` transcripts."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -252,7 +252,7 @@ class YoutubeLoader(BaseLoader):
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GoogleApiYoutubeLoader(BaseLoader):
|
class GoogleApiYoutubeLoader(BaseLoader):
|
||||||
"""Loads all Videos from a Channel
|
"""Load all Videos from a `YouTube` Channel.
|
||||||
|
|
||||||
To use, you should have the ``googleapiclient,youtube_transcript_api``
|
To use, you should have the ``googleapiclient,youtube_transcript_api``
|
||||||
python package installed.
|
python package installed.
|
||||||
|
Loading…
Reference in New Issue
Block a user