langchain/libs/community/langchain_community/document_loaders/__init__.py
Sridhar Ramaswamy 9f1cbbc6ed
community[minor]: Add pebblo safe document loader (#16862)
- **Description:** Pebblo opensource project enables developers to
safely load data to their Gen AI apps. It identifies semantic topics and
entities found in the loaded data and summarizes them in a
developer-friendly report.
  - **Dependencies:** none
  - **Twitter handle:** srics

@hwchase17
2024-02-12 21:56:12 -08:00

417 lines
16 KiB
Python

"""**Document Loaders** are classes to load Documents.
**Document Loaders** are usually used to load a lot of Documents in a single run.
**Class hierarchy:**
.. code-block::
BaseLoader --> <name>Loader # Examples: TextLoader, UnstructuredFileLoader
**Main helpers:**
.. code-block::
Document, <name>TextSplitter
"""
from langchain_community.document_loaders.acreom import AcreomLoader
from langchain_community.document_loaders.airbyte import (
AirbyteCDKLoader,
AirbyteGongLoader,
AirbyteHubspotLoader,
AirbyteSalesforceLoader,
AirbyteShopifyLoader,
AirbyteStripeLoader,
AirbyteTypeformLoader,
AirbyteZendeskSupportLoader,
)
from langchain_community.document_loaders.airbyte_json import AirbyteJSONLoader
from langchain_community.document_loaders.airtable import AirtableLoader
from langchain_community.document_loaders.apify_dataset import ApifyDatasetLoader
from langchain_community.document_loaders.arcgis_loader import ArcGISLoader
from langchain_community.document_loaders.arxiv import ArxivLoader
from langchain_community.document_loaders.assemblyai import (
AssemblyAIAudioTranscriptLoader,
)
from langchain_community.document_loaders.astradb import AstraDBLoader
from langchain_community.document_loaders.async_html import AsyncHtmlLoader
from langchain_community.document_loaders.athena import AthenaLoader
from langchain_community.document_loaders.azlyrics import AZLyricsLoader
from langchain_community.document_loaders.azure_ai_data import (
AzureAIDataLoader,
)
from langchain_community.document_loaders.azure_blob_storage_container import (
AzureBlobStorageContainerLoader,
)
from langchain_community.document_loaders.azure_blob_storage_file import (
AzureBlobStorageFileLoader,
)
from langchain_community.document_loaders.bibtex import BibtexLoader
from langchain_community.document_loaders.bigquery import BigQueryLoader
from langchain_community.document_loaders.bilibili import BiliBiliLoader
from langchain_community.document_loaders.blackboard import BlackboardLoader
from langchain_community.document_loaders.blob_loaders import (
Blob,
BlobLoader,
FileSystemBlobLoader,
YoutubeAudioLoader,
)
from langchain_community.document_loaders.blockchain import BlockchainDocumentLoader
from langchain_community.document_loaders.brave_search import BraveSearchLoader
from langchain_community.document_loaders.browserless import BrowserlessLoader
from langchain_community.document_loaders.cassandra import CassandraLoader
from langchain_community.document_loaders.chatgpt import ChatGPTLoader
from langchain_community.document_loaders.chromium import AsyncChromiumLoader
from langchain_community.document_loaders.college_confidential import (
CollegeConfidentialLoader,
)
from langchain_community.document_loaders.concurrent import ConcurrentLoader
from langchain_community.document_loaders.confluence import ConfluenceLoader
from langchain_community.document_loaders.conllu import CoNLLULoader
from langchain_community.document_loaders.couchbase import CouchbaseLoader
from langchain_community.document_loaders.csv_loader import (
CSVLoader,
UnstructuredCSVLoader,
)
from langchain_community.document_loaders.cube_semantic import CubeSemanticLoader
from langchain_community.document_loaders.datadog_logs import DatadogLogsLoader
from langchain_community.document_loaders.dataframe import DataFrameLoader
from langchain_community.document_loaders.diffbot import DiffbotLoader
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_community.document_loaders.discord import DiscordChatLoader
from langchain_community.document_loaders.doc_intelligence import (
AzureAIDocumentIntelligenceLoader,
)
from langchain_community.document_loaders.docugami import DocugamiLoader
from langchain_community.document_loaders.docusaurus import DocusaurusLoader
from langchain_community.document_loaders.dropbox import DropboxLoader
from langchain_community.document_loaders.duckdb_loader import DuckDBLoader
from langchain_community.document_loaders.email import (
OutlookMessageLoader,
UnstructuredEmailLoader,
)
from langchain_community.document_loaders.epub import UnstructuredEPubLoader
from langchain_community.document_loaders.etherscan import EtherscanLoader
from langchain_community.document_loaders.evernote import EverNoteLoader
from langchain_community.document_loaders.excel import UnstructuredExcelLoader
from langchain_community.document_loaders.facebook_chat import FacebookChatLoader
from langchain_community.document_loaders.fauna import FaunaLoader
from langchain_community.document_loaders.figma import FigmaFileLoader
from langchain_community.document_loaders.gcs_directory import GCSDirectoryLoader
from langchain_community.document_loaders.gcs_file import GCSFileLoader
from langchain_community.document_loaders.geodataframe import GeoDataFrameLoader
from langchain_community.document_loaders.git import GitLoader
from langchain_community.document_loaders.gitbook import GitbookLoader
from langchain_community.document_loaders.github import (
GithubFileLoader,
GitHubIssuesLoader,
)
from langchain_community.document_loaders.google_speech_to_text import (
GoogleSpeechToTextLoader,
)
from langchain_community.document_loaders.googledrive import GoogleDriveLoader
from langchain_community.document_loaders.gutenberg import GutenbergLoader
from langchain_community.document_loaders.hn import HNLoader
from langchain_community.document_loaders.html import UnstructuredHTMLLoader
from langchain_community.document_loaders.html_bs import BSHTMLLoader
from langchain_community.document_loaders.hugging_face_dataset import (
HuggingFaceDatasetLoader,
)
from langchain_community.document_loaders.ifixit import IFixitLoader
from langchain_community.document_loaders.image import UnstructuredImageLoader
from langchain_community.document_loaders.image_captions import ImageCaptionLoader
from langchain_community.document_loaders.imsdb import IMSDbLoader
from langchain_community.document_loaders.iugu import IuguLoader
from langchain_community.document_loaders.joplin import JoplinLoader
from langchain_community.document_loaders.json_loader import JSONLoader
from langchain_community.document_loaders.lakefs import LakeFSLoader
from langchain_community.document_loaders.larksuite import LarkSuiteDocLoader
from langchain_community.document_loaders.markdown import UnstructuredMarkdownLoader
from langchain_community.document_loaders.mastodon import MastodonTootsLoader
from langchain_community.document_loaders.max_compute import MaxComputeLoader
from langchain_community.document_loaders.mediawikidump import MWDumpLoader
from langchain_community.document_loaders.merge import MergedDataLoader
from langchain_community.document_loaders.mhtml import MHTMLLoader
from langchain_community.document_loaders.modern_treasury import ModernTreasuryLoader
from langchain_community.document_loaders.mongodb import MongodbLoader
from langchain_community.document_loaders.news import NewsURLLoader
from langchain_community.document_loaders.notebook import NotebookLoader
from langchain_community.document_loaders.notion import NotionDirectoryLoader
from langchain_community.document_loaders.notiondb import NotionDBLoader
from langchain_community.document_loaders.obs_directory import OBSDirectoryLoader
from langchain_community.document_loaders.obs_file import OBSFileLoader
from langchain_community.document_loaders.obsidian import ObsidianLoader
from langchain_community.document_loaders.odt import UnstructuredODTLoader
from langchain_community.document_loaders.onedrive import OneDriveLoader
from langchain_community.document_loaders.onedrive_file import OneDriveFileLoader
from langchain_community.document_loaders.open_city_data import OpenCityDataLoader
from langchain_community.document_loaders.org_mode import UnstructuredOrgModeLoader
from langchain_community.document_loaders.pdf import (
AmazonTextractPDFLoader,
MathpixPDFLoader,
OnlinePDFLoader,
PDFMinerLoader,
PDFMinerPDFasHTMLLoader,
PDFPlumberLoader,
PyMuPDFLoader,
PyPDFDirectoryLoader,
PyPDFium2Loader,
PyPDFLoader,
UnstructuredPDFLoader,
)
from langchain_community.document_loaders.pebblo import PebbloSafeLoader
from langchain_community.document_loaders.polars_dataframe import PolarsDataFrameLoader
from langchain_community.document_loaders.powerpoint import UnstructuredPowerPointLoader
from langchain_community.document_loaders.psychic import PsychicLoader
from langchain_community.document_loaders.pubmed import PubMedLoader
from langchain_community.document_loaders.pyspark_dataframe import (
PySparkDataFrameLoader,
)
from langchain_community.document_loaders.python import PythonLoader
from langchain_community.document_loaders.readthedocs import ReadTheDocsLoader
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_community.document_loaders.reddit import RedditPostsLoader
from langchain_community.document_loaders.roam import RoamLoader
from langchain_community.document_loaders.rocksetdb import RocksetLoader
from langchain_community.document_loaders.rss import RSSFeedLoader
from langchain_community.document_loaders.rst import UnstructuredRSTLoader
from langchain_community.document_loaders.rtf import UnstructuredRTFLoader
from langchain_community.document_loaders.s3_directory import S3DirectoryLoader
from langchain_community.document_loaders.s3_file import S3FileLoader
from langchain_community.document_loaders.sharepoint import SharePointLoader
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_community.document_loaders.slack_directory import SlackDirectoryLoader
from langchain_community.document_loaders.snowflake_loader import SnowflakeLoader
from langchain_community.document_loaders.spreedly import SpreedlyLoader
from langchain_community.document_loaders.srt import SRTLoader
from langchain_community.document_loaders.stripe import StripeLoader
from langchain_community.document_loaders.surrealdb import SurrealDBLoader
from langchain_community.document_loaders.telegram import (
TelegramChatApiLoader,
TelegramChatFileLoader,
)
from langchain_community.document_loaders.tencent_cos_directory import (
TencentCOSDirectoryLoader,
)
from langchain_community.document_loaders.tencent_cos_file import TencentCOSFileLoader
from langchain_community.document_loaders.tensorflow_datasets import (
TensorflowDatasetLoader,
)
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.tomarkdown import ToMarkdownLoader
from langchain_community.document_loaders.toml import TomlLoader
from langchain_community.document_loaders.trello import TrelloLoader
from langchain_community.document_loaders.tsv import UnstructuredTSVLoader
from langchain_community.document_loaders.twitter import TwitterTweetLoader
from langchain_community.document_loaders.unstructured import (
UnstructuredAPIFileIOLoader,
UnstructuredAPIFileLoader,
UnstructuredFileIOLoader,
UnstructuredFileLoader,
)
from langchain_community.document_loaders.url import UnstructuredURLLoader
from langchain_community.document_loaders.url_playwright import PlaywrightURLLoader
from langchain_community.document_loaders.url_selenium import SeleniumURLLoader
from langchain_community.document_loaders.vsdx import VsdxLoader
from langchain_community.document_loaders.weather import WeatherDataLoader
from langchain_community.document_loaders.web_base import WebBaseLoader
from langchain_community.document_loaders.whatsapp_chat import WhatsAppChatLoader
from langchain_community.document_loaders.wikipedia import WikipediaLoader
from langchain_community.document_loaders.word_document import (
Docx2txtLoader,
UnstructuredWordDocumentLoader,
)
from langchain_community.document_loaders.xml import UnstructuredXMLLoader
from langchain_community.document_loaders.xorbits import XorbitsLoader
from langchain_community.document_loaders.youtube import (
GoogleApiClient,
GoogleApiYoutubeLoader,
YoutubeLoader,
)
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader
# For backwards compatibility
TelegramChatLoader = TelegramChatFileLoader
__all__ = [
"AcreomLoader",
"AsyncHtmlLoader",
"AsyncChromiumLoader",
"AZLyricsLoader",
"AcreomLoader",
"AirbyteCDKLoader",
"AirbyteGongLoader",
"AirbyteJSONLoader",
"AirbyteHubspotLoader",
"AirbyteSalesforceLoader",
"AirbyteShopifyLoader",
"AirbyteStripeLoader",
"AirbyteTypeformLoader",
"AirbyteZendeskSupportLoader",
"AirtableLoader",
"AmazonTextractPDFLoader",
"ApifyDatasetLoader",
"ArcGISLoader",
"ArxivLoader",
"AssemblyAIAudioTranscriptLoader",
"AstraDBLoader",
"AsyncHtmlLoader",
"AthenaLoader",
"AzureAIDataLoader",
"AzureAIDocumentIntelligenceLoader",
"AzureBlobStorageContainerLoader",
"AzureBlobStorageFileLoader",
"BSHTMLLoader",
"BibtexLoader",
"BigQueryLoader",
"BiliBiliLoader",
"BlackboardLoader",
"Blob",
"BlobLoader",
"BlockchainDocumentLoader",
"BraveSearchLoader",
"BrowserlessLoader",
"CassandraLoader",
"CSVLoader",
"ChatGPTLoader",
"CoNLLULoader",
"CollegeConfidentialLoader",
"ConcurrentLoader",
"ConfluenceLoader",
"CouchbaseLoader",
"CubeSemanticLoader",
"DataFrameLoader",
"DatadogLogsLoader",
"PebbloSafeLoader",
"DiffbotLoader",
"DirectoryLoader",
"DiscordChatLoader",
"DocugamiLoader",
"DocusaurusLoader",
"Docx2txtLoader",
"DropboxLoader",
"DuckDBLoader",
"EtherscanLoader",
"EverNoteLoader",
"FacebookChatLoader",
"FaunaLoader",
"FigmaFileLoader",
"FileSystemBlobLoader",
"GCSDirectoryLoader",
"GCSFileLoader",
"GeoDataFrameLoader",
"GithubFileLoader",
"GitHubIssuesLoader",
"GitLoader",
"GitbookLoader",
"GoogleApiClient",
"GoogleApiYoutubeLoader",
"GoogleSpeechToTextLoader",
"GoogleDriveLoader",
"GutenbergLoader",
"HNLoader",
"HuggingFaceDatasetLoader",
"IFixitLoader",
"IMSDbLoader",
"ImageCaptionLoader",
"IuguLoader",
"JSONLoader",
"JoplinLoader",
"LarkSuiteDocLoader",
"LakeFSLoader",
"MHTMLLoader",
"MWDumpLoader",
"MastodonTootsLoader",
"MathpixPDFLoader",
"MaxComputeLoader",
"MergedDataLoader",
"ModernTreasuryLoader",
"MongodbLoader",
"NewsURLLoader",
"NotebookLoader",
"NotionDBLoader",
"NotionDirectoryLoader",
"OBSDirectoryLoader",
"OBSFileLoader",
"ObsidianLoader",
"OneDriveFileLoader",
"OneDriveLoader",
"OnlinePDFLoader",
"OpenCityDataLoader",
"OutlookMessageLoader",
"PDFMinerLoader",
"PDFMinerPDFasHTMLLoader",
"PDFPlumberLoader",
"PagedPDFSplitter",
"PlaywrightURLLoader",
"PolarsDataFrameLoader",
"PsychicLoader",
"PubMedLoader",
"PyMuPDFLoader",
"PyPDFDirectoryLoader",
"PyPDFLoader",
"PyPDFium2Loader",
"PySparkDataFrameLoader",
"PythonLoader",
"RSSFeedLoader",
"ReadTheDocsLoader",
"RecursiveUrlLoader",
"RedditPostsLoader",
"RoamLoader",
"RocksetLoader",
"S3DirectoryLoader",
"S3FileLoader",
"SRTLoader",
"SeleniumURLLoader",
"SharePointLoader",
"SitemapLoader",
"SlackDirectoryLoader",
"SnowflakeLoader",
"SpreedlyLoader",
"StripeLoader",
"SurrealDBLoader",
"TelegramChatApiLoader",
"TelegramChatFileLoader",
"TelegramChatLoader",
"TensorflowDatasetLoader",
"TencentCOSDirectoryLoader",
"TencentCOSFileLoader",
"TextLoader",
"ToMarkdownLoader",
"TomlLoader",
"TrelloLoader",
"TwitterTweetLoader",
"UnstructuredAPIFileIOLoader",
"UnstructuredAPIFileLoader",
"UnstructuredCSVLoader",
"UnstructuredEPubLoader",
"UnstructuredEmailLoader",
"UnstructuredExcelLoader",
"UnstructuredFileIOLoader",
"UnstructuredFileLoader",
"UnstructuredHTMLLoader",
"UnstructuredImageLoader",
"UnstructuredMarkdownLoader",
"UnstructuredODTLoader",
"UnstructuredOrgModeLoader",
"UnstructuredPDFLoader",
"UnstructuredPowerPointLoader",
"UnstructuredRSTLoader",
"UnstructuredRTFLoader",
"UnstructuredTSVLoader",
"UnstructuredURLLoader",
"UnstructuredWordDocumentLoader",
"UnstructuredXMLLoader",
"VsdxLoader",
"WeatherDataLoader",
"WebBaseLoader",
"WhatsAppChatLoader",
"WikipediaLoader",
"XorbitsLoader",
"YoutubeAudioLoader",
"YoutubeLoader",
]