From 0019d8a9485b4557a63ef9dd54c6c0a3df77f383 Mon Sep 17 00:00:00 2001 From: Rajendra Kadam Date: Wed, 27 Mar 2024 23:09:52 +0530 Subject: [PATCH] community[minor]: Add support for non-file-based Document Loaders in PebbloSafeLoader (#19574) **Description:** PebbloSafeLoader: Add support for non-file-based Document Loaders This pull request enhances PebbloSafeLoader by introducing support for several non-file-based Document Loaders. With this update, PebbloSafeLoader now seamlessly integrates with the following loaders: - GoogleDriveLoader - SlackDirectoryLoader - Unstructured EmailLoader **Issue:** NA **Dependencies:** - None **Twitter handle:** @Raj__725 --------- Co-authored-by: Rahul Tripathi --- .../document_loaders/pebblo.py | 4 ++- .../langchain_community/utilities/pebblo.py | 36 ++++++++++++++++--- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index 0875335ac1..8b67898cf2 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -125,7 +125,9 @@ class PebbloSafeLoader(BaseLoader): doc_content = [doc.dict() for doc in self.docs] docs = [] for doc in doc_content: - doc_source_path = get_full_path(doc.get("metadata", {}).get("source")) + doc_source_path = get_full_path( + doc.get("metadata", {}).get("source", self.source_path) + ) doc_source_owner = PebbloSafeLoader.get_file_owner_from_path( doc_source_path ) diff --git a/libs/community/langchain_community/utilities/pebblo.py b/libs/community/langchain_community/utilities/pebblo.py index a52d3c45b4..a9c5e3bdc2 100644 --- a/libs/community/langchain_community/utilities/pebblo.py +++ b/libs/community/langchain_community/utilities/pebblo.py @@ -29,11 +29,28 @@ file_loader = [ "AmazonTextractPDFLoader", "CSVLoader", "UnstructuredExcelLoader", + "UnstructuredEmailLoader", ] -dir_loader = ["DirectoryLoader", "S3DirLoader", "PyPDFDirectoryLoader"] +dir_loader = [ + "DirectoryLoader", + "S3DirLoader", + "SlackDirectoryLoader", + "PyPDFDirectoryLoader", + "NotionDirectoryLoader", +] + in_memory = ["DataFrameLoader"] +remote_db = [ + "NotionDBLoader", + "GoogleDriveLoader", +] -LOADER_TYPE_MAPPING = {"file": file_loader, "dir": dir_loader, "in-memory": in_memory} +LOADER_TYPE_MAPPING = { + "file": file_loader, + "dir": dir_loader, + "in-memory": in_memory, + "remote_db": remote_db, +} SUPPORTED_LOADERS = (*file_loader, *dir_loader, *in_memory) @@ -159,7 +176,7 @@ def get_loader_type(loader: str) -> str: for loader_type, loaders in LOADER_TYPE_MAPPING.items(): if loader in loaders: return loader_type - return "unknown" + return "unsupported" def get_loader_full_path(loader: BaseLoader) -> str: @@ -172,6 +189,7 @@ def get_loader_full_path(loader: BaseLoader) -> str: from langchain_community.document_loaders import ( DataFrameLoader, GCSFileLoader, + NotionDBLoader, S3FileLoader, ) @@ -188,15 +206,25 @@ def get_loader_full_path(loader: BaseLoader) -> str: location = f"gc://{loader.bucket}/{loader.blob}" elif isinstance(loader, S3FileLoader): location = f"s3://{loader.bucket}/{loader.key}" + elif "source" in loader_dict: + location = loader_dict["source"] + if location and "channel" in loader_dict: + channel = loader_dict["channel"] + if channel: + location = f"{location}/{channel}" elif "path" in loader_dict: location = loader_dict["path"] elif "file_path" in loader_dict: location = loader_dict["file_path"] elif "web_paths" in loader_dict: - location = loader_dict["web_paths"][0] + web_paths = loader_dict["web_paths"] + if web_paths and isinstance(web_paths, list) and len(web_paths) > 0: + location = web_paths[0] # For in-memory types: elif isinstance(loader, DataFrameLoader): location = "in-memory" + elif isinstance(loader, NotionDBLoader): + location = f"notiondb://{loader.database_id}" except Exception: pass return get_full_path(str(location))