From 0019d8a9485b4557a63ef9dd54c6c0a3df77f383 Mon Sep 17 00:00:00 2001
From: Rajendra Kadam <raj.725@outlook.com>
Date: Wed, 27 Mar 2024 23:09:52 +0530
Subject: [PATCH] community[minor]: Add support for non-file-based Document
 Loaders in PebbloSafeLoader (#19574)

**Description:**
PebbloSafeLoader: Add support for non-file-based Document Loaders

This pull request enhances PebbloSafeLoader by introducing support for
several non-file-based Document Loaders. With this update,
PebbloSafeLoader now seamlessly integrates with the following loaders:
- GoogleDriveLoader
- SlackDirectoryLoader
- Unstructured EmailLoader

**Issue:** NA
**Dependencies:** - None
**Twitter handle:** @Raj__725

---------

Co-authored-by: Rahul Tripathi <rauhl.psit.ec@gmail.com>
---
 .../document_loaders/pebblo.py                |  4 ++-
 .../langchain_community/utilities/pebblo.py   | 36 ++++++++++++++++---
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py
index 0875335ac1..8b67898cf2 100644
--- a/libs/community/langchain_community/document_loaders/pebblo.py
+++ b/libs/community/langchain_community/document_loaders/pebblo.py
@@ -125,7 +125,9 @@ class PebbloSafeLoader(BaseLoader):
         doc_content = [doc.dict() for doc in self.docs]
         docs = []
         for doc in doc_content:
-            doc_source_path = get_full_path(doc.get("metadata", {}).get("source"))
+            doc_source_path = get_full_path(
+                doc.get("metadata", {}).get("source", self.source_path)
+            )
             doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(
                 doc_source_path
             )
diff --git a/libs/community/langchain_community/utilities/pebblo.py b/libs/community/langchain_community/utilities/pebblo.py
index a52d3c45b4..a9c5e3bdc2 100644
--- a/libs/community/langchain_community/utilities/pebblo.py
+++ b/libs/community/langchain_community/utilities/pebblo.py
@@ -29,11 +29,28 @@ file_loader = [
     "AmazonTextractPDFLoader",
     "CSVLoader",
     "UnstructuredExcelLoader",
+    "UnstructuredEmailLoader",
 ]
-dir_loader = ["DirectoryLoader", "S3DirLoader", "PyPDFDirectoryLoader"]
+dir_loader = [
+    "DirectoryLoader",
+    "S3DirLoader",
+    "SlackDirectoryLoader",
+    "PyPDFDirectoryLoader",
+    "NotionDirectoryLoader",
+]
+
 in_memory = ["DataFrameLoader"]
+remote_db = [
+    "NotionDBLoader",
+    "GoogleDriveLoader",
+]
 
-LOADER_TYPE_MAPPING = {"file": file_loader, "dir": dir_loader, "in-memory": in_memory}
+LOADER_TYPE_MAPPING = {
+    "file": file_loader,
+    "dir": dir_loader,
+    "in-memory": in_memory,
+    "remote_db": remote_db,
+}
 
 SUPPORTED_LOADERS = (*file_loader, *dir_loader, *in_memory)
 
@@ -159,7 +176,7 @@ def get_loader_type(loader: str) -> str:
     for loader_type, loaders in LOADER_TYPE_MAPPING.items():
         if loader in loaders:
             return loader_type
-    return "unknown"
+    return "unsupported"
 
 
 def get_loader_full_path(loader: BaseLoader) -> str:
@@ -172,6 +189,7 @@ def get_loader_full_path(loader: BaseLoader) -> str:
     from langchain_community.document_loaders import (
         DataFrameLoader,
         GCSFileLoader,
+        NotionDBLoader,
         S3FileLoader,
     )
 
@@ -188,15 +206,25 @@ def get_loader_full_path(loader: BaseLoader) -> str:
                 location = f"gc://{loader.bucket}/{loader.blob}"
             elif isinstance(loader, S3FileLoader):
                 location = f"s3://{loader.bucket}/{loader.key}"
+        elif "source" in loader_dict:
+            location = loader_dict["source"]
+            if location and "channel" in loader_dict:
+                channel = loader_dict["channel"]
+                if channel:
+                    location = f"{location}/{channel}"
         elif "path" in loader_dict:
             location = loader_dict["path"]
         elif "file_path" in loader_dict:
             location = loader_dict["file_path"]
         elif "web_paths" in loader_dict:
-            location = loader_dict["web_paths"][0]
+            web_paths = loader_dict["web_paths"]
+            if web_paths and isinstance(web_paths, list) and len(web_paths) > 0:
+                location = web_paths[0]
         # For in-memory types:
         elif isinstance(loader, DataFrameLoader):
             location = "in-memory"
+        elif isinstance(loader, NotionDBLoader):
+            location = f"notiondb://{loader.database_id}"
     except Exception:
         pass
     return get_full_path(str(location))