community[minor]: Add support for non-file-based Document Loaders in PebbloSafeLoader (#19574)

**Description:**
PebbloSafeLoader: Add support for non-file-based Document Loaders

This pull request enhances PebbloSafeLoader by introducing support for
several non-file-based Document Loaders. With this update,
PebbloSafeLoader now seamlessly integrates with the following loaders:
- GoogleDriveLoader
- SlackDirectoryLoader
- Unstructured EmailLoader

**Issue:** NA
**Dependencies:** - None
**Twitter handle:** @Raj__725

---------

Co-authored-by: Rahul Tripathi <rauhl.psit.ec@gmail.com>
pull/19666/head
Rajendra Kadam 3 months ago committed by GitHub
parent 9954c6a38e
commit 0019d8a948
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -125,7 +125,9 @@ class PebbloSafeLoader(BaseLoader):
doc_content = [doc.dict() for doc in self.docs]
docs = []
for doc in doc_content:
doc_source_path = get_full_path(doc.get("metadata", {}).get("source"))
doc_source_path = get_full_path(
doc.get("metadata", {}).get("source", self.source_path)
)
doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(
doc_source_path
)

@ -29,11 +29,28 @@ file_loader = [
"AmazonTextractPDFLoader",
"CSVLoader",
"UnstructuredExcelLoader",
"UnstructuredEmailLoader",
]
dir_loader = ["DirectoryLoader", "S3DirLoader", "PyPDFDirectoryLoader"]
dir_loader = [
"DirectoryLoader",
"S3DirLoader",
"SlackDirectoryLoader",
"PyPDFDirectoryLoader",
"NotionDirectoryLoader",
]
in_memory = ["DataFrameLoader"]
remote_db = [
"NotionDBLoader",
"GoogleDriveLoader",
]
LOADER_TYPE_MAPPING = {"file": file_loader, "dir": dir_loader, "in-memory": in_memory}
LOADER_TYPE_MAPPING = {
"file": file_loader,
"dir": dir_loader,
"in-memory": in_memory,
"remote_db": remote_db,
}
SUPPORTED_LOADERS = (*file_loader, *dir_loader, *in_memory)
@ -159,7 +176,7 @@ def get_loader_type(loader: str) -> str:
for loader_type, loaders in LOADER_TYPE_MAPPING.items():
if loader in loaders:
return loader_type
return "unknown"
return "unsupported"
def get_loader_full_path(loader: BaseLoader) -> str:
@ -172,6 +189,7 @@ def get_loader_full_path(loader: BaseLoader) -> str:
from langchain_community.document_loaders import (
DataFrameLoader,
GCSFileLoader,
NotionDBLoader,
S3FileLoader,
)
@ -188,15 +206,25 @@ def get_loader_full_path(loader: BaseLoader) -> str:
location = f"gc://{loader.bucket}/{loader.blob}"
elif isinstance(loader, S3FileLoader):
location = f"s3://{loader.bucket}/{loader.key}"
elif "source" in loader_dict:
location = loader_dict["source"]
if location and "channel" in loader_dict:
channel = loader_dict["channel"]
if channel:
location = f"{location}/{channel}"
elif "path" in loader_dict:
location = loader_dict["path"]
elif "file_path" in loader_dict:
location = loader_dict["file_path"]
elif "web_paths" in loader_dict:
location = loader_dict["web_paths"][0]
web_paths = loader_dict["web_paths"]
if web_paths and isinstance(web_paths, list) and len(web_paths) > 0:
location = web_paths[0]
# For in-memory types:
elif isinstance(loader, DataFrameLoader):
location = "in-memory"
elif isinstance(loader, NotionDBLoader):
location = f"notiondb://{loader.database_id}"
except Exception:
pass
return get_full_path(str(location))

Loading…
Cancel
Save