diff --git a/libs/community/langchain_community/document_loaders/base_o365.py b/libs/community/langchain_community/document_loaders/base_o365.py index ddf95bdc76..33a7c5a818 100644 --- a/libs/community/langchain_community/document_loaders/base_o365.py +++ b/libs/community/langchain_community/document_loaders/base_o365.py @@ -1,4 +1,5 @@ """Base class for all loaders that uses O365 Package""" + from __future__ import annotations import logging @@ -6,8 +7,8 @@ import os import tempfile from abc import abstractmethod from enum import Enum -from pathlib import Path -from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union +from pathlib import Path, PurePath +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Sequence, Union from langchain_core.pydantic_v1 import ( BaseModel, @@ -108,14 +109,31 @@ class O365BaseLoader(BaseLoader, BaseModel): """ file_mime_types = self._fetch_mime_types items = folder.get_items() + metadata_dict: Dict[str, Dict[str, Any]] = {} with tempfile.TemporaryDirectory() as temp_dir: os.makedirs(os.path.dirname(temp_dir), exist_ok=True) for file in items: if file.is_file: if file.mime_type in list(file_mime_types.values()): file.download(to_path=temp_dir, chunk_size=self.chunk_size) + metadata_dict[file.name] = { + "source": file.web_url, + "mime_type": file.mime_type, + "created": file.created, + "modified": file.modified, + "created_by": str(file.created_by), + "modified_by": str(file.modified_by), + "description": file.description, + } + loader = FileSystemBlobLoader(path=temp_dir) - yield from loader.yield_blobs() + for blob in loader.yield_blobs(): + if not isinstance(blob.path, PurePath): + raise NotImplementedError("Expected blob path to be a PurePath") + if blob.path: + file_metadata_ = metadata_dict.get(str(blob.path), {}) + blob.metadata.update(file_metadata_) + yield blob if self.recursive: for subfolder in folder.get_child_folders(): yield from self._load_from_folder(subfolder) diff --git a/libs/community/langchain_community/document_loaders/sharepoint.py b/libs/community/langchain_community/document_loaders/sharepoint.py index bfcc47fba1..5eb02df867 100644 --- a/libs/community/langchain_community/document_loaders/sharepoint.py +++ b/libs/community/langchain_community/document_loaders/sharepoint.py @@ -1,4 +1,5 @@ """Loader that loads data from Sharepoint Document Library""" + from __future__ import annotations import json @@ -82,7 +83,9 @@ class SharePointLoader(O365BaseLoader, BaseLoader): if not isinstance(target_folder, Folder): raise ValueError("Unable to fetch root folder") for blob in self._load_from_folder(target_folder): - yield from blob_parser.lazy_parse(blob) + for blob_part in blob_parser.lazy_parse(blob): + blob_part.metadata.update(blob.metadata) + yield blob_part def authorized_identities(self) -> List: data = self._fetch_access_token()