diff --git a/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb b/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb index a525008e38..905a1e06d7 100644 --- a/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb +++ b/docs/docs/integrations/document_loaders/microsoft_sharepoint.ipynb @@ -21,7 +21,7 @@ "7. To find your `Tenant Name` follow the instructions at this [document](https://learn.microsoft.com/en-us/azure/active-directory-b2c/tenant-management-read-tenant-name). Once you got this, just remove `.onmicrosoft.com` from the value and hold the rest as your `Tenant Name`.\n", "8. To obtain your `Collection ID` and `Subsite ID`, you will need your **SharePoint** `site-name`. Your `SharePoint` site URL has the following format `https://.sharepoint.com/sites/`. The last part of this URL is the `site-name`.\n", "9. To Get the Site `Collection ID`, hit this URL in the browser: `https://.sharepoint.com/sites//_api/site/id` and copy the value of the `Edm.Guid` property.\n", - "10. To get the `Subsite ID` (or web ID) use: `https://.sharepoint.com//_api/web/id` and copy the value of the `Edm.Guid` property.\n", + "10. To get the `Subsite ID` (or web ID) use: `https://.sharepoint.com/sites//_api/web/id` and copy the value of the `Edm.Guid` property.\n", "11. The `SharePoint site ID` has the following format: `.sharepoint.com,,`. You can hold that value to use in the next step.\n", "12. Visit the [Graph Explorer Playground](https://developer.microsoft.com/en-us/graph/graph-explorer) to obtain your `Document Library ID`. The first step is to ensure you are logged in with the account associated with your **SharePoint** site. Then you need to make a request to `https://graph.microsoft.com/v1.0/sites//drive` and the response will return a payload with a field `id` that holds the ID of your `Document Library ID`.\n", "\n", @@ -65,6 +65,30 @@ "documents = loader.load()\n", "```\n", "\n", + "If you are receiving the error `Resource not found for the segment`, try using the `folder_id` instead of the folder path, which can be obtained from the [Microsoft Graph API](https://developer.microsoft.com/en-us/graph/graph-explorer)\n", + "\n", + "```python\n", + "loader = SharePointLoader(document_library_id=\"YOUR DOCUMENT LIBRARY ID\", auth_with_token=True\n", + " folder_id=\"\")\n", + "documents = loader.load()\n", + "```\n", + "\n", + "If you wish to load documents from the root directory, you can omit `folder_id`, `folder_path` and `documents_ids` and loader will load root directory.\n", + "```python\n", + "# loads documents from root directory\n", + "loader = SharePointLoader(document_library_id=\"YOUR DOCUMENT LIBRARY ID\", auth_with_token=True)\n", + "documents = loader.load()\n", + "```\n", + "\n", + "Combined with `recursive=True` you can simply load all documents from whole SharePoint:\n", + "```python\n", + "# loads documents from root directory\n", + "loader = SharePointLoader(document_library_id=\"YOUR DOCUMENT LIBRARY ID\",\n", + " recursive=True,\n", + " auth_with_token=True)\n", + "documents = loader.load()\n", + "```\n", + "\n", "#### 📑 Loading documents from a list of Documents IDs\n", "\n", "Another possibility is to provide a list of `object_id` for each document you want to load. For that, you will need to query the [Microsoft Graph API](https://developer.microsoft.com/en-us/graph/graph-explorer) to find all the documents ID that you are interested in. This [link](https://learn.microsoft.com/en-us/graph/api/resources/onedrive?view=graph-rest-1.0#commonly-accessed-resources) provides a list of endpoints that will be helpful to retrieve the documents ID.\n", diff --git a/libs/community/langchain_community/document_loaders/base_o365.py b/libs/community/langchain_community/document_loaders/base_o365.py index 1400f36d8d..90dba6d29e 100644 --- a/libs/community/langchain_community/document_loaders/base_o365.py +++ b/libs/community/langchain_community/document_loaders/base_o365.py @@ -76,6 +76,8 @@ class O365BaseLoader(BaseLoader, BaseModel): """Whether to authenticate with a token or not. Defaults to False.""" chunk_size: Union[int, str] = CHUNK_SIZE """Number of bytes to retrieve from each api call to the server. int or 'auto'.""" + recursive: bool = False + """Should the loader recursively load subfolders?""" @property @abstractmethod @@ -114,6 +116,9 @@ class O365BaseLoader(BaseLoader, BaseModel): file.download(to_path=temp_dir, chunk_size=self.chunk_size) loader = FileSystemBlobLoader(path=temp_dir) yield from loader.yield_blobs() + if self.recursive: + for subfolder in folder.get_child_folders(): + yield from self._load_from_folder(subfolder) def _load_from_object_ids( self, drive: Drive, object_ids: List[str] diff --git a/libs/community/langchain_community/document_loaders/sharepoint.py b/libs/community/langchain_community/document_loaders/sharepoint.py index ff84c64305..f4d57d66d4 100644 --- a/libs/community/langchain_community/document_loaders/sharepoint.py +++ b/libs/community/langchain_community/document_loaders/sharepoint.py @@ -22,6 +22,8 @@ class SharePointLoader(O365BaseLoader): """ The path to the folder to load data from.""" object_ids: Optional[List[str]] = None """ The IDs of the objects to load data from.""" + folder_id: Optional[str] = None + """ The ID of the folder to load data from.""" @property def _file_types(self) -> Sequence[_FileType]: @@ -51,6 +53,18 @@ class SharePointLoader(O365BaseLoader): raise ValueError(f"There isn't a folder with path {self.folder_path}.") for blob in self._load_from_folder(target_folder): yield from blob_parser.lazy_parse(blob) + if self.folder_id: + target_folder = drive.get_item(self.folder_id) + if not isinstance(target_folder, Folder): + raise ValueError(f"There isn't a folder with path {self.folder_path}.") + for blob in self._load_from_folder(target_folder): + yield from blob_parser.lazy_parse(blob) if self.object_ids: for blob in self._load_from_object_ids(drive, self.object_ids): yield from blob_parser.lazy_parse(blob) + if not (self.folder_path or self.folder_id or self.object_ids): + target_folder = drive.get_root_folder() + if not isinstance(target_folder, Folder): + raise ValueError("Unable to fetch root folder") + for blob in self._load_from_folder(target_folder): + yield from blob_parser.lazy_parse(blob)