diff --git a/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb b/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb new file mode 100644 index 00000000..bf2a5395 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb @@ -0,0 +1,85 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "1dc7df1d", + "metadata": {}, + "source": [ + "# Slack\n", + "\n", + "This notebook covers how to load documents from a Slack export dumped locally.\n", + "\n", + "In order to get this Slack dump, follow these instructions:\n", + "\n", + "## 🧑 Instructions for ingesting your own dataset\n", + "\n", + "Export your Slack data. You can do this by going to your Workspace Management page and clicking the Import/Export option ({your_slack_domain}.slack.com/services/export). Then, choose the right date range and click `Start export`. Slack will send you an email and a DM when the export is ready.\n", + "\n", + "The download will produce a `.zip` file in your Downloads folder (or wherever your downloads can be found, depending on your OS configuration).\n", + "\n", + "Run the following command to unzip the zip file (replace the `Export...` with your own file name as needed) or unzip using built-in tools.\n", + "\n", + "```shell\n", + "unzip xxx.zip -d Slack_Exports\n", + "```\n", + "\n", + "Once ready, move the directory to the directory you are running this notebook from." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "007c5cbf", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import SLackDirectoryLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1caec59", + "metadata": {}, + "outputs": [], + "source": [ + "# Optionally set your Slack URL. This will give you proper URLs in sources which is very convernient.\n", + "SLACK_WORKSPACE_URL = \"https://xxx.slack.com\"\n", + "\n", + "loader = (\"Slack_Exports\", SLACK_WORKSPACE_URL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1c30ff7", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/slack_directory.py b/langchain/document_loaders/slack_directory.py new file mode 100644 index 00000000..df0a1f11 --- /dev/null +++ b/langchain/document_loaders/slack_directory.py @@ -0,0 +1,57 @@ +import json +from pathlib import Path +from typing import Dict, List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class SlackDirectoryLoader(BaseLoader): + """Loader that loads documents from Slack directory dump.""" + + def __init__(self, path: str, workspace_url: Optional[str] = None): + """Initialize with path and optional workspace URL. Including the URL will turn sources into links.""" + self.file_path = path + self.workspace_url = workspace_url + self.channel_id_map = self._get_channel_id_map() + + def _get_channel_id_map(self) -> Dict[str, str]: + """Get a dictionary mapping channel names to their respective IDs.""" + channels_json_path = Path(self.file_path) / "channels.json" + if channels_json_path.exists(): + with open(channels_json_path, encoding="utf-8") as f: + channels = json.load(f) + return {channel["name"]: channel["id"] for channel in channels} + return {} + + def load(self) -> List[Document]: + """Load documents.""" + channel_paths = list(Path(self.file_path).glob("*")) + docs = [] + for channel_path in channel_paths: + if channel_path.is_dir(): + channel_name = channel_path.name + json_files = list(channel_path.glob("*.json")) + for json_file in json_files: + with open(json_file, encoding='utf-8') as f: + messages = json.load(f) + for message in messages: + text = message.get("text", "") + timestamp = message.get("ts") + user = message.get("user") + if self.workspace_url: + channel_id = self.channel_id_map.get( + channel_name, "") + message_link = f"{self.workspace_url}/archives/{channel_id}/p{timestamp.replace('.', '')}" + source = message_link + else: + source = f"{channel_name} - {user} - {timestamp}" + metadata = { + "source": source, + "channel": channel_name, + "timestamp": timestamp, + "user": user, + } + docs.append( + Document(page_content=text, metadata=metadata)) + return docs