Add Slack Directory Loader (#2835)

Adds a loader for Slack Exports which can be a very valuable source of knowledge to use for internal QA bots and other use cases. ```py # Export data from your Slack Workspace first. from langchain.document_loaders import SLackDirectoryLoader SLACK_WORKSPACE_URL = "https://awesome.slack.com" loader = ("Slack_Exports", SLACK_WORKSPACE_URL) docs = loader.load() ``` --------- Co-authored-by: Mikhail Dubov <mikhail@chattermill.io>
1 year ago · a6f767ae7a
parent 4f231b46ee
commit a6f767ae7a
2 changed files with 142 additions and 0 deletions
--- a/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb
@ -0,0 +1,85 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "1dc7df1d",
+   "metadata": {},
+   "source": [
+    "# Slack\n",
+    "\n",
+    "This notebook covers how to load documents from a Slack export dumped locally.\n",
+    "\n",
+    "In order to get this Slack dump, follow these instructions:\n",
+    "\n",
+    "## 🧑 Instructions for ingesting your own dataset\n",
+    "\n",
+    "Export your Slack data. You can do this by going to your Workspace Management page and clicking the Import/Export option ({your_slack_domain}.slack.com/services/export). Then, choose the right date range and click `Start export`. Slack will send you an email and a DM when the export is ready.\n",
+    "\n",
+    "The download will produce a `.zip` file in your Downloads folder (or wherever your downloads can be found, depending on your OS configuration).\n",
+    "\n",
+    "Run the following command to unzip the zip file (replace the `Export...` with your own file name as needed) or unzip using built-in tools.\n",
+    "\n",
+    "```shell\n",
+    "unzip xxx.zip -d Slack_Exports\n",
+    "```\n",
+    "\n",
+    "Once ready, move the directory to the directory you are running this notebook from."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "007c5cbf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import SLackDirectoryLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1caec59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optionally set your Slack URL. This will give you proper URLs in sources which is very convernient.\n",
+    "SLACK_WORKSPACE_URL = \"https://xxx.slack.com\"\n",
+    "\n",
+    "loader = (\"Slack_Exports\", SLACK_WORKSPACE_URL)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1c30ff7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/langchain/document_loaders/slack_directory.py
+++ b/langchain/document_loaders/slack_directory.py
@ -0,0 +1,57 @@
+import json
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class SlackDirectoryLoader(BaseLoader):
+    """Loader that loads documents from Slack directory dump."""
+
+    def __init__(self, path: str, workspace_url: Optional[str] = None):
+        """Initialize with path and optional workspace URL. Including the URL will turn sources into links."""
+        self.file_path = path
+        self.workspace_url = workspace_url
+        self.channel_id_map = self._get_channel_id_map()
+
+    def _get_channel_id_map(self) -> Dict[str, str]:
+        """Get a dictionary mapping channel names to their respective IDs."""
+        channels_json_path = Path(self.file_path) / "channels.json"
+        if channels_json_path.exists():
+            with open(channels_json_path, encoding="utf-8") as f:
+                channels = json.load(f)
+            return {channel["name"]: channel["id"] for channel in channels}
+        return {}
+
+    def load(self) -> List[Document]:
+        """Load documents."""
+        channel_paths = list(Path(self.file_path).glob("*"))
+        docs = []
+        for channel_path in channel_paths:
+            if channel_path.is_dir():
+                channel_name = channel_path.name
+                json_files = list(channel_path.glob("*.json"))
+                for json_file in json_files:
+                    with open(json_file, encoding='utf-8') as f:
+                        messages = json.load(f)
+                    for message in messages:
+                        text = message.get("text", "")
+                        timestamp = message.get("ts")
+                        user = message.get("user")
+                        if self.workspace_url:
+                            channel_id = self.channel_id_map.get(
+                                channel_name, "")
+                            message_link = f"{self.workspace_url}/archives/{channel_id}/p{timestamp.replace('.', '')}"
+                            source = message_link
+                        else:
+                            source = f"{channel_name} - {user} - {timestamp}"
+                        metadata = {
+                            "source": source,
+                            "channel": channel_name,
+                            "timestamp": timestamp,
+                            "user": user,
+                        }
+                        docs.append(
+                            Document(page_content=text, metadata=metadata))
+        return docs