Add Slack Directory Loader (#2835)

Adds a loader for Slack Exports which can be a very valuable source of
    knowledge to use for internal QA bots and other use cases.

    ```py
    # Export data from your Slack Workspace first.
    from langchain.document_loaders import SLackDirectoryLoader

    SLACK_WORKSPACE_URL = "https://awesome.slack.com"

    loader = ("Slack_Exports", SLACK_WORKSPACE_URL)
    docs = loader.load()
```

---------

Co-authored-by: Mikhail Dubov <mikhail@chattermill.io>
fix_agent_callbacks
vowelparrot 1 year ago committed by GitHub
parent 4f231b46ee
commit a6f767ae7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,85 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "1dc7df1d",
"metadata": {},
"source": [
"# Slack\n",
"\n",
"This notebook covers how to load documents from a Slack export dumped locally.\n",
"\n",
"In order to get this Slack dump, follow these instructions:\n",
"\n",
"## 🧑 Instructions for ingesting your own dataset\n",
"\n",
"Export your Slack data. You can do this by going to your Workspace Management page and clicking the Import/Export option ({your_slack_domain}.slack.com/services/export). Then, choose the right date range and click `Start export`. Slack will send you an email and a DM when the export is ready.\n",
"\n",
"The download will produce a `.zip` file in your Downloads folder (or wherever your downloads can be found, depending on your OS configuration).\n",
"\n",
"Run the following command to unzip the zip file (replace the `Export...` with your own file name as needed) or unzip using built-in tools.\n",
"\n",
"```shell\n",
"unzip xxx.zip -d Slack_Exports\n",
"```\n",
"\n",
"Once ready, move the directory to the directory you are running this notebook from."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "007c5cbf",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import SLackDirectoryLoader"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1caec59",
"metadata": {},
"outputs": [],
"source": [
"# Optionally set your Slack URL. This will give you proper URLs in sources which is very convernient.\n",
"SLACK_WORKSPACE_URL = \"https://xxx.slack.com\"\n",
"\n",
"loader = (\"Slack_Exports\", SLACK_WORKSPACE_URL)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b1c30ff7",
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -0,0 +1,57 @@
import json
from pathlib import Path
from typing import Dict, List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class SlackDirectoryLoader(BaseLoader):
"""Loader that loads documents from Slack directory dump."""
def __init__(self, path: str, workspace_url: Optional[str] = None):
"""Initialize with path and optional workspace URL. Including the URL will turn sources into links."""
self.file_path = path
self.workspace_url = workspace_url
self.channel_id_map = self._get_channel_id_map()
def _get_channel_id_map(self) -> Dict[str, str]:
"""Get a dictionary mapping channel names to their respective IDs."""
channels_json_path = Path(self.file_path) / "channels.json"
if channels_json_path.exists():
with open(channels_json_path, encoding="utf-8") as f:
channels = json.load(f)
return {channel["name"]: channel["id"] for channel in channels}
return {}
def load(self) -> List[Document]:
"""Load documents."""
channel_paths = list(Path(self.file_path).glob("*"))
docs = []
for channel_path in channel_paths:
if channel_path.is_dir():
channel_name = channel_path.name
json_files = list(channel_path.glob("*.json"))
for json_file in json_files:
with open(json_file, encoding='utf-8') as f:
messages = json.load(f)
for message in messages:
text = message.get("text", "")
timestamp = message.get("ts")
user = message.get("user")
if self.workspace_url:
channel_id = self.channel_id_map.get(
channel_name, "")
message_link = f"{self.workspace_url}/archives/{channel_id}/p{timestamp.replace('.', '')}"
source = message_link
else:
source = f"{channel_name} - {user} - {timestamp}"
metadata = {
"source": source,
"channel": channel_name,
"timestamp": timestamp,
"user": user,
}
docs.append(
Document(page_content=text, metadata=metadata))
return docs
Loading…
Cancel
Save