forked from Archives/langchain
Add Slack Directory Loader (#2835)
Adds a loader for Slack Exports which can be a very valuable source of knowledge to use for internal QA bots and other use cases. ```py # Export data from your Slack Workspace first. from langchain.document_loaders import SLackDirectoryLoader SLACK_WORKSPACE_URL = "https://awesome.slack.com" loader = ("Slack_Exports", SLACK_WORKSPACE_URL) docs = loader.load() ``` --------- Co-authored-by: Mikhail Dubov <mikhail@chattermill.io>
This commit is contained in:
parent
4f231b46ee
commit
a6f767ae7a
@ -0,0 +1,85 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1dc7df1d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Slack\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook covers how to load documents from a Slack export dumped locally.\n",
|
||||||
|
"\n",
|
||||||
|
"In order to get this Slack dump, follow these instructions:\n",
|
||||||
|
"\n",
|
||||||
|
"## 🧑 Instructions for ingesting your own dataset\n",
|
||||||
|
"\n",
|
||||||
|
"Export your Slack data. You can do this by going to your Workspace Management page and clicking the Import/Export option ({your_slack_domain}.slack.com/services/export). Then, choose the right date range and click `Start export`. Slack will send you an email and a DM when the export is ready.\n",
|
||||||
|
"\n",
|
||||||
|
"The download will produce a `.zip` file in your Downloads folder (or wherever your downloads can be found, depending on your OS configuration).\n",
|
||||||
|
"\n",
|
||||||
|
"Run the following command to unzip the zip file (replace the `Export...` with your own file name as needed) or unzip using built-in tools.\n",
|
||||||
|
"\n",
|
||||||
|
"```shell\n",
|
||||||
|
"unzip xxx.zip -d Slack_Exports\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"Once ready, move the directory to the directory you are running this notebook from."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "007c5cbf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import SLackDirectoryLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a1caec59",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Optionally set your Slack URL. This will give you proper URLs in sources which is very convernient.\n",
|
||||||
|
"SLACK_WORKSPACE_URL = \"https://xxx.slack.com\"\n",
|
||||||
|
"\n",
|
||||||
|
"loader = (\"Slack_Exports\", SLACK_WORKSPACE_URL)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b1c30ff7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
57
langchain/document_loaders/slack_directory.py
Normal file
57
langchain/document_loaders/slack_directory.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class SlackDirectoryLoader(BaseLoader):
|
||||||
|
"""Loader that loads documents from Slack directory dump."""
|
||||||
|
|
||||||
|
def __init__(self, path: str, workspace_url: Optional[str] = None):
|
||||||
|
"""Initialize with path and optional workspace URL. Including the URL will turn sources into links."""
|
||||||
|
self.file_path = path
|
||||||
|
self.workspace_url = workspace_url
|
||||||
|
self.channel_id_map = self._get_channel_id_map()
|
||||||
|
|
||||||
|
def _get_channel_id_map(self) -> Dict[str, str]:
|
||||||
|
"""Get a dictionary mapping channel names to their respective IDs."""
|
||||||
|
channels_json_path = Path(self.file_path) / "channels.json"
|
||||||
|
if channels_json_path.exists():
|
||||||
|
with open(channels_json_path, encoding="utf-8") as f:
|
||||||
|
channels = json.load(f)
|
||||||
|
return {channel["name"]: channel["id"] for channel in channels}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load documents."""
|
||||||
|
channel_paths = list(Path(self.file_path).glob("*"))
|
||||||
|
docs = []
|
||||||
|
for channel_path in channel_paths:
|
||||||
|
if channel_path.is_dir():
|
||||||
|
channel_name = channel_path.name
|
||||||
|
json_files = list(channel_path.glob("*.json"))
|
||||||
|
for json_file in json_files:
|
||||||
|
with open(json_file, encoding='utf-8') as f:
|
||||||
|
messages = json.load(f)
|
||||||
|
for message in messages:
|
||||||
|
text = message.get("text", "")
|
||||||
|
timestamp = message.get("ts")
|
||||||
|
user = message.get("user")
|
||||||
|
if self.workspace_url:
|
||||||
|
channel_id = self.channel_id_map.get(
|
||||||
|
channel_name, "")
|
||||||
|
message_link = f"{self.workspace_url}/archives/{channel_id}/p{timestamp.replace('.', '')}"
|
||||||
|
source = message_link
|
||||||
|
else:
|
||||||
|
source = f"{channel_name} - {user} - {timestamp}"
|
||||||
|
metadata = {
|
||||||
|
"source": source,
|
||||||
|
"channel": channel_name,
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"user": user,
|
||||||
|
}
|
||||||
|
docs.append(
|
||||||
|
Document(page_content=text, metadata=metadata))
|
||||||
|
return docs
|
Loading…
Reference in New Issue
Block a user