mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
ed58eeb9c5
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
113 lines
4.0 KiB
Python
113 lines
4.0 KiB
Python
import json
|
|
import zipfile
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
|
|
|
|
class SlackDirectoryLoader(BaseLoader):
|
|
"""Load from a `Slack` directory dump."""
|
|
|
|
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
|
|
"""Initialize the SlackDirectoryLoader.
|
|
|
|
Args:
|
|
zip_path (str): The path to the Slack directory dump zip file.
|
|
workspace_url (Optional[str]): The Slack workspace URL.
|
|
Including the URL will turn
|
|
sources into links. Defaults to None.
|
|
"""
|
|
self.zip_path = Path(zip_path)
|
|
self.workspace_url = workspace_url
|
|
self.channel_id_map = self._get_channel_id_map(self.zip_path)
|
|
|
|
@staticmethod
|
|
def _get_channel_id_map(zip_path: Path) -> Dict[str, str]:
|
|
"""Get a dictionary mapping channel names to their respective IDs."""
|
|
with zipfile.ZipFile(zip_path, "r") as zip_file:
|
|
try:
|
|
with zip_file.open("channels.json", "r") as f:
|
|
channels = json.load(f)
|
|
return {channel["name"]: channel["id"] for channel in channels}
|
|
except KeyError:
|
|
return {}
|
|
|
|
def load(self) -> List[Document]:
|
|
"""Load and return documents from the Slack directory dump."""
|
|
docs = []
|
|
with zipfile.ZipFile(self.zip_path, "r") as zip_file:
|
|
for channel_path in zip_file.namelist():
|
|
channel_name = Path(channel_path).parent.name
|
|
if not channel_name:
|
|
continue
|
|
if channel_path.endswith(".json"):
|
|
messages = self._read_json(zip_file, channel_path)
|
|
for message in messages:
|
|
document = self._convert_message_to_document(
|
|
message, channel_name
|
|
)
|
|
docs.append(document)
|
|
return docs
|
|
|
|
def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:
|
|
"""Read JSON data from a zip subfile."""
|
|
with zip_file.open(file_path, "r") as f:
|
|
data = json.load(f)
|
|
return data
|
|
|
|
def _convert_message_to_document(
|
|
self, message: dict, channel_name: str
|
|
) -> Document:
|
|
"""
|
|
Convert a message to a Document object.
|
|
|
|
Args:
|
|
message (dict): A message in the form of a dictionary.
|
|
channel_name (str): The name of the channel the message belongs to.
|
|
|
|
Returns:
|
|
Document: A Document object representing the message.
|
|
"""
|
|
text = message.get("text", "")
|
|
metadata = self._get_message_metadata(message, channel_name)
|
|
return Document(
|
|
page_content=text,
|
|
metadata=metadata,
|
|
)
|
|
|
|
def _get_message_metadata(self, message: dict, channel_name: str) -> dict:
|
|
"""Create and return metadata for a given message and channel."""
|
|
timestamp = message.get("ts", "")
|
|
user = message.get("user", "")
|
|
source = self._get_message_source(channel_name, user, timestamp)
|
|
return {
|
|
"source": source,
|
|
"channel": channel_name,
|
|
"timestamp": timestamp,
|
|
"user": user,
|
|
}
|
|
|
|
def _get_message_source(self, channel_name: str, user: str, timestamp: str) -> str:
|
|
"""
|
|
Get the message source as a string.
|
|
|
|
Args:
|
|
channel_name (str): The name of the channel the message belongs to.
|
|
user (str): The user ID who sent the message.
|
|
timestamp (str): The timestamp of the message.
|
|
|
|
Returns:
|
|
str: The message source.
|
|
"""
|
|
if self.workspace_url:
|
|
channel_id = self.channel_id_map.get(channel_name, "")
|
|
return (
|
|
f"{self.workspace_url}/archives/{channel_id}"
|
|
+ f"/p{timestamp.replace('.', '')}"
|
|
)
|
|
else:
|
|
return f"{channel_name} - {user} - {timestamp}"
|