mirror of https://github.com/hwchase17/langchain
Add Mastodon toots loader (#5036)
# Add Mastodon toots loader. Loader works either with public toots, or Mastodon app credentials. Toot text and user info is loaded. I've also added integration test for this new loader as it works with public data, and a notebook with example output run now. --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>harrison/serialize-chat
parent
e173e032bc
commit
69de33e024
@ -0,0 +1,88 @@
|
||||
"""Mastodon document loader."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import mastodon
|
||||
|
||||
|
||||
def _dependable_mastodon_import() -> mastodon:
|
||||
try:
|
||||
import mastodon
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Mastodon.py package not found, "
|
||||
"please install it with `pip install Mastodon.py`"
|
||||
)
|
||||
return mastodon
|
||||
|
||||
|
||||
class MastodonTootsLoader(BaseLoader):
|
||||
"""Mastodon toots loader."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mastodon_accounts: Sequence[str],
|
||||
number_toots: Optional[int] = 100,
|
||||
exclude_replies: bool = False,
|
||||
access_token: Optional[str] = None,
|
||||
api_base_url: str = "https://mastodon.social",
|
||||
):
|
||||
"""Instantiate Mastodon toots loader.
|
||||
|
||||
Args:
|
||||
mastodon_accounts: The list of Mastodon accounts to query.
|
||||
number_toots: How many toots to pull for each account.
|
||||
exclude_replies: Whether to exclude reply toots from the load.
|
||||
access_token: An access token if toots are loaded as a Mastodon app. Can
|
||||
also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
|
||||
api_base_url: A Mastodon API base URL to talk to, if not using the default.
|
||||
"""
|
||||
mastodon = _dependable_mastodon_import()
|
||||
access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")
|
||||
self.api = mastodon.Mastodon(
|
||||
access_token=access_token, api_base_url=api_base_url
|
||||
)
|
||||
self.mastodon_accounts = mastodon_accounts
|
||||
self.number_toots = number_toots
|
||||
self.exclude_replies = exclude_replies
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load toots into documents."""
|
||||
results: List[Document] = []
|
||||
for account in self.mastodon_accounts:
|
||||
user = self.api.account_lookup(account)
|
||||
toots = self.api.account_statuses(
|
||||
user.id,
|
||||
only_media=False,
|
||||
pinned=False,
|
||||
exclude_replies=self.exclude_replies,
|
||||
exclude_reblogs=True,
|
||||
limit=self.number_toots,
|
||||
)
|
||||
docs = self._format_toots(toots, user)
|
||||
results.extend(docs)
|
||||
return results
|
||||
|
||||
def _format_toots(
|
||||
self, toots: List[Dict[str, Any]], user_info: dict
|
||||
) -> Iterable[Document]:
|
||||
"""Format toots into documents.
|
||||
|
||||
Adding user info, and selected toot fields into the metadata.
|
||||
"""
|
||||
for toot in toots:
|
||||
metadata = {
|
||||
"created_at": toot["created_at"],
|
||||
"user_info": user_info,
|
||||
"is_reply": toot["in_reply_to_id"] is not None,
|
||||
}
|
||||
yield Document(
|
||||
page_content=toot["content"],
|
||||
metadata=metadata,
|
||||
)
|
@ -0,0 +1,14 @@
|
||||
"""Tests for the Mastodon toots loader"""
|
||||
from langchain.document_loaders import MastodonTootsLoader
|
||||
|
||||
|
||||
def test_mastodon_toots_loader() -> None:
|
||||
"""Test Mastodon toots loader with an external query."""
|
||||
# Query the Mastodon CEO's account
|
||||
loader = MastodonTootsLoader(
|
||||
mastodon_accounts=["@Gargron@mastodon.social"], number_toots=1
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].metadata["user_info"]["id"] == 1
|
Loading…
Reference in New Issue