mirror of https://github.com/LemmyNet/lemmy
Persistent, performant, reliable federation queue (#3605)
* persistent activity queue * fixes * fixes * make federation workers function callable from outside * log federation instances * dead instance detection not needed here * taplo fmt * split federate bin/lib * minor fix * better logging * log * create struct to hold cancellable task for readability * use boxfuture for readability * reset submodule * fix * fix lint * swap * remove json column, use separate array columns instead * some review comments * make worker a struct for readability * minor readability * add local filter to community follower view * remove separate lemmy_federate entry point * fix remaining duration * address review comments mostly * fix lint * upgrade actitypub-fed to simpler interface * fix sql format * increase delays a bit * fixes after merge * remove selectable * fix instance selectable * add comment * start federation based on latest id at the time * rename federate process args * dead instances in one query * filter follow+report activities by local * remove synchronous federation remove activity sender queue * lint * fix federation tests by waiting for results to change * fix fed test * fix comment report * wait some more * Apply suggestions from code review Co-authored-by: SorteKanin <sortekanin@gmail.com> * fix most remaining tests * wait until private messages * fix community tests * fix community tests * move arg parse * use instance_id instead of domain in federation_queue_state table --------- Co-authored-by: Dessalines <dessalines@users.noreply.github.com> Co-authored-by: SorteKanin <sortekanin@gmail.com>pull/3949/head
parent
3b67642ec2
commit
375d9a2a3c
@ -0,0 +1,108 @@
|
||||
use crate::{
|
||||
fetcher::user_or_community::{PersonOrGroup, UserOrCommunity},
|
||||
objects::instance::ApubSite,
|
||||
protocol::objects::instance::Instance,
|
||||
};
|
||||
use activitypub_federation::{
|
||||
config::Data,
|
||||
traits::{Actor, Object},
|
||||
};
|
||||
use chrono::{DateTime, Utc};
|
||||
use lemmy_api_common::context::LemmyContext;
|
||||
use lemmy_utils::error::LemmyError;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
// todo: maybe this enum should be somewhere else?
|
||||
#[derive(Debug)]
|
||||
pub enum SiteOrCommunityOrUser {
|
||||
Site(ApubSite),
|
||||
UserOrCommunity(UserOrCommunity),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
#[serde(untagged)]
|
||||
pub enum SiteOrPersonOrGroup {
|
||||
Instance(Instance),
|
||||
PersonOrGroup(PersonOrGroup),
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Object for SiteOrCommunityOrUser {
|
||||
type DataType = LemmyContext;
|
||||
type Kind = SiteOrPersonOrGroup;
|
||||
type Error = LemmyError;
|
||||
|
||||
fn last_refreshed_at(&self) -> Option<DateTime<Utc>> {
|
||||
Some(match self {
|
||||
SiteOrCommunityOrUser::Site(p) => p.last_refreshed_at,
|
||||
SiteOrCommunityOrUser::UserOrCommunity(p) => p.last_refreshed_at()?,
|
||||
})
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn read_from_id(
|
||||
_object_id: Url,
|
||||
_data: &Data<Self::DataType>,
|
||||
) -> Result<Option<Self>, LemmyError> {
|
||||
unimplemented!();
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn delete(self, data: &Data<Self::DataType>) -> Result<(), LemmyError> {
|
||||
match self {
|
||||
SiteOrCommunityOrUser::Site(p) => p.delete(data).await,
|
||||
SiteOrCommunityOrUser::UserOrCommunity(p) => p.delete(data).await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn into_json(self, _data: &Data<Self::DataType>) -> Result<Self::Kind, LemmyError> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn verify(
|
||||
apub: &Self::Kind,
|
||||
expected_domain: &Url,
|
||||
data: &Data<Self::DataType>,
|
||||
) -> Result<(), LemmyError> {
|
||||
match apub {
|
||||
SiteOrPersonOrGroup::Instance(a) => ApubSite::verify(a, expected_domain, data).await,
|
||||
SiteOrPersonOrGroup::PersonOrGroup(a) => {
|
||||
UserOrCommunity::verify(a, expected_domain, data).await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn from_json(_apub: Self::Kind, _data: &Data<Self::DataType>) -> Result<Self, LemmyError> {
|
||||
unimplemented!();
|
||||
}
|
||||
}
|
||||
|
||||
impl Actor for SiteOrCommunityOrUser {
|
||||
fn id(&self) -> Url {
|
||||
match self {
|
||||
SiteOrCommunityOrUser::Site(u) => u.id(),
|
||||
SiteOrCommunityOrUser::UserOrCommunity(c) => c.id(),
|
||||
}
|
||||
}
|
||||
|
||||
fn public_key_pem(&self) -> &str {
|
||||
match self {
|
||||
SiteOrCommunityOrUser::Site(p) => p.public_key_pem(),
|
||||
SiteOrCommunityOrUser::UserOrCommunity(p) => p.public_key_pem(),
|
||||
}
|
||||
}
|
||||
|
||||
fn private_key_pem(&self) -> Option<String> {
|
||||
match self {
|
||||
SiteOrCommunityOrUser::Site(p) => p.private_key_pem(),
|
||||
SiteOrCommunityOrUser::UserOrCommunity(p) => p.private_key_pem(),
|
||||
}
|
||||
}
|
||||
|
||||
fn inbox(&self) -> Url {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
@ -0,0 +1,41 @@
|
||||
[package]
|
||||
name = "lemmy_federate"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
description.workspace = true
|
||||
license.workspace = true
|
||||
homepage.workspace = true
|
||||
documentation.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
lemmy_api_common.workspace = true
|
||||
lemmy_apub.workspace = true
|
||||
lemmy_db_schema = { workspace = true, features = ["full"] }
|
||||
lemmy_db_views_actor.workspace = true
|
||||
lemmy_utils.workspace = true
|
||||
|
||||
activitypub_federation.workspace = true
|
||||
anyhow.workspace = true
|
||||
futures.workspace = true
|
||||
chrono.workspace = true
|
||||
diesel = { workspace = true, features = ["postgres", "chrono", "serde_json"] }
|
||||
diesel-async = { workspace = true, features = ["deadpool", "postgres"] }
|
||||
once_cell.workspace = true
|
||||
reqwest.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde.workspace = true
|
||||
tokio = { workspace = true, features = ["full"] }
|
||||
tracing.workspace = true
|
||||
|
||||
async-trait = "0.1.71"
|
||||
bytes = "1.4.0"
|
||||
enum_delegate = "0.2.0"
|
||||
moka = { version = "0.11.2", features = ["future"] }
|
||||
openssl = "0.10.55"
|
||||
reqwest-middleware = "0.2.2"
|
||||
reqwest-tracing = "0.4.5"
|
||||
tokio-util = "0.7.8"
|
||||
tracing-subscriber = "0.3.17"
|
@ -0,0 +1,63 @@
|
||||
use crate::util::ActivityId;
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, TimeZone, Utc};
|
||||
use diesel::prelude::*;
|
||||
use diesel_async::RunQueryDsl;
|
||||
use lemmy_db_schema::{
|
||||
newtypes::InstanceId,
|
||||
utils::{get_conn, DbPool},
|
||||
};
|
||||
|
||||
#[derive(Queryable, Selectable, Insertable, AsChangeset, Clone)]
|
||||
#[diesel(table_name = lemmy_db_schema::schema::federation_queue_state)]
|
||||
#[diesel(check_for_backend(diesel::pg::Pg))]
|
||||
pub struct FederationQueueState {
|
||||
pub instance_id: InstanceId,
|
||||
pub last_successful_id: ActivityId, // todo: i64
|
||||
pub fail_count: i32,
|
||||
pub last_retry: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl FederationQueueState {
|
||||
/// load state or return a default empty value
|
||||
pub async fn load(
|
||||
pool: &mut DbPool<'_>,
|
||||
instance_id_: InstanceId,
|
||||
) -> Result<FederationQueueState> {
|
||||
use lemmy_db_schema::schema::federation_queue_state::dsl::{
|
||||
federation_queue_state,
|
||||
instance_id,
|
||||
};
|
||||
let conn = &mut get_conn(pool).await?;
|
||||
Ok(
|
||||
federation_queue_state
|
||||
.filter(instance_id.eq(&instance_id_))
|
||||
.select(FederationQueueState::as_select())
|
||||
.get_result(conn)
|
||||
.await
|
||||
.optional()?
|
||||
.unwrap_or(FederationQueueState {
|
||||
instance_id: instance_id_,
|
||||
fail_count: 0,
|
||||
last_retry: Utc.timestamp_nanos(0),
|
||||
last_successful_id: -1, // this value is set to the most current id for new instances
|
||||
}),
|
||||
)
|
||||
}
|
||||
pub async fn upsert(pool: &mut DbPool<'_>, state: &FederationQueueState) -> Result<()> {
|
||||
use lemmy_db_schema::schema::federation_queue_state::dsl::{
|
||||
federation_queue_state,
|
||||
instance_id,
|
||||
};
|
||||
let conn = &mut get_conn(pool).await?;
|
||||
|
||||
state
|
||||
.insert_into(federation_queue_state)
|
||||
.on_conflict(instance_id)
|
||||
.do_update()
|
||||
.set(state)
|
||||
.execute(conn)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
@ -0,0 +1,207 @@
|
||||
use crate::{
|
||||
util::{retry_sleep_duration, CancellableTask},
|
||||
worker::InstanceWorker,
|
||||
};
|
||||
use activitypub_federation::config::FederationConfig;
|
||||
use chrono::{Local, Timelike};
|
||||
use federation_queue_state::FederationQueueState;
|
||||
use lemmy_api_common::context::LemmyContext;
|
||||
use lemmy_db_schema::{
|
||||
newtypes::InstanceId,
|
||||
source::instance::Instance,
|
||||
utils::{ActualDbPool, DbPool},
|
||||
};
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
use tokio::{
|
||||
sync::mpsc::{unbounded_channel, UnboundedReceiver},
|
||||
time::sleep,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
mod federation_queue_state;
|
||||
mod util;
|
||||
mod worker;
|
||||
|
||||
static WORKER_EXIT_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
#[cfg(debug_assertions)]
|
||||
static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(5);
|
||||
#[cfg(not(debug_assertions))]
|
||||
static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(60);
|
||||
|
||||
pub struct Opts {
|
||||
/// how many processes you are starting in total
|
||||
pub process_count: i32,
|
||||
/// the index of this process (1-based: 1 - process_count)
|
||||
pub process_index: i32,
|
||||
}
|
||||
|
||||
async fn start_stop_federation_workers(
|
||||
opts: Opts,
|
||||
pool: ActualDbPool,
|
||||
federation_config: FederationConfig<LemmyContext>,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut workers = HashMap::<InstanceId, CancellableTask<_>>::new();
|
||||
|
||||
let (stats_sender, stats_receiver) = unbounded_channel();
|
||||
let exit_print = tokio::spawn(receive_print_stats(pool.clone(), stats_receiver));
|
||||
let pool2 = &mut DbPool::Pool(&pool);
|
||||
let process_index = opts.process_index - 1;
|
||||
let local_domain = federation_config.settings().get_hostname_without_port()?;
|
||||
loop {
|
||||
let mut total_count = 0;
|
||||
let mut dead_count = 0;
|
||||
let mut disallowed_count = 0;
|
||||
for (instance, allowed, is_dead) in Instance::read_all_with_blocked_and_dead(pool2).await? {
|
||||
if instance.domain == local_domain {
|
||||
continue;
|
||||
}
|
||||
if instance.id.inner() % opts.process_count != process_index {
|
||||
continue;
|
||||
}
|
||||
total_count += 1;
|
||||
if !allowed {
|
||||
disallowed_count += 1;
|
||||
}
|
||||
if is_dead {
|
||||
dead_count += 1;
|
||||
}
|
||||
let should_federate = allowed && !is_dead;
|
||||
if should_federate {
|
||||
if workers.contains_key(&instance.id) {
|
||||
if workers
|
||||
.get(&instance.id)
|
||||
.map(util::CancellableTask::has_ended)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
// task must have errored out, remove and recreated it
|
||||
let worker = workers
|
||||
.remove(&instance.id)
|
||||
.expect("just checked contains_key");
|
||||
tracing::error!(
|
||||
"worker for {} has stopped, recreating: {:?}",
|
||||
instance.domain,
|
||||
worker.cancel().await
|
||||
);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// create new worker
|
||||
let stats_sender = stats_sender.clone();
|
||||
let context = federation_config.to_request_data();
|
||||
let pool = pool.clone();
|
||||
workers.insert(
|
||||
instance.id,
|
||||
CancellableTask::spawn(WORKER_EXIT_TIMEOUT, |stop| async move {
|
||||
InstanceWorker::init_and_loop(
|
||||
instance,
|
||||
context,
|
||||
&mut DbPool::Pool(&pool),
|
||||
stop,
|
||||
stats_sender,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}),
|
||||
);
|
||||
} else if !should_federate {
|
||||
if let Some(worker) = workers.remove(&instance.id) {
|
||||
if let Err(e) = worker.cancel().await {
|
||||
tracing::error!("error stopping worker: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let worker_count = workers.len();
|
||||
tracing::info!("Federating to {worker_count}/{total_count} instances ({dead_count} dead, {disallowed_count} disallowed)");
|
||||
tokio::select! {
|
||||
() = sleep(INSTANCES_RECHECK_DELAY) => {},
|
||||
_ = cancel.cancelled() => { break; }
|
||||
}
|
||||
}
|
||||
drop(stats_sender);
|
||||
tracing::warn!(
|
||||
"Waiting for {} workers ({:.2?} max)",
|
||||
workers.len(),
|
||||
WORKER_EXIT_TIMEOUT
|
||||
);
|
||||
// the cancel futures need to be awaited concurrently for the shutdown processes to be triggered concurrently
|
||||
futures::future::join_all(workers.into_values().map(util::CancellableTask::cancel)).await;
|
||||
exit_print.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// starts and stops federation workers depending on which instances are on db
|
||||
/// await the returned future to stop/cancel all workers gracefully
|
||||
pub fn start_stop_federation_workers_cancellable(
|
||||
opts: Opts,
|
||||
pool: ActualDbPool,
|
||||
config: FederationConfig<LemmyContext>,
|
||||
) -> CancellableTask<()> {
|
||||
CancellableTask::spawn(WORKER_EXIT_TIMEOUT, move |c| {
|
||||
start_stop_federation_workers(opts, pool, config, c)
|
||||
})
|
||||
}
|
||||
|
||||
/// every 60s, print the state for every instance. exits if the receiver is done (all senders dropped)
|
||||
async fn receive_print_stats(
|
||||
pool: ActualDbPool,
|
||||
mut receiver: UnboundedReceiver<(String, FederationQueueState)>,
|
||||
) {
|
||||
let pool = &mut DbPool::Pool(&pool);
|
||||
let mut printerval = tokio::time::interval(Duration::from_secs(60));
|
||||
printerval.tick().await; // skip first
|
||||
let mut stats = HashMap::new();
|
||||
loop {
|
||||
tokio::select! {
|
||||
ele = receiver.recv() => {
|
||||
let Some((domain, ele)) = ele else {
|
||||
tracing::info!("done. quitting");
|
||||
print_stats(pool, &stats).await;
|
||||
return;
|
||||
};
|
||||
stats.insert(domain, ele);
|
||||
},
|
||||
_ = printerval.tick() => {
|
||||
print_stats(pool, &stats).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn print_stats(pool: &mut DbPool<'_>, stats: &HashMap<String, FederationQueueState>) {
|
||||
let last_id = crate::util::get_latest_activity_id(pool).await;
|
||||
let Ok(last_id) = last_id else {
|
||||
tracing::error!("could not get last id");
|
||||
return;
|
||||
};
|
||||
// it's expected that the values are a bit out of date, everything < SAVE_STATE_EVERY should be considered up to date
|
||||
tracing::info!(
|
||||
"Federation state as of {}:",
|
||||
Local::now()
|
||||
.with_nanosecond(0)
|
||||
.expect("0 is valid nanos")
|
||||
.to_rfc3339()
|
||||
);
|
||||
// todo: less noisy output (only output failing instances and summary for successful)
|
||||
// todo: more stats (act/sec, avg http req duration)
|
||||
let mut ok_count = 0;
|
||||
for (domain, stat) in stats {
|
||||
let behind = last_id - stat.last_successful_id;
|
||||
if stat.fail_count > 0 {
|
||||
tracing::info!(
|
||||
"{}: Warning. {} behind, {} consecutive fails, current retry delay {:.2?}",
|
||||
domain,
|
||||
behind,
|
||||
stat.fail_count,
|
||||
retry_sleep_duration(stat.fail_count)
|
||||
);
|
||||
} else if behind > 0 {
|
||||
tracing::info!("{}: Ok. {} behind", domain, behind);
|
||||
} else {
|
||||
ok_count += 1;
|
||||
}
|
||||
}
|
||||
tracing::info!("{ok_count} others up to date");
|
||||
}
|
@ -0,0 +1,198 @@
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use diesel::{
|
||||
prelude::*,
|
||||
sql_types::{Bool, Int8},
|
||||
};
|
||||
use diesel_async::RunQueryDsl;
|
||||
use lemmy_apub::{
|
||||
activity_lists::SharedInboxActivities,
|
||||
fetcher::{site_or_community_or_user::SiteOrCommunityOrUser, user_or_community::UserOrCommunity},
|
||||
};
|
||||
use lemmy_db_schema::{
|
||||
source::{
|
||||
activity::{ActorType, SentActivity},
|
||||
community::Community,
|
||||
person::Person,
|
||||
site::Site,
|
||||
},
|
||||
traits::ApubActor,
|
||||
utils::{get_conn, DbPool},
|
||||
};
|
||||
use moka::future::Cache;
|
||||
use once_cell::sync::Lazy;
|
||||
use reqwest::Url;
|
||||
use serde_json::Value;
|
||||
use std::{
|
||||
future::Future,
|
||||
pin::Pin,
|
||||
sync::{Arc, RwLock},
|
||||
time::Duration,
|
||||
};
|
||||
use tokio::{task::JoinHandle, time::sleep};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
pub struct CancellableTask<R: Send + 'static> {
|
||||
f: Pin<Box<dyn Future<Output = Result<R, anyhow::Error>> + Send + 'static>>,
|
||||
ended: Arc<RwLock<bool>>,
|
||||
}
|
||||
|
||||
impl<R: Send + 'static> CancellableTask<R> {
|
||||
/// spawn a task but with graceful shutdown
|
||||
pub fn spawn<F>(
|
||||
timeout: Duration,
|
||||
task: impl FnOnce(CancellationToken) -> F,
|
||||
) -> CancellableTask<R>
|
||||
where
|
||||
F: Future<Output = Result<R>> + Send + 'static,
|
||||
{
|
||||
let stop = CancellationToken::new();
|
||||
let task = task(stop.clone());
|
||||
let ended = Arc::new(RwLock::new(false));
|
||||
let ended_write = ended.clone();
|
||||
let task: JoinHandle<Result<R>> = tokio::spawn(async move {
|
||||
match task.await {
|
||||
Ok(o) => Ok(o),
|
||||
Err(e) => {
|
||||
*ended_write.write().expect("poisoned") = true;
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
});
|
||||
let abort = task.abort_handle();
|
||||
CancellableTask {
|
||||
ended,
|
||||
f: Box::pin(async move {
|
||||
stop.cancel();
|
||||
tokio::select! {
|
||||
r = task => {
|
||||
Ok(r.context("could not join")??)
|
||||
},
|
||||
_ = sleep(timeout) => {
|
||||
abort.abort();
|
||||
tracing::warn!("Graceful shutdown timed out, aborting task");
|
||||
Err(anyhow!("task aborted due to timeout"))
|
||||
}
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// cancel the cancel signal, wait for timeout for the task to stop gracefully, otherwise abort it
|
||||
pub async fn cancel(self) -> Result<R, anyhow::Error> {
|
||||
self.f.await
|
||||
}
|
||||
pub fn has_ended(&self) -> bool {
|
||||
*self.ended.read().expect("poisoned")
|
||||
}
|
||||
}
|
||||
|
||||
/// assuming apub priv key and ids are immutable, then we don't need to have TTL
|
||||
/// TODO: capacity should be configurable maybe based on memory use
|
||||
pub(crate) async fn get_actor_cached(
|
||||
pool: &mut DbPool<'_>,
|
||||
actor_type: ActorType,
|
||||
actor_apub_id: &Url,
|
||||
) -> Result<Arc<SiteOrCommunityOrUser>> {
|
||||
static CACHE: Lazy<Cache<Url, Arc<SiteOrCommunityOrUser>>> =
|
||||
Lazy::new(|| Cache::builder().max_capacity(10000).build());
|
||||
CACHE
|
||||
.try_get_with(actor_apub_id.clone(), async {
|
||||
let url = actor_apub_id.clone().into();
|
||||
let person = match actor_type {
|
||||
ActorType::Site => SiteOrCommunityOrUser::Site(
|
||||
Site::read_from_apub_id(pool, &url)
|
||||
.await?
|
||||
.context("apub site not found")?
|
||||
.into(),
|
||||
),
|
||||
ActorType::Community => SiteOrCommunityOrUser::UserOrCommunity(UserOrCommunity::Community(
|
||||
Community::read_from_apub_id(pool, &url)
|
||||
.await?
|
||||
.context("apub community not found")?
|
||||
.into(),
|
||||
)),
|
||||
ActorType::Person => SiteOrCommunityOrUser::UserOrCommunity(UserOrCommunity::User(
|
||||
Person::read_from_apub_id(pool, &url)
|
||||
.await?
|
||||
.context("apub person not found")?
|
||||
.into(),
|
||||
)),
|
||||
};
|
||||
Result::<_, anyhow::Error>::Ok(Arc::new(person))
|
||||
})
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("err getting actor {actor_type:?} {actor_apub_id}: {e:?}"))
|
||||
}
|
||||
|
||||
/// this should maybe be a newtype like all the other PersonId CommunityId etc.
|
||||
pub(crate) type ActivityId = i64;
|
||||
|
||||
type CachedActivityInfo = Option<Arc<(SentActivity, SharedInboxActivities)>>;
|
||||
/// activities are immutable so cache does not need to have TTL
|
||||
/// May return None if the corresponding id does not exist or is a received activity.
|
||||
/// Holes in serials are expected behaviour in postgresql
|
||||
/// todo: cache size should probably be configurable / dependent on desired memory usage
|
||||
pub(crate) async fn get_activity_cached(
|
||||
pool: &mut DbPool<'_>,
|
||||
activity_id: ActivityId,
|
||||
) -> Result<CachedActivityInfo> {
|
||||
static ACTIVITIES: Lazy<Cache<ActivityId, CachedActivityInfo>> =
|
||||
Lazy::new(|| Cache::builder().max_capacity(10000).build());
|
||||
ACTIVITIES
|
||||
.try_get_with(activity_id, async {
|
||||
let row = SentActivity::read(pool, activity_id)
|
||||
.await
|
||||
.optional()
|
||||
.context("could not read activity")?;
|
||||
let Some(mut row) = row else {
|
||||
return anyhow::Result::<_, anyhow::Error>::Ok(None);
|
||||
};
|
||||
// swap to avoid cloning
|
||||
let mut data = Value::Null;
|
||||
std::mem::swap(&mut row.data, &mut data);
|
||||
let activity_actual: SharedInboxActivities = serde_json::from_value(data)?;
|
||||
|
||||
Ok(Some(Arc::new((row, activity_actual))))
|
||||
})
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("err getting activity: {e:?}"))
|
||||
}
|
||||
|
||||
/// return the most current activity id (with 1 second cache)
|
||||
pub(crate) async fn get_latest_activity_id(pool: &mut DbPool<'_>) -> Result<ActivityId> {
|
||||
static CACHE: Lazy<Cache<(), ActivityId>> = Lazy::new(|| {
|
||||
Cache::builder()
|
||||
.time_to_live(Duration::from_secs(1))
|
||||
.build()
|
||||
});
|
||||
CACHE
|
||||
.try_get_with((), async {
|
||||
let conn = &mut get_conn(pool).await?;
|
||||
let seq: Sequence =
|
||||
diesel::sql_query("select last_value, is_called from sent_activity_id_seq")
|
||||
.get_result(conn)
|
||||
.await?;
|
||||
let latest_id = if seq.is_called {
|
||||
seq.last_value as ActivityId
|
||||
} else {
|
||||
// if a PG sequence has never been used, last_value will actually be next_value
|
||||
(seq.last_value - 1) as ActivityId
|
||||
};
|
||||
anyhow::Result::<_, anyhow::Error>::Ok(latest_id as ActivityId)
|
||||
})
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("err getting id: {e:?}"))
|
||||
}
|
||||
|
||||
/// how long to sleep based on how many retries have already happened
|
||||
pub(crate) fn retry_sleep_duration(retry_count: i32) -> Duration {
|
||||
Duration::from_secs_f64(10.0 * 2.0_f64.powf(f64::from(retry_count)))
|
||||
}
|
||||
|
||||
#[derive(QueryableByName)]
|
||||
struct Sequence {
|
||||
#[diesel(sql_type = Int8)]
|
||||
last_value: i64, // this value is bigint for some reason even if sequence is int4
|
||||
#[diesel(sql_type = Bool)]
|
||||
is_called: bool,
|
||||
}
|
@ -0,0 +1,312 @@
|
||||
use crate::{
|
||||
federation_queue_state::FederationQueueState,
|
||||
util::{get_activity_cached, get_actor_cached, get_latest_activity_id, retry_sleep_duration},
|
||||
};
|
||||
use activitypub_federation::{activity_sending::SendActivityTask, config::Data};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, TimeZone, Utc};
|
||||
use lemmy_api_common::context::LemmyContext;
|
||||
use lemmy_apub::activity_lists::SharedInboxActivities;
|
||||
use lemmy_db_schema::{
|
||||
newtypes::{CommunityId, InstanceId},
|
||||
source::{activity::SentActivity, instance::Instance, site::Site},
|
||||
utils::DbPool,
|
||||
};
|
||||
use lemmy_db_views_actor::structs::CommunityFollowerView;
|
||||
use lemmy_utils::error::LemmyErrorExt2;
|
||||
use once_cell::sync::Lazy;
|
||||
use reqwest::Url;
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
time::Duration,
|
||||
};
|
||||
use tokio::{sync::mpsc::UnboundedSender, time::sleep};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
/// save state to db every n sends if there's no failures (otherwise state is saved after every attempt)
|
||||
static CHECK_SAVE_STATE_EVERY_IT: i64 = 100;
|
||||
static SAVE_STATE_EVERY_TIME: Duration = Duration::from_secs(60);
|
||||
/// recheck for new federation work every n seconds
|
||||
#[cfg(debug_assertions)]
|
||||
static WORK_FINISHED_RECHECK_DELAY: Duration = Duration::from_secs(1);
|
||||
#[cfg(not(debug_assertions))]
|
||||
static WORK_FINISHED_RECHECK_DELAY: Duration = Duration::from_secs(30);
|
||||
#[cfg(debug_assertions)]
|
||||
static FOLLOW_ADDITIONS_RECHECK_DELAY: Lazy<chrono::Duration> =
|
||||
Lazy::new(|| chrono::Duration::seconds(1));
|
||||
#[cfg(not(debug_assertions))]
|
||||
static FOLLOW_ADDITIONS_RECHECK_DELAY: Lazy<chrono::Duration> =
|
||||
Lazy::new(|| chrono::Duration::minutes(1));
|
||||
static FOLLOW_REMOVALS_RECHECK_DELAY: Lazy<chrono::Duration> =
|
||||
Lazy::new(|| chrono::Duration::hours(1));
|
||||
pub(crate) struct InstanceWorker {
|
||||
instance: Instance,
|
||||
// load site lazily because if an instance is first seen due to being on allowlist,
|
||||
// the corresponding row in `site` may not exist yet since that is only added once
|
||||
// `fetch_instance_actor_for_object` is called.
|
||||
// (this should be unlikely to be relevant outside of the federation tests)
|
||||
site_loaded: bool,
|
||||
site: Option<Site>,
|
||||
followed_communities: HashMap<CommunityId, HashSet<Url>>,
|
||||
stop: CancellationToken,
|
||||
context: Data<LemmyContext>,
|
||||
stats_sender: UnboundedSender<(String, FederationQueueState)>,
|
||||
last_full_communities_fetch: DateTime<Utc>,
|
||||
last_incremental_communities_fetch: DateTime<Utc>,
|
||||
state: FederationQueueState,
|
||||
last_state_insert: DateTime<Utc>,
|
||||
}
|
||||
|
||||
impl InstanceWorker {
|
||||
pub(crate) async fn init_and_loop(
|
||||
instance: Instance,
|
||||
context: Data<LemmyContext>,
|
||||
pool: &mut DbPool<'_>, // in theory there's a ref to the pool in context, but i couldn't get that to work wrt lifetimes
|
||||
stop: CancellationToken,
|
||||
stats_sender: UnboundedSender<(String, FederationQueueState)>,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let state = FederationQueueState::load(pool, instance.id).await?;
|
||||
let mut worker = InstanceWorker {
|
||||
instance,
|
||||
site_loaded: false,
|
||||
site: None,
|
||||
followed_communities: HashMap::new(),
|
||||
stop,
|
||||
context,
|
||||
stats_sender,
|
||||
last_full_communities_fetch: Utc.timestamp_nanos(0),
|
||||
last_incremental_communities_fetch: Utc.timestamp_nanos(0),
|
||||
state,
|
||||
last_state_insert: Utc.timestamp_nanos(0),
|
||||
};
|
||||
worker.loop_until_stopped(pool).await
|
||||
}
|
||||
/// loop fetch new activities from db and send them to the inboxes of the given instances
|
||||
/// this worker only returns if (a) there is an internal error or (b) the cancellation token is cancelled (graceful exit)
|
||||
pub(crate) async fn loop_until_stopped(
|
||||
&mut self,
|
||||
pool: &mut DbPool<'_>,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let save_state_every = chrono::Duration::from_std(SAVE_STATE_EVERY_TIME).expect("not negative");
|
||||
|
||||
self.update_communities(pool).await?;
|
||||
self.initial_fail_sleep().await?;
|
||||
while !self.stop.is_cancelled() {
|
||||
self.loop_batch(pool).await?;
|
||||
if self.stop.is_cancelled() {
|
||||
break;
|
||||
}
|
||||
if (Utc::now() - self.last_state_insert) > save_state_every {
|
||||
self.save_and_send_state(pool).await?;
|
||||
}
|
||||
self.update_communities(pool).await?;
|
||||
}
|
||||
// final update of state in db
|
||||
self.save_and_send_state(pool).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn initial_fail_sleep(&mut self) -> Result<()> {
|
||||
// before starting queue, sleep remaining duration if last request failed
|
||||
if self.state.fail_count > 0 {
|
||||
let elapsed = (Utc::now() - self.state.last_retry).to_std()?;
|
||||
let required = retry_sleep_duration(self.state.fail_count);
|
||||
if elapsed >= required {
|
||||
return Ok(());
|
||||
}
|
||||
let remaining = required - elapsed;
|
||||
tokio::select! {
|
||||
() = sleep(remaining) => {},
|
||||
() = self.stop.cancelled() => {}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
async fn loop_batch(&mut self, pool: &mut DbPool<'_>) -> Result<()> {
|
||||
let latest_id = get_latest_activity_id(pool).await?;
|
||||
if self.state.last_successful_id == -1 {
|
||||
// this is the initial creation (instance first seen) of the federation queue for this instance
|
||||
// skip all past activities:
|
||||
self.state.last_successful_id = latest_id;
|
||||
// save here to ensure it's not read as 0 again later if no activities have happened
|
||||
self.save_and_send_state(pool).await?;
|
||||
}
|
||||
let mut id = self.state.last_successful_id;
|
||||
if id == latest_id {
|
||||
// no more work to be done, wait before rechecking
|
||||
tokio::select! {
|
||||
() = sleep(WORK_FINISHED_RECHECK_DELAY) => {},
|
||||
() = self.stop.cancelled() => {}
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
let mut processed_activities = 0;
|
||||
while id < latest_id
|
||||
&& processed_activities < CHECK_SAVE_STATE_EVERY_IT
|
||||
&& !self.stop.is_cancelled()
|
||||
{
|
||||
id += 1;
|
||||
processed_activities += 1;
|
||||
let Some(ele) = get_activity_cached(pool, id)
|
||||
.await
|
||||
.context("failed reading activity from db")?
|
||||
else {
|
||||
self.state.last_successful_id = id;
|
||||
continue;
|
||||
};
|
||||
if let Err(e) = self.send_retry_loop(pool, &ele.0, &ele.1).await {
|
||||
tracing::warn!(
|
||||
"sending {} errored internally, skipping activity: {:?}",
|
||||
ele.0.ap_id,
|
||||
e
|
||||
);
|
||||
}
|
||||
if self.stop.is_cancelled() {
|
||||
return Ok(());
|
||||
}
|
||||
// send success!
|
||||
self.state.last_successful_id = id;
|
||||
self.state.fail_count = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// this function will return successfully when (a) send succeeded or (b) worker cancelled
|
||||
// and will return an error if an internal error occurred (send errors cause an infinite loop)
|
||||
async fn send_retry_loop(
|
||||
&mut self,
|
||||
pool: &mut DbPool<'_>,
|
||||
activity: &SentActivity,
|
||||
object: &SharedInboxActivities,
|
||||
) -> Result<()> {
|
||||
let inbox_urls = self
|
||||
.get_inbox_urls(pool, activity)
|
||||
.await
|
||||
.context("failed figuring out inbox urls")?;
|
||||
if inbox_urls.is_empty() {
|
||||
self.state.last_successful_id = activity.id;
|
||||
return Ok(());
|
||||
}
|
||||
let Some(actor_apub_id) = &activity.actor_apub_id else {
|
||||
return Ok(()); // activity was inserted before persistent queue was activated
|
||||
};
|
||||
let actor = get_actor_cached(pool, activity.actor_type, actor_apub_id)
|
||||
.await
|
||||
.context("failed getting actor instance (was it marked deleted / removed?)")?;
|
||||
|
||||
let inbox_urls = inbox_urls.into_iter().collect();
|
||||
let requests = SendActivityTask::prepare(object, actor.as_ref(), inbox_urls, &self.context)
|
||||
.await
|
||||
.into_anyhow()?;
|
||||
for task in requests {
|
||||
// usually only one due to shared inbox
|
||||
tracing::info!("sending out {}", task);
|
||||
while let Err(e) = task.sign_and_send(&self.context).await {
|
||||
self.state.fail_count += 1;
|
||||
self.state.last_retry = Utc::now();
|
||||
let retry_delay: Duration = retry_sleep_duration(self.state.fail_count);
|
||||
tracing::info!(
|
||||
"{}: retrying {} attempt {} with delay {retry_delay:.2?}. ({e})",
|
||||
self.instance.domain,
|
||||
activity.id,
|
||||
self.state.fail_count
|
||||
);
|
||||
self.save_and_send_state(pool).await?;
|
||||
tokio::select! {
|
||||
() = sleep(retry_delay) => {},
|
||||
() = self.stop.cancelled() => {
|
||||
// save state to db and exit
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// get inbox urls of sending the given activity to the given instance
|
||||
/// most often this will return 0 values (if instance doesn't care about the activity)
|
||||
/// or 1 value (the shared inbox)
|
||||
/// > 1 values only happens for non-lemmy software
|
||||
async fn get_inbox_urls(
|
||||
&mut self,
|
||||
pool: &mut DbPool<'_>,
|
||||
activity: &SentActivity,
|
||||
) -> Result<HashSet<Url>> {
|
||||
let mut inbox_urls: HashSet<Url> = HashSet::new();
|
||||
|
||||
if activity.send_all_instances {
|
||||
if !self.site_loaded {
|
||||
self.site = Site::read_from_instance_id(pool, self.instance.id).await?;
|
||||
self.site_loaded = true;
|
||||
}
|
||||
if let Some(site) = &self.site {
|
||||
// Nutomic: Most non-lemmy software wont have a site row. That means it cant handle these activities. So handling it like this is fine.
|
||||
inbox_urls.insert(site.inbox_url.inner().clone());
|
||||
}
|
||||
}
|
||||
if let Some(t) = &activity.send_community_followers_of {
|
||||
if let Some(urls) = self.followed_communities.get(t) {
|
||||
inbox_urls.extend(urls.iter().map(std::clone::Clone::clone));
|
||||
}
|
||||
}
|
||||
inbox_urls.extend(
|
||||
activity
|
||||
.send_inboxes
|
||||
.iter()
|
||||
.filter_map(std::option::Option::as_ref)
|
||||
.filter_map(|u| (u.domain() == Some(&self.instance.domain)).then(|| u.inner().clone())),
|
||||
);
|
||||
Ok(inbox_urls)
|
||||
}
|
||||
|
||||
async fn update_communities(&mut self, pool: &mut DbPool<'_>) -> Result<()> {
|
||||
if (Utc::now() - self.last_full_communities_fetch) > *FOLLOW_REMOVALS_RECHECK_DELAY {
|
||||
// process removals every hour
|
||||
(self.followed_communities, self.last_full_communities_fetch) = self
|
||||
.get_communities(pool, self.instance.id, self.last_full_communities_fetch)
|
||||
.await?;
|
||||
self.last_incremental_communities_fetch = self.last_full_communities_fetch;
|
||||
}
|
||||
if (Utc::now() - self.last_incremental_communities_fetch) > *FOLLOW_ADDITIONS_RECHECK_DELAY {
|
||||
// process additions every minute
|
||||
let (news, time) = self
|
||||
.get_communities(
|
||||
pool,
|
||||
self.instance.id,
|
||||
self.last_incremental_communities_fetch,
|
||||
)
|
||||
.await?;
|
||||
self.followed_communities.extend(news);
|
||||
self.last_incremental_communities_fetch = time;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// get a list of local communities with the remote inboxes on the given instance that cares about them
|
||||
async fn get_communities(
|
||||
&mut self,
|
||||
pool: &mut DbPool<'_>,
|
||||
instance_id: InstanceId,
|
||||
last_fetch: DateTime<Utc>,
|
||||
) -> Result<(HashMap<CommunityId, HashSet<Url>>, DateTime<Utc>)> {
|
||||
let new_last_fetch = Utc::now(); // update to time before fetch to ensure overlap
|
||||
Ok((
|
||||
CommunityFollowerView::get_instance_followed_community_inboxes(pool, instance_id, last_fetch)
|
||||
.await?
|
||||
.into_iter()
|
||||
.fold(HashMap::new(), |mut map, (c, u)| {
|
||||
map.entry(c).or_insert_with(HashSet::new).insert(u.into());
|
||||
map
|
||||
}),
|
||||
new_last_fetch,
|
||||
))
|
||||
}
|
||||
async fn save_and_send_state(&mut self, pool: &mut DbPool<'_>) -> Result<()> {
|
||||
self.last_state_insert = Utc::now();
|
||||
FederationQueueState::upsert(pool, &self.state).await?;
|
||||
self
|
||||
.stats_sender
|
||||
.send((self.instance.domain.clone(), self.state.clone()))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
ALTER TABLE sent_activity
|
||||
DROP COLUMN send_inboxes,
|
||||
DROP COLUMN send_community_followers_of,
|
||||
DROP COLUMN send_all_instances,
|
||||
DROP COLUMN actor_apub_id,
|
||||
DROP COLUMN actor_type;
|
||||
|
||||
DROP TYPE actor_type_enum;
|
||||
|
||||
DROP TABLE federation_queue_state;
|
||||
|
||||
DROP INDEX idx_community_follower_published;
|
||||
|
@ -0,0 +1,32 @@
|
||||
CREATE TYPE actor_type_enum AS enum (
|
||||
'site',
|
||||
'community',
|
||||
'person'
|
||||
);
|
||||
|
||||
-- actor_apub_id only null for old entries before this migration
|
||||
ALTER TABLE sent_activity
|
||||
ADD COLUMN send_inboxes text[] NOT NULL DEFAULT '{}', -- list of specific inbox urls
|
||||
ADD COLUMN send_community_followers_of integer DEFAULT NULL,
|
||||
ADD COLUMN send_all_instances boolean NOT NULL DEFAULT FALSE,
|
||||
ADD COLUMN actor_type actor_type_enum NOT NULL DEFAULT 'person',
|
||||
ADD COLUMN actor_apub_id text DEFAULT NULL;
|
||||
|
||||
ALTER TABLE sent_activity
|
||||
ALTER COLUMN send_inboxes DROP DEFAULT,
|
||||
ALTER COLUMN send_community_followers_of DROP DEFAULT,
|
||||
ALTER COLUMN send_all_instances DROP DEFAULT,
|
||||
ALTER COLUMN actor_type DROP DEFAULT,
|
||||
ALTER COLUMN actor_apub_id DROP DEFAULT;
|
||||
|
||||
CREATE TABLE federation_queue_state (
|
||||
id serial PRIMARY KEY,
|
||||
instance_id integer NOT NULL UNIQUE REFERENCES instance (id),
|
||||
last_successful_id bigint NOT NULL,
|
||||
fail_count integer NOT NULL,
|
||||
last_retry timestamptz NOT NULL
|
||||
);
|
||||
|
||||
-- for incremental fetches of followers
|
||||
CREATE INDEX idx_community_follower_published ON community_follower (published);
|
||||
|
Loading…
Reference in New Issue