From 608bb6b1b4099fe0d7a4c3fad118a4cd65226d72 Mon Sep 17 00:00:00 2001 From: Nutomic Date: Fri, 13 Oct 2023 02:36:02 +0200 Subject: [PATCH] Convert titles for posts from Mastodon to plaintext (fixes #3828) (#4033) * Convert titles for posts from Mastodon to plaintext (fixes #3828) * Fix prettier. * Trigger build * Convert titles for posts from Mastodon to plaintext (fixes #3828) * Fix prettier. * Fix sanizize. --------- Co-authored-by: Dessalines Co-authored-by: Dessalines --- Cargo.lock | 8 +++ crates/apub/Cargo.toml | 2 + crates/apub/assets/lemmy/objects/group.json | 2 +- crates/apub/assets/mastodon/objects/page.json | 36 +++++++------- crates/apub/src/api/user_settings_backup.rs | 7 +-- crates/apub/src/objects/post.rs | 49 ++++++++++++++++--- 6 files changed, 77 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 41f132612..36bbfce8e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2719,6 +2719,7 @@ dependencies = [ "enum_delegate", "futures", "html2md", + "html2text", "http", "itertools 0.11.0", "lemmy_api_common", @@ -2734,6 +2735,7 @@ dependencies = [ "serde_json", "serde_with", "serial_test", + "stringreader", "strum_macros", "task-local-extensions", "tokio", @@ -4950,6 +4952,12 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "stringreader" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "913e7b03d63752f6cdd2df77da36749d82669904798fe8944b9ec3d23f159905" + [[package]] name = "strsim" version = "0.10.0" diff --git a/crates/apub/Cargo.toml b/crates/apub/Cargo.toml index ebdf3245e..748fe3335 100644 --- a/crates/apub/Cargo.toml +++ b/crates/apub/Cargo.toml @@ -38,6 +38,8 @@ anyhow = { workspace = true } reqwest = { workspace = true } once_cell = { workspace = true } html2md = "0.2.14" +html2text = "0.6.0" +stringreader = "0.1.1" serde_with = { workspace = true } enum_delegate = "0.2.0" moka = { version = "0.11", features = ["future"] } diff --git a/crates/apub/assets/lemmy/objects/group.json b/crates/apub/assets/lemmy/objects/group.json index 3870daf75..1b848a866 100644 --- a/crates/apub/assets/lemmy/objects/group.json +++ b/crates/apub/assets/lemmy/objects/group.json @@ -1,7 +1,7 @@ { "id": "https://enterprise.lemmy.ml/c/tenforward", "type": "Group", - "preferredUsername": "main", + "preferredUsername": "tenforward", "name": "Ten Forward", "summary": "

Lounge and recreation facility

\n
\n

Welcome to the Enterprise!.

\n", "source": { diff --git a/crates/apub/assets/mastodon/objects/page.json b/crates/apub/assets/mastodon/objects/page.json index d2195dbf3..ec4c13080 100644 --- a/crates/apub/assets/mastodon/objects/page.json +++ b/crates/apub/assets/mastodon/objects/page.json @@ -11,40 +11,42 @@ "votersCount": "toot:votersCount" } ], - "id": "https://mastodon.madrid/users/felix/statuses/107224289116410645", + "id": "https://dice.camp/users/thekernelinyellow/statuses/110830743680706519", "type": "Note", "summary": null, - "published": "2021-11-05T11:46:50Z", - "url": "https://mastodon.madrid/@felix/107224289116410645", - "attributedTo": "https://mastodon.madrid/users/felix", - "to": ["https://mastodon.madrid/users/felix/followers"], + "inReplyTo": null, + "published": "2023-08-04T09:55:39Z", + "url": "https://dice.camp/@thekernelinyellow/110830743680706519", + "attributedTo": "https://dice.camp/users/thekernelinyellow", + "to": ["https://www.w3.org/ns/activitystreams#Public"], "cc": [ - "https://www.w3.org/ns/activitystreams#Public", - "https://mamot.fr/users/retiolus" + "https://dice.camp/users/thekernelinyellow/followers", + "https://enterprise.lemmy.ml/c/tenforward", + "https://enterprise.lemmy.ml/c/tenforward/followers" ], "sensitive": false, - "atomUri": "https://mastodon.madrid/users/felix/statuses/107224289116410645", - "inReplyToAtomUri": "https://mamot.fr/users/retiolus/statuses/107224244380204526", - "conversation": "tag:mamot.fr,2021-11-05:objectId=64635960:objectType=Conversation", - "content": "

@retiolus i have never been disappointed by a thinkpad. if you want to save money, get a model from a few years ago, there isnt a huge difference anyway.

", + "atomUri": "https://dice.camp/users/thekernelinyellow/statuses/110830743680706519", + "inReplyToAtomUri": null, + "conversation": "tag:dice.camp,2023-08-04:objectId=29969291:objectType=Conversation", + "content": "

@tenforward Variable never resetting at refresh

Hi! I'm using a variable to count elements in my generator but every time I generate a new character, the counter's value carries on from the previous one. Is there a function to reset it (I set it to 0 at the beginning of the file)

", "contentMap": { - "en": "

@retiolus i have never been disappointed by a thinkpad. if you want to save money, get a model from a few years ago, there isnt a huge difference anyway.

" + "it": "

@tenforwardVariable never resetting at refresh

Hi! I'm using a variable to count elements in my generator but every time I generate a new character, the counter's value carries on from the previous one. Is there a function to reset it (I set it to 0 at the beginning of the file)

" }, "attachment": [], "tag": [ { "type": "Mention", - "href": "https://mamot.fr/users/retiolus", - "name": "@retiolus@mamot.fr" + "href": "https://enterprise.lemmy.ml/c/tenforward", + "name": "@tenforward@enterprise.lemmy.ml" } ], "replies": { - "id": "https://mastodon.madrid/users/felix/statuses/107224289116410645/replies", + "id": "https://dice.camp/users/thekernelinyellow/statuses/110830743680706519/replies", "type": "Collection", "first": { "type": "CollectionPage", - "next": "https://mastodon.madrid/users/felix/statuses/107224289116410645/replies?only_other_accounts=true&page=true", - "partOf": "https://mastodon.madrid/users/felix/statuses/107224289116410645/replies", + "next": "https://dice.camp/users/thekernelinyellow/statuses/110830743680706519/replies?only_other_accounts=true&page=true", + "partOf": "https://dice.camp/users/thekernelinyellow/statuses/110830743680706519/replies", "items": [] } } diff --git a/crates/apub/src/api/user_settings_backup.rs b/crates/apub/src/api/user_settings_backup.rs index 0349171a9..c1102791d 100644 --- a/crates/apub/src/api/user_settings_backup.rs +++ b/crates/apub/src/api/user_settings_backup.rs @@ -7,7 +7,7 @@ use crate::objects::{ use activitypub_federation::{config::Data, fetch::object_id::ObjectId}; use actix_web::web::Json; use futures::{future::try_join_all, StreamExt}; -use lemmy_api_common::{context::LemmyContext, utils::sanitize_html_api_opt, SuccessResponse}; +use lemmy_api_common::{context::LemmyContext, SuccessResponse}; use lemmy_db_schema::{ newtypes::DbUrl, source::{ @@ -20,6 +20,7 @@ use lemmy_db_schema::{ post::{PostSaved, PostSavedForm}, }, traits::{Blockable, Crud, Followable, Saveable}, + utils::diesel_option_overwrite, }; use lemmy_db_views::structs::LocalUserView; use lemmy_utils::{ @@ -96,8 +97,8 @@ pub async fn import_settings( local_user_view: LocalUserView, context: Data, ) -> Result, LemmyError> { - let display_name = Some(sanitize_html_api_opt(&data.display_name)); - let bio = Some(sanitize_html_api_opt(&data.bio)); + let display_name = diesel_option_overwrite(data.display_name.clone()); + let bio = diesel_option_overwrite(data.bio.clone()); let person_form = PersonUpdateForm { display_name, diff --git a/crates/apub/src/objects/post.rs b/crates/apub/src/objects/post.rs index c25f988dc..4aa398bc2 100644 --- a/crates/apub/src/objects/post.rs +++ b/crates/apub/src/objects/post.rs @@ -21,7 +21,7 @@ use activitypub_federation::{ }; use anyhow::anyhow; use chrono::{DateTime, Utc}; -use html2md::parse_html; +use html2text::{from_read_with_decorator, render::text_renderer::TrivialDecorator}; use lemmy_api_common::{ context::LemmyContext, request::fetch_site_data, @@ -48,6 +48,7 @@ use lemmy_utils::{ }, }; use std::ops::Deref; +use stringreader::StringReader; use url::Url; const MAX_TITLE_LENGTH: usize = 200; @@ -171,11 +172,21 @@ impl Object for ApubPost { .name .clone() .or_else(|| { + // Posts coming from Mastodon or similar platforms don't have a title. Instead we take the + // first line of the content and convert it from HTML to plaintext. We also remove mentions + // of the community name. page .content - .clone() - .as_ref() - .and_then(|c| parse_html(c).lines().next().map(ToString::to_string)) + .as_deref() + .map(StringReader::new) + .map(|c| from_read_with_decorator(c, MAX_TITLE_LENGTH, TrivialDecorator::new())) + .and_then(|c| { + c.lines().next().map(|s| { + s.replace(&format!("@{}", community.name), "") + .trim() + .to_string() + }) + }) }) .ok_or_else(|| anyhow!("Object must have name or content"))?; if name.chars().count() > MAX_TITLE_LENGTH { @@ -288,8 +299,9 @@ mod tests { use super::*; use crate::{ objects::{ - community::tests::parse_lemmy_community, - person::tests::parse_lemmy_person, + community::{tests::parse_lemmy_community, ApubCommunity}, + instance::ApubSite, + person::{tests::parse_lemmy_person, ApubPerson}, post::ApubPost, tests::init_context, }, @@ -318,6 +330,31 @@ mod tests { assert!(!post.featured_community); assert_eq!(context.request_count(), 0); + cleanup(&context, person, site, community, post).await; + } + + #[tokio::test] + #[serial] + async fn test_convert_mastodon_post_title() { + let context = init_context().await; + let (person, site) = parse_lemmy_person(&context).await; + let community = parse_lemmy_community(&context).await; + + let json = file_to_json_object("assets/mastodon/objects/page.json").unwrap(); + let post = ApubPost::from_json(json, &context).await.unwrap(); + + assert_eq!(post.name, "Variable never resetting at refresh"); + + cleanup(&context, person, site, community, post).await; + } + + async fn cleanup( + context: &Data, + person: ApubPerson, + site: ApubSite, + community: ApubCommunity, + post: ApubPost, + ) { Post::delete(&mut context.pool(), post.id).await.unwrap(); Person::delete(&mut context.pool(), person.id) .await