use crate::{ context::LemmyContext, post::{LinkMetadata, OpenGraphData}, utils::proxy_image_link, }; use encoding::{all::encodings, DecoderTrap}; use lemmy_db_schema::newtypes::DbUrl; use lemmy_utils::{ error::{LemmyError, LemmyErrorType}, settings::structs::{PictrsImageMode, Settings}, version::VERSION, REQWEST_TIMEOUT, }; use mime::Mime; use reqwest::{header::CONTENT_TYPE, Client, ClientBuilder}; use reqwest_middleware::ClientWithMiddleware; use serde::Deserialize; use tracing::info; use url::Url; use urlencoding::encode; use webpage::HTML; pub fn client_builder(settings: &Settings) -> ClientBuilder { let user_agent = format!( "Lemmy/{}; +{}", VERSION, settings.get_protocol_and_hostname() ); Client::builder() .user_agent(user_agent.clone()) .timeout(REQWEST_TIMEOUT) .connect_timeout(REQWEST_TIMEOUT) } /// Fetches metadata for the given link and optionally generates thumbnail. #[tracing::instrument(skip_all)] pub async fn fetch_link_metadata( url: &Url, generate_thumbnail: bool, context: &LemmyContext, ) -> Result { info!("Fetching site metadata for url: {}", url); let response = context.client().get(url.as_str()).send().await?; let content_type: Option = response .headers() .get(CONTENT_TYPE) .and_then(|h| h.to_str().ok()) .and_then(|h| h.parse().ok()); // Can't use .text() here, because it only checks the content header, not the actual bytes // https://github.com/LemmyNet/lemmy/issues/1964 let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec(); let opengraph_data = extract_opengraph_data(&html_bytes, url).unwrap_or_default(); let thumbnail = extract_thumbnail_from_opengraph_data( url, &opengraph_data, &content_type, generate_thumbnail, context, ) .await; Ok(LinkMetadata { opengraph_data, content_type: content_type.map(|c| c.to_string()), thumbnail, }) } #[tracing::instrument(skip_all)] pub async fn fetch_link_metadata_opt( url: Option<&Url>, generate_thumbnail: bool, context: &LemmyContext, ) -> LinkMetadata { match &url { Some(url) => fetch_link_metadata(url, generate_thumbnail, context) .await .unwrap_or_default(), _ => Default::default(), } } /// Extract site metadata from HTML Opengraph attributes. fn extract_opengraph_data(html_bytes: &[u8], url: &Url) -> Result { let html = String::from_utf8_lossy(html_bytes); // Make sure the first line is doctype html let first_line = html .trim_start() .lines() .next() .ok_or(LemmyErrorType::NoLinesInHtml)? .to_lowercase(); if !first_line.starts_with("") { Err(LemmyErrorType::SiteMetadataPageIsNotDoctypeHtml)? } let mut page = HTML::from_string(html.to_string(), None)?; // If the web page specifies that it isn't actually UTF-8, re-decode the received bytes with the // proper encoding. If the specified encoding cannot be found, fall back to the original UTF-8 // version. if let Some(charset) = page.meta.get("charset") { if charset.to_lowercase() != "utf-8" { if let Some(encoding_ref) = encodings().iter().find(|e| e.name() == charset) { if let Ok(html_with_encoding) = encoding_ref.decode(html_bytes, DecoderTrap::Replace) { page = HTML::from_string(html_with_encoding, None)?; } } } } let page_title = page.title; let page_description = page.description; let og_description = page .opengraph .properties .get("description") .map(std::string::ToString::to_string); let og_title = page .opengraph .properties .get("title") .map(std::string::ToString::to_string); let og_image = page .opengraph .images .first() // join also works if the target URL is absolute .and_then(|ogo| url.join(&ogo.url).ok()); let og_embed_url = page .opengraph .videos .first() // join also works if the target URL is absolute .and_then(|v| url.join(&v.url).ok()); Ok(OpenGraphData { title: og_title.or(page_title), description: og_description.or(page_description), image: og_image.map(Into::into), embed_video_url: og_embed_url.map(Into::into), }) } #[tracing::instrument(skip_all)] pub async fn extract_thumbnail_from_opengraph_data( url: &Url, opengraph_data: &OpenGraphData, content_type: &Option, generate_thumbnail: bool, context: &LemmyContext, ) -> Option { let is_image = content_type.as_ref().unwrap_or(&mime::TEXT_PLAIN).type_() == mime::IMAGE; if generate_thumbnail && is_image { let image_url = opengraph_data .image .as_ref() .map(lemmy_db_schema::newtypes::DbUrl::inner) .unwrap_or(url); generate_pictrs_thumbnail(image_url, context) .await .ok() .map(Into::into) } else { None } } #[derive(Deserialize, Debug)] struct PictrsResponse { files: Vec, msg: String, } #[derive(Deserialize, Debug)] struct PictrsFile { file: String, #[allow(dead_code)] delete_token: String, } #[derive(Deserialize, Debug)] struct PictrsPurgeResponse { msg: String, } /// Purges an image from pictrs /// Note: This should often be coerced from a Result to .ok() in order to fail softly, because: /// - It might fail due to image being not local /// - It might not be an image /// - Pictrs might not be set up pub async fn purge_image_from_pictrs( image_url: &Url, context: &LemmyContext, ) -> Result<(), LemmyError> { is_image_content_type(context.client(), image_url).await?; let alias = image_url .path_segments() .ok_or(LemmyErrorType::ImageUrlMissingPathSegments)? .next_back() .ok_or(LemmyErrorType::ImageUrlMissingLastPathSegment)?; let pictrs_config = context.settings().pictrs_config()?; let purge_url = format!("{}internal/purge?alias={}", pictrs_config.url, alias); let pictrs_api_key = pictrs_config .api_key .ok_or(LemmyErrorType::PictrsApiKeyNotProvided)?; let response = context .client() .post(&purge_url) .timeout(REQWEST_TIMEOUT) .header("x-api-token", pictrs_api_key) .send() .await?; let response: PictrsPurgeResponse = response.json().await.map_err(LemmyError::from)?; match response.msg.as_str() { "ok" => Ok(()), _ => Err(LemmyErrorType::PictrsPurgeResponseError(response.msg))?, } } pub async fn delete_image_from_pictrs( alias: &str, delete_token: &str, context: &LemmyContext, ) -> Result<(), LemmyError> { let pictrs_config = context.settings().pictrs_config()?; let url = format!( "{}image/delete/{}/{}", pictrs_config.url, &delete_token, &alias ); context .client() .delete(&url) .timeout(REQWEST_TIMEOUT) .send() .await .map_err(LemmyError::from)?; Ok(()) } /// Retrieves the image with local pict-rs and generates a thumbnail. Returns the thumbnail url. #[tracing::instrument(skip_all)] async fn generate_pictrs_thumbnail( image_url: &Url, context: &LemmyContext, ) -> Result { let pictrs_config = context.settings().pictrs_config()?; if pictrs_config.image_mode() == PictrsImageMode::ProxyAllImages { return Ok(proxy_image_link(image_url.clone(), context).await?.into()); } // fetch remote non-pictrs images for persistent thumbnail link // TODO: should limit size once supported by pictrs let fetch_url = format!( "{}image/download?url={}", pictrs_config.url, encode(image_url.as_str()) ); let response = context .client() .get(&fetch_url) .timeout(REQWEST_TIMEOUT) .send() .await?; let response: PictrsResponse = response.json().await?; if response.msg == "ok" { let thumbnail_url = Url::parse(&format!( "{}/pictrs/image/{}", context.settings().get_protocol_and_hostname(), response.files.first().expect("missing pictrs file").file ))?; Ok(thumbnail_url) } else { Err(LemmyErrorType::PictrsResponseError(response.msg))? } } // TODO: get rid of this by reading content type from db #[tracing::instrument(skip_all)] async fn is_image_content_type(client: &ClientWithMiddleware, url: &Url) -> Result<(), LemmyError> { let response = client.get(url.as_str()).send().await?; if response .headers() .get("Content-Type") .ok_or(LemmyErrorType::NoContentTypeHeader)? .to_str()? .starts_with("image/") { Ok(()) } else { Err(LemmyErrorType::NotAnImageType)? } } #[cfg(test)] mod tests { #![allow(clippy::unwrap_used)] #![allow(clippy::indexing_slicing)] use crate::{ context::LemmyContext, request::{extract_opengraph_data, fetch_link_metadata}, }; use pretty_assertions::assert_eq; use serial_test::serial; use url::Url; // These helped with testing #[tokio::test] #[serial] async fn test_link_metadata() { let context = LemmyContext::init_test_context_with_networking().await; let sample_url = Url::parse("https://gitlab.com/IzzyOnDroid/repo/-/wikis/FAQ").unwrap(); let sample_res = fetch_link_metadata(&sample_url, false, &context) .await .unwrap(); assert_eq!( Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()), sample_res.opengraph_data.title ); assert_eq!( Some("The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string()), sample_res.opengraph_data.description ); assert_eq!( Some( Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png") .unwrap() .into() ), sample_res.opengraph_data.image ); assert_eq!(None, sample_res.opengraph_data.embed_video_url); assert_eq!( Some(mime::TEXT_HTML_UTF_8.to_string()), sample_res.content_type ); assert_eq!(None, sample_res.thumbnail); } // #[test] // fn test_pictshare() { // let res = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpg"); // assert!(res.is_ok()); // let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu"); // assert!(res_other.is_err()); // } #[test] fn test_resolve_image_url() { // url that lists the opengraph fields let url = Url::parse("https://example.com/one/two.html").unwrap(); // root relative url let html_bytes = b""; let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata"); assert_eq!( metadata.image, Some(Url::parse("https://example.com/image.jpg").unwrap().into()) ); // base relative url let html_bytes = b""; let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata"); assert_eq!( metadata.image, Some( Url::parse("https://example.com/one/image.jpg") .unwrap() .into() ) ); // absolute url let html_bytes = b""; let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata"); assert_eq!( metadata.image, Some(Url::parse("https://cdn.host.com/image.jpg").unwrap().into()) ); // protocol relative url let html_bytes = b""; let metadata = extract_opengraph_data(html_bytes, &url).expect("Unable to parse metadata"); assert_eq!( metadata.image, Some(Url::parse("https://example.com/image.jpg").unwrap().into()) ); } }