use crate::error::{LemmyErrorExt, LemmyErrorType, LemmyResult}; use itertools::Itertools; use once_cell::sync::Lazy; use regex::{Regex, RegexBuilder, RegexSet}; use url::{ParseError, Url}; // From here: https://github.com/vector-im/element-android/blob/develop/matrix-sdk-android/src/main/java/org/matrix/android/sdk/api/MatrixPatterns.kt#L35 static VALID_MATRIX_ID_REGEX: Lazy = Lazy::new(|| { Regex::new(r"^@[A-Za-z0-9\x21-\x39\x3B-\x7F]+:[A-Za-z0-9.-]+(:[0-9]{2,5})?$") .expect("compile regex") }); // taken from https://en.wikipedia.org/wiki/UTM_parameters static CLEAN_URL_PARAMS_REGEX: Lazy = Lazy::new(|| { Regex::new(r"^utm_source|utm_medium|utm_campaign|utm_term|utm_content|gclid|gclsrc|dclid|fbclid$") .expect("compile regex") }); const ALLOWED_POST_URL_SCHEMES: [&str; 3] = ["http", "https", "magnet"]; const BODY_MAX_LENGTH: usize = 10000; const POST_BODY_MAX_LENGTH: usize = 50000; const BIO_MAX_LENGTH: usize = 300; const ALT_TEXT_MAX_LENGTH: usize = 300; const SITE_NAME_MAX_LENGTH: usize = 20; const SITE_NAME_MIN_LENGTH: usize = 1; const SITE_DESCRIPTION_MAX_LENGTH: usize = 150; //Invisible unicode characters, taken from https://invisible-characters.com/ const FORBIDDEN_DISPLAY_CHARS: [char; 53] = [ '\u{0009}', '\u{00a0}', '\u{00ad}', '\u{034f}', '\u{061c}', '\u{115f}', '\u{1160}', '\u{17b4}', '\u{17b5}', '\u{180e}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}', '\u{2006}', '\u{2007}', '\u{2008}', '\u{2009}', '\u{200a}', '\u{200b}', '\u{200c}', '\u{200d}', '\u{200e}', '\u{200f}', '\u{202f}', '\u{205f}', '\u{2060}', '\u{2061}', '\u{2062}', '\u{2063}', '\u{2064}', '\u{206a}', '\u{206b}', '\u{206c}', '\u{206d}', '\u{206e}', '\u{206f}', '\u{3000}', '\u{2800}', '\u{3164}', '\u{feff}', '\u{ffa0}', '\u{1d159}', '\u{1d173}', '\u{1d174}', '\u{1d175}', '\u{1d176}', '\u{1d177}', '\u{1d178}', '\u{1d179}', '\u{1d17a}', ]; fn has_newline(name: &str) -> bool { name.contains('\n') } pub fn is_valid_actor_name(name: &str, actor_name_max_length: usize) -> LemmyResult<()> { static VALID_ACTOR_NAME_REGEX_EN: Lazy = Lazy::new(|| Regex::new(r"^[a-zA-Z0-9_]{3,}$").expect("compile regex")); static VALID_ACTOR_NAME_REGEX_AR: Lazy = Lazy::new(|| Regex::new(r"^[\p{Arabic}0-9_]{3,}$").expect("compile regex")); static VALID_ACTOR_NAME_REGEX_RU: Lazy = Lazy::new(|| Regex::new(r"^[\p{Cyrillic}0-9_]{3,}$").expect("compile regex")); let check = name.chars().count() <= actor_name_max_length && !has_newline(name); // Only allow characters from a single alphabet per username. This avoids problems with lookalike // characters like `o` which looks identical in Latin and Cyrillic, and can be used to imitate // other users. Checks for additional alphabets can be added in the same way. let lang_check = VALID_ACTOR_NAME_REGEX_EN.is_match(name) || VALID_ACTOR_NAME_REGEX_AR.is_match(name) || VALID_ACTOR_NAME_REGEX_RU.is_match(name); if !check || !lang_check { Err(LemmyErrorType::InvalidName.into()) } else { Ok(()) } } fn has_3_permitted_display_chars(name: &str) -> bool { let mut num_non_fdc: i8 = 0; for c in name.chars() { if !FORBIDDEN_DISPLAY_CHARS.contains(&c) { num_non_fdc += 1; if num_non_fdc >= 3 { break; } } } if num_non_fdc >= 3 { return true; } false } // Can't do a regex here, reverse lookarounds not supported pub fn is_valid_display_name(name: &str, actor_name_max_length: usize) -> LemmyResult<()> { let check = !name.starts_with('@') && !name.starts_with(FORBIDDEN_DISPLAY_CHARS) && name.chars().count() <= actor_name_max_length && !has_newline(name) && has_3_permitted_display_chars(name); if !check { Err(LemmyErrorType::InvalidDisplayName.into()) } else { Ok(()) } } pub fn is_valid_matrix_id(matrix_id: &str) -> LemmyResult<()> { let check = VALID_MATRIX_ID_REGEX.is_match(matrix_id) && !has_newline(matrix_id); if !check { Err(LemmyErrorType::InvalidMatrixId.into()) } else { Ok(()) } } pub fn is_valid_post_title(title: &str) -> LemmyResult<()> { let length = title.trim().chars().count(); let check = (3..=200).contains(&length) && !has_newline(title); if !check { Err(LemmyErrorType::InvalidPostTitle.into()) } else { Ok(()) } } /// This could be post bodies, comments, or any description field pub fn is_valid_body_field(body: &Option, post: bool) -> LemmyResult<()> { if let Some(body) = body { if post { max_length_check(body, POST_BODY_MAX_LENGTH, LemmyErrorType::InvalidBodyField)?; } else { max_length_check(body, BODY_MAX_LENGTH, LemmyErrorType::InvalidBodyField)?; }; } Ok(()) } pub fn is_valid_bio_field(bio: &str) -> LemmyResult<()> { max_length_check(bio, BIO_MAX_LENGTH, LemmyErrorType::BioLengthOverflow) } pub fn is_valid_alt_text_field(alt_text: &Option) -> LemmyResult<()> { if let Some(alt_text) = alt_text { max_length_check( alt_text, ALT_TEXT_MAX_LENGTH, LemmyErrorType::AltTextLengthOverflow, ) } else { Ok(()) } } /// Checks the site name length, the limit as defined in the DB. pub fn site_name_length_check(name: &str) -> LemmyResult<()> { min_length_check(name, SITE_NAME_MIN_LENGTH, LemmyErrorType::SiteNameRequired)?; max_length_check( name, SITE_NAME_MAX_LENGTH, LemmyErrorType::SiteNameLengthOverflow, ) } /// Checks the site description length, the limit as defined in the DB. pub fn site_description_length_check(description: &str) -> LemmyResult<()> { max_length_check( description, SITE_DESCRIPTION_MAX_LENGTH, LemmyErrorType::SiteDescriptionLengthOverflow, ) } /// Check minimum and maximum length of input string. If the string is too short or too long, the /// corresponding error is returned. /// /// HTML frontends specify maximum input length using `maxlength` attribute. /// For consistency we use the same counting method (UTF-16 code units). /// https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/maxlength fn max_length_check(item: &str, max_length: usize, max_msg: LemmyErrorType) -> LemmyResult<()> { let len = item.encode_utf16().count(); if len > max_length { Err(max_msg.into()) } else { Ok(()) } } fn min_length_check(item: &str, min_length: usize, min_msg: LemmyErrorType) -> LemmyResult<()> { let len = item.encode_utf16().count(); if len < min_length { Err(min_msg.into()) } else { Ok(()) } } /// Attempts to build a regex and check it for common errors before inserting into the DB. pub fn build_and_check_regex(regex_str_opt: &Option<&str>) -> LemmyResult> { regex_str_opt.map_or_else( || Ok(None::), |regex_str| { if regex_str.is_empty() { // If the proposed regex is empty, return as having no regex at all; this is the same // behavior that happens downstream before the write to the database. return Ok(None::); } RegexBuilder::new(regex_str) .case_insensitive(true) .build() .with_lemmy_type(LemmyErrorType::InvalidRegex) .and_then(|regex| { // NOTE: It is difficult to know, in the universe of user-crafted regex, which ones // may match against any string text. To keep it simple, we'll match the regex // against an innocuous string - a single number - which should help catch a regex // that accidentally matches against all strings. if regex.is_match("1") { Err(LemmyErrorType::PermissiveRegex.into()) } else { Ok(Some(regex)) } }) }, ) } pub fn clean_url_params(url: &Url) -> Url { let mut url_out = url.clone(); if url.query().is_some() { let new_query = url .query_pairs() .filter(|q| !CLEAN_URL_PARAMS_REGEX.is_match(&q.0)) .map(|q| format!("{}={}", q.0, q.1)) .join("&"); url_out.set_query(Some(&new_query)); } url_out } pub fn check_site_visibility_valid( current_private_instance: bool, current_federation_enabled: bool, new_private_instance: &Option, new_federation_enabled: &Option, ) -> LemmyResult<()> { let private_instance = new_private_instance.unwrap_or(current_private_instance); let federation_enabled = new_federation_enabled.unwrap_or(current_federation_enabled); if private_instance && federation_enabled { Err(LemmyErrorType::CantEnablePrivateInstanceAndFederationTogether.into()) } else { Ok(()) } } pub fn check_url_scheme(url: &Option) -> LemmyResult<()> { if let Some(url) = url { if !ALLOWED_POST_URL_SCHEMES.contains(&url.scheme()) { Err(LemmyErrorType::InvalidUrlScheme.into()) } else { Ok(()) } } else { Ok(()) } } pub fn is_url_blocked(url: &Option, blocklist: &RegexSet) -> LemmyResult<()> { if let Some(url) = url { if blocklist.is_match(url.as_str()) { Err(LemmyErrorType::BlockedUrl)? } } Ok(()) } pub fn check_urls_are_valid(urls: &Vec) -> LemmyResult> { let mut parsed_urls = vec![]; for url in urls { let url = Url::parse(url).or_else(|e| { if e == ParseError::RelativeUrlWithoutBase { Url::parse(&format!("https://{url}")) } else { Err(e) } })?; parsed_urls.push(url.to_string()); } Ok(parsed_urls) } #[cfg(test)] #[allow(clippy::unwrap_used)] #[allow(clippy::indexing_slicing)] mod tests { use crate::{ error::LemmyErrorType, utils::validation::{ build_and_check_regex, check_site_visibility_valid, check_url_scheme, check_urls_are_valid, clean_url_params, is_url_blocked, is_valid_actor_name, is_valid_bio_field, is_valid_display_name, is_valid_matrix_id, is_valid_post_title, site_description_length_check, site_name_length_check, BIO_MAX_LENGTH, SITE_DESCRIPTION_MAX_LENGTH, SITE_NAME_MAX_LENGTH, }, }; use pretty_assertions::assert_eq; use url::Url; #[test] fn test_clean_url_params() { let url = Url::parse("https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&username=randomuser&id=123").unwrap(); let cleaned = clean_url_params(&url); let expected = Url::parse("https://example.com/path/123?username=randomuser&id=123").unwrap(); assert_eq!(expected.to_string(), cleaned.to_string()); let url = Url::parse("https://example.com/path/123").unwrap(); let cleaned = clean_url_params(&url); assert_eq!(url.to_string(), cleaned.to_string()); } #[test] fn regex_checks() { assert!(is_valid_post_title("hi").is_err()); assert!(is_valid_post_title("him").is_ok()); assert!(is_valid_post_title(" him ").is_ok()); assert!(is_valid_post_title("n\n\n\n\nanother").is_err()); assert!(is_valid_post_title("hello there!\n this is a test.").is_err()); assert!(is_valid_post_title("hello there! this is a test.").is_ok()); assert!(is_valid_post_title(("12345".repeat(40) + "x").as_str()).is_err()); assert!(is_valid_post_title("12345".repeat(40).as_str()).is_ok()); assert!(is_valid_post_title((("12345".repeat(40)) + " ").as_str()).is_ok()); } #[test] fn test_valid_actor_name() { let actor_name_max_length = 20; assert!(is_valid_actor_name("Hello_98", actor_name_max_length).is_ok()); assert!(is_valid_actor_name("ten", actor_name_max_length).is_ok()); assert!(is_valid_actor_name("تجريب", actor_name_max_length).is_ok()); assert!(is_valid_actor_name("تجريب_123", actor_name_max_length).is_ok()); assert!(is_valid_actor_name("Владимир", actor_name_max_length).is_ok()); // mixed scripts assert!(is_valid_actor_name("تجريب_abc", actor_name_max_length).is_err()); assert!(is_valid_actor_name("Влад_abc", actor_name_max_length).is_err()); // dash assert!(is_valid_actor_name("Hello-98", actor_name_max_length).is_err()); // too short assert!(is_valid_actor_name("a", actor_name_max_length).is_err()); // empty assert!(is_valid_actor_name("", actor_name_max_length).is_err()); } #[test] fn test_valid_display_name() { let actor_name_max_length = 20; assert!(is_valid_display_name("hello @there", actor_name_max_length).is_ok()); assert!(is_valid_display_name("@hello there", actor_name_max_length).is_err()); assert!(is_valid_display_name("\u{200d}hello", actor_name_max_length).is_err()); assert!(is_valid_display_name( "\u{1f3f3}\u{fe0f}\u{200d}\u{26a7}\u{fe0f}Name", actor_name_max_length ) .is_ok()); assert!(is_valid_display_name("\u{2003}1\u{ffa0}2\u{200d}", actor_name_max_length).is_err()); // Make sure zero-space with an @ doesn't work assert!( is_valid_display_name(&format!("{}@my name is", '\u{200b}'), actor_name_max_length).is_err() ); } #[test] fn test_valid_post_title() { assert!(is_valid_post_title("Post Title").is_ok()); assert!(is_valid_post_title( "აშშ ითხოვს ირანს დაუყოვნებლივ გაანთავისუფლოს დაკავებული ნავთობის ტანკერი" ) .is_ok()); assert!(is_valid_post_title(" POST TITLE 😃😃😃😃😃").is_ok()); assert!(is_valid_post_title("\n \n \n \n ").is_err()); // tabs/spaces/newlines } #[test] fn test_valid_matrix_id() { assert!(is_valid_matrix_id("@dess:matrix.org").is_ok()); assert!(is_valid_matrix_id("@dess_:matrix.org").is_ok()); assert!(is_valid_matrix_id("@dess:matrix.org:443").is_ok()); assert!(is_valid_matrix_id("dess:matrix.org").is_err()); assert!(is_valid_matrix_id(" @dess:matrix.org").is_err()); assert!(is_valid_matrix_id("@dess:matrix.org t").is_err()); assert!(is_valid_matrix_id("@dess:matrix.org t").is_err()); } #[test] fn test_valid_site_name() { let valid_names = [ (0..SITE_NAME_MAX_LENGTH).map(|_| 'A').collect::(), String::from("A"), ]; let invalid_names = [ ( &(0..SITE_NAME_MAX_LENGTH + 1) .map(|_| 'A') .collect::(), LemmyErrorType::SiteNameLengthOverflow, ), (&String::new(), LemmyErrorType::SiteNameRequired), ]; valid_names.iter().for_each(|valid_name| { assert!( site_name_length_check(valid_name).is_ok(), "Expected {} of length {} to be Ok.", valid_name, valid_name.len() ) }); invalid_names .iter() .for_each(|(invalid_name, expected_err)| { let result = site_name_length_check(invalid_name); assert!(result.is_err()); assert!( result.unwrap_err().error_type.eq(&expected_err.clone()), "Testing {}, expected error {}", invalid_name, expected_err ); }); } #[test] fn test_valid_bio() { assert!(is_valid_bio_field(&(0..BIO_MAX_LENGTH).map(|_| 'A').collect::()).is_ok()); let invalid_result = is_valid_bio_field(&(0..BIO_MAX_LENGTH + 1).map(|_| 'A').collect::()); assert!( invalid_result.is_err() && invalid_result .unwrap_err() .error_type .eq(&LemmyErrorType::BioLengthOverflow) ); } #[test] fn test_valid_site_description() { assert!(site_description_length_check( &(0..SITE_DESCRIPTION_MAX_LENGTH) .map(|_| 'A') .collect::() ) .is_ok()); let invalid_result = site_description_length_check( &(0..SITE_DESCRIPTION_MAX_LENGTH + 1) .map(|_| 'A') .collect::(), ); assert!( invalid_result.is_err() && invalid_result .unwrap_err() .error_type .eq(&LemmyErrorType::SiteDescriptionLengthOverflow) ); } #[test] fn test_valid_slur_regex() { let valid_regexes = [&None, &Some(""), &Some("(foo|bar)")]; valid_regexes.iter().for_each(|regex| { let result = build_and_check_regex(regex); assert!(result.is_ok(), "Testing regex: {:?}", regex); }); } #[test] fn test_too_permissive_slur_regex() { let match_everything_regexes = [ (&Some("["), LemmyErrorType::InvalidRegex), (&Some("(foo|bar|)"), LemmyErrorType::PermissiveRegex), (&Some(".*"), LemmyErrorType::PermissiveRegex), ]; match_everything_regexes .iter() .for_each(|(regex_str, expected_err)| { let result = build_and_check_regex(regex_str); assert!(result.is_err()); assert!( result.unwrap_err().error_type.eq(&expected_err.clone()), "Testing regex {:?}, expected error {}", regex_str, expected_err ); }); } #[test] fn test_check_site_visibility_valid() { assert!(check_site_visibility_valid(true, true, &None, &None).is_err()); assert!(check_site_visibility_valid(true, false, &None, &Some(true)).is_err()); assert!(check_site_visibility_valid(false, true, &Some(true), &None).is_err()); assert!(check_site_visibility_valid(false, false, &Some(true), &Some(true)).is_err()); assert!(check_site_visibility_valid(true, false, &None, &None).is_ok()); assert!(check_site_visibility_valid(false, true, &None, &None).is_ok()); assert!(check_site_visibility_valid(false, false, &Some(true), &None).is_ok()); assert!(check_site_visibility_valid(false, false, &None, &Some(true)).is_ok()); } #[test] fn test_check_url_scheme() { assert!(check_url_scheme(&None).is_ok()); assert!(check_url_scheme(&Some(Url::parse("http://example.com").unwrap())).is_ok()); assert!(check_url_scheme(&Some(Url::parse("https://example.com").unwrap())).is_ok()); assert!(check_url_scheme(&Some(Url::parse("https://example.com").unwrap())).is_ok()); assert!(check_url_scheme(&Some(Url::parse("ftp://example.com").unwrap())).is_err()); assert!(check_url_scheme(&Some(Url::parse("javascript:void").unwrap())).is_err()); let magnet_link="magnet:?xt=urn:btih:4b390af3891e323778959d5abfff4b726510f14c&dn=Ravel%20Complete%20Piano%20Sheet%20Music%20-%20Public%20Domain&tr=udp%3A%2F%2Fopen.tracker.cl%3A1337%2Fannounce"; assert!(check_url_scheme(&Some(Url::parse(magnet_link).unwrap())).is_ok()); } #[test] fn test_url_block() { let set = regex::RegexSet::new(vec![ r"(https://)?example\.org/page/to/article", r"(https://)?example\.net/?", r"(https://)?example\.com/?", ]) .unwrap(); assert!(is_url_blocked(&Some(Url::parse("https://example.blog").unwrap()), &set).is_ok()); assert!(is_url_blocked(&Some(Url::parse("https://example.org").unwrap()), &set).is_ok()); assert!(is_url_blocked(&None, &set).is_ok()); assert!(is_url_blocked(&Some(Url::parse("https://example.com").unwrap()), &set).is_err()); } #[test] fn test_url_parsed() { assert_eq!( vec![String::from("https://example.com/")], check_urls_are_valid(&vec![String::from("example.com")]).unwrap() ); assert!(check_urls_are_valid(&vec![ String::from("example.com"), String::from("https://example.blog") ]) .is_ok()); assert!(check_urls_are_valid(&vec![String::from("https://example .com"),]).is_err()); } }