lemmy/crates/utils/src/utils/validation.rs

use crate::error::{LemmyErrorExt, LemmyErrorType, LemmyResult};
use itertools::Itertools;
use once_cell::sync::Lazy;
use regex::{Regex, RegexBuilder, RegexSet};
use url::{ParseError, Url};

// From here: https://github.com/vector-im/element-android/blob/develop/matrix-sdk-android/src/main/java/org/matrix/android/sdk/api/MatrixPatterns.kt#L35
static VALID_MATRIX_ID_REGEX: Lazy<Regex> = Lazy::new(|| {
  Regex::new(r"^@[A-Za-z0-9\x21-\x39\x3B-\x7F]+:[A-Za-z0-9.-]+(:[0-9]{2,5})?$")
    .expect("compile regex")
});
// taken from https://en.wikipedia.org/wiki/UTM_parameters
static CLEAN_URL_PARAMS_REGEX: Lazy<Regex> = Lazy::new(|| {
  Regex::new(r"^utm_source|utm_medium|utm_campaign|utm_term|utm_content|gclid|gclsrc|dclid|fbclid$")
    .expect("compile regex")
});
const ALLOWED_POST_URL_SCHEMES: [&str; 3] = ["http", "https", "magnet"];

const BODY_MAX_LENGTH: usize = 10000;
const POST_BODY_MAX_LENGTH: usize = 50000;
const BIO_MAX_LENGTH: usize = 300;
const ALT_TEXT_MAX_LENGTH: usize = 300;
const SITE_NAME_MAX_LENGTH: usize = 20;
const SITE_NAME_MIN_LENGTH: usize = 1;
const SITE_DESCRIPTION_MAX_LENGTH: usize = 150;
//Invisible unicode characters, taken from https://invisible-characters.com/
const FORBIDDEN_DISPLAY_CHARS: [char; 53] = [
  '\u{0009}',
  '\u{00a0}',
  '\u{00ad}',
  '\u{034f}',
  '\u{061c}',
  '\u{115f}',
  '\u{1160}',
  '\u{17b4}',
  '\u{17b5}',
  '\u{180e}',
  '\u{2000}',
  '\u{2001}',
  '\u{2002}',
  '\u{2003}',
  '\u{2004}',
  '\u{2005}',
  '\u{2006}',
  '\u{2007}',
  '\u{2008}',
  '\u{2009}',
  '\u{200a}',
  '\u{200b}',
  '\u{200c}',
  '\u{200d}',
  '\u{200e}',
  '\u{200f}',
  '\u{202f}',
  '\u{205f}',
  '\u{2060}',
  '\u{2061}',
  '\u{2062}',
  '\u{2063}',
  '\u{2064}',
  '\u{206a}',
  '\u{206b}',
  '\u{206c}',
  '\u{206d}',
  '\u{206e}',
  '\u{206f}',
  '\u{3000}',
  '\u{2800}',
  '\u{3164}',
  '\u{feff}',
  '\u{ffa0}',
  '\u{1d159}',
  '\u{1d173}',
  '\u{1d174}',
  '\u{1d175}',
  '\u{1d176}',
  '\u{1d177}',
  '\u{1d178}',
  '\u{1d179}',
  '\u{1d17a}',
];

fn has_newline(name: &str) -> bool {
  name.contains('\n')
}

pub fn is_valid_actor_name(name: &str, actor_name_max_length: usize) -> LemmyResult<()> {
  static VALID_ACTOR_NAME_REGEX_EN: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"^[a-zA-Z0-9_]{3,}$").expect("compile regex"));
  static VALID_ACTOR_NAME_REGEX_AR: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"^[\p{Arabic}0-9_]{3,}$").expect("compile regex"));
  static VALID_ACTOR_NAME_REGEX_RU: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"^[\p{Cyrillic}0-9_]{3,}$").expect("compile regex"));

  let check = name.chars().count() <= actor_name_max_length && !has_newline(name);

  // Only allow characters from a single alphabet per username. This avoids problems with lookalike
  // characters like `o` which looks identical in Latin and Cyrillic, and can be used to imitate
  // other users. Checks for additional alphabets can be added in the same way.
  let lang_check = VALID_ACTOR_NAME_REGEX_EN.is_match(name)
    || VALID_ACTOR_NAME_REGEX_AR.is_match(name)
    || VALID_ACTOR_NAME_REGEX_RU.is_match(name);

  if !check || !lang_check {
    Err(LemmyErrorType::InvalidName.into())
  } else {
    Ok(())
  }
}

fn has_3_permitted_display_chars(name: &str) -> bool {
  let mut num_non_fdc: i8 = 0;
  for c in name.chars() {
    if !FORBIDDEN_DISPLAY_CHARS.contains(&c) {
      num_non_fdc += 1;
      if num_non_fdc >= 3 {
        break;
      }
    }
  }
  if num_non_fdc >= 3 {
    return true;
  }
  false
}

// Can't do a regex here, reverse lookarounds not supported
pub fn is_valid_display_name(name: &str, actor_name_max_length: usize) -> LemmyResult<()> {
  let check = !name.starts_with('@')
    && !name.starts_with(FORBIDDEN_DISPLAY_CHARS)
    && name.chars().count() <= actor_name_max_length
    && !has_newline(name)
    && has_3_permitted_display_chars(name);
  if !check {
    Err(LemmyErrorType::InvalidDisplayName.into())
  } else {
    Ok(())
  }
}

pub fn is_valid_matrix_id(matrix_id: &str) -> LemmyResult<()> {
  let check = VALID_MATRIX_ID_REGEX.is_match(matrix_id) && !has_newline(matrix_id);
  if !check {
    Err(LemmyErrorType::InvalidMatrixId.into())
  } else {
    Ok(())
  }
}

pub fn is_valid_post_title(title: &str) -> LemmyResult<()> {
  let length = title.trim().chars().count();
  let check = (3..=200).contains(&length) && !has_newline(title);
  if !check {
    Err(LemmyErrorType::InvalidPostTitle.into())
  } else {
    Ok(())
  }
}

/// This could be post bodies, comments, or any description field
pub fn is_valid_body_field(body: &Option<String>, post: bool) -> LemmyResult<()> {
  if let Some(body) = body {
    if post {
      max_length_check(body, POST_BODY_MAX_LENGTH, LemmyErrorType::InvalidBodyField)?;
    } else {
      max_length_check(body, BODY_MAX_LENGTH, LemmyErrorType::InvalidBodyField)?;
    };
  }
  Ok(())
}

pub fn is_valid_bio_field(bio: &str) -> LemmyResult<()> {
  max_length_check(bio, BIO_MAX_LENGTH, LemmyErrorType::BioLengthOverflow)
}

pub fn is_valid_alt_text_field(alt_text: &Option<String>) -> LemmyResult<()> {
  if let Some(alt_text) = alt_text {
    max_length_check(
      alt_text,
      ALT_TEXT_MAX_LENGTH,
      LemmyErrorType::AltTextLengthOverflow,
    )
  } else {
    Ok(())
  }
}

/// Checks the site name length, the limit as defined in the DB.
pub fn site_name_length_check(name: &str) -> LemmyResult<()> {
  min_length_check(name, SITE_NAME_MIN_LENGTH, LemmyErrorType::SiteNameRequired)?;
  max_length_check(
    name,
    SITE_NAME_MAX_LENGTH,
    LemmyErrorType::SiteNameLengthOverflow,
  )
}

/// Checks the site description length, the limit as defined in the DB.
pub fn site_description_length_check(description: &str) -> LemmyResult<()> {
  max_length_check(
    description,
    SITE_DESCRIPTION_MAX_LENGTH,
    LemmyErrorType::SiteDescriptionLengthOverflow,
  )
}

/// Check minimum and maximum length of input string. If the string is too short or too long, the
/// corresponding error is returned.
///
/// HTML frontends specify maximum input length using `maxlength` attribute.
/// For consistency we use the same counting method (UTF-16 code units).
/// https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/maxlength
fn max_length_check(item: &str, max_length: usize, max_msg: LemmyErrorType) -> LemmyResult<()> {
  let len = item.encode_utf16().count();
  if len > max_length {
    Err(max_msg.into())
  } else {
    Ok(())
  }
}

fn min_length_check(item: &str, min_length: usize, min_msg: LemmyErrorType) -> LemmyResult<()> {
  let len = item.encode_utf16().count();
  if len < min_length {
    Err(min_msg.into())
  } else {
    Ok(())
  }
}

/// Attempts to build a regex and check it for common errors before inserting into the DB.
pub fn build_and_check_regex(regex_str_opt: &Option<&str>) -> LemmyResult<Option<Regex>> {
  regex_str_opt.map_or_else(
    || Ok(None::<Regex>),
    |regex_str| {
      if regex_str.is_empty() {
        // If the proposed regex is empty, return as having no regex at all; this is the same
        // behavior that happens downstream before the write to the database.
        return Ok(None::<Regex>);
      }

      RegexBuilder::new(regex_str)
        .case_insensitive(true)
        .build()
        .with_lemmy_type(LemmyErrorType::InvalidRegex)
        .and_then(|regex| {
          // NOTE: It is difficult to know, in the universe of user-crafted regex, which ones
          // may match against any string text. To keep it simple, we'll match the regex
          // against an innocuous string - a single number - which should help catch a regex
          // that accidentally matches against all strings.
          if regex.is_match("1") {
            Err(LemmyErrorType::PermissiveRegex.into())
          } else {
            Ok(Some(regex))
          }
        })
    },
  )
}

pub fn clean_url_params(url: &Url) -> Url {
  let mut url_out = url.clone();
  if url.query().is_some() {
    let new_query = url
      .query_pairs()
      .filter(|q| !CLEAN_URL_PARAMS_REGEX.is_match(&q.0))
      .map(|q| format!("{}={}", q.0, q.1))
      .join("&");
    url_out.set_query(Some(&new_query));
  }
  url_out
}

pub fn check_site_visibility_valid(
  current_private_instance: bool,
  current_federation_enabled: bool,
  new_private_instance: &Option<bool>,
  new_federation_enabled: &Option<bool>,
) -> LemmyResult<()> {
  let private_instance = new_private_instance.unwrap_or(current_private_instance);
  let federation_enabled = new_federation_enabled.unwrap_or(current_federation_enabled);

  if private_instance && federation_enabled {
    Err(LemmyErrorType::CantEnablePrivateInstanceAndFederationTogether.into())
  } else {
    Ok(())
  }
}

pub fn check_url_scheme(url: &Option<Url>) -> LemmyResult<()> {
  if let Some(url) = url {
    if !ALLOWED_POST_URL_SCHEMES.contains(&url.scheme()) {
      Err(LemmyErrorType::InvalidUrlScheme.into())
    } else {
      Ok(())
    }
  } else {
    Ok(())
  }
}

pub fn is_url_blocked(url: &Option<Url>, blocklist: &RegexSet) -> LemmyResult<()> {
  if let Some(url) = url {
    if blocklist.is_match(url.as_str()) {
      Err(LemmyErrorType::BlockedUrl)?
    }
  }

  Ok(())
}

pub fn check_urls_are_valid(urls: &Vec<String>) -> LemmyResult<Vec<String>> {
  let mut parsed_urls = vec![];
  for url in urls {
    let url = Url::parse(url).or_else(|e| {
      if e == ParseError::RelativeUrlWithoutBase {
        Url::parse(&format!("https://{url}"))
      } else {
        Err(e)
      }
    })?;

    parsed_urls.push(url.to_string());
  }

  Ok(parsed_urls)
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
#[allow(clippy::indexing_slicing)]
mod tests {

  use crate::{
    error::LemmyErrorType,
    utils::validation::{
      build_and_check_regex,
      check_site_visibility_valid,
      check_url_scheme,
      check_urls_are_valid,
      clean_url_params,
      is_url_blocked,
      is_valid_actor_name,
      is_valid_bio_field,
      is_valid_display_name,
      is_valid_matrix_id,
      is_valid_post_title,
      site_description_length_check,
      site_name_length_check,
      BIO_MAX_LENGTH,
      SITE_DESCRIPTION_MAX_LENGTH,
      SITE_NAME_MAX_LENGTH,
    },
  };
  use pretty_assertions::assert_eq;
  use url::Url;

  #[test]
  fn test_clean_url_params() {
    let url = Url::parse("https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&username=randomuser&id=123").unwrap();
    let cleaned = clean_url_params(&url);
    let expected = Url::parse("https://example.com/path/123?username=randomuser&id=123").unwrap();
    assert_eq!(expected.to_string(), cleaned.to_string());

    let url = Url::parse("https://example.com/path/123").unwrap();
    let cleaned = clean_url_params(&url);
    assert_eq!(url.to_string(), cleaned.to_string());
  }

  #[test]
  fn regex_checks() {
    assert!(is_valid_post_title("hi").is_err());
    assert!(is_valid_post_title("him").is_ok());
    assert!(is_valid_post_title("  him  ").is_ok());
    assert!(is_valid_post_title("n\n\n\n\nanother").is_err());
    assert!(is_valid_post_title("hello there!\n this is a test.").is_err());
    assert!(is_valid_post_title("hello there! this is a test.").is_ok());
    assert!(is_valid_post_title(("12345".repeat(40) + "x").as_str()).is_err());
    assert!(is_valid_post_title("12345".repeat(40).as_str()).is_ok());
    assert!(is_valid_post_title((("12345".repeat(40)) + "  ").as_str()).is_ok());
  }

  #[test]
  fn test_valid_actor_name() {
    let actor_name_max_length = 20;
    assert!(is_valid_actor_name("Hello_98", actor_name_max_length).is_ok());
    assert!(is_valid_actor_name("ten", actor_name_max_length).is_ok());
    assert!(is_valid_actor_name("تجريب", actor_name_max_length).is_ok());
    assert!(is_valid_actor_name("تجريب_123", actor_name_max_length).is_ok());
    assert!(is_valid_actor_name("Владимир", actor_name_max_length).is_ok());

    // mixed scripts
    assert!(is_valid_actor_name("تجريب_abc", actor_name_max_length).is_err());
    assert!(is_valid_actor_name("Влад_abc", actor_name_max_length).is_err());
    // dash
    assert!(is_valid_actor_name("Hello-98", actor_name_max_length).is_err());
    // too short
    assert!(is_valid_actor_name("a", actor_name_max_length).is_err());
    // empty
    assert!(is_valid_actor_name("", actor_name_max_length).is_err());
  }

  #[test]
  fn test_valid_display_name() {
    let actor_name_max_length = 20;
    assert!(is_valid_display_name("hello @there", actor_name_max_length).is_ok());
    assert!(is_valid_display_name("@hello there", actor_name_max_length).is_err());
    assert!(is_valid_display_name("\u{200d}hello", actor_name_max_length).is_err());
    assert!(is_valid_display_name(
      "\u{1f3f3}\u{fe0f}\u{200d}\u{26a7}\u{fe0f}Name",
      actor_name_max_length
    )
    .is_ok());
    assert!(is_valid_display_name("\u{2003}1\u{ffa0}2\u{200d}", actor_name_max_length).is_err());

    // Make sure zero-space with an @ doesn't work
    assert!(
      is_valid_display_name(&format!("{}@my name is", '\u{200b}'), actor_name_max_length).is_err()
    );
  }

  #[test]
  fn test_valid_post_title() {
    assert!(is_valid_post_title("Post Title").is_ok());
    assert!(is_valid_post_title(
      "აშშ ითხოვს ირანს დაუყოვნებლივ გაანთავისუფლოს დაკავებული ნავთობის ტანკერი"
    )
    .is_ok());
    assert!(is_valid_post_title("   POST TITLE 😃😃😃😃😃").is_ok());
    assert!(is_valid_post_title("\n \n \n \n    		").is_err()); // tabs/spaces/newlines
  }

  #[test]
  fn test_valid_matrix_id() {
    assert!(is_valid_matrix_id("@dess:matrix.org").is_ok());
    assert!(is_valid_matrix_id("@dess_:matrix.org").is_ok());
    assert!(is_valid_matrix_id("@dess:matrix.org:443").is_ok());
    assert!(is_valid_matrix_id("dess:matrix.org").is_err());
    assert!(is_valid_matrix_id(" @dess:matrix.org").is_err());
    assert!(is_valid_matrix_id("@dess:matrix.org t").is_err());
    assert!(is_valid_matrix_id("@dess:matrix.org t").is_err());
  }

  #[test]
  fn test_valid_site_name() {
    let valid_names = [
      (0..SITE_NAME_MAX_LENGTH).map(|_| 'A').collect::<String>(),
      String::from("A"),
    ];
    let invalid_names = [
      (
        &(0..SITE_NAME_MAX_LENGTH + 1)
          .map(|_| 'A')
          .collect::<String>(),
        LemmyErrorType::SiteNameLengthOverflow,
      ),
      (&String::new(), LemmyErrorType::SiteNameRequired),
    ];

    valid_names.iter().for_each(|valid_name| {
      assert!(
        site_name_length_check(valid_name).is_ok(),
        "Expected {} of length {} to be Ok.",
        valid_name,
        valid_name.len()
      )
    });

    invalid_names
      .iter()
      .for_each(|(invalid_name, expected_err)| {
        let result = site_name_length_check(invalid_name);

        assert!(result.is_err());
        assert!(
          result.unwrap_err().error_type.eq(&expected_err.clone()),
          "Testing {}, expected error {}",
          invalid_name,
          expected_err
        );
      });
  }

  #[test]
  fn test_valid_bio() {
    assert!(is_valid_bio_field(&(0..BIO_MAX_LENGTH).map(|_| 'A').collect::<String>()).is_ok());

    let invalid_result =
      is_valid_bio_field(&(0..BIO_MAX_LENGTH + 1).map(|_| 'A').collect::<String>());

    assert!(
      invalid_result.is_err()
        && invalid_result
          .unwrap_err()
          .error_type
          .eq(&LemmyErrorType::BioLengthOverflow)
    );
  }

  #[test]
  fn test_valid_site_description() {
    assert!(site_description_length_check(
      &(0..SITE_DESCRIPTION_MAX_LENGTH)
        .map(|_| 'A')
        .collect::<String>()
    )
    .is_ok());

    let invalid_result = site_description_length_check(
      &(0..SITE_DESCRIPTION_MAX_LENGTH + 1)
        .map(|_| 'A')
        .collect::<String>(),
    );

    assert!(
      invalid_result.is_err()
        && invalid_result
          .unwrap_err()
          .error_type
          .eq(&LemmyErrorType::SiteDescriptionLengthOverflow)
    );
  }

  #[test]
  fn test_valid_slur_regex() {
    let valid_regexes = [&None, &Some(""), &Some("(foo|bar)")];

    valid_regexes.iter().for_each(|regex| {
      let result = build_and_check_regex(regex);

      assert!(result.is_ok(), "Testing regex: {:?}", regex);
    });
  }

  #[test]
  fn test_too_permissive_slur_regex() {
    let match_everything_regexes = [
      (&Some("["), LemmyErrorType::InvalidRegex),
      (&Some("(foo|bar|)"), LemmyErrorType::PermissiveRegex),
      (&Some(".*"), LemmyErrorType::PermissiveRegex),
    ];

    match_everything_regexes
      .iter()
      .for_each(|(regex_str, expected_err)| {
        let result = build_and_check_regex(regex_str);

        assert!(result.is_err());
        assert!(
          result.unwrap_err().error_type.eq(&expected_err.clone()),
          "Testing regex {:?}, expected error {}",
          regex_str,
          expected_err
        );
      });
  }

  #[test]
  fn test_check_site_visibility_valid() {
    assert!(check_site_visibility_valid(true, true, &None, &None).is_err());
    assert!(check_site_visibility_valid(true, false, &None, &Some(true)).is_err());
    assert!(check_site_visibility_valid(false, true, &Some(true), &None).is_err());
    assert!(check_site_visibility_valid(false, false, &Some(true), &Some(true)).is_err());
    assert!(check_site_visibility_valid(true, false, &None, &None).is_ok());
    assert!(check_site_visibility_valid(false, true, &None, &None).is_ok());
    assert!(check_site_visibility_valid(false, false, &Some(true), &None).is_ok());
    assert!(check_site_visibility_valid(false, false, &None, &Some(true)).is_ok());
  }

  #[test]
  fn test_check_url_scheme() {
    assert!(check_url_scheme(&None).is_ok());
    assert!(check_url_scheme(&Some(Url::parse("http://example.com").unwrap())).is_ok());
    assert!(check_url_scheme(&Some(Url::parse("https://example.com").unwrap())).is_ok());
    assert!(check_url_scheme(&Some(Url::parse("https://example.com").unwrap())).is_ok());
    assert!(check_url_scheme(&Some(Url::parse("ftp://example.com").unwrap())).is_err());
    assert!(check_url_scheme(&Some(Url::parse("javascript:void").unwrap())).is_err());

    let magnet_link="magnet:?xt=urn:btih:4b390af3891e323778959d5abfff4b726510f14c&dn=Ravel%20Complete%20Piano%20Sheet%20Music%20-%20Public%20Domain&tr=udp%3A%2F%2Fopen.tracker.cl%3A1337%2Fannounce";
    assert!(check_url_scheme(&Some(Url::parse(magnet_link).unwrap())).is_ok());
  }

  #[test]
  fn test_url_block() {
    let set = regex::RegexSet::new(vec![
      r"(https://)?example\.org/page/to/article",
      r"(https://)?example\.net/?",
      r"(https://)?example\.com/?",
    ])
    .unwrap();

    assert!(is_url_blocked(&Some(Url::parse("https://example.blog").unwrap()), &set).is_ok());

    assert!(is_url_blocked(&Some(Url::parse("https://example.org").unwrap()), &set).is_ok());

    assert!(is_url_blocked(&None, &set).is_ok());

    assert!(is_url_blocked(&Some(Url::parse("https://example.com").unwrap()), &set).is_err());
  }

  #[test]
  fn test_url_parsed() {
    assert_eq!(
      vec![String::from("https://example.com/")],
      check_urls_are_valid(&vec![String::from("example.com")]).unwrap()
    );

    assert!(check_urls_are_valid(&vec![
      String::from("example.com"),
      String::from("https://example.blog")
    ])
    .is_ok());

    assert!(check_urls_are_valid(&vec![String::from("https://example .com"),]).is_err());
  }
}