diff --git a/src/bin/hacktoberfest.rs b/src/bin/hacktoberfest.rs new file mode 100644 index 0000000..b888010 --- /dev/null +++ b/src/bin/hacktoberfest.rs @@ -0,0 +1,283 @@ +// Helper tool to dump all repos in awesome-rust that are tagged with "hacktoberfest" + +use chrono::{DateTime, Duration, Local}; +use failure::{format_err, Error, Fail}; +use futures::future::{select_all, BoxFuture, FutureExt}; +use lazy_static::lazy_static; +use log::{debug, warn}; +use pulldown_cmark::{Event, Parser, Tag}; +use regex::Regex; +use reqwest::redirect::Policy; +use reqwest::Client; +use serde::{Deserialize, Serialize}; +use std::collections::{BTreeMap, BTreeSet}; +use std::env; +use std::fs; +use std::io::Write; +use std::time; +use std::u8; +use tokio::sync::Semaphore; +use tokio::sync::SemaphorePermit; + +#[derive(Debug, Fail, Serialize, Deserialize)] +enum CheckerError { + #[fail(display = "http error: {}", status)] + HttpError { + status: u16, + location: Option, + }, +} + +struct MaxHandles { + remaining: Semaphore, +} + +struct Handle<'a> { + _permit: SemaphorePermit<'a>, +} + +impl MaxHandles { + fn new(max: usize) -> MaxHandles { + MaxHandles { + remaining: Semaphore::new(max), + } + } + + async fn get<'a>(&'a self) -> Handle<'a> { + let permit = self.remaining.acquire().await.unwrap(); + return Handle { _permit: permit }; + } +} + +impl<'a> Drop for Handle<'a> { + fn drop(&mut self) { + debug!("Dropping"); + } +} + +lazy_static! { + static ref CLIENT: Client = Client::builder() + .danger_accept_invalid_certs(true) // because some certs are out of date + .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0") // so some sites (e.g. sciter.com) don't reject us + .redirect(Policy::none()) + .pool_max_idle_per_host(0) + .timeout(time::Duration::from_secs(20)) + .build().unwrap(); + + // This is to avoid errors with running out of file handles, so we only do 20 requests at a time + static ref HANDLES: MaxHandles = MaxHandles::new(20); +} + +lazy_static! { + static ref GITHUB_REPO_REGEX: Regex = + Regex::new(r"^https://github.com/(?P[^/]+)/(?P[^/]+)/?$").unwrap(); + static ref GITHUB_API_REGEX: Regex = Regex::new(r"https://api.github.com/").unwrap(); +} + +#[derive(Deserialize, Debug)] +struct RepoInfo { + full_name: String, + description: Option, + topics: Vec, +} + +async fn get_hacktoberfest_core(github_url: String) -> Result { + warn!("Downloading Hacktoberfest label for {}", github_url); + let rewritten = GITHUB_REPO_REGEX + .replace_all(&github_url, "https://api.github.com/repos/$org/$repo") + .to_string(); + let mut req = CLIENT.get(&rewritten); + if let Ok(username) = env::var("GITHUB_USERNAME") { + if let Ok(password) = env::var("GITHUB_TOKEN") { + // needs a token with at least public_repo scope + req = req.basic_auth(username, Some(password)); + } + } + + let resp = req.send().await; + match resp { + Err(err) => { + warn!("Error while getting {}: {}", github_url, err); + return Err(CheckerError::HttpError { + status: err.status().unwrap().as_u16(), + location: Some(github_url.to_string()), + }); + } + Ok(ok) => { + if !ok.status().is_success() { + return Err(CheckerError::HttpError { + status: ok.status().as_u16(), + location: None, + }); + } + let raw = ok.text().await.unwrap(); + match serde_json::from_str::(&raw) { + Ok(val) => Ok(Info { + name: val.full_name, + description: val.description.unwrap_or_default(), + hacktoberfest: val.topics.iter().find(|t| *t == "hacktoberfest").is_some(), + }), + Err(_) => { + panic!("{}", raw); + } + } + } + } +} + +fn get_hacktoberfest(url: String) -> BoxFuture<'static, (String, Result)> { + debug!("Need handle for {}", url); + async move { + let _handle = HANDLES.get().await; + return (url.clone(), get_hacktoberfest_core(url).await); + } + .boxed() +} + +#[derive(Debug, Serialize, Deserialize)] +struct Info { + hacktoberfest: bool, + name: String, + description: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct Link { + updated_at: DateTime, + info: Info, +} + +type Results = BTreeMap; + +#[tokio::main] +async fn main() -> Result<(), Error> { + env_logger::init(); + let markdown_input = fs::read_to_string("README.md").expect("Can't read README.md"); + let parser = Parser::new(&markdown_input); + + let mut used: BTreeSet = BTreeSet::new(); + let mut results: Results = fs::read_to_string("results/hacktoberfest.yaml") + .map_err(|e| format_err!("{}", e)) + .and_then(|x| serde_yaml::from_str(&x).map_err(|e| format_err!("{}", e))) + .unwrap_or(Results::new()); + + let mut url_checks = vec![]; + + let mut do_check = |url: String| { + if !url.starts_with("http") { + return; + } + if used.contains(&url) { + return; + } + used.insert(url.clone()); + if let Some(_) = results.get(&url) { + return; + } + let check = get_hacktoberfest(url).boxed(); + url_checks.push(check); + }; + + let mut to_check: Vec = vec![]; + + #[derive(Debug)] + struct ListInfo { + location: usize, + data: Vec, + } + + for (event, _) in parser.into_offset_iter() { + match event { + Event::Start(tag) => match tag { + Tag::Link(_link_type, url, _title) | Tag::Image(_link_type, url, _title) => { + if GITHUB_REPO_REGEX.is_match(&url) { + to_check.push(url.to_string()); + } + } + _ => {} + }, + _ => {} + } + } + + for url in to_check { + do_check(url) + } + + let results_keys = results.keys().cloned().collect::>(); + let old_links = results_keys.difference(&used); + for link in old_links { + results.remove(link).unwrap(); + } + fs::write("results/results.yaml", serde_yaml::to_string(&results)?)?; + + let mut not_written = 0; + let mut last_written = Local::now(); + + let mut failed: u32 = 0; + while url_checks.len() > 0 { + debug!("Waiting for {}", url_checks.len()); + let ((url, res), _index, remaining) = select_all(url_checks).await; + url_checks = remaining; + match res { + Ok(info) => { + print!("\u{2714} "); + if let Some(link) = results.get_mut(&url) { + link.updated_at = Local::now(); + link.info = info + } else { + results.insert( + url.clone(), + Link { + updated_at: Local::now(), + info: info, + }, + ); + } + } + Err(_) => { + print!("\u{2718} "); + println!("{}", url); + failed += 1; + } + } + std::io::stdout().flush().unwrap(); + + not_written += 1; + let duration = Local::now() - last_written; + if duration > Duration::seconds(5) || not_written > 20 { + fs::write( + "results/hacktoberfest.yaml", + serde_yaml::to_string(&results)?, + )?; + not_written = 0; + last_written = Local::now(); + } + } + fs::write( + "results/hacktoberfest.yaml", + serde_yaml::to_string(&results)?, + )?; + println!(""); + + if failed == 0 { + println!("All awesome-rust repos tagged with 'hacktoberfest'"); + let mut sorted_repos = results + .keys() + .map(|s| s.to_string()) + .collect::>(); + sorted_repos.sort_by_key(|a| a.to_lowercase()); + for name in sorted_repos { + let link = results.get(&name).unwrap(); + if link.info.hacktoberfest { + println!( + "* [{}]({}) - {}", + link.info.name, name, link.info.description + ) + } + } + Ok(()) + } else { + Err(format_err!("{} urls with errors", failed)) + } +}