diff --git a/src/main.rs b/src/main.rs index e0324ce..61d1873 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,5 @@ use pulldown_cmark::{Parser, Event, Tag}; -use std::fs; +use std::{cmp::Ordering, fs}; use futures::future::{select_all, BoxFuture, FutureExt}; use std::collections::{BTreeSet, BTreeMap}; use serde::{Serialize, Deserialize}; @@ -26,6 +26,9 @@ enum CheckerError { location: Option, }, + #[fail(display = "too many requests")] + TooManyRequests, + #[fail(display = "reqwest error: {}", error)] ReqwestError { error: String, @@ -191,6 +194,12 @@ fn get_url_core(url: String) -> BoxFuture<'static, (String, Result<(), CheckerEr return (url, res); } + if status == StatusCode::TOO_MANY_REQUESTS { + // We get a lot of these, and we should not retry as they'll just fail again + warn!("Error while getting {}: {}", url, status); + return (url, Err(CheckerError::TooManyRequests)); + } + warn!("Error while getting {}, retrying: {}", url, status); if status.is_redirection() { res = Err(CheckerError::HttpError {status: status.as_u16(), location: ok.headers().get(header::LOCATION).and_then(|h| h.to_str().ok()).map(|x| x.to_string())}); @@ -284,12 +293,14 @@ async fn main() -> Result<(), Error> { url_checks.push(check); }; + let mut to_check: Vec = vec![]; + for (event, _range) in parser.into_offset_iter() { match event { Event::Start(tag) => { match tag { Tag::Link(_link_type, url, _title) | Tag::Image(_link_type, url, _title) => { - do_check(url.to_string()); + to_check.push(url.to_string()); } _ => {} } @@ -301,6 +312,38 @@ async fn main() -> Result<(), Error> { } } + to_check.sort_by(|a,b| { + let get_time = |k| { + let res = results.get(k); + if let Some(link) = res { + if let Some(last_working) = link.last_working { + Some(last_working) + } else { + None + } + } else { + None + } + }; + let res_a = get_time(a); + let res_b = get_time(b); + if res_a.is_none() { + if res_b.is_none() { + return a.cmp(b); + } else { + Ordering::Less + } + } else if res_b.is_none() { + Ordering::Greater + } else { + res_a.unwrap().cmp(&res_b.unwrap()) + } + }); + + for url in to_check { + do_check(url) + } + let results_keys = results.keys().cloned().collect::>(); let old_links = results_keys.difference(&used); for link in old_links { @@ -365,6 +408,13 @@ async fn main() -> Result<(), Error> { failed +=1; continue; } + CheckerError::TooManyRequests => { + // too many tries + if link.last_working.is_some() { + info!("Ignoring 429 failure on {} as we've seen success before", url); + continue; + } + } _ => {} }; if let Some(last_working) = link.last_working {