From 7bc2d0bb567acd3ce06af643a181c44fb15f0c29 Mon Sep 17 00:00:00 2001 From: Tom Parker-Shemilt Date: Thu, 24 Jun 2021 16:28:08 +0100 Subject: [PATCH 1/4] Don't retry 429's and don't worry about them if we've got a previous success --- src/main.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/main.rs b/src/main.rs index e0324ce..8861ca3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -26,6 +26,9 @@ enum CheckerError { location: Option, }, + #[fail(display = "too many requests")] + TooManyRequests, + #[fail(display = "reqwest error: {}", error)] ReqwestError { error: String, @@ -191,6 +194,12 @@ fn get_url_core(url: String) -> BoxFuture<'static, (String, Result<(), CheckerEr return (url, res); } + if status == StatusCode::TOO_MANY_REQUESTS { + // We get a lot of these, and we should not retry as they'll just fail again + warn!("Error while getting {}: {}", url, status); + return (url, Err(CheckerError::TooManyRequests)); + } + warn!("Error while getting {}, retrying: {}", url, status); if status.is_redirection() { res = Err(CheckerError::HttpError {status: status.as_u16(), location: ok.headers().get(header::LOCATION).and_then(|h| h.to_str().ok()).map(|x| x.to_string())}); @@ -365,6 +374,13 @@ async fn main() -> Result<(), Error> { failed +=1; continue; } + CheckerError::TooManyRequests => { + // too many tries + if link.last_working.is_some() { + info!("Ignoring 429 failure on {} as we've seen success before", url); + continue; + } + } _ => {} }; if let Some(last_working) = link.last_working { From e3cd6c6a107e80bffec8f634af44be59a7a0b13d Mon Sep 17 00:00:00 2001 From: Tom Parker-Shemilt Date: Tue, 29 Jun 2021 20:48:00 +0100 Subject: [PATCH 2/4] Check the oldest/unchecked ones first --- src/main.rs | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index 8861ca3..aa92607 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,5 @@ use pulldown_cmark::{Parser, Event, Tag}; -use std::fs; +use std::{cmp::Ordering, fs}; use futures::future::{select_all, BoxFuture, FutureExt}; use std::collections::{BTreeSet, BTreeMap}; use serde::{Serialize, Deserialize}; @@ -10,7 +10,7 @@ use std::io::Write; use reqwest::{Client, redirect::Policy, StatusCode, header, Url}; use regex::Regex; use failure::{Fail, Error, format_err}; -use chrono::{Local, DateTime, Duration}; +use chrono::{DateTime, Duration, Local}; use std::env; use tokio::sync::Semaphore; use tokio::sync::SemaphorePermit; @@ -121,7 +121,7 @@ fn get_url_core(url: String) -> BoxFuture<'static, (String, Result<(), CheckerEr async move { let mut res = Err(CheckerError::NotTried); for _ in 0..5u8 { - debug!("Running {}", url); + info!("Running {}", url); lazy_static! { static ref GITHUB_REPO_REGEX: Regex = Regex::new(r"^https://github.com/(?P[^/]+)/(?P[^/]+)$").unwrap(); static ref GITHUB_API_REGEX: Regex = Regex::new(r"https://api.github.com/").unwrap(); @@ -293,12 +293,14 @@ async fn main() -> Result<(), Error> { url_checks.push(check); }; + let mut to_check: Vec = vec![]; + for (event, _range) in parser.into_offset_iter() { match event { Event::Start(tag) => { match tag { Tag::Link(_link_type, url, _title) | Tag::Image(_link_type, url, _title) => { - do_check(url.to_string()); + to_check.push(url.to_string()); } _ => {} } @@ -310,6 +312,38 @@ async fn main() -> Result<(), Error> { } } + to_check.sort_by(|a,b| { + let get_time = |k| { + let res = results.get(k); + if let Some(link) = res { + if let Some(last_working) = link.last_working { + Some(last_working) + } else { + None + } + } else { + None + } + }; + let res_a = get_time(a); + let res_b = get_time(b); + if res_a.is_none() { + if res_b.is_none() { + return a.cmp(b); + } else { + Ordering::Greater + } + } else if res_b.is_none() { + Ordering::Less + } else { + res_a.unwrap().cmp(&res_b.unwrap()) + } + }); + + for url in to_check { + do_check(url) + } + let results_keys = results.keys().cloned().collect::>(); let old_links = results_keys.difference(&used); for link in old_links { From d3260a60d609203ffbae643a7931c17a5a1126e3 Mon Sep 17 00:00:00 2001 From: Tom Parker-Shemilt Date: Tue, 29 Jun 2021 20:49:32 +0100 Subject: [PATCH 3/4] Reset some unneeded changes --- src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index aa92607..984673e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,7 +10,7 @@ use std::io::Write; use reqwest::{Client, redirect::Policy, StatusCode, header, Url}; use regex::Regex; use failure::{Fail, Error, format_err}; -use chrono::{DateTime, Duration, Local}; +use chrono::{Local, DateTime, Duration}; use std::env; use tokio::sync::Semaphore; use tokio::sync::SemaphorePermit; @@ -121,7 +121,7 @@ fn get_url_core(url: String) -> BoxFuture<'static, (String, Result<(), CheckerEr async move { let mut res = Err(CheckerError::NotTried); for _ in 0..5u8 { - info!("Running {}", url); + debug!("Running {}", url); lazy_static! { static ref GITHUB_REPO_REGEX: Regex = Regex::new(r"^https://github.com/(?P[^/]+)/(?P[^/]+)$").unwrap(); static ref GITHUB_API_REGEX: Regex = Regex::new(r"https://api.github.com/").unwrap(); From 90edcef35a7fc08ae5f1fd39054d80db58f00580 Mon Sep 17 00:00:00 2001 From: Tom Parker-Shemilt Date: Tue, 29 Jun 2021 21:07:56 +0100 Subject: [PATCH 4/4] Ordering was in the wrong order... --- src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 984673e..61d1873 100644 --- a/src/main.rs +++ b/src/main.rs @@ -331,10 +331,10 @@ async fn main() -> Result<(), Error> { if res_b.is_none() { return a.cmp(b); } else { - Ordering::Greater + Ordering::Less } } else if res_b.is_none() { - Ordering::Less + Ordering::Greater } else { res_a.unwrap().cmp(&res_b.unwrap()) }