2019-05-03 05:26:41 +00:00
|
|
|
#!/usr/bin/env bash
|
|
|
|
### _ _ _ _
|
|
|
|
### __ _ ___ | |_| |__ | | ___| |_ _ _
|
|
|
|
### / _` |/ _ \| __| '_ \| |/ _ \ __| | | |
|
|
|
|
###| (_| | (_) | |_| |_) | | __/ |_| |_| |
|
|
|
|
### \__, |\___/ \__|_.__/|_|\___|\__|\__,_|
|
|
|
|
### |___/
|
|
|
|
### https://www.youtube.com/user/gotbletu
|
|
|
|
### https://twitter.com/gotbletu
|
|
|
|
### https://github.com/gotbletu
|
2019-05-08 21:15:47 +00:00
|
|
|
### gotbletu@gmail.com
|
2019-05-03 05:26:41 +00:00
|
|
|
###
|
|
|
|
### Name : 404urlcleaner
|
2019-05-08 20:11:38 +00:00
|
|
|
### Version : 0.3
|
|
|
|
### Date : 20190508
|
2019-05-03 05:26:41 +00:00
|
|
|
### Description : Remove all dead urls with HTTP status code 404 (mainly use with newsboat/newsbeuter)
|
|
|
|
### Depends On : bash curl coreutils grep moreutils
|
|
|
|
### Video Demo : https://youtu.be/dTpEuQZRRDM
|
|
|
|
|
|
|
|
# Reset
|
|
|
|
Color_Off='\e[0m' # Text Reset
|
|
|
|
# Regular Colors
|
|
|
|
Black='\e[0;30m' # Black
|
|
|
|
Red='\e[0;31m' # Red
|
|
|
|
Green='\e[0;32m' # Green
|
|
|
|
Yellow='\e[0;33m' # Yellow
|
|
|
|
Blue='\e[0;34m' # Blue
|
|
|
|
Purple='\e[0;35m' # Purple
|
|
|
|
Cyan='\e[0;36m' # Cyan
|
|
|
|
White='\e[0;37m' # White
|
|
|
|
|
|
|
|
display_usage() {
|
|
|
|
echo "Remove all dead urls with HTTP status code 404 (mainly use with newsboat/newsbeuter)"
|
|
|
|
echo -e "\n Usage:\n $0 [file]"
|
|
|
|
echo -e "\n Example:\n $0 ~/.newsboat/urls"
|
|
|
|
}
|
|
|
|
# if less than one argument supplied, display usage
|
|
|
|
if [ $# -lt 1 ]
|
|
|
|
then
|
|
|
|
display_usage
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
# check whether user had supplied -h or --help . If yes display usage
|
|
|
|
if [[ ( $@ == "--help") || $@ == "-h" ]]
|
|
|
|
then
|
|
|
|
display_usage
|
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
|
|
|
|
# all url status code
|
|
|
|
TMPFILE1=/tmp/urlstatus.txt
|
|
|
|
# only 404 status code
|
|
|
|
TMPFILE2=/tmp/urlstatus2nd.txt
|
|
|
|
|
|
|
|
# create backup file
|
|
|
|
echo -e "${Green}>>>Creating Backup File${Color_Off}"
|
|
|
|
cp -aiv "$1" "$1_`date +'%F_%Hh%Ms%S'`"
|
|
|
|
|
|
|
|
# check all links and print http code
|
|
|
|
echo -e "${Green}>>>Checking HTTP Status Code${Color_Off}"
|
|
|
|
grep '^http' -i "$1" | awk '{print $1}' | while read url
|
|
|
|
do
|
|
|
|
urlstatus=$(curl -o /dev/null --silent --head --write-out '%{http_code}' "$url")
|
|
|
|
echo "$url C0DE-$urlstatus" | tee -a "$TMPFILE1"
|
|
|
|
done
|
|
|
|
|
|
|
|
# double check 404 links again (sometime the first check is not acurrate)
|
2019-05-08 20:11:38 +00:00
|
|
|
echo -e "${Blue}>>>Double Checking ONLY DEAD HTTP Status Code${Color_Off}"
|
2019-05-03 05:26:41 +00:00
|
|
|
grep 'C0DE-404' -i "$TMPFILE1" | awk '{print $1}' | while read url
|
|
|
|
do
|
|
|
|
urlstatus=$(curl -o /dev/null --silent --head --write-out '%{http_code}' "$url")
|
|
|
|
echo "$url C0DE-$urlstatus" | tee -a "$TMPFILE2"
|
|
|
|
done
|
|
|
|
|
|
|
|
# remove all 404
|
2019-05-08 20:11:38 +00:00
|
|
|
grep 'C0DE-404' -s -i "$TMPFILE2" | awk '{print $1}' | while read url
|
2019-05-03 05:26:41 +00:00
|
|
|
do
|
|
|
|
grep -F -v "$url" "$1" | sponge "$1"
|
|
|
|
echo -e "${Red}>>>REMOVING $url ${Color_Off}"
|
|
|
|
done
|
|
|
|
|
2019-05-08 20:11:38 +00:00
|
|
|
# remove temp files if it exist
|
|
|
|
if [ -f "$TMPFILE1" ] ; then
|
|
|
|
rm "$TMPFILE1"
|
|
|
|
fi
|
|
|
|
if [ -f "$TMPFILE2" ] ; then
|
|
|
|
rm "$TMPFILE2"
|
|
|
|
fi
|
|
|
|
|
|
|
|
echo -e "${Green}>>>Scan Completed${Color_Off}"
|
2019-05-03 05:26:41 +00:00
|
|
|
|
|
|
|
|