diff --git a/listsofwikis/checkalive.pl b/listsofwikis/checkalive.pl index e69de29..3100472 100644 --- a/listsofwikis/checkalive.pl +++ b/listsofwikis/checkalive.pl @@ -0,0 +1,113 @@ +#!/usr/bin/perl + +# Name: checkalive.pl +# Description: This script will go thru a list of wiki URLs & determine +# if they are online & if they are Mediawiki wikis. It should work with +# "index.php/Main_Page", "index.php" and "api.php". + +# Created: 12/14/2013 +# Most recently updated: 01/21/2014 +# Copyright (c) 2013-2014 Scott D. Boyd - scottdb56@gmail.com + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# NOTE: The following three Perl modules need to be installed on your computer. +# Search for them on cpan.org or use your Linux distro's package manager. +use LWP::Simple; +use LWP::UserAgent; +use Crypt::SSLeay; +my $slp=2; # You can change this number for seconds to sleep between requests (currently 2) +my $urllist="URL-list.txt"; +my $alivelist="alive-wikis.txt"; +my $deadlist="dead-wikis.txt"; +my $pwrdby1="Powered by MediaWiki"; +my $pwrdby2="poweredby_mediawiki"; +my $mwapi="MediaWiki API documentation page"; +my $lw=0; +my $dw=0; +my $flag=0; +my $ua = LWP::UserAgent->new; +$ua->agent("Mozilla/5.0"); # using this user-agent gets around the wikis that don't allow scripts +$ua->timeout(30); +$ua->show_progress(1); + +# Here's where most of the work takes place: +open (MYURLLIST,"<$urllist"); +open (ALIVEFILE,">$alivelist"); +open (DEADFILE,">$deadlist"); +while () { + if (/\#(.*?)/) { # check to see if line is commented-out + next; # if so - skip it + } else { + $url=$_; # assign the current line to $url + chomp $url; + $req = HTTP::Request->new(GET => $url); # --| + $req->header('Accept' => 'text/html'); # | -- some of these lines + $res = $ua->request($req); # send request | -- were adapted from + if ($res->is_success) { # if the URL still exists | -- lwpcook.pod + print "Got it! "; # | + $doc=$res->content; + &ParsePage; # go to "ParsePage" sub-routine --| + } else { + $errormsg=$res->status_line; + if (substr($errormsg,0,3) eq "500") { # if response-code 500 + &reCheck; # try the fetch once more + } elsif (substr($errormsg,0,3) eq "403") { # if URL is forbidden (code 403) + print DEADFILE "$url is alive but forbidding the script\n"; $dw++; # print to the dead-file + } elsif (substr($errormsg,0,3) eq "404") { # if URL is dead (code 404) + print DEADFILE "$url\n"; $dw++; # print to the dead-file + } elsif (substr($errormsg,0,3) eq "406") { # if Not Acceptable (code 406) + print DEADFILE "$url\n"; $dw++; # print to the dead-file + } + } + print "Pausing for $slp seconds...\n"; sleep $slp; + } +} +close DEADFILE; close ALIVEFILE; close MYURLLIST; +print "I found $lw live wikis and $dw dead or non-Mediawiki wikis.\n"; + +# Here's the sub-routines +# ======================= +sub ParsePage { + print "Parsing the document... "; + if (($doc=~/$pwrdby1/i) || ($doc=~/$pwrdby2/i)) { # if the page contains: "Powered by MediaWiki" + print "It's alive and running Mediawiki.\n"; # or: "poweredby_mediawiki" + print ALIVEFILE "$url\n"; + $flag=1;$lw++; + } elsif ($doc=~/$mwapi/i) { # if the api.php contains: "MediaWiki API documentation page" + print "It's alive and running Mediawiki.\n"; + print ALIVEFILE "$url\n"; + $flag=1;$lw++; + } + unless ($flag) { + print "It's alive but NOT running MediaWiki.\n"; + print DEADFILE "$url is alive but NOT running Mediawiki.\n"; $dw++; + } + $flag=0; +} + +sub reCheck { + print "Sleeping for 15 seconds - then re-trying...\n"; sleep 15; + $req = HTTP::Request->new(GET => $url); + $req->header('Accept' => 'text/html'); + $res = $ua->request($req); + if ($res->is_success) { + print "Got it! "; + $doc=$res->content; + &ParsePage; # go to "ParsePage" sub-routine + } else { + print DEADFILE "$url\n"; $dw++; + } +} +