#!/usr/bin/perl # Name: checkalive.pl v2.01 # Description: This script will go thru a list of URLs & determine if they are online & if they are Mediawiki wikis. # It should work with: "/index.php/Main_Page", "index.php", "api.php" and even pages such as: "/wiki/Pagina_principale". # If the URl is not "api.php", it will look for it, check it, and output it if found to be a valid api.php. If not found, # it will output the URL with "index.php" if that's available. # # Created: 12/14/2013 # Most recently updated: 04/11/2014 # Copyright (c) 2013-2014 by Scott D. Boyd - scottdb56@gmail.com # # =========================================================================================================================== # This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License # as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along with this program. If not, # see . # =========================================================================================================================== # # NOTE: The following four Perl modules need to be installed on your computer. # Search for them on cpan.org or use your Linux distro's package manager. use LWP::Simple; use LWP::UserAgent; use Crypt::SSLeay; use Mojo::URL; my $slp=2; # You can change this number for seconds to sleep between requests (currently 2 seconds) my $urllist="URL-list.txt"; my $alivelist="alive-wikis.txt"; my $deadlist="dead-wikis.txt"; my $pwrdby1="Powered by MediaWiki"; my $pwrdby2="poweredby_mediawiki"; my $genmw="meta name\=\"generator\" content\=\"MediaWiki"; my $mwapi="MediaWiki API documentation page"; my $mwapi2="API Home Page"; # found in an older version of the api my $indexphp="index.php"; my $apiphp="api.php"; my $wapiphp="w\/api.php"; my $wikiapiphp="wiki\/api.php"; my $apiurl=""; my $live=0; my $dead=0; my $a=1; my $b=0; my $c=0; my $flag=0; my $ua = LWP::UserAgent->new; $ua->agent("Mozilla/5.0"); # use this user-agent to get into wikis that block spiders & robots $ua->timeout(30); $ua->show_progress(1); open (MYURLLIST,"<$urllist") or die "Cannot open the URL-list file: $!"; open (ALIVEFILE,">$alivelist"); open (DEADFILE,">$deadlist"); while () { if ((/\#(.*?)/) || (/^\s*$/)) { # check to see if line is a comment or a blank line next; # if so - skip it } else { $url=$_; # assign the current line to $url chomp $url; $req = HTTP::Request->new(GET => $url); # --| $req->header('Accept' => 'text/html'); # |-- some of these lines $res = $ua->request($req); # send request |-- were adapted from if ($res->is_success) { # if the URL still exists |-- lwpcook.pod print "Got it! "; # | $doc=$res->content; # | print "Parsing the document... "; if (($doc=~/$pwrdby1/i) || ($doc=~/$pwrdby2/i)) { # if the page contains: "Powered by MediaWiki" print "It's alive and powered by Mediawiki\n"; # or: "poweredby_mediawiki" $flag=1;$live++; # then it's a MediaWiki wiki & Check4api; } elsif ($doc=~/$genmw/i) { # if the content generator is MediaWiki print "It's alive and powered by Mediawiki\n"; # then it's a MediaWiki wiki $flag=1;$live++; & Check4api; } elsif ($doc=~/$mwapi/i) { # if the api.php contains: "MediaWiki API documentation page" print "It's alive and powered by Mediawiki\n"; # then it's a MediaWiki wiki print ALIVEFILE "$url\n"; $flag=1;$live++; } elsif ($doc=~/$mwapi2/i) { # if the api.php contains: "API Home Page" (older version) print "It's alive and powered by Mediawiki\n"; # then it's a MediaWiki wiki print ALIVEFILE "$url\n"; $flag=1;$live++; } unless ($flag) { print "It's alive but NOT powered by MediaWiki\n"; print DEADFILE "$url is alive but NOT powered by Mediawiki\n"; $dead++; } $flag=0; } else { $errormsg=$res->status_line; if (substr($errormsg,0,3) eq "500") { # if response-code 500 print DEADFILE "$url\n"; $dead++; } elsif (substr($errormsg,0,3) eq "401") { # if Unauthorized (code 401) print DEADFILE "$url\n"; $dead++; } elsif (substr($errormsg,0,3) eq "403") { # if forbidden (code 403) print DEADFILE "$url is alive but access is denied.\n"; $dead++; } elsif (substr($errormsg,0,3) eq "404") { # if URL is dead (code 404) print DEADFILE "$url\n"; $dead++; } elsif (substr($errormsg,0,3) eq "406") { # if Not Acceptable (code 406) print DEADFILE "$url\n"; $dead++; } } $c++; $b=$c/10; if ($b==$a) { print "Checked $c URLs -- "; # print the progress every 10 URLs $a++; } &PauseRoutine; } } close DEADFILE; close ALIVEFILE; close MYURLLIST; print "\nFinished! I found $live live wikis and $dead dead or non-Mediawiki wikis.\n"; # Here's the sub-routines # ======================= sub Check4api { $pos=rindex($url,"\/"); # $pos will contain the position of the final "/" (counting from zero) $base_plus=substr($url,0,($pos+1)); # $base_plus will contain everything up to & including the final "/" my $len1=length($url); my $len2=length($base_plus); if ($len2 < 10) { # if $base_plus contains only "http://" or "https://" $base_plus=$url; # then assign $url to $base_plus my $tail=substr $base_plus, -1; if (!($tail=~/\//)) { # if the last character of $base_plus is not a "/" $base_plus=$base_plus."\/" ; # then add it } } $apiurl=$base_plus.$apiphp; # $apiurl is our new URL with api.php tacked on the end &PauseRoutine; & Fetch_api; # pause & then try to get api.php if ($res->is_success) { print "Found api.php... "; $doc=$res->content; &Parse_api; }else{ # if no api.php... $apiurl=$base_plus.$wapiphp; # modify the URL &PauseRoutine; & Fetch_api; # pause & then try to get /w/api.php if ($res->is_success) { print "Found api.php... "; $doc=$res->content; &Parse_api; }else{ # if no /w/api.php... $apiurl=$base_plus.$wikiapiphp; # modify the URL &PauseRoutine; & Fetch_api; # pause & then try to get /wiki/api.php if ($res->is_success) { print "Found api.php... "; $doc=$res->content; &Parse_api; }else{ if (/https:\/\//) { $scheme="https://"; } else { $scheme="http://"; } $url = Mojo::URL->new($url); $base = $url->host; # extract just the host from $url & assign it to $base $base=$scheme.$base; my $tail=substr $base, -1; if (!($tail=~/\//)) { # if the last character of $base_plus is not a "/" $base=$base."\/" ; # then add it } $apiurl=$base.$apiphp; # $apiurl is our new URL with api.php tacked on the end &PauseRoutine; & Fetch_api; # pause & then try to get api.php if ($res->is_success) { print "Found api.php... "; $doc=$res->content; &Parse_api; }else{ # if no api.php... $apiurl=$base.$wapiphp; # modify the URL &PauseRoutine; & Fetch_api; # pause & then try to get /w/api.php if ($res->is_success) { print "Found api.php... "; $doc=$res->content; &Parse_api; }else{ # if no /w/api.php... $apiurl=$base.$wikiapiphp; # modify the URL &PauseRoutine; & Fetch_api; # pause & then try to get /wiki/api.php if ($res->is_success) { print "Found api.php... "; $doc=$res->content; &Parse_api; }else{ if (!($url=~/index.php/i)) { # if the URL does not end with index.php... print "There is no api.php -- I'll try index.php...\n"; $indexurl=$base_plus.$indexphp; # then tack on index.php... $req = HTTP::Request->new(GET => $indexurl); # and try to get it $req->header('Accept' => 'text/html'); $res = $ua->request($req); # send request if ($res->is_success) { $doc=$res->content; if (($doc=~/$pwrdby1/i) || ($doc=~/$pwrdby2/i)) { # if the page contains: "Powered by MediaWiki" print ALIVEFILE "$indexurl\n"; # or: "poweredby_mediawiki" }elsif ($doc=~/$genmw/i) { # if the content generator is MediaWiki print ALIVEFILE "$indexurl\n"; }else{ print "There is no api.php OR index.php for this URL\n"; print ALIVEFILE "$url\n"; } }else{ print ALIVEFILE "$url\n"; } }else{ print "There is no api.php for this URL\n"; print ALIVEFILE "$url\n"; } } } } } } } } sub Fetch_api { $req = HTTP::Request->new(GET => $apiurl); $req->header('Accept' => 'text/html'); $res = $ua->request($req); # send request } sub Parse_api { print "Parsing the document...\n"; if ($doc=~/$mwapi/i) { # if the api.php contains: "MediaWiki API documentation page" print "Found a valid api.php and writing it to the list\n"; print ALIVEFILE "$apiurl\n"; # then it's a MediaWiki wiki - print it to the list }elsif ($doc=~/$mwapi2/i) { # if the api.php contains: "API Home Page" (older version) print "Found a valid api.php and writing it to the list\n"; print ALIVEFILE "$apiurl\n"; # then it's a MediaWiki wiki - print it to the list }else{ print "This api.php is not valid.\n"; # then try to get index.php $indexurl=$base_plus.$indexphp; print "Trying to get $indexurl...\n"; $req = HTTP::Request->new(GET => $indexurl); $req->header('Accept' => 'text/html'); $res = $ua->request($req); # send request if ($res->is_success) { $doc=$res->content; if (($doc=~/$pwrdby1/i) || ($doc=~/$pwrdby2/i)) { # if the page contains: "Powered by MediaWiki" print "Found a good index.php and writing it to the list\n"; print ALIVEFILE "$indexurl\n"; # or: "poweredby_mediawiki" }elsif ($doc=~/$genmw/i) { # if the content generator is MediaWiki print "Found a good index.php and writing it to the list\n"; print ALIVEFILE "$indexurl\n"; }else{ print "There is no api.php OR index.php for $url\n"; print ALIVEFILE "$url\n"; } } } } sub PauseRoutine { if ($slp > 0) { print "Pausing for $slp seconds...\n\n"; sleep $slp; } else { } # don't pause - go on to the next URL }