2014-01-23 00:29:43 +00:00
|
|
|
#!/usr/bin/perl
|
|
|
|
|
2014-04-24 03:03:16 +00:00
|
|
|
# Name: checkalive.pl v2.01
|
2014-02-27 05:50:28 +00:00
|
|
|
# Description: This script will go thru a list of URLs & determine if they are online & if they are Mediawiki wikis.
|
|
|
|
# It should work with: "/index.php/Main_Page", "index.php", "api.php" and even pages such as: "/wiki/Pagina_principale".
|
|
|
|
# If the URl is not "api.php", it will look for it, check it, and output it if found to be a valid api.php. If not found,
|
|
|
|
# it will output the URL with "index.php" if that's available.
|
2014-01-26 21:58:05 +00:00
|
|
|
#
|
2014-01-23 00:29:43 +00:00
|
|
|
# Created: 12/14/2013
|
2014-04-24 03:03:16 +00:00
|
|
|
# Most recently updated: 04/11/2014
|
2014-01-26 21:58:05 +00:00
|
|
|
# Copyright (c) 2013-2014 by Scott D. Boyd - scottdb56@gmail.com
|
2014-01-23 00:29:43 +00:00
|
|
|
#
|
2014-02-27 05:50:28 +00:00
|
|
|
# ===========================================================================================================================
|
|
|
|
# This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License
|
|
|
|
# as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
|
2014-01-23 00:29:43 +00:00
|
|
|
#
|
2014-02-27 05:50:28 +00:00
|
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
|
|
|
|
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
2014-01-26 21:58:05 +00:00
|
|
|
#
|
2014-02-27 05:50:28 +00:00
|
|
|
# You should have received a copy of the GNU General Public License along with this program. If not,
|
|
|
|
# see <http://www.gnu.org/licenses/>.
|
|
|
|
# ===========================================================================================================================
|
|
|
|
#
|
|
|
|
# NOTE: The following four Perl modules need to be installed on your computer.
|
|
|
|
# Search for them on cpan.org or use your Linux distro's package manager.
|
2014-01-23 00:29:43 +00:00
|
|
|
use LWP::Simple;
|
|
|
|
use LWP::UserAgent;
|
|
|
|
use Crypt::SSLeay;
|
2014-02-27 05:50:28 +00:00
|
|
|
use Mojo::URL;
|
|
|
|
my $slp=2; # You can change this number for seconds to sleep between requests (currently 2 seconds)
|
|
|
|
my $urllist="URL-list.txt";
|
2014-01-23 00:29:43 +00:00
|
|
|
my $alivelist="alive-wikis.txt";
|
|
|
|
my $deadlist="dead-wikis.txt";
|
|
|
|
my $pwrdby1="Powered by MediaWiki";
|
|
|
|
my $pwrdby2="poweredby_mediawiki";
|
2014-01-27 22:10:27 +00:00
|
|
|
my $genmw="meta name\=\"generator\" content\=\"MediaWiki";
|
2014-01-23 00:29:43 +00:00
|
|
|
my $mwapi="MediaWiki API documentation page";
|
2014-02-27 05:50:28 +00:00
|
|
|
my $mwapi2="API Home Page"; # found in an older version of the api
|
|
|
|
my $indexphp="index.php";
|
|
|
|
my $apiphp="api.php";
|
|
|
|
my $wapiphp="w\/api.php";
|
|
|
|
my $wikiapiphp="wiki\/api.php";
|
|
|
|
my $apiurl="";
|
|
|
|
my $live=0; my $dead=0;
|
2014-01-26 21:58:05 +00:00
|
|
|
my $a=1; my $b=0; my $c=0;
|
2014-01-23 00:29:43 +00:00
|
|
|
my $flag=0;
|
|
|
|
my $ua = LWP::UserAgent->new;
|
2014-02-27 05:50:28 +00:00
|
|
|
$ua->agent("Mozilla/5.0"); # use this user-agent to get into wikis that block spiders & robots
|
2014-01-23 00:29:43 +00:00
|
|
|
$ua->timeout(30);
|
|
|
|
$ua->show_progress(1);
|
|
|
|
|
2014-01-26 21:58:05 +00:00
|
|
|
open (MYURLLIST,"<$urllist")
|
|
|
|
or die "Cannot open the URL-list file: $!";
|
2014-01-23 00:29:43 +00:00
|
|
|
open (ALIVEFILE,">$alivelist");
|
|
|
|
open (DEADFILE,">$deadlist");
|
|
|
|
while (<MYURLLIST>) {
|
2014-02-27 05:50:28 +00:00
|
|
|
if ((/\#(.*?)/) || (/^\s*$/)) { # check to see if line is a comment or a blank line
|
|
|
|
next; # if so - skip it
|
2014-01-23 00:29:43 +00:00
|
|
|
} else {
|
2014-02-27 05:50:28 +00:00
|
|
|
$url=$_; # assign the current line to $url
|
2014-01-23 00:29:43 +00:00
|
|
|
chomp $url;
|
2014-02-27 05:50:28 +00:00
|
|
|
$req = HTTP::Request->new(GET => $url); # --|
|
|
|
|
$req->header('Accept' => 'text/html'); # |-- some of these lines
|
|
|
|
$res = $ua->request($req); # send request |-- were adapted from
|
|
|
|
if ($res->is_success) { # if the URL still exists |-- lwpcook.pod
|
|
|
|
print "Got it! "; # |
|
|
|
|
$doc=$res->content; # |
|
|
|
|
print "Parsing the document... ";
|
|
|
|
if (($doc=~/$pwrdby1/i) || ($doc=~/$pwrdby2/i)) { # if the page contains: "Powered by MediaWiki"
|
|
|
|
print "It's alive and powered by Mediawiki\n"; # or: "poweredby_mediawiki"
|
|
|
|
$flag=1;$live++; # then it's a MediaWiki wiki
|
|
|
|
& Check4api;
|
|
|
|
} elsif ($doc=~/$genmw/i) { # if the content generator is MediaWiki
|
|
|
|
print "It's alive and powered by Mediawiki\n"; # then it's a MediaWiki wiki
|
|
|
|
$flag=1;$live++;
|
|
|
|
& Check4api;
|
|
|
|
} elsif ($doc=~/$mwapi/i) { # if the api.php contains: "MediaWiki API documentation page"
|
|
|
|
print "It's alive and powered by Mediawiki\n"; # then it's a MediaWiki wiki
|
|
|
|
print ALIVEFILE "$url\n";
|
|
|
|
$flag=1;$live++;
|
|
|
|
} elsif ($doc=~/$mwapi2/i) { # if the api.php contains: "API Home Page" (older version)
|
|
|
|
print "It's alive and powered by Mediawiki\n"; # then it's a MediaWiki wiki
|
|
|
|
print ALIVEFILE "$url\n";
|
|
|
|
$flag=1;$live++;
|
|
|
|
}
|
|
|
|
unless ($flag) {
|
|
|
|
print "It's alive but NOT powered by MediaWiki\n";
|
|
|
|
print DEADFILE "$url is alive but NOT powered by Mediawiki\n"; $dead++;
|
|
|
|
}
|
|
|
|
$flag=0;
|
|
|
|
} else {
|
|
|
|
$errormsg=$res->status_line;
|
2014-01-26 21:58:05 +00:00
|
|
|
if (substr($errormsg,0,3) eq "500") { # if response-code 500
|
2014-02-27 05:50:28 +00:00
|
|
|
print DEADFILE "$url\n"; $dead++;
|
2014-01-26 21:58:05 +00:00
|
|
|
} elsif (substr($errormsg,0,3) eq "401") { # if Unauthorized (code 401)
|
2014-02-27 05:50:28 +00:00
|
|
|
print DEADFILE "$url\n"; $dead++;
|
2014-01-26 21:58:05 +00:00
|
|
|
} elsif (substr($errormsg,0,3) eq "403") { # if forbidden (code 403)
|
2014-02-27 05:50:28 +00:00
|
|
|
print DEADFILE "$url is alive but access is denied.\n"; $dead++;
|
2014-01-26 21:58:05 +00:00
|
|
|
} elsif (substr($errormsg,0,3) eq "404") { # if URL is dead (code 404)
|
2014-02-27 05:50:28 +00:00
|
|
|
print DEADFILE "$url\n"; $dead++;
|
2014-01-26 21:58:05 +00:00
|
|
|
} elsif (substr($errormsg,0,3) eq "406") { # if Not Acceptable (code 406)
|
2014-02-27 05:50:28 +00:00
|
|
|
print DEADFILE "$url\n"; $dead++;
|
2014-01-23 00:29:43 +00:00
|
|
|
}
|
|
|
|
}
|
2014-01-26 21:58:05 +00:00
|
|
|
$c++; $b=$c/10;
|
|
|
|
if ($b==$a) {
|
2014-02-27 05:50:28 +00:00
|
|
|
print "Checked $c URLs -- "; # print the progress every 10 URLs
|
2014-01-26 21:58:05 +00:00
|
|
|
$a++;
|
|
|
|
}
|
2014-02-27 05:50:28 +00:00
|
|
|
&PauseRoutine;
|
2014-01-23 00:29:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
close DEADFILE; close ALIVEFILE; close MYURLLIST;
|
2014-02-27 05:50:28 +00:00
|
|
|
print "\nFinished! I found $live live wikis and $dead dead or non-Mediawiki wikis.\n";
|
2014-01-23 00:29:43 +00:00
|
|
|
|
2014-02-27 05:50:28 +00:00
|
|
|
# Here's the sub-routines
|
|
|
|
# =======================
|
|
|
|
sub Check4api {
|
|
|
|
$pos=rindex($url,"\/"); # $pos will contain the position of the final "/" (counting from zero)
|
|
|
|
$base_plus=substr($url,0,($pos+1)); # $base_plus will contain everything up to & including the final "/"
|
|
|
|
my $len1=length($url); my $len2=length($base_plus);
|
|
|
|
if ($len2 < 10) { # if $base_plus contains only "http://" or "https://"
|
|
|
|
$base_plus=$url; # then assign $url to $base_plus
|
|
|
|
my $tail=substr $base_plus, -1;
|
|
|
|
if (!($tail=~/\//)) { # if the last character of $base_plus is not a "/"
|
|
|
|
$base_plus=$base_plus."\/" ; # then add it
|
|
|
|
}
|
2014-01-23 00:29:43 +00:00
|
|
|
}
|
2014-02-27 05:50:28 +00:00
|
|
|
$apiurl=$base_plus.$apiphp; # $apiurl is our new URL with api.php tacked on the end
|
|
|
|
&PauseRoutine; & Fetch_api; # pause & then try to get api.php
|
|
|
|
if ($res->is_success) {
|
|
|
|
print "Found api.php... "; $doc=$res->content;
|
|
|
|
&Parse_api;
|
|
|
|
}else{ # if no api.php...
|
|
|
|
$apiurl=$base_plus.$wapiphp; # modify the URL
|
|
|
|
&PauseRoutine; & Fetch_api; # pause & then try to get /w/api.php
|
|
|
|
if ($res->is_success) {
|
|
|
|
print "Found api.php... "; $doc=$res->content;
|
|
|
|
&Parse_api;
|
|
|
|
}else{ # if no /w/api.php...
|
|
|
|
$apiurl=$base_plus.$wikiapiphp; # modify the URL
|
|
|
|
&PauseRoutine; & Fetch_api; # pause & then try to get /wiki/api.php
|
|
|
|
if ($res->is_success) {
|
|
|
|
print "Found api.php... "; $doc=$res->content;
|
|
|
|
&Parse_api;
|
|
|
|
}else{
|
|
|
|
if (/https:\/\//) {
|
|
|
|
$scheme="https://";
|
|
|
|
} else {
|
|
|
|
$scheme="http://";
|
|
|
|
}
|
|
|
|
$url = Mojo::URL->new($url);
|
|
|
|
$base = $url->host; # extract just the host from $url & assign it to $base
|
|
|
|
$base=$scheme.$base;
|
|
|
|
my $tail=substr $base, -1;
|
|
|
|
if (!($tail=~/\//)) { # if the last character of $base_plus is not a "/"
|
|
|
|
$base=$base."\/" ; # then add it
|
|
|
|
}
|
|
|
|
$apiurl=$base.$apiphp; # $apiurl is our new URL with api.php tacked on the end
|
|
|
|
&PauseRoutine; & Fetch_api; # pause & then try to get api.php
|
|
|
|
if ($res->is_success) {
|
|
|
|
print "Found api.php... "; $doc=$res->content;
|
|
|
|
&Parse_api;
|
|
|
|
}else{ # if no api.php...
|
|
|
|
$apiurl=$base.$wapiphp; # modify the URL
|
|
|
|
&PauseRoutine; & Fetch_api; # pause & then try to get /w/api.php
|
|
|
|
if ($res->is_success) {
|
|
|
|
print "Found api.php... "; $doc=$res->content;
|
|
|
|
&Parse_api;
|
|
|
|
}else{ # if no /w/api.php...
|
|
|
|
$apiurl=$base.$wikiapiphp; # modify the URL
|
|
|
|
&PauseRoutine; & Fetch_api; # pause & then try to get /wiki/api.php
|
|
|
|
if ($res->is_success) {
|
|
|
|
print "Found api.php... "; $doc=$res->content;
|
|
|
|
&Parse_api;
|
|
|
|
}else{
|
|
|
|
if (!($url=~/index.php/i)) { # if the URL does not end with index.php...
|
|
|
|
print "There is no api.php -- I'll try index.php...\n";
|
|
|
|
$indexurl=$base_plus.$indexphp; # then tack on index.php...
|
|
|
|
$req = HTTP::Request->new(GET => $indexurl); # and try to get it
|
|
|
|
$req->header('Accept' => 'text/html');
|
|
|
|
$res = $ua->request($req); # send request
|
|
|
|
if ($res->is_success) {
|
|
|
|
$doc=$res->content;
|
|
|
|
if (($doc=~/$pwrdby1/i) || ($doc=~/$pwrdby2/i)) { # if the page contains: "Powered by MediaWiki"
|
|
|
|
print ALIVEFILE "$indexurl\n"; # or: "poweredby_mediawiki"
|
|
|
|
}elsif ($doc=~/$genmw/i) { # if the content generator is MediaWiki
|
|
|
|
print ALIVEFILE "$indexurl\n";
|
|
|
|
}else{
|
2014-04-24 03:03:16 +00:00
|
|
|
print "There is no api.php OR index.php for this URL\n";
|
2014-02-27 05:50:28 +00:00
|
|
|
print ALIVEFILE "$url\n";
|
|
|
|
}
|
|
|
|
}else{
|
|
|
|
print ALIVEFILE "$url\n";
|
|
|
|
}
|
|
|
|
}else{
|
2014-04-24 03:03:16 +00:00
|
|
|
print "There is no api.php for this URL\n";
|
2014-02-27 05:50:28 +00:00
|
|
|
print ALIVEFILE "$url\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-01-26 21:58:05 +00:00
|
|
|
}
|
2014-02-27 05:50:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
sub Fetch_api {
|
|
|
|
$req = HTTP::Request->new(GET => $apiurl);
|
|
|
|
$req->header('Accept' => 'text/html');
|
|
|
|
$res = $ua->request($req); # send request
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Parse_api {
|
2014-04-24 03:03:16 +00:00
|
|
|
print "Parsing the document...\n";
|
2014-02-27 05:50:28 +00:00
|
|
|
if ($doc=~/$mwapi/i) { # if the api.php contains: "MediaWiki API documentation page"
|
2014-04-24 03:03:16 +00:00
|
|
|
print "Found a valid api.php and writing it to the list\n";
|
2014-02-27 05:50:28 +00:00
|
|
|
print ALIVEFILE "$apiurl\n"; # then it's a MediaWiki wiki - print it to the list
|
|
|
|
}elsif ($doc=~/$mwapi2/i) { # if the api.php contains: "API Home Page" (older version)
|
2014-04-24 03:03:16 +00:00
|
|
|
print "Found a valid api.php and writing it to the list\n";
|
2014-02-27 05:50:28 +00:00
|
|
|
print ALIVEFILE "$apiurl\n"; # then it's a MediaWiki wiki - print it to the list
|
|
|
|
}else{
|
|
|
|
print "This api.php is not valid.\n"; # then try to get index.php
|
|
|
|
$indexurl=$base_plus.$indexphp;
|
|
|
|
print "Trying to get $indexurl...\n";
|
|
|
|
$req = HTTP::Request->new(GET => $indexurl);
|
|
|
|
$req->header('Accept' => 'text/html');
|
|
|
|
$res = $ua->request($req); # send request
|
|
|
|
if ($res->is_success) {
|
|
|
|
$doc=$res->content;
|
|
|
|
if (($doc=~/$pwrdby1/i) || ($doc=~/$pwrdby2/i)) { # if the page contains: "Powered by MediaWiki"
|
2014-04-24 03:03:16 +00:00
|
|
|
print "Found a good index.php and writing it to the list\n";
|
2014-02-27 05:50:28 +00:00
|
|
|
print ALIVEFILE "$indexurl\n"; # or: "poweredby_mediawiki"
|
|
|
|
}elsif ($doc=~/$genmw/i) { # if the content generator is MediaWiki
|
2014-04-24 03:03:16 +00:00
|
|
|
print "Found a good index.php and writing it to the list\n";
|
2014-02-27 05:50:28 +00:00
|
|
|
print ALIVEFILE "$indexurl\n";
|
|
|
|
}else{
|
|
|
|
print "There is no api.php OR index.php for $url\n";
|
|
|
|
print ALIVEFILE "$url\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sub PauseRoutine {
|
|
|
|
if ($slp > 0) {
|
|
|
|
print "Pausing for $slp seconds...\n\n"; sleep $slp;
|
|
|
|
} else { } # don't pause - go on to the next URL
|
2014-01-23 00:29:43 +00:00
|
|
|
}
|
|
|
|
|