diff --git a/listsofwikis/checkalive.pl b/listsofwikis/checkalive.pl index 8866867..f662196 100644 --- a/listsofwikis/checkalive.pl +++ b/listsofwikis/checkalive.pl @@ -1,117 +1,247 @@ #!/usr/bin/perl -# Name: checkalive.pl v1.2 -# Description: This script will go thru a list of URLs & determine -# if they are online & if they are Mediawiki wikis. It should work -# with: "/index.php/Main_Page", "index.php", "api.php" and even pages -# such as: "/wiki/Pagina_principale". +# Name: checkalive.pl v2.0 +# Description: This script will go thru a list of URLs & determine if they are online & if they are Mediawiki wikis. +# It should work with: "/index.php/Main_Page", "index.php", "api.php" and even pages such as: "/wiki/Pagina_principale". +# If the URl is not "api.php", it will look for it, check it, and output it if found to be a valid api.php. If not found, +# it will output the URL with "index.php" if that's available. # # Created: 12/14/2013 -# Most recently updated: 01/26/2014 (It's a work-in-progress...) +# Most recently updated: 02/25/2014 # Copyright (c) 2013-2014 by Scott D. Boyd - scottdb56@gmail.com -# ==================================================================== -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. +# =========================================================================================================================== +# This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. # -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# ==================================================================== +# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty +# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. # -# NOTE: The following three Perl modules need to be installed on your computer. -# Search for them on cpan.org or use your Linux distro's package manager. +# You should have received a copy of the GNU General Public License along with this program. If not, +# see . +# =========================================================================================================================== +# +# NOTE: The following four Perl modules need to be installed on your computer. +# Search for them on cpan.org or use your Linux distro's package manager. use LWP::Simple; use LWP::UserAgent; use Crypt::SSLeay; -my $slp=2; # You can change this number for seconds to sleep between requests (currently 2 seconds) -my $urllist="my-URL-list.txt"; +use Mojo::URL; +my $slp=2; # You can change this number for seconds to sleep between requests (currently 2 seconds) +my $urllist="URL-list.txt"; my $alivelist="alive-wikis.txt"; my $deadlist="dead-wikis.txt"; my $pwrdby1="Powered by MediaWiki"; my $pwrdby2="poweredby_mediawiki"; my $genmw="meta name\=\"generator\" content\=\"MediaWiki"; my $mwapi="MediaWiki API documentation page"; -my $lw=0; my $dw=0; +my $mwapi2="API Home Page"; # found in an older version of the api +my $indexphp="index.php"; +my $apiphp="api.php"; +my $wapiphp="w\/api.php"; +my $wikiapiphp="wiki\/api.php"; +my $apiurl=""; +my $live=0; my $dead=0; my $a=1; my $b=0; my $c=0; my $flag=0; my $ua = LWP::UserAgent->new; -$ua->agent("Mozilla/5.0"); # use this user-agent to get into wikis that block spiders & robots +$ua->agent("Mozilla/5.0"); # use this user-agent to get into wikis that block spiders & robots $ua->timeout(30); $ua->show_progress(1); -# Here's where most of the work takes place: open (MYURLLIST,"<$urllist") or die "Cannot open the URL-list file: $!"; open (ALIVEFILE,">$alivelist"); open (DEADFILE,">$deadlist"); while () { - if ((/\#(.*?)/) || (/^\s*$/)) { # check to see if line is a comment or a blank line - next; # if so - skip it + if ((/\#(.*?)/) || (/^\s*$/)) { # check to see if line is a comment or a blank line + next; # if so - skip it } else { - $url=$_; # assign the current line to $url + $url=$_; # assign the current line to $url chomp $url; - $req = HTTP::Request->new(GET => $url); # --| - $req->header('Accept' => 'text/html'); # |-- some of these lines - $res = $ua->request($req); # send request |-- were adapted from - if ($res->is_success) { # if the URL still exists |-- lwpcook.pod - print "Got it! "; # | - $doc=$res->content; # | - &ParsePage; # go to "ParsePage" sub-routine | - } else { # | - $errormsg=$res->status_line; # --| + $req = HTTP::Request->new(GET => $url); # --| + $req->header('Accept' => 'text/html'); # |-- some of these lines + $res = $ua->request($req); # send request |-- were adapted from + if ($res->is_success) { # if the URL still exists |-- lwpcook.pod + print "Got it! "; # | + $doc=$res->content; # | + print "Parsing the document... "; + if (($doc=~/$pwrdby1/i) || ($doc=~/$pwrdby2/i)) { # if the page contains: "Powered by MediaWiki" + print "It's alive and powered by Mediawiki\n"; # or: "poweredby_mediawiki" + $flag=1;$live++; # then it's a MediaWiki wiki + & Check4api; + } elsif ($doc=~/$genmw/i) { # if the content generator is MediaWiki + print "It's alive and powered by Mediawiki\n"; # then it's a MediaWiki wiki + $flag=1;$live++; + & Check4api; + } elsif ($doc=~/$mwapi/i) { # if the api.php contains: "MediaWiki API documentation page" + print "It's alive and powered by Mediawiki\n"; # then it's a MediaWiki wiki + print ALIVEFILE "$url\n"; + $flag=1;$live++; + } elsif ($doc=~/$mwapi2/i) { # if the api.php contains: "API Home Page" (older version) + print "It's alive and powered by Mediawiki\n"; # then it's a MediaWiki wiki + print ALIVEFILE "$url\n"; + $flag=1;$live++; + } + unless ($flag) { + print "It's alive but NOT powered by MediaWiki\n"; + print DEADFILE "$url is alive but NOT powered by Mediawiki\n"; $dead++; + } + $flag=0; + } else { + $errormsg=$res->status_line; if (substr($errormsg,0,3) eq "500") { # if response-code 500 - print DEADFILE "$url\n"; $dw++; + print DEADFILE "$url\n"; $dead++; } elsif (substr($errormsg,0,3) eq "401") { # if Unauthorized (code 401) - print DEADFILE "$url\n"; $dw++; + print DEADFILE "$url\n"; $dead++; } elsif (substr($errormsg,0,3) eq "403") { # if forbidden (code 403) - print DEADFILE "$url is alive but access is denied.\n"; $dw++; + print DEADFILE "$url is alive but access is denied.\n"; $dead++; } elsif (substr($errormsg,0,3) eq "404") { # if URL is dead (code 404) - print DEADFILE "$url\n"; $dw++; + print DEADFILE "$url\n"; $dead++; } elsif (substr($errormsg,0,3) eq "406") { # if Not Acceptable (code 406) - print DEADFILE "$url\n"; $dw++; + print DEADFILE "$url\n"; $dead++; } } $c++; $b=$c/10; if ($b==$a) { - print "Checked $c URLs -- "; # print the progress every 10 URLs + print "Checked $c URLs -- "; # print the progress every 10 URLs $a++; } - if ($slp > 0) { - print "Pausing for $slp seconds...\n\n"; sleep $slp; - } else { # don't pause - go on to the next URL - } + &PauseRoutine; } } close DEADFILE; close ALIVEFILE; close MYURLLIST; -print "\nFinished! I found $lw live wikis and $dw dead or non-Mediawiki wikis.\n"; +print "\nFinished! I found $live live wikis and $dead dead or non-Mediawiki wikis.\n"; -# Here's the sub-routine -# ====================== -sub ParsePage { - print "Parsing the document... "; - if (($doc=~/$pwrdby1/i) || ($doc=~/$pwrdby2/i)) { # if the page contains: "Powered by MediaWiki" - print "It's alive and powered by Mediawiki\n"; # or: "poweredby_mediawiki" - print ALIVEFILE "$url\n"; # then it's a MediaWiki wiki - $flag=1;$lw++; - } elsif ($doc=~/$genmw/i) { # if the content generator is MediaWiki - print "It's alive and powered by Mediawiki\n"; # then it's a MediaWiki wiki - print ALIVEFILE "$url\n"; - $flag=1;$lw++; - } elsif ($doc=~/$mwapi/i) { # if the api.php contains: "MediaWiki API documentation page" - print "It's alive and powered by Mediawiki\n"; # then it's a MediaWiki wiki - print ALIVEFILE "$url\n"; - $flag=1;$lw++; +# Here's the sub-routines +# ======================= +sub Check4api { + $pos=rindex($url,"\/"); # $pos will contain the position of the final "/" (counting from zero) + $base_plus=substr($url,0,($pos+1)); # $base_plus will contain everything up to & including the final "/" + my $len1=length($url); my $len2=length($base_plus); + if ($len2 < 10) { # if $base_plus contains only "http://" or "https://" + $base_plus=$url; # then assign $url to $base_plus + my $tail=substr $base_plus, -1; + if (!($tail=~/\//)) { # if the last character of $base_plus is not a "/" + $base_plus=$base_plus."\/" ; # then add it + } } - unless ($flag) { - print "It's alive but NOT powered by MediaWiki\n"; - print DEADFILE "$url is alive but NOT powered by Mediawiki\n"; $dw++; + $apiurl=$base_plus.$apiphp; # $apiurl is our new URL with api.php tacked on the end + &PauseRoutine; & Fetch_api; # pause & then try to get api.php + if ($res->is_success) { + print "Found api.php... "; $doc=$res->content; + &Parse_api; + }else{ # if no api.php... + $apiurl=$base_plus.$wapiphp; # modify the URL + &PauseRoutine; & Fetch_api; # pause & then try to get /w/api.php + if ($res->is_success) { + print "Found api.php... "; $doc=$res->content; + &Parse_api; + }else{ # if no /w/api.php... + $apiurl=$base_plus.$wikiapiphp; # modify the URL + &PauseRoutine; & Fetch_api; # pause & then try to get /wiki/api.php + if ($res->is_success) { + print "Found api.php... "; $doc=$res->content; + &Parse_api; + }else{ + if (/https:\/\//) { + $scheme="https://"; + } else { + $scheme="http://"; + } + $url = Mojo::URL->new($url); + $base = $url->host; # extract just the host from $url & assign it to $base + $base=$scheme.$base; + my $tail=substr $base, -1; + if (!($tail=~/\//)) { # if the last character of $base_plus is not a "/" + $base=$base."\/" ; # then add it + } + $apiurl=$base.$apiphp; # $apiurl is our new URL with api.php tacked on the end + &PauseRoutine; & Fetch_api; # pause & then try to get api.php + if ($res->is_success) { + print "Found api.php... "; $doc=$res->content; + &Parse_api; + }else{ # if no api.php... + $apiurl=$base.$wapiphp; # modify the URL + &PauseRoutine; & Fetch_api; # pause & then try to get /w/api.php + if ($res->is_success) { + print "Found api.php... "; $doc=$res->content; + &Parse_api; + }else{ # if no /w/api.php... + $apiurl=$base.$wikiapiphp; # modify the URL + &PauseRoutine; & Fetch_api; # pause & then try to get /wiki/api.php + if ($res->is_success) { + print "Found api.php... "; $doc=$res->content; + &Parse_api; + }else{ + if (!($url=~/index.php/i)) { # if the URL does not end with index.php... + print "There is no api.php -- I'll try index.php...\n"; + $indexurl=$base_plus.$indexphp; # then tack on index.php... + $req = HTTP::Request->new(GET => $indexurl); # and try to get it + $req->header('Accept' => 'text/html'); + $res = $ua->request($req); # send request + if ($res->is_success) { + $doc=$res->content; + if (($doc=~/$pwrdby1/i) || ($doc=~/$pwrdby2/i)) { # if the page contains: "Powered by MediaWiki" + print ALIVEFILE "$indexurl\n"; # or: "poweredby_mediawiki" + }elsif ($doc=~/$genmw/i) { # if the content generator is MediaWiki + print ALIVEFILE "$indexurl\n"; + }else{ + print "There is no api.php OR index.php for $url\n"; + print ALIVEFILE "$url\n"; + } + }else{ + print ALIVEFILE "$url\n"; + } + }else{ + print "There is no api.php for $url\n"; + print ALIVEFILE "$url\n"; + } + } + } + } + } + } } - $flag=0; +} + +sub Fetch_api { + $req = HTTP::Request->new(GET => $apiurl); + $req->header('Accept' => 'text/html'); + $res = $ua->request($req); # send request +} + +sub Parse_api { + print "Parsing the document...\n "; + if ($doc=~/$mwapi/i) { # if the api.php contains: "MediaWiki API documentation page" + print ALIVEFILE "$apiurl\n"; # then it's a MediaWiki wiki - print it to the list + }elsif ($doc=~/$mwapi2/i) { # if the api.php contains: "API Home Page" (older version) + print "Found a valid api.php and writing it to the list\n"; # delete this line after testing + print ALIVEFILE "$apiurl\n"; # then it's a MediaWiki wiki - print it to the list + }else{ + print "This api.php is not valid.\n"; # then try to get index.php + $indexurl=$base_plus.$indexphp; + print "Trying to get $indexurl...\n"; + $req = HTTP::Request->new(GET => $indexurl); + $req->header('Accept' => 'text/html'); + $res = $ua->request($req); # send request + if ($res->is_success) { + $doc=$res->content; + if (($doc=~/$pwrdby1/i) || ($doc=~/$pwrdby2/i)) { # if the page contains: "Powered by MediaWiki" + print ALIVEFILE "$indexurl\n"; # or: "poweredby_mediawiki" + }elsif ($doc=~/$genmw/i) { # if the content generator is MediaWiki + print ALIVEFILE "$indexurl\n"; + }else{ + print "There is no api.php OR index.php for $url\n"; + print ALIVEFILE "$url\n"; + } + } + } +} + +sub PauseRoutine { + if ($slp > 0) { + print "Pausing for $slp seconds...\n\n"; sleep $slp; + } else { } # don't pause - go on to the next URL } diff --git a/listsofwikis/readme-checkalive.txt b/listsofwikis/readme-checkalive.txt index c78af75..29e10b5 100644 --- a/listsofwikis/readme-checkalive.txt +++ b/listsofwikis/readme-checkalive.txt @@ -1,9 +1,11 @@ Description ----------- -checkalive.pl is a Perl script that will go thru a list of URLs & determine if -they are online and if they are Mediawiki wikis. It should work with -"index.php/Main_Page", "index.php" and "api.php". As of 01/23/2014, I have -started using version numbers. +checkalive.pl is a Perl script that will go thru a list of URLs & determine if they are +online & if they are Mediawiki wikis. It should work with: "/index.php/Main_Page", +"index.php", "api.php" and even pages such as: "/wiki/Pagina_principale". If the URl is +not "api.php", it will look for it, check it, and output it if found to be a valid api.php. +If not found, it will output the URL with "index.php" if that's available. +As of 01/23/2014, I have started using version numbers. Required programs and modules ----------------------------- @@ -12,10 +14,13 @@ be on your system. You will also need to have the following Perl modules install LWP::Simple LWP::UserAgent Crypt::SSLeay +Mojo::URL The first two are contained in LWP - The World-Wide Web library for Perl -(aka: libwww-perl-6.x), available at CPAN, or through your Linux distro's package manager. +(aka: libwww-perl-6.x), available at CPAN, (http://www.cpan.org)or through your Linux +distro's package manager. Crypt::SSLeay (OpenSSL support for LWP) is also available at CPAN. This module is needed to properly handle any URLs beginning with "https". +Mojo::URL is available at CPAN as well. It's needed to extract the domain name from a URL. Configuration ------------- @@ -31,6 +36,12 @@ There are several variables you can change, or you can just use them as-is: and will be noted as such. Any other variable that you want to change - you do so at your own risk. +Starting the script +------------------- +If you want to use the default configuration noted above, at a command prompt, simply +type: "perl checkalive.pl" (without the quotes). You must be in the same directory (or +folder) as the script and the URL list that you want to check. + Issues ------ The script does NOT have a "resume" feature at this time. If you are running through a @@ -38,8 +49,3 @@ list of 1000's of URLs, and the script crashes, or you kill it, your lists of al dead URLs will NOT BE SAVED TO DISK. I suggest breaking up your list into smaller lists of a few hundred URLs in each list until I can implement a resume feature. -The LWP library does transparent redirect handling, so I can't capture the new URL that -is displayed on screen as the script is running. Therefore, any of the URLs that get -redirected to a new URL will have the original URL saved to the appropriate list(whether -it's dead or alive). -