I've made a perl based bot that searches the net for emails (a.k.a. spam bot). I couldn't help but grin when I saw your email was "fravia(at)linuxmail(point)org". Kevin Jobson.
#!/usr/bin/perl
use LWP::UserAgent;
require
"subparseform.lib";
&Parse_Form;
$varurl =
$formdata{'varurl'}; # url permanente de base (ex. www.infinit.com)
$baseurl
= $formdata{'base_url'}; # url variante (ex.
www.infinit.com/blabla/encorebla/)
$newurl = $formdata{'newurl'}; # flag si nouveau url (flag if new URL)
print "Content-type: text/html\n\n";
print "<HTML><BODY>";
if ($newurl == 1) # si nouveau url, mettre le
dans la base permanente (new URL? ==> smash it into database)
{
$varurl = $baseurl;
}
if ($newurl
!= 1) # si pas un nouveau url, chercher dans la liste pour
une nouvelle (no new URL? search list for new)
{
open(URLS, "<data/url.txt") || &ErrorMsg; # lire
url.txt
$continue = 0;
while ($continue == 0 &&
($baseurl = readline *URLS)) # lire une ligne a la fois (one line only)
{
if (eof
URLS) # si la fin de url.txt alors fin de recherche (end of url.txt? ==> end of search)
{
print
"<center><h1>RECHERCHE COMPLETÉ</h1></center>";
exit;
}
open(VERIFURL,
"<data/verifurl.txt") || &ErrorMsg;
$found = 0;
while
(($found == 0) && ($line = readline *VERIFURL))
{
if ($line
eq $baseurl) # si $line = $baseurl alors url a
deja été fais. saute au prochain (jump to next URL in this
case)
{
$found = 1;
}
}
if(!$found)
{
$continue
= 1;
}
close(VERIFURL);
}
close(URLS);
}
if (($continue == 1) || ($newurl == 1))
### MAIN LOOP ###
{
###############SEARCH URL'S######################
$ua = LWP::UserAgent->new;
$req = HTTP::Request->new ('GET', $baseurl);
$resp = $ua->request($req);
$response_http = $resp->as_string(); # mettre code de la page http en une variable (smash http source code into variable)
#print "$response_http";
$response_http =~ s/<a href=.?\//<a href="$varurl/gi;
# remplace les lien (a href=/fr) relatif avec un lien absolu (replace relative link with absolute lkink)
print "<h1>BaseURL: $baseurl <BR>file length: ";
print length $response_http;
print "<br></h1>";
while ($response_http =~ m/$varurl[a-zA-Z0-9\:\.\/\%\?\=\&]+/i)
# cherche des liens http (search http links)
{
$varhttp = $&;
$varremainhttp =
$';
if ($varhttp =~ m/(\.php|\.htm|\.shtm|\.asp|\?|\.cgi)/i)
{
print
"FOUND: $varhttp <br>";
push(@http, $varhttp);
}
$response_http =
$varremainhttp;
}
open (URLS, ">>data/url.txt") ||
&ErrorMsg; #Append to data folder
foreach $url (@http)
{
print "URL
AJOUTÉ: $url<br>";
print URLS "$url\n";
}
close (URLS);
################### SEARCH EMAILS #########################
$response_email = $resp->as_string();
while ($response_email =~ m/\w+@\w+\.\w+/) # cherche
email with pattern something@something.something
{
push(@email, $&);
$response_email = $';
}
print "<br><h3>@email</h3>"; # afficher les emails qui on été trouvé (show found emails)
open (EMAILS, ">>data/email.txt") || &ErrorMsg; #
Append to data folder
foreach $single_email (@email)
{
print EMAILS
"$single_email\n";
}
close (EMAILS);
############################
WRITE CHECKED URL TO FILE ###########################
open (VERIFURL,
">>data/verifurl.txt") || &ErrorMsg; # Write to data folder
print
VERIFURL "$baseurl";
close (VERIFURL); # ajouter le URL verifier a verifurl.txt
} #### END IF ####
print "<meta http-equiv=\"refresh\"
content=\"0; URL=spider.cgi?newurl=0&varurl=$varurl\">"; # reload page
print
"</BODY></HTML>";
####################### SUB #####################
sub ErrorMsg
{
print "Server can't open file : $!";
exit;
}