use Mojo::UserAgent;
use Bloom::Filter;
use Smart::Comments;
use DBI;
my $dbname = "bbs_url";
my $location = "localhost";
my $port = "3306";
my $database = "DBI:mysql:$dbname:$location:$port";
my $db_user = "root";
my $db_pass = "toor";
my $dbh = DBI->connect($database,$db_user,$db_pass);
my $dept_level = 4;
my $baseUrl = Mojo::URL->new($ARGV[0] || 'http://bbs.xxxxx.cn/');
my ($domain) = $baseUrl =~ qr#http://(?:www.)?([^/]+)#;
my $filter = Bloom::Filter->new(capacity => 100000, error_rate => 0.0001);
my $ua = Mojo::UserAgent->new(max_redirects => 5);
$name="xxxxx";
my $query = "CREATE TABLE $name("." `No` int(100) NOT NULL auto_increment,"." `depth` int(10) NOT NULL,"." `Url` text NOT NULL, PRIMARY KEY (`No`) ".") ENGINE=MyISAM DEFAULT CHARSET=utf8;";
my $sth = $dbh->prepare($query);
$sth->execute() or die "create table student error: ".$sth->errstr();
my $callback;$callback = sub {
my ($ua, $tx) = @_;
#open(FD,">>url.txt")|| die ("Could not open file");
return if !$tx->success;
my $dept = $tx->req->headers->header('dept');
return if $dept > $dept_level;
++$dept;
$tx->res->dom->find("a[href]")->each(sub{
my $attrs = shift->attrs;
my $newUrl = Mojo::URL->new($attrs->{href});
if (!$newUrl->host and !$newUrl->scheme) {
$newUrl->host($tx->req->url->host);
$newUrl->scheme($tx->req->url->scheme);
}
$newUrl->fragment(undef);
next if ( $newUrl->scheme ne 'http' && $newUrl->scheme ne 'https' );
next if $newUrl->host !~ qr/$domain/;
next if ( $newUrl->path =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf)$/i );
if( !$filter->check($newUrl) ) {
if(($filter->key_count())%1000 ==0){
print $filter->key_count(), " $dept ", $newUrl, "\n";
}
if($dept==3 || $dept ==4 || $dept ==5){
#$n++;
#print FD $filter->key_count(),"\t",$dept,"\t",$newUrl,"\n";
my $sql="insert into $name(depth,Url) values('$dept','$newUrl')";
my $sth=$dbh->prepare("$sql");
$sth->execute();
$sth->finish();
}
#if($dept==4){
#$n++;
#print FD $filter->key_count(),"\t",$dept,"\t",$newUrl,"\n";
#my $sql="insert into $names(depth,Url) values('$dept','$newUrl')";
#my $sth=$dbh->prepare("$sql");
#$sth->execute();
#$sth->finish();
# }
$filter->add($newUrl);
$ua->get($newUrl => { dept => $dept } => $callback);
}
});
};
$ua->get($baseUrl => { dept => 1} => $callback);
Mojo::IOLoop->start;
|
请发表评论