Saya telah menyelesaikan proyek kecil selama dua hari terakhir yang terdiri dari pembuatan crawler di Perl.
Saya tidak punya pengalaman nyata di Perl (hanya apa yang saya pelajari dalam dua hari terakhir). Skrip saya adalah sebagai berikut:
ACTC.pm:
#!/usr/bin/perl
use strict;
use URI;
use URI::http;
use File::Basename;
use DBI;
use HTML::Parser;
use LWP::Simple;
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->max_redirect(0);
package Crawler;
sub new {
my $class = shift;
my $self = {
_url => shift,
_max_link => 0,
_local => 1,
};
bless $self, $class;
return $self;
}
sub trim{
my( $self, $string ) = @_;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
sub process_image {
my ($self, $process_image) = @_;
$self->{_process_image} = $process_image;
}
sub local {
my ($self, $local) = @_;
$self->{_local} = $local;
}
sub max_link {
my ($self, $max_link) = @_;
$self->{_max_link} = $max_link;
}
sub x_more {
my ($self, $x_more) = @_;
$self->{_x_more} = $x_more;
}
sub resolve_href {
my ( $self, $base, $href) = @_;
my $u = URI->new_abs($href, $base);
return $u->canonical;
}
sub write {
my ( $self, $ref, $data ) = @_;
open FILE, '>c:/perlscripts/' . $ref . '_' . $self->{_process_image} . '.txt';
foreach( $data ) {
print FILE $self->trim($_) . "\n";
}
close( FILE );
}
sub scrape {
my ( @m_error_array, @m_href_array, @href_array, $dbh, $query, $result, $array );
my ( $self, $DBhost, $DBuser, $DBpass, $DBname ) = @_;
if( defined( $self->{_process_image} ) && ( -e 'c:/perlscripts/href_w_' . $self->{_process_image} . ".txt" ) ) {
open ERROR_W, "<c:/perlscripts/error_w_" . $self->{_process_image} . ".txt";
open M_HREF_W, "<c:/perlscripts/m_href_w_" . $self->{_process_image} . ".txt";
open HREF_W, "<c:/perlscripts/href_w_" . $self->{_process_image} . ".txt";
@m_error_array = <ERROR_W>;
@m_href_array = <M_HREF_W>;
@href_array = <HREF_W>;
close ( ERROR_W );
close ( M_HREF_W );
close ( HREF_W );
}else{
@href_array = ( $self->{_url} );
}
my $z = 0;
while( @href_array ){
if( defined( $self->{_x_more} ) && $z == $self->{_x_more} ) {
print "died";
last;
}
my $href = shift( @href_array );
if( defined( $self->{_process_image} ) && scalar @href_array ne 0 ) {
$self->write( 'm_href_w', @m_href_array );
$self->write( 'href_w', @href_array );
$self->write( 'error_w', @m_error_array );
}
$self->{_link_count} = scalar @m_href_array;
my $info = URI::http->new($href);
if( ! defined( $info->host ) ) {
push( @m_error_array, $href );
}else{
my $host = $info->host;
$host =~ s/^www\.//;
$self->{_current_page} = $href;
my $redirect_limit = 10;
my $y = 0;
my( $response, $responseCode );
while( 1 && $y le $redirect_limit ) {
$response = $ua->get($href);
$responseCode = $response->code;
if( $responseCode == 200 || $responseCode == 301 || $responseCode == 302 ) {
if( $responseCode == 301 || $responseCode == 302 ) {
$href = $self->resolve_href( $href, $response->header('Location') );
}else{
last;
}
}else{
last;
}
$y++;
}
if( $y != $redirect_limit && $responseCode == 200 ) {
print $href . "\n";
if( ! defined( $self->{_url_list} ) ) {
my @url_list = ( $href );
}else{
my @url_list = $self->{_url_list};
push( @url_list, $href );
$self->{_url_list} = @url_list;
}
my $DNS = "dbi:mysql:$DBname:$DBhost:3306";
$dbh = DBI->connect($DNS, $DBuser, $DBpass ) or die $DBI::errstr;
$result = $dbh->prepare("INSERT INTO `". $host ."` (URL) VALUES ('$href')");
if( ! $result->execute() ){
$result = $dbh->prepare("CREATE TABLE `". $host ."` ( `ID` INT( 255 ) NOT NULL AUTO_INCREMENT , `URL` VARCHAR( 255 ) NOT NULL , PRIMARY KEY ( `ID` )) ENGINE = MYISAM ;");
$result->execute();
print "Host added: " . $host . "\n";
}
my $content = $response->content;
die "get failed: " . $href if (!defined $content);
my @pageLinksArray = ( $content =~ m/href=["']([^"']*)["']/g );
foreach( @pageLinksArray ) {
my $link = $self->trim($_);
if( $self->{_max_link} != 0 && scalar @m_href_array > $self->{_max_link} ) {
last;
}
my $new_href = $self->resolve_href( $href, $link );
if( $new_href =~ m/^http:\/\// ) {
if( substr( $new_href, -1 ) ne "#" ) {
my $base = $self->{_url};
my %values_index;
@values_index{@m_href_array} = ();
if( ! $new_href =~ m/$base/ ) {
if( $self->{_local} eq "true" && ! exists $values_index{$new_href} ) {
push( @m_href_array, $new_href );
push( @href_array, $new_href );
}
}elsif( $self->{_local} eq "true" && ! exists $values_index{$new_href} ) {
push( @m_href_array, $new_href );
push( @href_array, $new_href );
}
}
}
}
}else{
push( @m_error_array, $href );
}
}
}
}
1;
laba-laba_baru.pl:
#!/usr/bin/perl
use strict;
use warnings;
use ACTC;
my ($object, $url, $uri);
print "Starting Poing: (url): ";
chomp($url = <>);
$object = new Crawler( $url );
$object->process_image('process_image_name');
$object->local('true');
$object->max_link(0);
$object->x_more(9999999);
$object->scrape( 'localhost', 'root', '', 'crawl' );
#print $object->{_url} . "\n";
#print $object->{_process_image};
Sekarang belum lengkap beberapa fungsinya tidak berfungsi dengan benar tetapi setelah menjalankan skrip saya telah mengindeks 1500 halaman dalam waktu sekitar satu jam yang menurut saya cukup lambat.
Script mulai memeriksa hasilnya tetapi sekarang cukup lamban mengeluarkan satu url setiap detik.
Adakah yang bisa memberikan tip tentang cara meningkatkan kinerja?