#!/usr/bin/perl -w use strict; use carp; use LWP::UserAgent; use URI::URL; use HTML::Parse qw(parse_html); # Ecriture du log d'erreur BEGIN { use CGI::Carp qw(carpout); open(LOG, ">>/log_erreur.txt") or die "Impossible d'ouvrir : $!\n"; carpout(*LOG); } my $VERSION = "Bot/1.01"; $| = 1; sub spider (%); spider URL => '$url'; sub spider (%) { my %args = @_; my @startlinks = ("http://www.free.fr"); @startlinks = delete_doublon(@startlinks); push(@startlinks, $args{URL}); # Call Lwp method my $ua = LWP::UserAgent->new; $ua->agent('Mozilla/5.0 (compatible;)'); WORKLOOP: while (my $link = shift @startlinks) { for (my $i = 0; $i< $#startlinks; $i++) { next WORKLOOP if $link eq $startlinks[$i]; } print ">>>>> working on $link\n"; #------------------------------------------------# # Nettoyage de $link et suppréssion des doublons # #------------------------------------------------# my @links = $link; @links = delete_doublon(@links); @links = grep { $_ ne "\$url" } @links; @links = explore(@links); print "$_\n" foreach @links; #-------------------------------------------------------------------------------- #------------------------------------------------# # Exploration des liens (sans doublon) :) # #------------------------------------------------# sub explore { my $ua = new LWP::UserAgent; $ua->agent('Mozilla/5.0 (compatible;)'); my($url, %saw, @urls); foreach $url ( @_ ) { my $res = $ua->request(HTTP::Request->new(GET => $url)); =pod unless ($res->is_success) { warn "$0: Bad URL: $url\n"; next; } =cut my $ht_tree = parse_html($res->content); my $base = $res->base; my($linkpair, $fqurl); foreach $linkpair (@{$ht_tree->extract_links(qw)}) { my($link,$elem) = @$linkpair; push(@urls, $fqurl) unless $saw{ $fqurl = url($link,$base)->abs->as_string }++; } } =pod push @urls, print join("\n", @urls), "\n"; push @urls, return @urls; use next line for uniq and sorted urls print join("\n", sort keys %saw), "\n"; =cut push @urls, return sort keys %saw; } #-------------------------------------------------------------------------------- } } sub delete_doublon { @_ = grep { defined } @_; my %h_unique; foreach my $ligne ( @_ ) { $h_unique{$ligne} = undef; } @_ = keys %h_unique; push @_, return grep { defined } @_; }