␡
- Introduction
- Fetching the Site and Checking Links
- Conclusion
- Complete Listing
< Back
Page 4 of 4
Like this article? We recommend
Complete Listing
01: #!/usr/bin/perl -w 02: use strict; 03: use HTML::Parser; 04: use LWP::UserAgent; 05: use URI::URL; 06: my %LINKS; 07: my %GOOD_LINKS; 08: my %BAD_LINKS; 09: my $BASE; 10: my @TO_CHECK; 11: my $URL = $ARGV[0] || "http://mydomain.com"; 12: { 13: package GetLinks; 14: use base 'HTML::Parser'; 15: sub start { 16: my $self = shift; 17: my ($tag, $tag_attr) = @_; 18: if ($tag eq 'a' and defined $tag_attr->{href}) { 19: $LINKS{$tag_attr->{href}} = 0; 20: } 21: if ($tag eq 'img' and defined $tag_attr->{src}) { 22: $LINKS{$tag_attr->{src}} = 0; 23: } 24: } 25: } 26: my $ua = new LWP::UserAgent; 27: $ua->agent("LinkCheck/0.1"); 28: print "Starting scan from $URL\n"; 29: my $req = new HTTP::Request('GET',$URL); 30: my $res = $ua->request($req); 31: if (!$res->is_success) { 32: die "Can't fetch $URL"; 33: } 34: $BASE = $res->base; 35: my $parser = GetLinks->new; 36: $parser->parse($res->content); 37: for my $link (keys %LINKS) { 38: my $true_url = url($link, $BASE)->abs; 39: push(@TO_CHECK, $true_url); 40: } 41: while (my $url = shift @TO_CHECK) { 42: next if exists $GOOD_LINKS{$url} or exists $BAD_LINKS{$url}; 43: $req = new HTTP::Request('GET', $url); 44: $res = $ua->request($req); 45: if ($res->is_success) { 46: if ($res->content_type =~ /text\/html/i && $url =~ { 47: my $parser = GetLinks->new; 48: $parser->parse($res->content); 49: for my $link (keys %LINKS) { 50: my $abs = url($link, $BASE)->abs; 51: unless(exists $GOOD_LINKS{$abs} or BAD_LINKS{$abs}) { 52: push(@TO_CHECK, $abs); 53: } 54: } 55: } 56: $GOOD_LINKS{$url}++; 57: } else { 58: $BAD_LINKS{$url}++; 59: } 60: } 61: print qq{Bad links\n}; 62: print qq{$_\n} for keys %BAD_LINKS;
< Back
Page 4 of 4