#! /opt/bin/perl use IO::Socket; # initialize program # $page_counter=0; print "\n\nWhere would you like to start searching ?\n"; print "(please type full path name)\n\n"; print "If you do not want to specify a start page simply press enter...\n"; $start_link=; chomp $start_link; unless ($start_link) { $start_link="http://www.pitt.edu/"; } $last_char=chop($start_link); if ($last_char ne "/") { $start_link=$start_link.$last_char."/"; } push (@new_links, $start_link); open(LINKS, ">>links.html"); print LINKS "

\tLINKS FOUND\n\n

\n"; #____________________________________________________________________________________________ while ($page_counter<=$link_counter) { $last_char=chop($new_links[$page_counter]); if ($last_char ne "/") { $new_links[$page_counter]=$new_links[$page_counter].$last_char."/"; } else {$new_links[$page_counter]=$new_links[$page_counter].$last_char;} $new_links[$page_counter]=~m#(https?://)?(.*?)/([^\s]*)\s*#i; $host = $2; $document = "/".$3; print "$page_counter: fetching document $document from $host\n"; $remote = IO::Socket::INET->new( Proto => "tcp", PeerAddr => $host, PeerPort => "80"); unless ($remote) { print "cannot connect to HTTP server\n $!\n"; $page_counter++; if ($new_links[$page_counter]) {redo;} else {print "- SPIDER TERMINATED -\n"}; } $remote->autoflush(1); print $remote "GET $document HTTP/1.0\n\n"; while ($temp = <$remote>) { # does the page contain any anchor tags? if yes push it to $new_links # if ($temp=~m#]+)(\s*)"?(.*)?>#ig) { $link=$5; print "link:=$5\n"; # does the found link contain http or www, otherwise add host address to it # unless (($link=~m#http#ig) || ($link=~m#www#ig)) { unless ($link=~/mailto/ig) { # check for "/" before single filenames found to distinguish index.html and /index.html # $check_char=reverse $link; $check_last_char=chop($check_char); if ($check_last_char ne "/") { $check_char=$check_char.$check_last_char."/"; $link=reverse ($check_char); } else { $check_char=$check_char."/"; $link=reverse $check_char; } $link="http://".$host.$link; } else { next; } } # check if link is already in array @new_links to avoid duplicates # $flag=1; foreach (@new_links) { if ($link eq $_) {$flag=0;} } if ($flag==1) { push(@new_links, $link); $link_counter++; print LINKS "$link_counter: $link<\/A>
\n"; } } } $page_counter++; } print "Total number of new links collected:$#new_links"; close LINKS; -close $remote;