--- makenh.orig Tue Jul 28 03:21:30 1998 +++ makenh Mon Nov 2 19:55:02 1998 @@ -68,6 +68,7 @@ $SITE_RE = '[^:]+:\/\/([^\/]+)\/.*'; $NumLocalCollected = 0; $NumRemoteCollected = 0; +$max_redir = 6; # LOGFILE, ERRFILE -- files for logging ### *TO CHANGE TRAVERSAL* @@ -187,22 +188,22 @@ # Initialize variables to avoid warnings ($title, $urlpath, $traverse_type, $explicit_only, $numhops, - $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem) = - ('','','','','','','','','','',''); + $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, $locale, $charset) = + ('','','','','','','','','','','','',''); ($title, $urlpath, $traverse_type, $explicit_only, $numhops, - $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, @urllist) = ReadConfig($archivepwd); + $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, $locale, $charset, @urllist) = ReadConfig($archivepwd); # open logs &open_logs(); print LOGFILE "From Configuration:\n"; my(@configlist) = qw(title urlpath traverse_type explicit_only numhops - nhhops local_limit remote_limit addboxes vhost usemaxmem ) ; + nhhops local_limit remote_limit addboxes vhost usemaxmem locale charset) ; foreach $item (@configlist) { $value = ''; - eval "$value = \$$item"; + eval "\$value = \$$item"; print LOGFILE " $item: $value\n"; } print LOGFILE " urllist: @urllist\n\n"; @@ -398,7 +399,7 @@ &close_logs(); # remove the robots file -system("rm -rf $TEMPROBOTFILE"); +unlink($TEMPROBOTFILE); #---------------------- #change the dir back @@ -751,7 +752,7 @@ my($prot, $host, $port, $path) = &url::parse_url($url); # if the protocol isn't http, assume it's good - if($prot!~/http/i){ + if(!defined($prot) || $prot!~/http/i){ return 1; } @@ -800,6 +801,7 @@ my($output); my($olddata, $newdata); my($newprot, $newhost, $newport, $newpath, $url); + my($redcount)=0; # make the $url $url = "http://$host:$port/robots.txt"; @@ -815,6 +817,7 @@ while($output ne ""){ # more for error? if($output=~/^error/i){ + truncate($TEMPROBOTFILE,0); print ERRFILE "Error with getting $url\n"; # print LOGFILE "Error with getting $url\n"; last; @@ -822,7 +825,13 @@ # look at output for redirect -- store redirects in file, too if($output=~/^Redirect: (.*)$/){ - print LOGFILE "Redirected to: $1..."; + if ($redcount >= $max_redir) { + truncate($TEMPROBOTFILE,0); + print ERRFILE "Too many redirections with $url\n"; + last; + } + $redcount++; + print LOGFILE "Redirected to: $1...\n"; # see if we have the redirected server ($newprot, $newhost, $newport, $newpath) = &url::parse_url($1); @@ -843,6 +852,7 @@ } }else{ # we've got it, or there's an error... + truncate($TEMPROBOTFILE,0); last; } } @@ -894,6 +904,7 @@ sub geturl2file{ my($url) = @_; my($output, $link, $file, $oldfile, @aliases); + my($redcount)=0; # check if we have that in stock (we know it's not local) if (defined($URL2FILE{$url})) { @@ -930,6 +941,7 @@ while($output ne ""){ # more for error? if($output=~/^error/i){ + truncate($file,0); print ERRFILE "Error with getting $url: $output\n"; # print LOGFILE "Error with getting $url\n"; last; @@ -937,6 +949,12 @@ # look at output for redirect -- store redirects in file, too if($output=~/^Redirect: (.*)$/){ + if ($redcount >= $max_redir) { + truncate($file,0); + print ERRFILE "Too many redirections with $url\n"; + last; + } + $redcount++; &ungetnewname(); # rewind the name counter # The next get will overwrite the unnecessary file @@ -970,6 +988,7 @@ } }else{ # we've got it, or there's an error... + truncate($file,0); last; } } @@ -1159,6 +1178,15 @@ ($prot, $host, $port, $path) = &url::parse_url($url); #print "URL after parsing: $prot://$host:$port$path\n"; + next if !defined($prot); + if (!defined($port) || + ($port eq '80' && $prot =~ /^https?$/) || + ($port eq '21' && $prot eq 'ftp')) { + $port = ''; + } else { + $port = ":$port"; + } + # make sure the path has a preceding / $path = "/$path" if $path!~/^\//; @@ -1177,7 +1205,7 @@ # $host = "$a.$b.$c.$d"; # } - $url = "$prot://$host:$port$path"; + $url = "$prot://$host$port$path"; #print "URL after normalization: $url\n"; # strip off any #text