#!/usr/local/bin/perl -w ########################################################################## # Created March/April 1996, Michael D. Smith # Part of WebGlimpse (GlimpseHTTP) research with Udi Manber # Glimpse mailing list: glimpse@cs.arizona.edu # WebGlimpse home page: http://glimpse.cs.arizona.edu/webglimpse # All documentations are there. # # Modified by Dachuan Zhang, May 23, 1996 # . Use arrays instead of associative arrays for IndexAD & AddSearchAD # to maintain the order of patterns. # . Perl subroutine is used instead of binary 'get_href' # . Unused procs like old_traverse are gone. # . Deal differently with Sub-directory option. # # Modified by Michael Smith, Sept 23, 1996 # - in-lined libraries for efficiency # - cleaned up some code # # Modified into version 1.1b1 by Michael Smith, Burra Gopal, and Udi Manber # November 22, 1996 # lots of added features. # # Modified into version 1.5 by Golda Bernstein, Peter Bigot, Burra Gopal and Udi Manber # 7/4/97 Added code to escape special characters in paths array # 7/23 Added Virtual Host support. # 9/16 Fixed neighborhood search by saving file paths to .nh files as per Burra # Several other minor fixes # # 12/25/97 Now using revision notation for each file separately # # Revision 2.01 # ########################################################################## ########################################################################## ## GLOBALS ########################################################################## $archivedir = $ARGV[0]; $quiet = defined($ARGV[1]) && ($ARGV[1] eq "-q"); undef %NEIGHBORHOOD; # stored as LOCAL files, value: # of times %NEIGHBORHOOD = (); undef %LINKS; # file->links (as FILES) ## REALLY as URLS! Noted 9/15/97 --GB %LINKS = (); undef %FILELINKS; # REALLY has file->links as FILES. Added 7/5/98 --GB %FILELINKS = (); undef %URL2FILE; # url->files %URL2FILE = (); undef %ROBOTDATA; # cached data from sites -- robot permissions %ROBOTDATA = (); undef %TOINDEX; # list of files to index %TOINDEX = (); #%IndexAD={}; # allow/deny of files to index #%AddSearchAD={}; # allow/deny of files to add search box ### unused # @LocalFiles=(); $archivepwd = ""; $archiveprot =""; $archivehost =""; $archiveport =""; $archivepath =""; $archiveurl = "http://www.myserver.xxx/path/to/archive"; $globalfilenum=0; $HTMLFILE_RE = "((.s?html)|(.sht)|(.htm))\$"; $SITE_RE = '[^:]+:\/\/([^\/]+)\/.*'; $NumLocalCollected = 0; $NumRemoteCollected = 0; # LOGFILE, ERRFILE -- files for logging ### *TO CHANGE TRAVERSAL* ### SET THIS VARIABLE TO 0 FOR MORE 'LENIENT' TRAVERSAL $LIMIT_TRAVERSAL = 1; ########################################################################## ## SETTINGS ########################################################################## # to be changed $WEBGLIMPSE_HOME = "/home/onetan/onetan-www"; # static $nh_pre = ".nh."; $WEBGLIMPSE_LIB = "$WEBGLIMPSE_HOME/lib"; #Changed to httpget.pl, which uses url_get 8/5/97 --GB #$GETHTTP_CMD = "$WEBGLIMPSE_LIB/httpget"; $GETHTTP_CMD = "$WEBGLIMPSE_LIB/httpget"; $GETURL_CMD = "$WEBGLIMPSE_LIB/url_get"; # name of config file # $CONFIGFILE = "archive.cfg"; # files and dirs in the archivepwd $TEMPROBOTFILE = "robots.tmp"; $MAPFILE= ".wgmapfile"; $REMOTEDIR = ".remote"; $WGINDEX = ".wgfilter-index"; $MADENH = ".wg_madenh"; $FLISTFNAME = ".wg_toindex"; $ERRFILENAME = ".wg_err"; $LOGFILENAME = ".wg_log"; # $STARTFILE = ".wgstart"; $WGADDSEARCH = ".wgfilter-box"; $ROBOTNAME = "HTTPGET"; # Hash of file patterns to allow and deny -GB 6/30/98 local(%IndexPAT) = (); local(%IndexAD) = (); ########################################################################## ## ENTRY POINT ########################################################################## $| = 1; #--------------------------------- # make my libraries more important unshift(@INC, "$WEBGLIMPSE_LIB"); require "URL.pl"; require "siteconf.pl"; require "config.pl"; ## included below -- not needed any more # require "webgutils.pl"; # require "normalize.pl"; # require "get_href.pl"; #--------------------------------- if($archivedir eq "") { $archivedir = "."; # make it current dir } # try to change the directory to indexdir $startpwd = `pwd`; $retval = chdir ($archivedir); if($retval==0){ print "Cannot change directory to $archivedir. Quitting.\n"; exit -3; } # get the 'real' path $archivepwd = $archivedir; if ($archivepwd !~ /^\//) { $archivepwd = `pwd`; chomp $archivepwd; print STDERR "Warning: overriding archive dir $archivedir with $archivepwd\n"; } # make sure it has a configuration file if(&TestConfig($archivepwd)==0){ print "Cannot find configuration file for archive in $archivepwd. Quitting.\n"; exit -4; } #---------------------- # Avoid error messages from -w about using things only once $urlstat = $URL_TRAVERSE; $urlstat = $URL_ERROR; $urlstat = $URL_LOCAL; $urlstat = $URL_SCRIPT; $urlstat = $URL_REMOTE; #---------------------- # make the .remote directory if it doesn't exist if(!(-d $REMOTEDIR)){ mkdir($REMOTEDIR, 0755); chmod(0755, $REMOTEDIR); }else{ # clean out the directory `rm -rf $REMOTEDIR/*`; } # get the settings from the configuration file # there should be no problem opening this file -- we know it exists # read the settings # ($title, $url, $traverse_type, $numhops, $nhhops, # $addboxes) = &ReadConfig($archivepwd); # Initialize variables to avoid warnings ($title, $urlpath, $traverse_type, $explicit_only, $numhops, $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem) = ('','','','','','','','','','',''); ($title, $urlpath, $traverse_type, $explicit_only, $numhops, $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, @urllist) = ReadConfig($archivepwd); # open logs &open_logs(); print LOGFILE "From Configuration:\n"; my(@configlist) = qw(title urlpath traverse_type explicit_only numhops nhhops local_limit remote_limit addboxes vhost usemaxmem ) ; foreach $item (@configlist) { $value = ''; eval "$value = \$$item"; print LOGFILE " $item: $value\n"; } print LOGFILE " urllist: @urllist\n\n"; if (!$quiet) { if($traverse_type==1){ print "Getting remote links, $numhops hops...\n"; print "Neighborhood will be $nhhops hops.\n"; }elsif ($traverse_type==2){ print "Getting local files by directory...\n"; print "Neighborhoods correspond to subdirectories\n"; }else{ print "Getting local links by hops, $numhops hops...\n"; print "Neighborhood will be $nhhops hops.\n"; } } # set the robots file to the archivepwd $REMOTEDIR = "$archivepwd/$REMOTEDIR"; $WGINDEX = "$archivepwd/$WGINDEX"; $MADENH = "$archivepwd/$MADENH"; $FLISTFNAME = "$archivepwd/$FLISTFNAME"; $ERRFILENAME = "$archivepwd/$ERRFILENAME"; $LOGFILENAME = "$archivepwd/$LOGFILENAME"; $MAPFILE = "$archivepwd/$MAPFILE"; $TEMPROBOTFILE = "$archivepwd/$TEMPROBOTFILE"; $WGADDSEARCH = "$archivepwd/$WGADDSEARCH"; ($archiveprot, $archivehost, $archiveport, $archivepath) = &url::parse_url($archiveurl); # open map &open_map(); # open the file of files to index open(FLIST, ">$FLISTFNAME") || die "Cannot open file $FLISTFNAME. Aborting."; # open the MADENH file open(NEIGH, ">$MADENH") || die "Cannot open file $MADENH. Aborting."; # read in the .wgfilter-index into IndexPAT, IndexAD added params --GB 6/30/98 &open_indexallowdeny("$WGINDEX", *IndexPAT, *IndexAD); # read in the .wgfilter-box into AddSearchAllowDeny &open_searchallowdeny("$WGADDSEARCH"); # read in the site configuration &siteconf::ReadConf($vhost); &siteconf::LoadCache(); ############### ### PHASE 1 ### ############### # TRAVERSE SITE # first, clear the .remote dir system("/bin/rm -f $REMOTEDIR/*"); # for each file in the .wgstart # open(WGSTART, "$STARTFILE") || die "Cannot open $STARTFILE"; # @startlist = ; foreach $url(@urllist){ chomp($url); if ($traverse_type == 2) { # For sub-directory option. # Starting with v2.0, we accept directory paths as well as url's --GB 12/25/97 # If $url starts with a /, assume it is a directory and not a url. if ($url =~ /^\//) { $file = $url; $url = &siteconf::LocalFile2Url($file); } else { $file = &siteconf::LocalUrl2File($url); } # The URL's in the list may be for directories or domains # Only chop off the file if matches HTML_RE -GB 7/24/97 ($_ = $file) =~ tr/A-Z/a-z/; if ($file =~ /$HTMLFILE_RE/) { $file =~ s/(\/)[^\/]+$/\//; } if ($url =~ /$HTMLFILE_RE/) { $url =~ s/(\/)[^\/]+$/\//; } # If doesn't end with a /, add one. -GB 7/31/97 if ($file !~ /.+\/$/) { ($file .= '/');} if ($url !~ /.+\/$/) { ($url .= '/');} # Removed because the directory is not actually a file we can index --GB 7/1/98 #$URL2FILE{$url}=$file; print LOGFILE "Starting from url: $url as dir: $file\n"; &IndexDir($url, $file); } else { # For # of hops option. # This used to assume that the first link was always local! May not be the case. # $file = &siteconf::LocalUrl2File($url); # Now we check first if the url is local or remote. --GB 7/27/98 if (CheckServer($url) == $URL_LOCAL) { # just get the local file name $file = &siteconf::LocalUrl2File($url); $NumLocalCollected++; if(!(-e $file)){ print ERRFILE "Cannot find $url as $file. Not traversing.\n"; next; } } else { # if remote file, go get it! $file = &geturl2file($url); # geturl2file puts it into URL2FILE map } print LOGFILE "Starting from url: $url, file: $file\n"; # Traverse the files... # &traverse($url, $file); # puts lists in global variables &new_traverse($url, $file, $numhops); } } &siteconf::SaveCache(); # output some indexing data if (!$quiet) { print "\n\n------------------------------------------------------\nCollected $NumLocalCollected local pages and $NumRemoteCollected remote pages.\n------------------------------------------------------\n\n"; } ############### ### PHASE 2 ### ############### # store the data we got from the traversal # write out the TOINDEX array to .wg_toindex # while(($file, $url) = (each %TOINDEX)) { # print FLIST "$file\n"; # } while(($url, $file) = (each %URL2FILE)) { print FLIST "$file $url\n"; } close(FLIST); ############### ### PHASE 3 ### ############### # IF INDEXING BY SUBDIR, neighborhoods are controlled by filtering at the time of the search for subdirectory. # IF TRAVERSING BY LINKS, each file has a neighborhood --GB 12/25/97 # Currently, the only way to use the neighborhood search is by adding search boxes to each page if (($traverse_type != 2) && ($addboxes == 1)) { # for each local file, if writable, create neighborhood and search box while(($file, $junk) = each %TOINDEX){ # if it's not a remote file, try to create a neighborhood if($file !~ /^$REMOTEDIR/){ # check if not excluded by the wgfilter-box file if(&okay_to_addsearch($file)==1){ # check if we can write the file if(-w $file){ if(create_neighborhood($file, $nhhops)==1) { if(store_neighborhood($file)!=0) { # if we wrote something into a neighborhood file for it, # write the file to the .neighborhooded file # this info will be used by addsearch print NEIGH "$file\n"; } else { print LOGFILE "Unable to store neighborhood for $file.\n"; } } else { print LOGFILE "No neighborhood for $file: cannot create a neighborhood.\n"; } }else{ print LOGFILE "No neighborhood for $file: cannot write the .nh file.\n"; } } else { print LOGFILE "No neighborhood for $file: excluded by wgfilter-box.\n"; } } else { print LOGFILE "No neighborhood for $file; it's remote.\n"; } } } ### CLOSE UP SHOP ### close NEIGH; # close the neighborhooded file &close_map(); &close_logs(); # remove the robots file system("rm -rf $TEMPROBOTFILE"); #---------------------- #change the dir back chdir($startpwd); #Added by bgopal, 12:45pm, Nov 13 1996 #&DB'perlprof if defined &DB'perlprof; ########################################################################## ### PROCEDURES ########################################################################## ########################################################################## sub okay_to_addsearch{ my($file)=@_; my($index, $found, $pattern, $allowdeny, $i); # first, check if it's excluded $index=1; # by default, it's accepted # this loop is hacked because while/each doesn't re-enter correctly $found=0; # print "$file\n"; foreach $i (0 .. $#AddSearchPAT) { $pattern = $AddSearchPAT[$i]; $allowdeny= $AddSearchAD[$i]; # print "$pattern $allowdeny\n"; if($file=~/$pattern/){ $index=$allowdeny; last; } } return $index; } ########################################################################## # takes 2 params -- file name and assoc array # uses both BY REFERENCE sub open_searchallowdeny{ my($lineno, $AD, $pat, $i); # read in the info from file eval{ open(FILE, "$_[0]"); }; if($@){ warn "Cannot open file $_[0]\n"; return; } $lineno=0; $i = 0; while(){ $lineno++; /(\S+)\s*(\S+)/; $AD = $1; $pat = $2; if($AD=~/Allow/i){ $AddSearchPAT[$i] = $pat; $AddSearchAD[$i] = 1; }elsif ($AD=~/Deny/i){ $AddSearchPAT[$i] = $pat; $AddSearchAD[$i] = 0; }else{ print "Syntax error in $_[0], line $lineno\n"; } $i ++; } close FILE; } ########################################################################## sub store_neighborhood{ my($origfile)=@_; my($name, $num, $file); $file = $origfile; # prepend the .nh_ $file =~ s/([^\/]+)$/$nh_pre$1/; eval{ open(FILE, ">$file"); }; if ($@) { print LOGFILE "Cannot open neighborhood file $file.\n"; # failure return 0; } # go through the NEIGHBORHOOD and print all entries # # We need FILES, not URLs, so look up URL2FILE on each. 9/15/97 --GB # Now this is taken care of at the time we make the hash; so remove this lookup. 7/5/98 --GB $num=0; while(($name, $junk)=each %NEIGHBORHOOD){ $num++; # if (defined($URL2FILE{$name})) { # print FILE "$URL2FILE{$name}\n"; # } print FILE "$name\n"; } close(FILE); chmod(0644, $file); if($num==0){ print LOGFILE "No neighborhood for $origfile. Not adding search box to it.\n"; unlink($file); # just delete the neighborhood } return $num; # returns the number in the neighborhood } ########################################################################## sub create_neighborhood{ my($file, $hops)=@_; my($i, @links,@nextlinks); if (!$quiet) { print "Creating neighborhood of $hops hops for $file.\n"; } # clear it undef %NEIGHBORHOOD; %NEIGHBORHOOD = (); ### $hops is never < 0; not sure what this section is for --GB 12/28/97 if($hops<0){ my($dir); # just strip the file name from the $file $dir = $file; $dir =~ s/[^\/]+$//; $NEIGHBORHOOD{$dir}=1; return 1; # success ### possibly delete section above }else{ # create the initial list of entries @links = (); # if (defined($LINKS{$file})) { # @links = split(",", $LINKS{$file}); # } # Replaced with FILELINKS that contains filenames rather than URLS --GB 7/5/98 # Attempting to fix bug where neighborhoods over size 1 didn't work. if (defined($FILELINKS{$file})) { @links = split(",", $FILELINKS{$file}); } # put all these links in the hash table foreach $link(@links){ if($link ne ""){ $NEIGHBORHOOD{$link} = 1; } } # go n hops in for($i=1; $i<$hops; $i++){ # clear the 'nextlinks' array undef @nextlinks; @nextlinks = (); # get all the links for each link foreach $link(@links){ # if (defined($LINKS{$link})) { # # get the list of links for this link and add this to the list # push(@nextlinks, split(",",$LINKS{$link})); # } if (defined($FILELINKS{$link})) { # get the list of links AS FILENAMES for this link and add this to the list push(@nextlinks, split(",",$FILELINKS{$link})); } } # clear the list for the next round undef @links; @links = (); # add all the elements to the hash table foreach $link(@nextlinks){ if(!defined($NEIGHBORHOOD{$link}) || ($NEIGHBORHOOD{$link}!=1)){ # if it's not already in the table, # add it, and traverse next time $NEIGHBORHOOD{$link}=1; push(@links, $link); } } my($numlinks); $numlinks = @links; if($numlinks==0){ last; } } ## split it all up and add to neighborhood # @links = split(",", $linkstring); # foreach $link(@links){ # if($link ne ""){ # $NEIGHBORHOOD{$link} = 1; # } # } # Added by bgopal, 11/14/96 #undef @nextlinks; #undef @links; return 1; # success } } ########################################################################## sub close_logs{ close ERRFILE; close LOGFILE; } ########################################################################## sub open_logs{ open(ERRFILE, ">$ERRFILENAME"); open(LOGFILE, ">$LOGFILENAME"); } ########################################################################## sub open_map{ open(MAP, ">$MAPFILE") || die "Cannot open map file: "; ### TO DO -- read map file? } ########################################################################## sub close_map{ while (($key, $value)=each %URL2FILE){ print MAP "$key $value\n"; } close(MAP); # change permissions chmod (0644, "$MAPFILE"); } ########################################################################## # Make sure the url ends with a complete filename (eg index.html) ########################################################################## sub fixurl{ my($file, $url) =@_; my($hfile); $file =~ /([^\/]*)$/; $hfile = $1; if ($url !~ /$hfile$/) { if ($url !~ /\/$/) { $url .= '/'; } $url .= $hfile; } return $url; } ########################################################################## sub getlinks{ my($file, $url) = @_; my($links, @output); # check if it's in the lookup table $links = $LINKS{$file}; if(defined($links) && ($links ne "")){ return $links; } # The problem is, we can't tell a non-html file from a directory on remote servers # # Make sure the url ends with a complete filename (eg index.html) # # Otherwise normalize() will incorrectly set the basepath # Changed to make normilize use HTML_RE instead of assuming local file! --GB 7/27/98 # $url = fixurl($file, $url); # if not in table, # get the hrefs # @output = `$GETHREF_CMD $file`; # chop(@output); # remove the \n @output = &get_href($file); # print "Output from get_href: @output\n"; # absolutify the links @output = &normalize($url, @output); # print "Output from normalization: @output\n"; #print "Links from url $url, file $file are: @output\n"; # remove dups and mailtos my(%THISLIST, $link); undef %THISLIST; %THISLIST = (); foreach $link (@output){ if($link=~/^mailto:/i || $link=~/^file:/i ){ # do nothing -- skip it }elsif (defined($THISLIST{$link}) && ($THISLIST{$link} eq "1")){ # it's a dup! splice(@output, $n, 1); # $size--; # what was this for? Not used anywhere else. }else{ # not a dup or mailto -- add to list and go on $THISLIST{$link} = "1"; $n++; } } # join and store in the lookup table $links = join(",", keys %THISLIST ); $LINKS{$file} = $links; return $links; } ########################################################################## sub ungetnewname{ $globalfilenum--; } ########################################################################## sub getnewname{ my($file) = @_; # if it ends in a /, just call it '.html' if($file=~/\/$/){ $ext=".html"; }else{ # put the extension onto the filename returned $file =~ /\.([^\/\.]+)$/; $ext = $1 || ''; if($ext ne ''){ $ext = ".$ext"; } } $globalfilenum++; return "$REMOTEDIR/$globalfilenum$ext"; } ########################################################################## sub robotsokay{ my($url)=@_; my($prot, $host, $port, $path) = &url::parse_url($url); # if the protocol isn't http, assume it's good if($prot!~/http/i){ return 1; } # check for the host in the robots stuff $paths = $ROBOTDATA{$host}; if (!defined($paths) || ($paths eq "")){ # we don't have it -- go get it $paths = &getrobotfile($host, $port); } # compare the paths and the urls return &pathokay($path, $paths); } ########################################################################## sub pathokay{ my($path, $paths) = @_; my(@patharray,$test); # make sure the path isn't empty -- if it is, it's a / if($path eq ""){ $path="/"; } # split the string @patharray = split(" ", $paths); # look at the paths -- if the url contains them, return 0 foreach $test(@patharray){ # Need to escape special chars $test =~ s/\*/\\\*/g; $test =~ s/\+/\\\+/g; if($path=~m#$test#){ return 0; } } return 1; } ########################################################################## sub getrobotfile{ my($host, $port)=@_; my(@aliases); my($output); my($olddata, $newdata); my($newprot, $newhost, $newport, $newpath, $url); # make the $url $url = "http://$host:$port/robots.txt"; # clear the aliases @aliases=($host); print LOGFILE "Getting robots file from $host:$port...\n "; # it's an http process -- call httpget $output = `$GETHTTP_CMD $url -o $TEMPROBOTFILE`; while($output ne ""){ # more for error? if($output=~/^error/i){ print ERRFILE "Error with getting $url\n"; # print LOGFILE "Error with getting $url\n"; last; } # look at output for redirect -- store redirects in file, too if($output=~/^Redirect: (.*)$/){ print LOGFILE "Redirected to: $1..."; # see if we have the redirected server ($newprot, $newhost, $newport, $newpath) = &url::parse_url($1); # add this name to the aliases list push(@aliases, $newhost); $olddata = $ROBOTDATA{$newhost}; if(defined($olddata) && ($olddata ne "")){ # set all the values foreach $newhost(@aliases){ $ROBOTDATA{$newhost}=$olddata; } return $olddata; # return 'bad' }else{ # try again $output = `$GETHTTP_CMD $1 -o $TEMPROBOTFILE`; } }else{ # we've got it, or there's an error... last; } } print LOGFILE "Done.\n"; $newdata = &getrobotpaths(); foreach $newhost(@aliases){ $ROBOTDATA{$newhost}=$newdata; } return $newdata; # return 'none' } ########################################################################## sub getrobotpaths{ my(@paths, $newdata); # now we have the robots.txt file in the TEMPROBOTFILE # check it! open(ROBOTFILE, $TEMPROBOTFILE); # assume it'll work while(){ s/\#.*$//; # remove comments if(/^User-agent:.*\W$ROBOTNAME\W/io || /^User-agent:\s*[*]/io){ # check for paths print LOGFILE " Found reference to this robot in robot file\n"; while(){ if(/^Disallow:\s*(\S+)\s*(\#.*)?/){ print LOGFILE " Robot disallowed for $1\n"; push(@paths, $1); }else{ last; # we're done with the record } } } } # print LOGFILE " Done parsing robot file\n"; close(ROBOTFILE); $pathstring = join(" ", @paths); if($pathstring eq ""){ $pathstring = " " ; } return $pathstring; } ########################################################################## sub geturl2file{ my($url) = @_; my($output, $link, $file, $oldfile, @aliases); # check if we have that in stock (we know it's not local) if (defined($URL2FILE{$url})) { $file = $URL2FILE{$url}; if($file ne ""){ return $file; } } # if we don't already have it, check if we can get it # check for robots.txt print LOGFILE "Checking the robot file for $url...\n"; if(&robotsokay($url)==0){ # it's not okay to get this. skip it. # print LOGFILE "Robot excluded from $url.\n"; print ERRFILE "Robot excluded from $url.\n"; $file=""; return $file; } # clear the aliases @aliases=($url); # order it $file = &getnewname($url); print LOGFILE "Getting $url into $file...\n "; # print "Getting $url ...\n "; if($url=~/^http:/i){ # it's an http process -- call httpget $output = `$GETHTTP_CMD $url -o $file`; while($output ne ""){ # more for error? if($output=~/^error/i){ print ERRFILE "Error with getting $url: $output\n"; # print LOGFILE "Error with getting $url\n"; last; } # look at output for redirect -- store redirects in file, too if($output=~/^Redirect: (.*)$/){ &ungetnewname(); # rewind the name counter # The next get will overwrite the unnecessary file # print LOGFILE "Redirected to: $1..."; # add this name to the aliases list push(@aliases, $1); # see if we have the redirected name already $oldfile = $URL2FILE{$1} || ""; if($oldfile ne ""){ # we have it already! $file = $oldfile; last; }else{ # try again $url = $1; # check robots.txt for new url if(&robotsokay($url)==0){ # it's not okay to get this. skip it. # print LOGFILE "Robot excluded from $url.\n"; $file=""; return $file; } $file = &getnewname($url); # get a new name (extensions matter) $output = `$GETHTTP_CMD $1 -o $file`; } }else{ # we've got it, or there's an error... last; } } }else{ $output = `$GETURL_CMD -o $file $url`; print LOGFILE "output from urlget: $output\n"; # can't tell if it worked or not } print LOGFILE "Done.\n"; # store $url and all redirects to map foreach $url(@aliases){ $URL2FILE{$url} = $file; } # change the permissions chmod(0644, $file); $NumRemoteCollected += 1; # Print message if we really retrieved the file. print "Got url $url into file $file.\n"; return $file; } ########################################################################## ########################################################################## ### TO DO -- make more robust -- check ip addrs, multiple paths sub local_file{ my($url) = @_; my($file); my($prot, $host, $port, $path) = &url::parse_url($url); $file=""; # convert $url to local file name (if we can) if($host=~/^$archivehost/i && $prot =~ /^$archiveprot$/i && $port =~ /^$archiveport$/ && $path =~/^$archivepath/){ $file=$path; # chop off archive path, prepend path $file =~ s/$archivepath/$archivepwd/; } return $file; } ##################################################################### # Following procs were added on June 2, 1996. # Dachuan Zhang ##################################################################### sub IndexDir { my($url, $dir) = @_; my($link, $file, $i, $cwd, $pattern, $allowdeny, $noindex); if (!$quiet) { print "IndexDir $dir as $url\n"; } # Find command cannot handle sym-link properly, so we first chdir. $cwd = `pwd`; chdir($dir); # Added -follow switch 2/11/98 to follow symbolic links. --Gb # Added -type f to just include files, not directories, sockets or other critters. --GB 7/1/98 open (FileList, "find . -type f -follow -print |"); # pipe in the file list. while () { chomp; (/\/\.nh\./) && next; (-d $_) && next; $file=$_; $link=$_; $file =~ s/^\.\//$dir/; $link =~ s/^\.\//$url/; $noindex = 1; # Default: index it. # print "$link:$file"; foreach $i (0 .. $#IndexPAT) { $pattern = $IndexPAT[$i]; $allowdeny = $IndexAD[$i]; if ($link =~ /$pattern/) { $noindex=$allowdeny; last; } } if ($noindex==0) { # print " Denied: $link\n"; print LOGFILE "Not indexing $link; excluded.\n"; } else { print LOGFILE " Accepted: $link\n"; ### MDSMITH -- added check for local_limit if($NumLocalCollected >= $local_limit){ print ERRFILE "Cannot collect $link; already collected local maximum.\n"; } else { $URL2FILE{$link}=$file; $NumLocalCollected += 1; $TOINDEX{$file}=1; } } } chdir $cwd; } ################################################################################ # 'in-lined' libraries for efficiency # 'require' is *very* poor for performance ################################################################################ # NORMALIZE ################################################################################ # Modified by Dachuan Zhang, May 23, 1996. # Take baseport into account! #--------------------------------------------------------------------------- sub normalize{ my($baseurl,@urllist)=@_; my($basefile, $url); my($baseprot, $basehost, $baseport, $basepath) = &url::parse_url($baseurl); # get the name for the $basehost # ($name, $aliases, $addrtype,$length,@addrs) = gethostbyname($basehost); # ($a,$b,$c,$d) = unpack('C4', $addrs[0]); # separate basepath into basepath and basefile # find the LAST / $basefile = $basepath; # Only chop off end if matches HTML_RE --GB 7/27/98 # $basepath =~ s/\/[^\/]*$//; # if ($basepath =~ /$HTMLFILE_RE/) { $basepath =~ s/(\/)[^\/]+$/\//; } if ($basepath !~ /\/$/) { $basepath .= "/"; # add the last / for the directory if not there already } # output # print "baseprot = $baseprot, "; # print "basehost = $basehost\n "; # print "baseport = $baseport, "; # print "basepath = $basepath, "; # print "basefile = $basefile\n"; foreach $url(@urllist){ next if($url =~ /^\s*$/); # print "Original url: $url\n"; # punt on the mailtos... if($url=~/^mailto:/i) { next; } # add things that might be missing. # if it starts with // if($url=~/^\/\//){ # tack on http: $url = "http:".$url; } # if it has no :// it has no protocol if ($url=~/^:\/\//){ # tack on http $url = "http".$url; } # Added https as valid protocol 5/2/98 --GB # if no protocol, if($url!~/^http:/i && $url!~/^https:/i && $url!~/^ftp:/i && $url!~/^gopher:/i && $url!~/^news:/i){ # if no / at beginning, it's relative, on same machine, same path if($url!~/^\//){ $url = $baseprot."://".$basehost.":".$baseport.$basepath.$url; }else{ # there is a / at the beginning # it's a new path, same machine $url = $baseprot."://".$basehost.":".$baseport.$url; } } #print "URL before parsing: $url\n"; my($prot, $host, $port, $path) = ('','','',''); ($prot, $host, $port, $path) = &url::parse_url($url); #print "URL after parsing: $prot://$host:$port$path\n"; # make sure the path has a preceding / $path = "/$path" if $path!~/^\//; # remove "/A/.." from "/A/../dir" $path =~ s/\/[^\/]+\/\.\.//g; # Removed IP address conversion because it causes problems with web servers # that alias by name only. --GB 10/2/97 # # # Uncomment for numbers # if($host!~/\d+\.\d+\.\d+\.\d+/){ # ($name, $aliases, $addrtype,$length,@addrs) = gethostbyname($host); # ($a,$b,$c,$d) = unpack('C4', $addrs[0]); # # # set host to the IP addr to prevent name aliasing # $host = "$a.$b.$c.$d"; # } $url = "$prot://$host:$port$path"; #print "URL after normalization: $url\n"; # strip off any #text $url =~ s/\#.+$//; # also, for consistency in our database, NO trailing /'s # NO! This causes a problem with the ROOT # $url =~ s/\/$//; } return @urllist; } ############################################################################### # Library- GET_HREF ############################################################################### sub get_href { my($file) = @_; my ($i, $link, $url, $page); my(@links) ; my(@lnks); $page = &readFile($file); # Changed 5/1/98 --GB # Links could be of the form # , or , or , # or # @links = split(/(HREF)]+HREF[\s]*=[\s]*|]*SRC[\s]*=[\s]*/i, $page); @links = split(/]*HREF[\s]*=[\s]*|]*SRC[\s]*=[\s]*/i, $page); foreach $i (1..$#links) { $link = $links[$i]; if ($link =~ /^\"?([^>\"\s]*)\"?/) { push(@lnks, $1); } } return @lnks; } sub readFile { my($file) = @_; local(*FH); my(@page); my($string); if (open (FH, $file)) { @page = ; close FH; } else { warn "Cannot open file $file: $@"; @page = (); } $string = join("",@page); return $string; } ######################################################################## ## new_traverse ## ## Recursively follows $numhops levels of links from $url (locally $file) ######################################################################## sub new_traverse { my ($url, $file, $numhops) = @_; my (@thelist); push(@thelist, $url); ### MDSMITH -- added check for local_limit # actually, no check needed here because this is only the first one... $URL2FILE{$url} = $file; $TOINDEX{$file}=1; # Don't assume 1st link is local. Could start with remote URL. # $NumLocalCollected+=1; if (!$quiet) { print "Traversing $numhops hops...\n"; } for($i=0; $i<$numhops; $i++){ # print "The urls after $i hops:\n"; # print " @thelist\n"; # visit the nodes in the list @thelist = visit(@thelist); # print "thelist: @thelist\n"; # if there's nothing more to collect, stop there my($numlinks); $numlinks = @thelist; if($numlinks==0) { if (!$quiet) { print "No more links to traverse.\n"; } last; } } } sub visit{ my(@urllist) = @_; my($file); my(%ToTraverse); my($url, $urlstat, $at_remote, @links, $link); my($noindex, $found, $i, $pattern, $allowdeny); # my($filname,$link_site, $url_site, @linksasfiles); my($filname,$link_site, $url_site); # linksasfiles was unused --GB 7/5/98 foreach $url (@urllist) { $file = $URL2FILE{$url}; # print "Looking at url: $url, file: $file\n"; # figure out whether this page is local or remote # Now we use archive-specific CheckServer instead # $urlstat = &siteconf::CheckUrl($url); $urlstat = &CheckServer($url); if($urlstat==$URL_REMOTE){ $at_remote=1; #print "$url is remote\n"; }else{ $at_remote=0; if ($urlstat == $URL_ERROR) { print LOGFILE "Error: Unable to determine status of $url\n"; } # print "$url is local\n"; } @links = split(",",getlinks($file,$url)); # for each link, foreach $link(@links){ #Added by bgopal for testing purposes: Nov 22/1996: 3.15pm if(($link eq "1") || ($link eq " ")) { next; } # first, check if it's excluded $noindex=1; # by default, it's accepted $found =0; # this loop is hacked because while/each doesn't re-enter correctly # print "link: $link\n"; foreach $i (0 .. $#IndexPAT) { $pattern = $IndexPAT[$i]; $allowdeny = $IndexAD[$i]; # "$pattern $allowdeny\n"; if($link=~/$pattern/){ $noindex=$allowdeny; last; } } # skip if denied if ($noindex==0){ # print "Denied\n"; print LOGFILE "Not indexing $link; excluded.\n"; next; } # convert to local or remote #disabled for now print "Checking url $link for remote or local..\n"; # $urlstat = &siteconf::CheckUrl($link); $urlstat = CheckServer($link); #print "urlstat for $link: $urlstat\n"; # $urlstat = $siteconf::URL_REMOTE; $filename=""; if(($urlstat==$URL_REMOTE)||($urlstat==$URL_TRAVERSE)||($urlstat==$URL_SCRIPT)){ print "Url $link is remote...\n"; if(($urlstat==$URL_REMOTE) && ($traverse_type!=1)){ # only do if we're allowing remote print "Skipping non-local url: $link.\n"; next; } # check that we haven't already gotten max if(($NumRemoteCollected >= $remote_limit)&&($urlstat==$URL_REMOTE)){ print ERRFILE "Cannot collect $link; already got maximum number of remote links.\n"; next; } # print LOGFILE "File $link is remote.\n"; print "Getting remote url: $link\n"; # if we're at a remote site and we're not allowed to go out of it, # limit the traversal if($LIMIT_TRAVERSAL && $at_remote){ # if the *current* page is remote, and this link is remote, # check that they're the same site! # print "Examining $link on page $url\n"; # get the sites $link =~ /$SITE_RE/o; $link_site = $1; $url =~ /$SITE_RE/o; $url_site = $1; # if not the same site, go to the next link if($link_site ne $url_site){ print ERRFILE " Cannot go from remote site $url_site to remote site $link_site... skipping $link.\n"; next; } else { #disabled for now print "Same site... okay.\n"; } } # PROBLEM - sometimes we get the same remote file many times! # if remote file, go get it! $filename = &geturl2file($link); # geturl2file puts it into URL2FILE map } if($urlstat==$URL_LOCAL){ # just get the local file name # print "Url $link is local...\n"; $filename = &siteconf::LocalUrl2File($link); # LOGFILE "File $link is local: $filename\n"; # print "Local url: $link, file: $filename\n"; if(!(-e $filename)){ print ERRFILE "Cannot find $link as $filename. Not traversing.\n"; next; } # if ($TOINDEX{$filename} ne "") { # add it if it hasn't already been visited # } # add this mapping to the list ### MDSMITH -- added check for local_limit if($NumLocalCollected >= $local_limit){ print LOGFILE "Cannot collect $link; already collected local maximum.\n"; } else { $URL2FILE{$link}=$filename; $NumLocalCollected +=1; } } if($filename ne ""){ # if we haven't already seen this file, add it to the list # to index, and add it to traversal list if(!defined($TOINDEX{$filename}) || ($TOINDEX{$filename}!=1)){ # add the file name to the list of files to index $TOINDEX{$filename}=1; # use an assoc array to remove dups # push onto the list to traverse # only put on the list if it's not remote, or explicit_only # is turned off ### MDSMITH EXPLICIT_ONLY change if($urlstat!=$URL_REMOTE || $explicit_only==0 ){ $ToTraverse{$link}=1; # hash to remove dups # print "Putting $link on the list to traverse.\n"; # push (@TraverseQ, $link); } } # linksasfiles was unused; replaced with hash FILELINKS --GB 7/5/98 # push onto a list of links #push(@linksasfiles, $filename); $FILELINKS{$file} = $filename; }else{ # filename=""... there was an error print LOGFILE "Error with link: $link. Cannot recognize as local *or* remote.\n"; } } # Added by bgopal, Nov 14 1996 undef @links; @links = (); # # UNUSED --GB 7/5/98 # undef @linksasfiles; # @linksasfiles=(); } my(@TraverseQ) = keys(%ToTraverse); # print "Returning TraverseQ of @TraverseQ\n"; # Added by bgopal, Nov 14 1996 # undef @urllist; return @TraverseQ; }