CGI and Perl

Listing 9.10. Modified crawlIt() function for mirroring a site.

sub crawlIt {
    my($ua,$urlStr,$urlLog,$visitedAlready)=@_;
    $request = new HTTP::Request `GET', $urlStr;
    $response = $ua->request($request);
    if ($response->is_success) {
       my($urlData)=$response->content();
       my($html) = parse_html($urlData);
       $title="";
       $html->traverse(\&searchForTitle,1);
       &mirrorFile($ua,$urlStr);
       foreach (@{$html->extract_links(qw(a img))}) {
          ($link,$linkelement)=@$;
          if ($linkelement->tag() eq `a') {
             my($url)=&getAbsoluteURL($link,$urlStr);
             if ($url ne "") {
                $escapedURL=$url;
                $escapedURL=~s/\//\\\//g;
                $escapedURL=~s/\?/\\\?/g;
                $escapedURL=~s/\+/\\\+/g;
                if (eval "grep(/$escapedURL/,\@\$visitedAlready)" == 0) {
                   push(@$visitedAlready,$url);
                   &crawlIt($ua,$url,$urlLog,$visitedAlready,$depth);
                }
             }
          } elsif ($linkelement->tag() eq `img') {
             my($url)=&getAbsoluteURL($link,$urlStr);
             if ($url ne "") {
                &mirrorFile($url);
             }
          }
       }
    }
 }
 sub searchForTitle {
    my($node,$startflag,$depth)=@_;
    $lwr_tag=$node->tag;
    $lwr_tag=~tr/A-Z/a-z/;
    if ($lwr_tag eq `title') {
       foreach (@{$node->content()}) {
          $title .= $_;
       }
       return 0;
    }
    return 1;
 }
 sub mirrorFile {
    my($ua,$urlStr)=@_;
    my($url)=new URI::URL $urlStr;
    my($localpath)=$MIRROR_ROOT;
    $localpath .= $url->path();
    $ua->mirror($urlStr,$localpath);
 }

This example of mirroring remote sites might be useful for simple sites with only HTML files. If you have the need for a more sophisticated remote mirroring system, it would be best to use a UNIX-based replication tool like rdist for your site. If you are running a Windows NT server, there are replication tools available for these systems as well.