sub crawlIt { my($ua,$urlStr,$urlLog,$visitedAlready)=@_; $request = new HTTP::Request `GET', $urlStr; $response = $ua->request($request); if ($response->is_success) { my($urlData)=$response->content(); my($html) = parse_html($urlData); $title=""; $html->traverse(\&searchForTitle,1); &mirrorFile($ua,$urlStr); foreach (@{$html->extract_links(qw(a img))}) { ($link,$linkelement)=@$; if ($linkelement->tag() eq `a') { my($url)=&getAbsoluteURL($link,$urlStr); if ($url ne "") { $escapedURL=$url; $escapedURL=~s/\//\\\//g; $escapedURL=~s/\?/\\\?/g; $escapedURL=~s/\+/\\\+/g; if (eval "grep(/$escapedURL/,\@\$visitedAlready)" == 0) { push(@$visitedAlready,$url); &crawlIt($ua,$url,$urlLog,$visitedAlready,$depth); } } } elsif ($linkelement->tag() eq `img') { my($url)=&getAbsoluteURL($link,$urlStr); if ($url ne "") { &mirrorFile($url); } } } } } sub searchForTitle { my($node,$startflag,$depth)=@_; $lwr_tag=$node->tag; $lwr_tag=~tr/A-Z/a-z/; if ($lwr_tag eq `title') { foreach (@{$node->content()}) { $title .= $_; } return 0; } return 1; } sub mirrorFile { my($ua,$urlStr)=@_; my($url)=new URI::URL $urlStr; my($localpath)=$MIRROR_ROOT; $localpath .= $url->path(); $ua->mirror($urlStr,$localpath); }
This example of mirroring remote sites might be useful for simple sites with only HTML files. If you have the need for a more sophisticated remote mirroring system, it would be best to use a UNIX-based replication tool like rdist for your site. If you are running a Windows NT server, there are replication tools available for these systems as well.
by
updated