#!/usr/bin/perl -w =head1 NAME ygspider - mirror the "Files" section of a Yahoo! Group =head1 SYNOPSIS ygspider [options] [group] [username] [password] Where C is your Yahoo! username and C is your password. =head1 OPTIONS =over =item B<--help> Generate a help message and exit. =item B<--root> dir Save mirrored files in this directory. Default is a subdirectory of the current directory named C/C where the final C is the name of the group. =item B<--group> groupname Spider this group. This is an alternative to the positional argument. =item B<--user> username Specify a Yahoo! username to use for authentication. This is an alternative to the positional argument. =item B<--password> password Specify the password for the Yahoo! account. This is an alternatice to the positional argument. =back =head1 AUTHOR Caleb Epstein Ecaleb dot epstein at gmail dot comE =head1 VERSION $Id$ =cut use strict; use WWW::Mechanize; use Getopt::Long; use File::Basename; use File::Spec::Functions; use Pod::Usage; my ($group, $login, $passwd); my $root; my $progname = basename $0; GetOptions ("group=s" => \$group, "user=s" => \$login, "password=s" => \$passwd, "root=s" => \$root, "help!" => sub { pod2usage (1); exit 0 }) or pod2usage (2); # See if we need to get the group from the positional arguments if (not defined $group) { pod2usage (2) unless scalar @ARGV; $group = shift; } # See if we need to get user from positional arguments if (not defined $login) { pod2usage (2) unless scalar @ARGV; $login = shift; } # See if we need to get password from positional arguments if (not defined $passwd) { pod2usage (2) unless scalar @ARGV; $passwd = shift; } $root = File::Spec->rel2abs ("groups.yahoo.com/$group") unless defined $root; print "$progname: mirroring $group files using username $login:\n"; $| = 1; my %VISITED; sub mkdir_p { my $dir = shift; my @DIRS = split /\//, $dir; my @DIR; foreach my $d (@DIRS) { push (@DIR, $d); my $path = File::Spec->catdir (@DIR); if (not -d $path) { mkdir ($path, 0777) or die; } } } sub mirror { my $mech = shift; my @LINKS = @{$mech->links}; foreach my $index (0 .. $#LINKS) { # my $link (@LINKS) { my $link = $LINKS[$index]; my $url = $link->url_abs (); next if $url !~ m@(/group/$group/files/ | grp\.yahoofs\.com)@ox or exists $VISITED{$url}; $VISITED{$url} = 1; if ($url =~ m@http://.+\.grp\.yahoofs\.com/v1/[\w_-]{94}/(.+)$@) { my $filename = catfile ($root, $1); next if -e $filename; my $dir = dirname $filename; mkdir_p ($dir) unless -d $dir; print "FILE: $url -> $filename\n"; $mech->follow ($index) or die; open (FILE, "> $filename") or die "open $filename: $!\n"; print FILE $mech->content (); close FILE; $mech->back (); } else { print "DIR: $url\n"; $mech->follow ($index) or die; mirror ($mech); $mech->back (); } } } my $mech = new WWW::Mechanize (agent => "Mozilla", onerror => sub { die @_ }) or die; my $url = "http://groups.yahoo.com/group/$group/files/"; print "ROOT: $url\n"; # Fetch main page, but we will need to authenticate $mech->get ($url) or die; # Follow login link $mech->follow_link (url_regex => qr/login\.yahoo\.com/) or die; # Submit login form $mech->submit_form (form_name => 'login_form', fields => { login => $login, passwd => $passwd }) or die; # Re-fetch main page and start mirroring $mech->get ($url) or die; mirror ($mech);