2009年8月1日星期六

[PerlChina] 贡献一个邪恶的脚本,呵呵

#!/usr/local/ActivePerl-5.10/bin/perl
#auto download pictures from http://e.psbbs.info/
#by
shanleiguang@gmail.com
use strict;
use warnings;

use threads;
use WWW::Mechanize;
use POSIX qw(strftime);

$| = 1;

my $today = strftime "%Y%m%d", localtime;

mkdir $today if(not -d $today);

my $pager = $ARGV[0] ? $ARGV[0] : 1; #页码,默认为1
my $url = "http://rr.pstang.com/thread.php?fid=39&search=&page=$pager";
my $mech = WWW::Mechanize->new(quite => 1, onerror => undef);
my @mechs;

#创建4个实例用于多线程下载图片
foreach (0..3) {
my $m = WWW::Mechanize->new(quite => 1, onerror => undef, timeout => 180);
$m->agent_alias('Windows IE 6');
$m->stack_depth(2);
push @mechs, $m;
}

$mech->agent_alias('Windows IE 6');
$mech->stack_depth(2);
print getts(), ", get baseURL '$url' ... ";
$mech->get($url);
print "done\n";

my $content = $mech->content();

$content =~ s/\n/ /g;
$content =~ s/\s{2,}/ /g;
$content = $1 if($content =~ m{普通主题(.*)$}); #取出'普通主题'关键字之后的内容

my @imgs; #一个帖子需下载的图片组
my %tids_downloaded; #记录已下载过的帖子的ID

open DOWNLOADED_R, "< downloaded.txt";
while(<DOWNLOADED_R>) {
chomp;
$tids_downloaded{$_} = 1;
}
close DOWNLOADED_R;
open DOWNLOADED_W, ">> downloaded.txt";
while($content =~ m{<tr align=center class=t_two>.*?<a href='read.php\?tid=(\d+)' id=''>}ig) {
my $thread_id = $1;
my $thread_url = "http://rr.pstang.com/read.php?tid=$thread_id";

print "\t", getts(), ", get threadURL '$thread_url' ... ";
$mech->get($thread_url);
print "done\n";

my $t_content = $mech->content();
my $uid;

$t_content =~ s/\n/ /g;
$t_content =~ s/\s{2,}/ /g;
if($t_content =~ m{本主题帖为未审核新贴}) { #忽略未审核新帖
print "\tskipped\n";
next;
}
if(-d "$thread_id") { #忽略先前下载过的帖子
print "\tdownloaded\n";
next;
}
if($t_content =~ m{<font face=Gulim><b>(.*?)</b></font>}) {
$uid = $1;
$uid =~ s/\s//g;
}
my $thread_dir = "$uid - $thread_id";
mkdir("$today/$thread_dir"); #为新帖创建单独目录

#@task_groups:按10个实例进行分组,为每个实例分配任务(要下载的图片)
#@threads:线程数组
my (@task_groups, @threads);

foreach my $img ($mech->find_all_images()) {
my $img_url = $img->url();
my @fs = split /\//, $img_url;
my $img_name = $fs[-1];

if($img_name =~ m{\.jpg$}i) { #只需要JPG图片
next if($img_name =~ m{luo.*?\.jpg$}i); #过滤不需要的论坛图片
push @imgs, [$img_url, "$today/$thread_dir/$img_name"]; #图片URL地址及本地保存路径
}
}
foreach my $i (0..$#imgs) {
my $gid = $i - int($i/4)*4; #$i mod 4
push @{$task_groups[$gid]}, $i;
}
foreach my $gid (0..$#task_groups) {
my $t = threads->create('get_image', $mechs[$gid], $task_groups[$gid]);
push @threads, $t;
}
while(my $t = shift @threads) { $t->join; } #join每个线程
@imgs = (); #处理下一个帖子前清空@imgs
print DOWNLOADED_W "$thread_id\n";
$tids_downloaded{$thread_id} = 1;
sleep(2); #be polite
}
close(DOWNLOADED_W);

sub getts { return strftime "%H:%M:%S", localtime; }

sub get_image {
my ($mech, $task_group) = @_;

foreach my $img_id (@{$task_group}) {
my ($img_url, $save_path) = @{$imgs[$img_id]};

$mech->get($img_url, ':content_file' => $save_path);
print "\t\t", getts(), ", got img '$img_url'\n";
}
}
你为浏览论坛看图不便而郁闷吗?贡献一个邪恶的脚本,呵呵

没有评论: