root/trunk/plagger/lib/Plagger/Plugin/Filter/FetchEnclosure/Wget.pm

Revision 1734 (checked in by miyagawa, 5 years ago)

added Test::Spelling and fixed typoes

Line 
1 package Plagger::Plugin::Filter::FetchEnclosure::Wget;
2 use strict;
3 use base qw(Plagger::Plugin::Filter::FetchEnclosure);
4
5 use POE;
6 use POE::Session;
7 use POE::Wheel::Run;
8
9 sub register {
10     my($self, $context) = @_;
11     $context->register_hook(
12         $self,
13         'update.entry.fixup' => \&enqueue,
14         'update.fixup'       => \&fetch,
15     );
16 }
17
18 sub enqueue {
19     my($self, $context, $args) = @_;
20
21     for my $enclosure ($args->{entry}->enclosures) {
22         # TODO: do all of this in the base class ::Command
23         my $feed_dir = File::Spec->catfile($self->conf->{dir}, $args->{feed}->id_safe);
24         unless (-e $feed_dir && -d _) {
25             $context->log(info => "mkdir $feed_dir");
26             mkdir $feed_dir, 0777;
27         }
28
29         my $path = File::Spec->catfile($feed_dir, $enclosure->filename);
30
31         if ($enclosure->length && -e $path && -s _ == $enclosure->length) {
32             # TODO: if-none-match
33             $context->log(debug => $enclosure->url . "is already stored in $path");
34             next;
35         }
36
37         $context->log(info => "fetch " . $enclosure->url . " to " . $path);
38
39         my $referer;
40         if ($self->conf->{fake_referer}) {
41             $context->log(debug => "Sending Referer: " . $args->{entry}->permalink);
42             $referer = $args->{entry}->permalink;
43         }
44
45         my $cookies;
46         my $conf = $context->conf->{user_agent} || {};
47         if ($conf->{cookies}) {
48             my $cookie_jar = Plagger::Cookies->create($conf->{cookies});
49             if ($cookie_jar->isa('HTTP::Cookies::Mozilla')) {
50                 $cookies = $cookie_jar->{file};
51                 $context->log(debug => "Using cookie file $cookies");
52             }
53         }
54
55         # TODO: max connections per domain to respect RFC
56         POE::Session->create(
57             inline_states => {
58                 _start => sub {
59                     $_[HEAP]->{wheel} = POE::Wheel::Run->new(
60                         Program => [
61                             'wget',
62                             $enclosure->url,
63                             '-O', $path,
64                             '--verbose',
65                             '--continue',
66                             '--timestamping',
67                             '--tries', 5,
68                             ($referer ? ('--referer', $referer) : ()),
69                             ($cookies ? ('--load-cookies', $cookies) : ())
70                         ],
71                         StderrEvent => 'stderr',
72                         ErrorEvent => 'wheel_close',
73                         CloseEvent => 'wheel_close',
74                     );
75                 },
76                 stderr => sub {
77                     if ($_[ARG0] =~ /The file is already fully retrieved/) {
78                         # ok
79                     }
80                     elsif ($_[ARG0] =~ /^Length: [(\d,)]+ \[(.*?)\]/) {
81                         my($length, $mime_type) = ($1, $2);
82                         $length =~ tr/,//d;
83                         $enclosure->length($length);
84                         $enclosure->type($mime_type);
85                     }
86                     elsif ($_[ARG0] =~ m!\`\Q$path\E' saved \[(\d+)/\d+\]!) {
87                         my $length = $1;
88                         $enclosure->local_path($path);
89                         $context->log(info => "Download to $path is done [$length]");
90                     }
91
92                     $context->log(debug => $_[ARG0]);
93                 },
94                 wheel_close => sub {
95                     delete $_[HEAP]->{wheel};
96                 },
97             },
98         );
99     }
100 }
101
102 sub fetch {
103     Plagger->context->log(info => "Start downloading files using wget.");
104     POE::Kernel->run;
105     Plagger->context->log(info => "w00t! Downloading finished.");
106 }
107
108 1;
109
110 __END__
111
112 =head1 NAME
113
114 Plagger::Plugin::Filter::FetchEnclosure::Wget - Fetch enclosures using wget
115
116 =head1 SYNOPSIS
117
118   - module: Filter::FetchEnclosure::Wget
119     config:
120       dir: /path/to/download
121       concurrency: 5
122       max_requests_per_host: 2
123
124 =head1 DESCRIPTION
125
126 This plugin uses wget command to download enclosure files.
127
128 =head1 AUTHOR
129
130 Tatsuhiko Miyagawa
131
132 =head1 SEE ALSO
133
134 L<Plagger>
135
136 =cut
137
Note: See TracBrowser for help on using the browser.