root/trunk/plagger/lib/Plagger/Plugin/Filter/FetchEnclosure/Xango.pm

Revision 873 (checked in by miyagawa, 2 years ago)

FetchEnclosure?::Xango: keep enclosure and local_path when the response is redirected. Refs #285

  • Property svn:keywords set to Id
Line 
1 # $Id$
2 #
3 # Copyright (c) 2006 Daisuke Maki <dmaki@cpan.org>
4 # All rights reserved.
5
6 package Plagger::Plugin::Filter::FetchEnclosure::Xango;
7 use strict;
8 use base qw(Plagger::Plugin::Filter::FetchEnclosure);
9 BEGIN { sub Xango::DEBUG{ 1 } }
10 use Xango::Broker::Push;
11
12 sub register {
13     my($self, $context) = @_;
14     my %xango_args = (
15         Alias => 'xgbroker',
16         HandlerAlias => 'xghandler',
17         HttpCompArgs => [ Agent => "Plagger/$Plagger::VERSION (http://plagger.org/)", Timeout => $self->conf->{timeout} || 10 ],
18         %{$self->conf->{xango_args} || {}},
19     );
20     $self->{xango_alias} = $xango_args{Alias};
21     Plagger::Plugin::Filter::FetchEnclosure::Xango::Crawler->spawn(
22         Plugin => $self,
23         BrokerAlias => $xango_args{Alias},
24         UseCache => exists $self->conf->{use_cache} ?
25             $self->conf->{use_cache} : 1,
26         MaxRedirect => $self->conf->{max_redirect} || 3,
27     );
28     Xango::Broker::Push->spawn(%xango_args);
29
30     $context->register_hook(
31         $self,
32         'update.entry.fixup' => \&enqueue,
33         'update.fixup'       => \&fetch,
34     );
35
36 }
37
38 sub enqueue
39 {
40     my($self, $context, $args) = @_;
41
42     for my $enclosure ($args->{entry}->enclosures) {
43         my $feed_dir = File::Spec->catfile($self->conf->{dir}, $args->{feed}->id_safe);
44         unless (-e $feed_dir && -d _) {
45             $context->log(info => "mkdir $feed_dir");
46             mkdir $feed_dir, 0777;
47         }
48
49         my $path = File::Spec->catfile($feed_dir, $enclosure->filename);
50         $context->log(info => "fetch " . $enclosure->url . " to " . $path);
51
52         my %job_args;
53         if ($self->conf->{fake_referer}) {
54             $context->log(debug => "Sending Referer: " . $args->{entry}->permalink);
55             $job_args{referer} = $args->{entry}->permalink;
56         }
57         my $job = Xango::Job->new(
58             uri      => URI->new($enclosure->url),
59             redirect => 0,
60             path     => $path,
61             enclosure => $enclosure,
62         );
63    
64         POE::Kernel->post($self->{xango_alias}, 'enqueue_job', $job);
65     }
66 }
67
68 sub fetch { POE::Kernel->run }
69
70 package Plagger::Plugin::Filter::FetchEnclosure::Xango::Crawler;
71 use strict;
72 use POE;
73 use File::Path qw(mkpath);
74 use File::Basename qw(dirname);
75
76 sub apply_policy { 1 }
77 sub spawn  {
78     my $class = shift;
79     my %args  = @_;
80
81     POE::Session->create(
82         heap => {
83             PLUGIN => $args{Plugin},
84             USE_CACHE => $args{UseCache},
85             BROKER_ALIAS => $args{BrokerAlias},
86             MaxRedirect => $args{MaxRedirect},
87         },
88         package_states => [
89             $class => [ qw(_start _stop apply_policy prep_request handle_response) ]
90         ]
91     );
92 }
93
94 sub _start { $_[KERNEL]->alias_set('xghandler') }
95 sub _stop  { }
96 sub prep_request {
97     return unless $_[HEAP]->{USE_CACHE};
98
99     my $job = $_[ARG0];
100     my $req = $_[ARG1];
101     my $plugin = $_[HEAP]->{PLUGIN};
102
103     my $ref = $plugin->cache->get($job->uri);
104     if ($ref) {
105         $req->if_modified_since($ref->{LastModified})
106             if $ref->{LastModified};
107         $req->header('If-None-Match', $ref->{ETag})
108             if $ref->{ETag};
109     }
110
111     $req->header(Referer => $job->notes('referer'))
112         if $job->notes('referer');
113 }
114
115 sub handle_response {
116     my $job = $_[ARG0];
117     my $plugin = $_[HEAP]->{PLUGIN};
118
119     my $redirect = $job->notes('redirect') + 1;
120     return if $redirect > $_[HEAP]->{MaxRedirect};
121
122     my $r = $job->notes('http_response');
123     my $url    = $job->uri;
124     if ($r->code =~ /^30[12]$/) {
125         $url = $r->header('location');
126         return unless $url =~ m!^https?://!i;
127         my $new_job = Xango::Job->new(
128             uri => URI->new($url),
129             redirect => $redirect,
130             path => $job->notes('path'), # TODO: rewrite path with the new URL? respect Content-Disposition?
131             enclosure => $job->notes('enclosure'),
132         );
133         $_[KERNEL]->post($_[HEAP]->{BROKER_ALIAS}, 'enqueue_job', $new_job);
134         return;
135     } else {
136         return unless $r->is_success;
137
138         my $local_path = $job->notes('path');
139
140         my $dir = dirname($local_path);
141         if (!-d $dir) {
142             if (! mkpath([$dir], 0, 0777) || !-d $dir || !-w _) {
143                 $plugin->log(warn => "failed to create directory $dir: $!");
144                 return;
145             }
146         }
147
148         open(my $fh, ">", $local_path);
149         if (! $fh) {
150             $plugin->log(warn => "failed to open $local_path for writing: $!");
151             return;
152         }
153
154         print $fh $r->content;
155         close($fh);
156
157         my $enclosure = $job->notes('enclosure');
158         $enclosure->local_path($local_path);
159         # Fix length if it's broken
160         if ($r->header('Content-Length')) {
161             $enclosure->length($r->header('Content-Length'));
162         }
163     }
164
165     if ($_[HEAP]->{USE_CACHE}) {
166         $plugin->cache->set(
167             $job->uri,
168             {ETag => $r->header('ETag'),
169                 LastModified => $r->header('Last-Modified')}
170         );
171     }
172 }
173
174 1;
175
176 1;
Note: See TracBrowser for help on using the browser.