root/trunk/plagger/lib/Plagger/Plugin/Filter/FindEnclosures.pm

Revision 2063 (checked in by jesse, 1 year ago)

Added support for a url whitelist for enclosures that FindEnclosures? should find

Line 
1 package Plagger::Plugin::Filter::FindEnclosures;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use HTML::TokeParser;
6 use Plagger::Util qw( decode_content );
7 use List::Util qw(first);
8 use URI;
9 use DirHandle;
10 use Plagger::Enclosure;
11 use Plagger::UserAgent;
12
13 sub register {
14     my($self, $context) = @_;
15
16     $context->autoload_plugin({ module => 'Filter::ResolveRelativeLink' });
17     $context->register_hook(
18         $self,
19         'update.entry.fixup' => \&filter,
20     );
21 }
22
23 sub init {
24     my $self = shift;
25     $self->SUPER::init(@_);
26     $self->load_plugins();
27
28     $self->{ua} = Plagger::UserAgent->new;
29 }
30
31 sub load_plugins {
32     my $self = shift;
33     my $context = Plagger->context;
34
35     my $dir = $self->assets_dir;
36     my $dh = DirHandle->new($dir) or $context->error("$dir: $!");
37     for my $file (grep -f $_->[0] && $_->[0] =~ /\.(?:pl|yaml)$/,
38                   map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
39         $self->load_plugin(@$file);
40     }
41 }
42
43 sub load_plugin {
44     my($self, $file, $base) = @_;
45
46     Plagger->context->log(debug => "loading $file");
47
48     my $load_method = $file =~ /\.pl$/ ? 'load_plugin_perl' : 'load_plugin_yaml';
49     push @{ $self->{plugins} }, $self->$load_method($file, $base);
50 }
51
52 sub load_plugin_perl {
53     my($self, $file, $base) = @_;
54
55     open my $fh, '<', $file or Plagger->context->error("$file: $!");
56     (my $pkg = $base) =~ s/\.pl$//;
57     my $plugin_class = "Plagger::Plugin::Filter::FindEnclosures::Site::$pkg";
58
59     my $code = join '', <$fh>;
60     unless ($code =~ /^\s*package/s) {
61         $code = join "\n",
62             ( "package $plugin_class;",
63               "use strict;",
64               "use base qw( Plagger::Plugin::Filter::FindEnclosures::Site );",
65               "sub site_name { '$pkg' }",
66               $code,
67               "1;" );
68     }
69
70     eval $code;
71     Plagger->context->error($@) if $@;
72
73     return $plugin_class->new;
74 }
75
76 sub load_plugin_yaml { Plagger->context->error("NOT IMPLEMENTED YET") }
77
78 sub filter {
79     my($self, $context, $args) = @_;
80
81     # check $entry->link first, if it links directly to media files
82     $self->add_enclosure($args->{entry}, [ 'a', { href => $args->{entry}->permalink } ], 'href' );
83
84     return unless $args->{entry}->body;
85
86     my $parser = HTML::TokeParser->new(\$args->{entry}->body->data);
87     while (my $tag = $parser->get_tag('a', 'embed', 'img', 'object')) {
88         if ($tag->[0] eq 'a' ) {
89             $self->add_enclosure($args->{entry}, $tag, 'href');
90         } elsif ($tag->[0] eq 'embed') {
91             $self->add_enclosure($args->{entry}, $tag, 'src', { type => $tag->[1]->{type} });
92         } elsif ($tag->[0] eq 'img') {
93             $self->add_enclosure($args->{entry}, $tag, 'src', { inline => 1 });
94         } elsif ($tag->[0] eq 'object') {
95             $self->add_enclosure_from_object($args->{entry}, $parser);
96         }
97     }
98 }
99
100 sub add_enclosure_from_object {
101     my($self, $entry, $parser) = @_;
102
103     # get param tags and find appropriate FLV movies
104     my @params;
105     while (my $tag = $parser->get_tag('param', '/object')) {
106         last if $tag->[0] eq '/object';
107         push @params, $tag;
108     }
109
110     # find URL inside flashvars parameter
111     my $url;
112     if (my $flashvars = first { lc($_->[1]->{name}) eq 'flashvars' } @params) {
113         my %values = split /[=&]/, $flashvars->[1]->{value} || '';
114         $url   = first { m!^https?://.*\flv! } values %values;
115         $url ||= first { m!^https?://.*! } values %values;
116     }
117
118     # if URL isn't found in flash vars, then fallback to <param name="movie" />
119     if (!$url) {
120         my $movie = first { lc($_->[1]->{name}) eq 'movie' } @params;
121         $url = $movie->[1]->{value} if $movie;
122     }
123
124     return unless ($self->url_whitelisted($url));
125
126     if ($url) {
127         Plagger->context->log(info => "Found enclosure $url");
128         my $enclosure = Plagger::Enclosure->new;
129         $enclosure->url( URI->new($url) );
130         $enclosure->auto_set_type;
131         $entry->add_enclosure($enclosure); # XXX inline?
132     }
133 }
134
135
136 sub add_enclosure {
137     my($self, $entry, $tag, $attr, $opt) = @_;
138     $opt ||= {};
139
140     return unless ($self->url_whitelisted($tag->[1]->{$attr}));
141
142     if ($self->is_enclosure($tag, $attr, $opt->{type})) {
143         Plagger->context->log(info => "Found enclosure $tag->[1]{$attr}");
144         my $enclosure = Plagger::Enclosure->new;
145         $enclosure->url($tag->[1]{$attr});
146         $enclosure->auto_set_type($opt->{type});
147         $enclosure->is_inline(1) if $opt->{inline};
148         $entry->add_enclosure($enclosure);
149         return;
150     }
151
152     my $url = $tag->[1]{$attr};
153     my $content;
154     for my $plugin (@{$self->{plugins}}) {
155         if ( $plugin->handle($url) ) {
156             Plagger->context->log(debug => "Try $url with " . $plugin->site_name);
157             if ($plugin->needs_content) {
158                 $content ||= $self->fetch_content($url) or return;
159             }
160
161             if (my $enclosure = $plugin->find({ content => $content, url => $url })) {
162                 Plagger->context->log(info => "Found enclosure " . $enclosure->url ." with " . $plugin->site_name);
163                 $entry->add_enclosure($enclosure);
164                 return;
165             }
166         }
167     }
168 }
169
170 sub url_whitelisted {
171     my $self = shift;
172     my $url = shift;
173
174     if (my $regex = $self->conf->{enclosure_whitelist} ){
175         return unless ($url =~ qr/$regex/);
176     }
177     return 1;
178 }
179
180 sub fetch_content {
181     my($self, $url) = @_;
182
183     my $ua  = Plagger::UserAgent->new;
184     my $res = $ua->fetch($url, $self, { NoNetwork => 3 * 60 * 60 });
185     return if !$res->status && $res->is_error;
186
187     return decode_content($res);
188 }
189
190 sub is_enclosure {
191     my($self, $tag, $attr, $type) = @_;
192
193     return 1 if $tag->[1]{rel} && $tag->[1]{rel} eq 'enclosure';
194     return 1 if $self->has_enclosure_mime_type($tag->[1]{$attr}, $type);
195
196     return;
197 }
198
199 sub has_enclosure_mime_type {
200     my($self, $url, $type) = @_;
201
202     my $mime = $type ? MIME::Type->new(type => $type) : Plagger::Util::mime_type_of( URI->new($url) );
203     Plagger::Util::mime_is_enclosure($mime);
204 }
205
206 package Plagger::Plugin::Filter::FindEnclosures::Site;
207 sub new { bless {}, shift }
208 sub handle { 0 }
209 sub find { }
210 sub needs_content { 1 }
211
212 1;
213
214 __END__
215
216 =head1 NAME
217
218 Plagger::Plugin::Filter::FindEnclosures - Auto-find enclosures from entry content using B<< <a> >> / B<< <embed> >> tags
219
220 =head1 SYNOPSIS
221
222   - module: Filter::FindEnclosures
223
224 =head1 DESCRIPTION
225
226 This plugin finds enclosures from C<< $entry->body >> by finding 1)
227 B<< <a> >> links with I<rel="enclosure"> attribute, 2) B<< <a> >>
228 links to any URL which filename extensions match with known
229 audio/video formats and 3) I<src> attributes in B<< <img> >> and B<< <embed> >> tags.
230
231 For example:
232
233   Listen to the <a href="http://example.com/foobar.mp3">Podcast</a> now, or <a rel="enclosure"
234   href="http://example.com/foobar.m4a">download AAC version</a>. <img src="/img/logo.gif" />
235
236 Those 3 links (I<foobar.mp3>, I<foobar.m4a> and I<logo.gif>) are
237 extracted as enclosures, while I<logo.gif> is marked as "inline", so
238 that they won't appear as enclosures in Publish::Feed.
239
240 You might want to also use Filter::HEADEnclosureMetadata plugin to
241 know the actual length (bytes-length) of enclosures by sending HEAD
242 requests.
243
244 =head1 USAGE
245
246   - module: Filter::FindEnclosures
247     config:
248       enclosure_whitelist: (?:jpg|png|gif)
249
250 =head1 AUTHOR
251
252 Tatsuhiko Miyagawa
253
254 Masahiro Nagano
255
256 =head1 SEE ALSO
257
258 L<Plagger>, L<Plagger::Plugin::Filter::HEADEnclosureMetadata>, L<http://www.msgilligan.com/rss-enclosure-bp.html>, L<http://forums.feedburner.com/viewtopic.php?t=20>
259
260 =cut
261
Note: See TracBrowser for help on using the browser.