root/trunk/plagger/lib/Plagger/Plugin/Filter/EntryFullText.pm

Revision 1881 (checked in by miyagawa, 2 years ago)

Filter::EntryFullText?: grab title from TITLE tag, if title is not there ... this is a HACK. I would like to make it a separate plugin

Line 
1 package Plagger::Plugin::Filter::EntryFullText;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use DirHandle;
6 use Encode;
7 use File::Spec;
8 use List::Util qw(first);
9 use HTML::ResolveLink;
10 use Plagger::Date; # for metadata in plugins
11 use Plagger::Util qw( decode_content );
12 use Plagger::Plugin::CustomFeed::Simple;
13 use Plagger::UserAgent;
14
15 sub rule_hook { 'update.entry.fixup' }
16
17 sub register {
18     my($self, $context) = @_;
19     $context->register_hook(
20         $self,
21         'customfeed.handle'  => \&handle,
22         'update.entry.fixup' => \&filter,
23     );
24 }
25
26 sub init {
27     my $self = shift;
28     $self->SUPER::init(@_);
29     $self->load_plugins();
30
31     $self->{ua} = Plagger::UserAgent->new;
32 }
33
34 sub load_plugins {
35     my $self = shift;
36     my $context = Plagger->context;
37
38     $self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) });
39     $self->load_assets('*.pl',   sub { $self->load_plugin_perl(@_) });
40 }
41
42 sub load_plugin_perl {
43     my($self, $file, $base) = @_;
44
45     Plagger->context->log(debug => "Load plugin $file");
46
47     open my $fh, '<', $file or Plagger->context->error("$file: $!");
48     (my $pkg = $base) =~ s/\.pl$//;
49     my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg";
50
51     if ($plugin_class->can('new')) {
52         Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code");
53         return $plugin_class->new;
54     }
55
56     my $code = join '', <$fh>;
57     unless ($code =~ /^\s*package/s) {
58         $code = join "\n",
59             ( "package $plugin_class;",
60               "use strict;",
61               "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );",
62               "sub site_name { '$pkg' }",
63               $code,
64               "1;" );
65     }
66
67     eval $code;
68     Plagger->context->error($@) if $@;
69
70     push @{ $self->{plugins} }, $plugin_class->new;
71 }
72
73 sub load_plugin_yaml {
74     my($self, $file, $base) = @_;
75
76     Plagger->context->log(debug => "Load YAML $file");
77     my @data = YAML::LoadFile($file);
78
79     push @{ $self->{plugins} },
80         map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) } @data;
81 }
82
83 sub handle {
84     my($self, $context, $args) = @_;
85
86     my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} };
87     if ($handler) {
88         $args->{match} = $handler->custom_feed_follow_link;
89         $args->{xpath} = $handler->custom_feed_follow_xpath;
90         return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args);
91     }
92 }
93
94 sub filter {
95     my($self, $context, $args) = @_;
96
97     my $handler = first { $_->handle_force($args) } @{ $self->{plugins} };
98     if ( !$handler && $args->{entry}->body && $args->{entry}->body->is_html && !$self->conf->{force_upgrade} ) {
99         $self->log(debug => $args->{entry}->link . " already contains body. Skipped");
100         return;
101     }
102
103     if (! $args->{entry}->permalink) {
104         $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped");
105         return;
106     }
107
108     # NoNetwork: don't connect for 3 hours
109     my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } );
110     if (!$res->status && $res->is_error) {
111         $self->log(debug => "Fetch " . $args->{entry}->permalink . " failed");
112         return;
113     }
114
115     $args->{content} = decode_content($res);
116
117     # if the request was redirected, set it as permalink
118     if ($res->http_response) {
119         my $base = $res->http_response->request->uri;
120         if ( $base ne $args->{entry}->permalink ) {
121             $context->log(info => "rewrite permalink to $base");
122             $args->{entry}->permalink($base);
123         }
124     }
125
126     # use Last-Modified to populate entry date, even if handler doesn't find one
127     # TODO: make this a separate plugin
128     if ($res->last_modified && !$args->{entry}->date) {
129         $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) );
130     }
131
132     my @plugins = $handler ? ($handler) : @{ $self->{plugins} };
133
134     my $upgraded;
135     for my $plugin (@plugins) {
136         if ( $handler || $plugin->handle($args) ) {
137             $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name);
138             my $data = $plugin->extract($args);
139                $data = { body => $data } if $data && !ref $data;
140             if ($data) {
141                 $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink);
142                 my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink );
143
144                 # if body was already there, set that to summary
145                 if ($args->{entry}->body) {
146                     $args->{entry}->summary($args->{entry}->body);
147                 }
148
149                 $data->{body} = $resolver->resolve( $data->{body} );
150                 $args->{entry}->body($data->{body});
151                 $args->{entry}->title($data->{title}) if $data->{title};
152                 $args->{entry}->author($data->{author}) if $data->{author};
153                 $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon};
154                 $args->{entry}->summary($data->{summary}) if $data->{summary};
155
156                 # extract date using found one
157                 if ($data->{date}) {
158                     $args->{entry}->date($data->{date});
159                 }
160
161                 $upgraded++;
162                 last;
163             }
164         }
165     }
166
167     # extract TITLE tag if title is not set yet
168     # TODO: make this a separate plugin
169     if (!$args->{entry}->title
170         and $args->{content} =~ m!<title>\s*(.*?)\s*</title>!is ) {
171         $args->{entry}->title( HTML::Entities::decode($1) );
172     }
173
174     return 1 if $upgraded;
175
176     # failed to extract: store whole HTML if the config is on
177     if ($self->conf->{store_html_on_failure}) {
178         $args->{entry}->body($args->{content});
179         return 1;
180     }
181
182     $context->log(warn => "Extract content failed on " . $args->{entry}->permalink);
183 }
184
185
186 package Plagger::Plugin::Filter::EntryFullText::Site;
187 sub new { bless {}, shift }
188 sub custom_feed_handle { 0 }
189 sub custom_feed_follow_link { }
190 sub custom_feed_follow_xpath { }
191 sub handle_force { 0 }
192 sub handle { 0 }
193
194 package Plagger::Plugin::Filter::EntryFullText::YAML;
195 use Encode;
196 use List::Util qw(first);
197
198 sub new {
199     my($class, $data, $base) = @_;
200
201     # add ^ if handle method starts with http://
202     for my $key ( qw(custom_feed_handle handle handle_force) ) {
203         next unless defined $data->{$key};
204         $data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!;
205     }
206
207     # decode as UTF-8
208     for my $key ( qw(extract extract_date_format) ) {
209         next unless defined $data->{$key};
210         if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') {
211             $data->{$key} = [ map decode("UTF-8", $_), @{$data->{$key}} ];
212         } else {
213             $data->{$key} = decode("UTF-8", $data->{$key});
214         }
215     }
216
217     bless {%$data, base => $base }, $class;
218 }
219
220 sub site_name {
221     my $self = shift;
222     $self->{base};
223 }
224
225 sub custom_feed_handle {
226     my($self, $args) = @_;
227     $self->{custom_feed_handle} ?
228         $args->{feed}->url =~ /$self->{custom_feed_handle}/ : 0;
229 }
230
231 sub custom_feed_follow_link {
232     $_[0]->{custom_feed_follow_link};
233 }
234
235 sub custom_feed_follow_xpath {
236     $_[0]->{custom_feed_follow_xpath};
237 }
238
239 sub handle_force {
240     my($self, $args) = @_;
241     $self->{handle_force}
242         ? $args->{entry}->permalink =~ /$self->{handle_force}/ : 0;
243 }
244
245 sub handle {
246     my($self, $args) = @_;
247     $self->{handle}
248         ? $args->{entry}->permalink =~ /$self->{handle}/ : 0;
249 }
250
251 sub xml_escape {
252     for my $x (@_) {
253         $x = Plagger::Util::encode_xml($x);
254     }
255 }
256
257 sub extract {
258     my($self, $args) = @_;
259     my $data;
260
261     unless ($self->{extract} || $self->{extract_xpath}) {
262         Plagger->context->log(error => "YAML doesn't have either 'extract' nor 'extract_xpath'");
263         return;
264     }
265
266     if ($self->{extract}) {
267         if (my @match = $args->{content} =~ /$self->{extract}/s) {
268             my @capture = split /\s+/, $self->{extract_capture};
269             @capture = ('body') unless @capture;
270             @{$data}{@capture} = @match;
271         }
272     }
273
274     if ($self->{extract_xpath}) {
275         eval { require HTML::TreeBuilder::XPath };
276         if ($@) {
277             Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
278             return;
279         }
280
281         my $tree = HTML::TreeBuilder::XPath->new;
282         $tree->parse($args->{content});
283         $tree->eof;
284
285         for my $capture (keys %{$self->{extract_xpath}}) {
286             my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
287             if (@children) {
288                 no warnings 'redefine';
289                 local *HTML::Element::_xml_escape = \&xml_escape;
290                 $data->{$capture} = $children[0]->isElementNode
291                     ? $children[0]->as_XML
292                     : $children[0]->getValue;
293             } else {
294                 Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}");
295             }
296         }
297     }
298
299     if ($data) {
300         if ($self->{extract_after_hook}) {
301             eval $self->{extract_after_hook};
302             Plagger->context->error($@) if $@;
303         }
304
305         if ($data->{date}) {
306             if (my $format = $self->{extract_date_format}) {
307                 $format = [ $format ] unless ref $format;
308                 $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
309                 if ($data->{date} && $self->{extract_date_timezone}) {
310                     $data->{date}->set_time_zone($self->{extract_date_timezone});
311                 }
312             } else {
313                 $data->{date} = Plagger::Date->parse_dwim($data->{date});
314             }
315         }
316
317         return $data;
318     }
319 }
320
321 1;
322
323 __END__
324
325 =head1 NAME
326
327 Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class
328
329 =head1 SYNOPSIS
330
331   - module: Filter::EntryFullText
332
333 =head1 DESCRIPTION
334
335 This plugin allows you to fetch entry full text by doing HTTP GET and
336 apply regexp to HTML. It's just like upgrading your flight ticket from
337 economy class to business class!
338
339 You can write custom fulltext handler by putting C<.pl> or C<.yaml>
340 files under assets plugin directory.
341
342 =head1 CONFIG
343
344 =over 4
345
346 =item store_html_on_failure
347
348 Even if fulltext handlers fail to extract content body from HTML, this
349 option enables to store the whole document HTML as entry body. It will
350 be useful to use with search engines like Gmail and Search:: plugins.
351 Defaults to 0.
352
353 =item force_upgrade
354
355 Even if entry body already contains HTML, this config forces the
356 plugin to upgrade the body. Defaults to 0.
357
358 =back
359
360 =head1 WRITING CUSTOM FULLTEXT HANDLER
361
362 (To be documented)
363
364 =head1 AUTHOR
365
366 Tatsuhiko Miyagawa
367
368 =head1 SEE ALSO
369
370 L<Plagger>
Note: See TracBrowser for help on using the browser.