package Plagger::Plugin::Filter::EntryFullText; use strict; use base qw( Plagger::Plugin ); use DirHandle; use Encode; use File::Spec; use List::Util qw(first); use HTML::ResolveLink; use Plagger::Date; # for metadata in plugins use Plagger::Util qw( decode_content ); use Plagger::Plugin::CustomFeed::Simple; use Plagger::UserAgent; sub rule_hook { 'update.entry.fixup' } sub register { my($self, $context) = @_; $context->register_hook( $self, 'customfeed.handle' => \&handle, 'update.entry.fixup' => \&filter, ); } sub init { my $self = shift; $self->SUPER::init(@_); $self->load_plugins(); $self->{ua} = Plagger::UserAgent->new; } sub load_plugins { my $self = shift; my $context = Plagger->context; $self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) }); $self->load_assets('*.pl', sub { $self->load_plugin_perl(@_) }); } sub load_plugin_perl { my($self, $file, $base) = @_; Plagger->context->log(debug => "Load plugin $file"); open my $fh, '<', $file or Plagger->context->error("$file: $!"); (my $pkg = $base) =~ s/\.pl$//; my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg"; if ($plugin_class->can('new')) { Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code"); return $plugin_class->new; } my $code = join '', <$fh>; unless ($code =~ /^\s*package/s) { $code = join "\n", ( "package $plugin_class;", "use strict;", "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );", "sub site_name { '$pkg' }", $code, "1;" ); } eval $code; Plagger->context->error($@) if $@; push @{ $self->{plugins} }, $plugin_class->new; } sub load_plugin_yaml { my($self, $file, $base) = @_; Plagger->context->log(debug => "Load YAML $file"); my @data = YAML::LoadFile($file); push @{ $self->{plugins} }, map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) } @data; } sub handle { my($self, $context, $args) = @_; my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} }; if ($handler) { $args->{match} = $handler->custom_feed_follow_link; $args->{xpath} = $handler->custom_feed_follow_xpath; return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args); } } sub filter { my($self, $context, $args) = @_; my $handler = first { $_->handle_force($args) } @{ $self->{plugins} }; if ( !$handler && $args->{entry}->body && $args->{entry}->body->is_html && !$self->conf->{force_upgrade} ) { $self->log(debug => $args->{entry}->link . " already contains body. Skipped"); return; } if (! $args->{entry}->permalink) { $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped"); return; } # NoNetwork: don't connect for 3 hours my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } ); if (!$res->status && $res->is_error) { $self->log(debug => "Fetch " . $args->{entry}->permalink . " failed"); return; } $args->{content} = decode_content($res); # if the request was redirected, set it as permalink if ($res->http_response) { my $base = $res->http_response->request->uri; if ( $base ne $args->{entry}->permalink ) { $context->log(info => "rewrite permalink to $base"); $args->{entry}->permalink($base); } } # use Last-Modified to populate entry date, even if handler doesn't find one # TODO: make this a separate plugin if ($res->last_modified && !$args->{entry}->date) { $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) ); } my @plugins = $handler ? ($handler) : @{ $self->{plugins} }; my $upgraded; for my $plugin (@plugins) { if ( $handler || $plugin->handle($args) ) { $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name); my $data = $plugin->extract($args); $data = { body => $data } if $data && !ref $data; if ($data) { $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink); my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink ); # if body was already there, set that to summary if ($args->{entry}->body) { $args->{entry}->summary($args->{entry}->body); } $data->{body} = $resolver->resolve( $data->{body} ); $args->{entry}->body($data->{body}); $args->{entry}->title($data->{title}) if $data->{title}; $args->{entry}->author($data->{author}) if $data->{author}; $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon}; $args->{entry}->summary($data->{summary}) if $data->{summary}; # extract date using found one if ($data->{date}) { $args->{entry}->date($data->{date}); } $upgraded++; last; } } } # extract TITLE tag if title is not set yet # TODO: make this a separate plugin if (!$args->{entry}->title and $args->{content} =~ m!