| 1 |
package Plagger::Plugin::Aggregator::Simple; |
|---|
| 2 |
use strict; |
|---|
| 3 |
use base qw( Plagger::Plugin ); |
|---|
| 4 |
|
|---|
| 5 |
use Feed::Find; |
|---|
| 6 |
use Plagger::Enclosure; |
|---|
| 7 |
use Plagger::FeedParser; |
|---|
| 8 |
use Plagger::UserAgent; |
|---|
| 9 |
use Plagger::Text; |
|---|
| 10 |
use List::Util qw(first); |
|---|
| 11 |
use UNIVERSAL::require; |
|---|
| 12 |
use URI; |
|---|
| 13 |
|
|---|
| 14 |
sub register { |
|---|
| 15 |
my($self, $context) = @_; |
|---|
| 16 |
$context->register_hook( |
|---|
| 17 |
$self, |
|---|
| 18 |
'customfeed.handle' => \&aggregate, |
|---|
| 19 |
); |
|---|
| 20 |
} |
|---|
| 21 |
|
|---|
| 22 |
sub aggregate { |
|---|
| 23 |
my($self, $context, $args) = @_; |
|---|
| 24 |
|
|---|
| 25 |
my $url = $args->{feed}->url; |
|---|
| 26 |
my $res = $self->fetch_content($url) or return; |
|---|
| 27 |
|
|---|
| 28 |
my $content_type = eval { $res->content_type } || |
|---|
| 29 |
$res->http_response->content_type || |
|---|
| 30 |
"text/xml"; |
|---|
| 31 |
|
|---|
| 32 |
$content_type =~ s/;.*$//; # strip charset= cruft |
|---|
| 33 |
|
|---|
| 34 |
my $feed_url = Plagger::FeedParser->discover($res); |
|---|
| 35 |
if ($url eq $feed_url) { |
|---|
| 36 |
$self->handle_feed($url, \$res->content, $args->{feed}); |
|---|
| 37 |
} elsif ($feed_url) { |
|---|
| 38 |
$res = $self->fetch_content($feed_url) or return; |
|---|
| 39 |
$self->handle_feed($feed_url, \$res->content, $args->{feed}); |
|---|
| 40 |
} else { |
|---|
| 41 |
return; |
|---|
| 42 |
} |
|---|
| 43 |
|
|---|
| 44 |
return 1; |
|---|
| 45 |
} |
|---|
| 46 |
|
|---|
| 47 |
sub fetch_content { |
|---|
| 48 |
my($self, $url) = @_; |
|---|
| 49 |
|
|---|
| 50 |
my $context = Plagger->context; |
|---|
| 51 |
$context->log(info => "Fetch $url"); |
|---|
| 52 |
|
|---|
| 53 |
my $agent = Plagger::UserAgent->new; |
|---|
| 54 |
my $response = $agent->fetch($url, $self); |
|---|
| 55 |
|
|---|
| 56 |
if ($response->is_error) { |
|---|
| 57 |
$context->log(error => "GET $url failed: " . |
|---|
| 58 |
$response->http_status . " " . |
|---|
| 59 |
$response->http_response->message); |
|---|
| 60 |
return; |
|---|
| 61 |
} |
|---|
| 62 |
|
|---|
| 63 |
|
|---|
| 64 |
$context->log(debug => $response->status . ": $url"); |
|---|
| 65 |
|
|---|
| 66 |
$response; |
|---|
| 67 |
} |
|---|
| 68 |
|
|---|
| 69 |
sub handle_feed { |
|---|
| 70 |
my($self, $url, $xml_ref, $feed) = @_; |
|---|
| 71 |
|
|---|
| 72 |
my $context = Plagger->context; |
|---|
| 73 |
|
|---|
| 74 |
my $args = { content => $$xml_ref }; |
|---|
| 75 |
$context->run_hook('aggregator.filter.feed', $args); |
|---|
| 76 |
|
|---|
| 77 |
my $remote = eval { Plagger::FeedParser->parse(\$args->{content}) }; |
|---|
| 78 |
if ($@) { |
|---|
| 79 |
$context->log(error => "Parser $url failed: $@"); |
|---|
| 80 |
return; |
|---|
| 81 |
} |
|---|
| 82 |
|
|---|
| 83 |
$feed ||= Plagger::Feed->new; |
|---|
| 84 |
$feed->title(_u($remote->title)) unless defined $feed->title; |
|---|
| 85 |
$feed->url($url); |
|---|
| 86 |
$feed->link($remote->link); |
|---|
| 87 |
$feed->description(_u($remote->tagline)); |
|---|
| 88 |
$feed->language($remote->language); |
|---|
| 89 |
$feed->author(_u($remote->author)); |
|---|
| 90 |
$feed->updated($remote->modified) if defined $remote->modified; |
|---|
| 91 |
|
|---|
| 92 |
Encode::_utf8_on($$xml_ref); |
|---|
| 93 |
$feed->source_xml($$xml_ref); |
|---|
| 94 |
|
|---|
| 95 |
if ($remote->format eq 'Atom') { |
|---|
| 96 |
$feed->id( $remote->{atom}->id ); |
|---|
| 97 |
} |
|---|
| 98 |
|
|---|
| 99 |
if ($remote->format =~ /^RSS/) { |
|---|
| 100 |
$feed->image( \%{$remote->{rss}->image} ) |
|---|
| 101 |
if $remote->{rss}->image; |
|---|
| 102 |
} elsif ($remote->format eq 'Atom') { |
|---|
| 103 |
$feed->image({ url => $remote->{atom}->logo }) |
|---|
| 104 |
if $remote->{atom}->logo; |
|---|
| 105 |
} |
|---|
| 106 |
|
|---|
| 107 |
for my $e ($remote->entries) { |
|---|
| 108 |
my $entry = Plagger::Entry->new; |
|---|
| 109 |
$entry->title(_u($e->title)); |
|---|
| 110 |
$entry->author(_u($e->author)); |
|---|
| 111 |
|
|---|
| 112 |
my $category = $e->category; |
|---|
| 113 |
$category = [ $category ] if $category && (!ref($category) || ref($category) ne 'ARRAY'); |
|---|
| 114 |
$entry->tags([ map _u($_), @$category ]) if $category; |
|---|
| 115 |
|
|---|
| 116 |
|
|---|
| 117 |
if ($remote->format eq 'Atom' && $e->{entry}->can('categories')) { |
|---|
| 118 |
my @categories = $e->{entry}->categories; |
|---|
| 119 |
for my $cat (@categories) { |
|---|
| 120 |
$entry->add_tag( _u($cat->label || $cat->term) ); |
|---|
| 121 |
} |
|---|
| 122 |
} |
|---|
| 123 |
|
|---|
| 124 |
my $date = eval { $e->issued } || eval { $e->modified }; |
|---|
| 125 |
$entry->date( Plagger::Date->rebless($date) ) if $date; |
|---|
| 126 |
|
|---|
| 127 |
|
|---|
| 128 |
if (!$entry->date && $remote->format eq 'Atom' && $e->{entry}->version eq '1.0') { |
|---|
| 129 |
if ( $e->{entry}->published ) { |
|---|
| 130 |
my $dt = XML::Atom::Util::iso2dt( $e->{entry}->published ); |
|---|
| 131 |
$entry->date( Plagger::Date->rebless($dt) ) if $dt; |
|---|
| 132 |
} |
|---|
| 133 |
} |
|---|
| 134 |
|
|---|
| 135 |
$entry->link($e->link); |
|---|
| 136 |
$entry->feed_link($feed->link); |
|---|
| 137 |
$entry->id($e->id); |
|---|
| 138 |
|
|---|
| 139 |
my $content = feed_to_text($e, $e->content); |
|---|
| 140 |
my $summary = feed_to_text($e, $e->summary); |
|---|
| 141 |
$entry->body($content || $summary); |
|---|
| 142 |
$entry->summary($summary) if $summary; |
|---|
| 143 |
|
|---|
| 144 |
|
|---|
| 145 |
if ($remote->format eq 'Atom' && $e->{entry}->content && $e->{entry}->content->lang) { |
|---|
| 146 |
$entry->language($e->{entry}->content->lang); |
|---|
| 147 |
} |
|---|
| 148 |
|
|---|
| 149 |
|
|---|
| 150 |
if ($remote->format =~ /^RSS / and my $encls = $e->{entry}->{enclosure}) { |
|---|
| 151 |
|
|---|
| 152 |
$encls = [ $encls ] unless ref $encls eq 'ARRAY'; |
|---|
| 153 |
|
|---|
| 154 |
for my $encl (@$encls) { |
|---|
| 155 |
my $enclosure = Plagger::Enclosure->new; |
|---|
| 156 |
$enclosure->url( URI->new($encl->{url}) ); |
|---|
| 157 |
$enclosure->length($encl->{length}); |
|---|
| 158 |
$enclosure->auto_set_type($encl->{type}); |
|---|
| 159 |
$entry->add_enclosure($enclosure); |
|---|
| 160 |
} |
|---|
| 161 |
} elsif ($remote->format eq 'Atom') { |
|---|
| 162 |
for my $link ( grep { defined $_->rel && $_->rel eq 'enclosure' } $e->{entry}->link ) { |
|---|
| 163 |
my $enclosure = Plagger::Enclosure->new; |
|---|
| 164 |
$enclosure->url( URI->new($link->href) ); |
|---|
| 165 |
$enclosure->length($link->length); |
|---|
| 166 |
$enclosure->auto_set_type($link->type); |
|---|
| 167 |
$entry->add_enclosure($enclosure); |
|---|
| 168 |
} |
|---|
| 169 |
} |
|---|
| 170 |
|
|---|
| 171 |
|
|---|
| 172 |
if ($remote->format =~ /^RSS / and my $img = $e->{entry}->{image}) { |
|---|
| 173 |
$entry->icon(\%$img); |
|---|
| 174 |
} |
|---|
| 175 |
|
|---|
| 176 |
my $args = { |
|---|
| 177 |
entry => $entry, |
|---|
| 178 |
feed => $feed, |
|---|
| 179 |
orig_entry => $e, |
|---|
| 180 |
orig_feed => $remote, |
|---|
| 181 |
}; |
|---|
| 182 |
$context->run_hook('aggregator.entry.fixup', $args); |
|---|
| 183 |
|
|---|
| 184 |
$feed->add_entry($entry); |
|---|
| 185 |
} |
|---|
| 186 |
|
|---|
| 187 |
$context->log(info => "Aggregate $url success: " . $feed->count . " entries."); |
|---|
| 188 |
$context->update->add($feed); |
|---|
| 189 |
} |
|---|
| 190 |
|
|---|
| 191 |
sub feed_to_text { |
|---|
| 192 |
my($e, $content) = @_; |
|---|
| 193 |
return unless $content->body; |
|---|
| 194 |
|
|---|
| 195 |
if (ref($e) =~ /Atom/) { |
|---|
| 196 |
|
|---|
| 197 |
|
|---|
| 198 |
|
|---|
| 199 |
if ($content->type eq 'text/plain' || $content->type eq 'text') { |
|---|
| 200 |
return Plagger::Text->new(type => 'text', data => $content->body); |
|---|
| 201 |
} else { |
|---|
| 202 |
return Plagger::Text->new(type => 'html', data => $content->body); |
|---|
| 203 |
} |
|---|
| 204 |
} elsif (ref($e) =~ /RSS/) { |
|---|
| 205 |
|
|---|
| 206 |
return Plagger::Text->new_from_text($content->body); |
|---|
| 207 |
} else { |
|---|
| 208 |
die "Something is wrong: $e"; |
|---|
| 209 |
} |
|---|
| 210 |
} |
|---|
| 211 |
|
|---|
| 212 |
sub _u { |
|---|
| 213 |
my $str = shift; |
|---|
| 214 |
Encode::_utf8_on($str); |
|---|
| 215 |
$str; |
|---|
| 216 |
} |
|---|
| 217 |
|
|---|
| 218 |
1; |
|---|
| 219 |
|
|---|
| 220 |
__END__ |
|---|
| 221 |
|
|---|
| 222 |
=head1 NAME |
|---|
| 223 |
|
|---|
| 224 |
Plagger::Plugin::Aggregator::Simple - Dumb simple aggregator |
|---|
| 225 |
|
|---|
| 226 |
=head1 SYNOPSIS |
|---|
| 227 |
|
|---|
| 228 |
- module: Aggregator::Simple |
|---|
| 229 |
|
|---|
| 230 |
=head1 DESCRIPTION |
|---|
| 231 |
|
|---|
| 232 |
This plugin implements a Plagger dumb aggregator. It crawls |
|---|
| 233 |
subscription sequentially and parses XML feeds using L<XML::Feed> |
|---|
| 234 |
module. |
|---|
| 235 |
|
|---|
| 236 |
It can be also used as a base class for custom aggregators. See |
|---|
| 237 |
L<Plagger::Plugin::Aggregator::Xango> for example. |
|---|
| 238 |
|
|---|
| 239 |
=head1 AUTHOR |
|---|
| 240 |
|
|---|
| 241 |
Tatsuhiko Miyagawa |
|---|
| 242 |
|
|---|
| 243 |
=head1 SEE ALSO |
|---|
| 244 |
|
|---|
| 245 |
L<Plagger>, L<XML::Feed>, L<XML::RSS::LibXML> |
|---|
| 246 |
|
|---|
| 247 |
=cut |
|---|