| 1 |
package Plagger::Plugin::Aggregator::Simple; |
|---|
| 2 |
use strict; |
|---|
| 3 |
use base qw( Plagger::Plugin ); |
|---|
| 4 |
|
|---|
| 5 |
use Feed::Find; |
|---|
| 6 |
use Plagger::Enclosure; |
|---|
| 7 |
use Plagger::UserAgent; |
|---|
| 8 |
use List::Util qw(first); |
|---|
| 9 |
use UNIVERSAL::require; |
|---|
| 10 |
use URI; |
|---|
| 11 |
use XML::Feed; |
|---|
| 12 |
use XML::Feed::RSS; |
|---|
| 13 |
|
|---|
| 14 |
$XML::Feed::RSS::PREFERRED_PARSER = first { $_->require } qw( XML::RSS::Liberal XML::RSS::LibXML XML::RSS ); |
|---|
| 15 |
|
|---|
| 16 |
eval { require XML::Liberal }; |
|---|
| 17 |
if (!$@ && $XML::Liberal::VERSION >= 0.10) { |
|---|
| 18 |
XML::Liberal->globally_override('LibXML'); |
|---|
| 19 |
} |
|---|
| 20 |
|
|---|
| 21 |
sub register { |
|---|
| 22 |
my($self, $context) = @_; |
|---|
| 23 |
$context->register_hook( |
|---|
| 24 |
$self, |
|---|
| 25 |
'customfeed.handle' => \&aggregate, |
|---|
| 26 |
); |
|---|
| 27 |
} |
|---|
| 28 |
|
|---|
| 29 |
sub aggregate { |
|---|
| 30 |
my($self, $context, $args) = @_; |
|---|
| 31 |
|
|---|
| 32 |
my $url = $args->{feed}->url; |
|---|
| 33 |
my $res = $self->fetch_content($url) or return; |
|---|
| 34 |
|
|---|
| 35 |
my $content_type = eval { $res->content_type } || |
|---|
| 36 |
$res->http_response->content_type || |
|---|
| 37 |
"text/xml"; |
|---|
| 38 |
|
|---|
| 39 |
my $content = $res->content; |
|---|
| 40 |
if ( $Feed::Find::IsFeed{$content_type} || $self->looks_like_feed(\$content) ) { |
|---|
| 41 |
$self->handle_feed($url, \$content, $args->{feed}); |
|---|
| 42 |
} else { |
|---|
| 43 |
my @feeds = Feed::Find->find_in_html(\$content, $url); |
|---|
| 44 |
if (@feeds) { |
|---|
| 45 |
$url = $feeds[0]; |
|---|
| 46 |
$res = $self->fetch_content($url) or return; |
|---|
| 47 |
$self->handle_feed($url, \$res->content, $args->{feed}); |
|---|
| 48 |
} else { |
|---|
| 49 |
return; |
|---|
| 50 |
} |
|---|
| 51 |
} |
|---|
| 52 |
|
|---|
| 53 |
return 1; |
|---|
| 54 |
} |
|---|
| 55 |
|
|---|
| 56 |
sub looks_like_feed { |
|---|
| 57 |
my($self, $content_ref) = @_; |
|---|
| 58 |
$$content_ref =~ m!<rss\s+version="|<rdf:RDF\s+xmlns="http://purl\.org/rss|<feed\s+xmlns="!s; |
|---|
| 59 |
} |
|---|
| 60 |
|
|---|
| 61 |
sub fetch_content { |
|---|
| 62 |
my($self, $url) = @_; |
|---|
| 63 |
|
|---|
| 64 |
my $context = Plagger->context; |
|---|
| 65 |
$context->log(info => "Fetch $url"); |
|---|
| 66 |
|
|---|
| 67 |
my $agent = Plagger::UserAgent->new; |
|---|
| 68 |
my $response = $agent->fetch($url, $self); |
|---|
| 69 |
|
|---|
| 70 |
if ($response->is_error) { |
|---|
| 71 |
$context->log(error => "GET $url failed: " . |
|---|
| 72 |
$response->http_status . " " . |
|---|
| 73 |
$response->http_response->message); |
|---|
| 74 |
return; |
|---|
| 75 |
} |
|---|
| 76 |
|
|---|
| 77 |
|
|---|
| 78 |
$context->log(debug => $response->status . ": $url"); |
|---|
| 79 |
|
|---|
| 80 |
$response; |
|---|
| 81 |
} |
|---|
| 82 |
|
|---|
| 83 |
sub handle_feed { |
|---|
| 84 |
my($self, $url, $xml_ref, $feed) = @_; |
|---|
| 85 |
|
|---|
| 86 |
my $context = Plagger->context; |
|---|
| 87 |
|
|---|
| 88 |
my $args = { content => $$xml_ref }; |
|---|
| 89 |
$context->run_hook('aggregator.filter.feed', $args); |
|---|
| 90 |
|
|---|
| 91 |
my $remote = eval { XML::Feed->parse(\$args->{content}) }; |
|---|
| 92 |
|
|---|
| 93 |
unless ($remote) { |
|---|
| 94 |
$context->log(error => "Parsing $url failed. " . ($@ || XML::Feed->errstr)); |
|---|
| 95 |
return; |
|---|
| 96 |
} |
|---|
| 97 |
|
|---|
| 98 |
$feed ||= Plagger::Feed->new; |
|---|
| 99 |
$feed->title(_u($remote->title)); |
|---|
| 100 |
$feed->url($url); |
|---|
| 101 |
$feed->link($remote->link); |
|---|
| 102 |
$feed->description(_u($remote->tagline)); |
|---|
| 103 |
$feed->language($remote->language); |
|---|
| 104 |
$feed->author(_u($remote->author)); |
|---|
| 105 |
$feed->updated($remote->modified); |
|---|
| 106 |
$feed->source_xml($$xml_ref); |
|---|
| 107 |
|
|---|
| 108 |
if ($remote->format eq 'Atom') { |
|---|
| 109 |
$feed->id( $remote->{atom}->id ); |
|---|
| 110 |
} |
|---|
| 111 |
|
|---|
| 112 |
if ($remote->format =~ /^RSS/) { |
|---|
| 113 |
$feed->image( $remote->{rss}->image ) |
|---|
| 114 |
if $remote->{rss}->image; |
|---|
| 115 |
} elsif ($remote->format eq 'Atom') { |
|---|
| 116 |
$feed->image({ url => $remote->{atom}->logo }) |
|---|
| 117 |
if $remote->{atom}->logo; |
|---|
| 118 |
} |
|---|
| 119 |
|
|---|
| 120 |
for my $e ($remote->entries) { |
|---|
| 121 |
my $entry = Plagger::Entry->new; |
|---|
| 122 |
$entry->title(_u($e->title)); |
|---|
| 123 |
$entry->author(_u($e->author)); |
|---|
| 124 |
|
|---|
| 125 |
my $category = $e->category; |
|---|
| 126 |
$category = [ $category ] if $category && !ref($category); |
|---|
| 127 |
$entry->tags([ map _u($_), @$category ]) if $category; |
|---|
| 128 |
|
|---|
| 129 |
$entry->date( Plagger::Date->rebless($e->issued) ) |
|---|
| 130 |
if eval { $e->issued }; |
|---|
| 131 |
|
|---|
| 132 |
|
|---|
| 133 |
if (!$entry->date && $remote->format eq 'Atom' && $e->{entry}->version eq '1.0') { |
|---|
| 134 |
if ( $e->{entry}->published ) { |
|---|
| 135 |
my $dt = XML::Atom::Util::iso2dt( $e->{entry}->published ); |
|---|
| 136 |
$entry->date( Plagger::Date->rebless($dt) ) if $dt; |
|---|
| 137 |
} |
|---|
| 138 |
} |
|---|
| 139 |
|
|---|
| 140 |
$entry->link($e->link); |
|---|
| 141 |
$entry->feed_link($feed->link); |
|---|
| 142 |
$entry->id($e->id); |
|---|
| 143 |
$entry->body(_u($e->content->body || $e->summary->body)); |
|---|
| 144 |
|
|---|
| 145 |
|
|---|
| 146 |
if ($remote->format =~ /^RSS / && $e->{entry}->{enclosure}) { |
|---|
| 147 |
my $enclosure = Plagger::Enclosure->new; |
|---|
| 148 |
$enclosure->url( URI->new($e->{entry}->{enclosure}->{url}) ); |
|---|
| 149 |
$enclosure->length($e->{entry}->{enclosure}->{length}); |
|---|
| 150 |
$enclosure->auto_set_type($e->{entry}->{enclosure}->{type}); |
|---|
| 151 |
$entry->add_enclosure($enclosure); |
|---|
| 152 |
} elsif ($remote->format eq 'Atom') { |
|---|
| 153 |
for my $link ( grep { $_->rel eq 'enclosure' } $e->{entry}->link ) { |
|---|
| 154 |
my $enclosure = Plagger::Enclosure->new; |
|---|
| 155 |
$enclosure->url( URI->new($link->href) ); |
|---|
| 156 |
$enclosure->length($link->length); |
|---|
| 157 |
$enclosure->auto_set_type($link->type); |
|---|
| 158 |
$entry->add_enclosure($enclosure); |
|---|
| 159 |
} |
|---|
| 160 |
} |
|---|
| 161 |
|
|---|
| 162 |
|
|---|
| 163 |
|
|---|
| 164 |
|
|---|
| 165 |
my $media_ns = "http://search.yahoo.com/mrss"; |
|---|
| 166 |
my $media = $e->{entry}->{$media_ns}->{group} || $e->{entry}; |
|---|
| 167 |
my $content = $media->{$media_ns}->{content} || []; |
|---|
| 168 |
$content = [ $content ] unless ref $content; |
|---|
| 169 |
|
|---|
| 170 |
for my $media_content (@{$content}) { |
|---|
| 171 |
my $enclosure = Plagger::Enclosure->new; |
|---|
| 172 |
$enclosure->url( URI->new($media_content->{url}) ); |
|---|
| 173 |
$enclosure->auto_set_type($media_content->{type}); |
|---|
| 174 |
$entry->add_enclosure($enclosure); |
|---|
| 175 |
} |
|---|
| 176 |
|
|---|
| 177 |
if (my $thumbnail = $media->{$media_ns}->{thumbnail}) { |
|---|
| 178 |
$entry->icon({ |
|---|
| 179 |
url => $thumbnail->{url}, |
|---|
| 180 |
width => $thumbnail->{width}, |
|---|
| 181 |
height => $thumbnail->{height}, |
|---|
| 182 |
}); |
|---|
| 183 |
} |
|---|
| 184 |
|
|---|
| 185 |
|
|---|
| 186 |
my $hatena = $e->{entry}->{"http://www.hatena.ne.jp/info/xmlns#"} || {}; |
|---|
| 187 |
if ($hatena->{imageurl}) { |
|---|
| 188 |
my $enclosure = Plagger::Enclosure->new; |
|---|
| 189 |
$enclosure->url($hatena->{imageurl}); |
|---|
| 190 |
$enclosure->auto_set_type; |
|---|
| 191 |
$entry->add_enclosure($enclosure); |
|---|
| 192 |
} |
|---|
| 193 |
|
|---|
| 194 |
if ($hatena->{imageurlsmall}) { |
|---|
| 195 |
$entry->icon({ url => $hatena->{imageurlsmall} }); |
|---|
| 196 |
} |
|---|
| 197 |
|
|---|
| 198 |
|
|---|
| 199 |
my $apple = $e->{entry}->{"http://www.apple.com/ilife/wallpapers"} || {}; |
|---|
| 200 |
if ($apple->{image}) { |
|---|
| 201 |
my $enclosure = Plagger::Enclosure->new; |
|---|
| 202 |
$enclosure->url( URI->new($apple->{image}) ); |
|---|
| 203 |
$enclosure->auto_set_type; |
|---|
| 204 |
$entry->add_enclosure($enclosure); |
|---|
| 205 |
} |
|---|
| 206 |
if ($apple->{thumbnail}) { |
|---|
| 207 |
$entry->icon({ url => $apple->{thumbnail} }); |
|---|
| 208 |
} |
|---|
| 209 |
|
|---|
| 210 |
my $args = { |
|---|
| 211 |
entry => $entry, |
|---|
| 212 |
feed => $feed, |
|---|
| 213 |
orig_entry => $e, |
|---|
| 214 |
orig_feed => $remote, |
|---|
| 215 |
}; |
|---|
| 216 |
$context->run_hook('aggregator.entry.fixup', $args); |
|---|
| 217 |
|
|---|
| 218 |
$feed->add_entry($entry); |
|---|
| 219 |
} |
|---|
| 220 |
|
|---|
| 221 |
$context->log(info => "Aggregate $url success: " . $feed->count . " entries."); |
|---|
| 222 |
$context->update->add($feed); |
|---|
| 223 |
} |
|---|
| 224 |
|
|---|
| 225 |
sub _u { |
|---|
| 226 |
my $str = shift; |
|---|
| 227 |
Encode::_utf8_on($str); |
|---|
| 228 |
$str; |
|---|
| 229 |
} |
|---|
| 230 |
|
|---|
| 231 |
1; |
|---|
| 232 |
|
|---|
| 233 |
__END__ |
|---|
| 234 |
|
|---|
| 235 |
=head1 NAME |
|---|
| 236 |
|
|---|
| 237 |
Plagger::Plugin::Aggregator::Simple - Dumb simple aggregator |
|---|
| 238 |
|
|---|
| 239 |
=head1 SYNOPSIS |
|---|
| 240 |
|
|---|
| 241 |
- module: Aggregator::Simple |
|---|
| 242 |
|
|---|
| 243 |
=head1 DESCRIPTION |
|---|
| 244 |
|
|---|
| 245 |
This plugin implements a Plagger dumb aggregator. It crawls |
|---|
| 246 |
subscription sequentially and parses XML feeds using L<XML::Feed> |
|---|
| 247 |
module. |
|---|
| 248 |
|
|---|
| 249 |
It can be also used as a base class for custom aggregators. See |
|---|
| 250 |
L<Plagger::Plugin::Aggregator::Xango> for example. |
|---|
| 251 |
|
|---|
| 252 |
=head1 AUTHOR |
|---|
| 253 |
|
|---|
| 254 |
Tatsuhiko Miyagawa |
|---|
| 255 |
|
|---|
| 256 |
=head1 SEE ALSO |
|---|
| 257 |
|
|---|
| 258 |
L<Plagger>, L<XML::Feed>, L<XML::RSS::LibXML> |
|---|
| 259 |
|
|---|
| 260 |
=cut |
|---|