Changeset 1492
- Timestamp:
- 08/20/06 17:36:14
- Files:
-
- trunk/plagger (modified) (1 prop)
- trunk/plagger/lib/Plagger/FeedParser.pm (added)
- trunk/plagger/lib/Plagger/Plugin/Aggregator/Simple.pm (modified) (4 diffs)
- trunk/plagger/lib/Plagger/Plugin/Filter/HatenaBookmarkTag.pm (modified) (2 diffs)
- trunk/plagger/lib/Plagger/UserAgent.pm (modified) (5 diffs)
- trunk/plagger/t/core/feed-parser-discover.t (added)
- trunk/plagger/t/core/feed-parser.t (added)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/plagger
- Property svn:ignore changed from Makefile Makefile.old *.yaml inc META.yml plugins *.patch blib pm_to_blib to Makefile Makefile.old *.yaml inc META.yml plugins *.patch blib pm_to_blib *.orig
trunk/plagger/lib/Plagger/Plugin/Aggregator/Simple.pm
r1389 r1492 5 5 use Feed::Find; 6 6 use Plagger::Enclosure; 7 use Plagger::FeedParser; 7 8 use Plagger::UserAgent; 8 9 use List::Util qw(first); 9 10 use UNIVERSAL::require; 10 11 use URI; 11 use XML::Feed;12 use XML::Feed::RSS;13 14 $XML::Feed::RSS::PREFERRED_PARSER = first { $_->require } qw( XML::RSS::Liberal XML::RSS::LibXML XML::RSS );15 12 16 13 sub register { … … 34 31 $content_type =~ s/;.*$//; # strip charset= cruft 35 32 36 my $content = $res->content; 37 if ( $Feed::Find::IsFeed{$content_type} || $self->looks_like_feed(\$content) ) { 38 $self->handle_feed($url, \$content, $args->{feed}); 33 my $feed_url = Plagger::FeedParser->discover($res); 34 if ($url eq $feed_url) { 35 $self->handle_feed($url, \$res->content, $args->{feed}); 36 } elsif ($feed_url) { 37 $res = $self->fetch_content($feed_url) or return; 38 $self->handle_feed($feed_url, \$res->content, $args->{feed}); 39 39 } else { 40 $content = Plagger::Util::decode_content($res); 41 my @feeds = Feed::Find->find_in_html(\$content, $url); 42 if (@feeds) { 43 $url = $feeds[0]; 44 $res = $self->fetch_content($url) or return; 45 $self->handle_feed($url, \$res->content, $args->{feed}); 46 } else { 47 return; 48 } 40 return; 49 41 } 50 42 51 43 return 1; 52 }53 54 sub looks_like_feed {55 my($self, $content_ref) = @_;56 $$content_ref =~ m!<rss |<rdf:RDF\s+.*?xmlns="http://purl\.org/rss|<feed\s+xmlns="!s;57 44 } 58 45 … … 64 51 65 52 my $agent = Plagger::UserAgent->new; 66 $agent->parse_head(0);67 53 my $response = $agent->fetch($url, $self); 68 54 … … 88 74 $context->run_hook('aggregator.filter.feed', $args); 89 75 90 # override XML::LibXML with Liberal 91 my $sweeper; # XML::Liberal >= 0.13 92 93 eval { require XML::Liberal }; 94 if (!$@ && $XML::Liberal::VERSION >= 0.10) { 95 $sweeper = XML::Liberal->globally_override('LibXML'); 96 } 97 98 local $XML::Atom::ForceUnicode = 1; 99 my $remote = eval { XML::Feed->parse(\$args->{content}) }; 100 101 unless ($remote) { 102 $context->log(error => "Parsing $url failed. " . ($@ || XML::Feed->errstr)); 76 my $remote = eval { Plagger::FeedParser->parse(\$args->{content}) }; 77 if ($@) { 78 $context->log(error => "Parser $url failed: $@"); 103 79 return; 104 80 } trunk/plagger/lib/Plagger/Plugin/Filter/HatenaBookmarkTag.pm
r189 r1492 3 3 use base qw( Plagger::Plugin ); 4 4 5 use Plagger::UserAgent; 5 6 use URI; 6 use XML::Feed;7 8 $XML::Feed::RSS::PREFERRED_PARSER = 'XML::RSS::LibXML';9 7 10 8 sub register { … … 20 18 21 19 # xxx need cache & interval 20 my $agent = Plagger::UserAgent->new; 22 21 my $url = 'http://b.hatena.ne.jp/entry/rss/' . $args->{entry}->permalink; 23 my $feed = XML::Feed->parse( URI->new($url) );22 my $feed = eval { $agent->fetch_parse( URI->new($url) ) }; 24 23 25 unless ($feed) {26 $context->log( warn => "Feed error $url: " . XML::Feed->errstr);24 if ($@) { 25 $context->log(error => "Feed error $url: $@"); 27 26 return; 28 27 } trunk/plagger/lib/Plagger/UserAgent.pm
r1296 r1492 3 3 use base qw( LWP::UserAgent ); 4 4 5 use Carp; 5 6 use Plagger::Cookies; 7 use Plagger::FeedParser; 6 8 use URI::Fetch 0.06; 7 9 … … 10 12 my $self = $class->SUPER::new(@_); 11 13 12 my $conf = Plagger->context ->conf->{user_agent};14 my $conf = Plagger->context ? Plagger->context->conf->{user_agent} : {}; 13 15 if ($conf->{cookies}) { 14 16 $self->cookie_jar( Plagger::Cookies->create($conf->{cookies}) ); … … 19 21 $self->env_proxy(); 20 22 21 Plagger->context->run_hook('useragent.init', { ua => $self }); 23 if (Plagger->context) { 24 Plagger->context->run_hook('useragent.init', { ua => $self }); 25 } 22 26 23 27 $self; … … 44 48 my $self = shift; 45 49 my($req) = @_; 46 Plagger->context->run_hook('useragent.request', { ua => $self, url => $req->uri, req => $req }); 50 if (Plagger->context) { 51 Plagger->context->run_hook('useragent.request', { ua => $self, url => $req->uri, req => $req }); 52 } 47 53 $self->SUPER::request(@_); 48 54 } … … 103 109 } 104 110 111 sub find_parse { 112 my($self, $url) = @_; 113 $url = URI->new($url) unless ref $url; 114 115 $self->parse_head(0); 116 my $response = $self->fetch($url); 117 if ($response->is_error) { 118 Carp::croak("Error fetching $url: ", $response->http_status); 119 } 120 121 my $feed_url = Plagger::FeedParser->discover($response); 122 if ($url eq $feed_url) { 123 return Plagger::FeedParser->parse(\$response->content); 124 } elsif ($feed_url) { 125 return $self->fetch_parse($feed_url); 126 } else { 127 Carp::croak("Can't find feed from $url"); 128 } 129 } 130 131 sub fetch_parse { 132 my($self, $url) = @_; 133 $url = URI->new($url) unless ref $url; 134 135 $self->parse_head(0); 136 137 my $response = $self->fetch($url); 138 if ($response->is_error) { 139 Carp::croak("Error fetching $url: ", $response->http_status); 140 } 141 142 Plagger::FeedParser->parse(\$response->content); 143 } 144 105 145 1; 106 146
