root/trunk/plagger/lib/Plagger/Plugin/CustomFeed/Simple.pm

Revision 1896 (checked in by miyagawa, 2 years ago)

CustomFeed?::Simple: a small optimization not to call extract_title when it's not necessary

Line 
1 package Plagger::Plugin::CustomFeed::Simple;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Encode;
6 use HTML::TokeParser;
7 use HTML::ResolveLink;
8 use HTML::TreeBuilder::XPath;
9 use Plagger::UserAgent;
10 use Plagger::Util qw( decode_content extract_title );
11
12 sub register {
13     my($self, $context) = @_;
14     $context->register_hook(
15         $self,
16         'customfeed.handle' => \&handle,
17     );
18 }
19
20 sub handle {
21     my($self, $context, $args) = @_;
22
23     $args->{match} = $args->{feed}->meta->{follow_link};
24     $args->{xpath} = $args->{feed}->meta->{follow_xpath};
25     if ($args->{match} || $args->{xpath}) {
26         return $self->aggregate($context, $args);
27     }
28
29     return;
30 }
31
32 sub aggregate {
33     my($self, $context, $args) = @_;
34
35     my $url = $args->{feed}->url;
36     $context->log(info => "GET $url");
37
38     my $agent = Plagger::UserAgent->new;
39     my $res = $agent->fetch($url, $self);
40
41     if ($res->http_response->is_error) {
42         $context->log(error => "GET $url failed: " . $res->status);
43         return;
44     }
45
46     my $content = decode_content($res);
47
48     my $feed = Plagger::Feed->new;
49     $feed->title($args->{feed}->title || extract_title($content));
50     $feed->link($url);
51
52     if( my $re = $args->{match} ) {
53         my $resolver = HTML::ResolveLink->new(base => $url);
54         $content = $resolver->resolve($content);
55
56         my %seen;
57         my $parser = HTML::TokeParser->new(\$content);
58         while (my $token = $parser->get_tag('a')) {
59             next unless ($token->[1]->{href} || '') =~ /$re/;
60
61             my $text = $parser->get_trimmed_text('/a');
62             next if !$text || $text eq '[IMG]';
63
64             my $item_url = URI->new_abs($token->[1]->{href}, $url);
65             next if $seen{$item_url->as_string}++;
66
67             my $entry = Plagger::Entry->new;
68             $entry->title($text);
69             $entry->link($item_url);
70             $feed->add_entry($entry);
71
72             $context->log(debug => "Add $token->[1]->{href} ($text)");
73         }
74     } elsif (my $xpath = $args->{xpath}) {
75         my $tree = HTML::TreeBuilder::XPath->new;
76         $tree->parse($content);
77         $tree->eof;
78
79         for my $child ( $tree->findnodes($xpath || '//a') ) {
80             my $href  = $child->attr('href') or next;
81             my $title = $child->attr('title') || $child->as_text;
82
83             my $entry = Plagger::Entry->new;
84             $entry->title($title);
85             $entry->link(URI->new_abs($href, $url));
86             $feed->add_entry($entry);
87
88             $context->log(debug => "Add $href ($title)");
89         }
90     }
91
92     $context->update->add($feed);
93
94     return 1;
95 }
96
97 1;
98
99 __END__
100
101 =head1 NAME
102
103 Plagger::Plugin::CustomFeed::Simple - Simple way to create title and link only custom feeds
104
105 =head1 SYNOPSIS
106
107   - module: Subscription::Config
108     config:
109       feed:
110         - url: http://sportsnavi.yahoo.co.jp/index.html
111           meta:
112             follow_link: /headlines/
113         - url: http://d.hatena.ne.jp/antipop/20050628/1119966355
114           meta:
115             follow_xpath: //ul[@class="xoxo" or @class="subscriptionlist"]//a
116
117   - module: CustomFeed::Simple
118
119 =head1 DESCRIPTION
120
121
122 =head1 AUTHOR
123
124 Tatsuhiko Miyagawa
125
126 =head1 SEE ALSO
127
128 L<Plagger>
129
130 =cut
131
132
133
134 1;
Note: See TracBrowser for help on using the browser.