root/branches/hackathon-summary/plagger/lib/Plagger/FeedParser.pm

Revision 1620 (checked in by miyagawa, 2 years ago)

backout the call to utf8::decode in strip_html function. FeedParser? now set $XML::Atom::ForceUnicode? = 1 globally rather than local(), since this flag takes effect during the runtime access to $entry->body or other elements when XML::Atom accesses its DOM node. Nasty.

Line 
1 package Plagger::FeedParser;
2 use strict;
3
4 use Feed::Find;
5 use XML::Atom;
6 use XML::Feed;
7 use XML::Feed::RSS;
8 $XML::Feed::RSS::PREFERRED_PARSER = "XML::RSS::LibXML";
9 $XML::Atom::ForceUnicode = 1;
10
11 use Plagger::Util;
12
13 sub parse {
14     my($class, $content_ref) = @_;
15
16     # override XML::LibXML with Liberal
17     my $sweeper; # XML::Liberal >= 0.13
18
19     eval { require XML::Liberal };
20     if (!$@ && $XML::Liberal::VERSION >= 0.10) {
21         $sweeper = XML::Liberal->globally_override('LibXML');
22     }
23
24     my $remote = eval { XML::Feed->parse($content_ref) }
25         or Carp::croak("Parsing content failed: " . ($@ || XML::Feed->errstr));
26
27     return $remote;
28 }
29
30 sub discover {
31     my($self, $res) = @_;
32
33     my $content_type = eval { $res->content_type } ||
34                        $res->http_response->content_type ||
35                        "text/xml";
36
37     $content_type =~ s/;.*$//; # strip charset= cruft
38
39     my $content = $res->content;
40     if ( $Feed::Find::IsFeed{$content_type} || $self->looks_like_feed(\$content) ) {
41         return $res->uri;
42     } else {
43         $content  = Plagger::Util::decode_content($res);
44         my @feeds = Feed::Find->find_in_html(\$content, $res->uri);
45         if (@feeds) {
46             return $feeds[0];
47         } else {
48             return;
49         }
50     }
51 }
52
53 sub looks_like_feed {
54     my($self, $content_ref) = @_;
55     $$content_ref =~ m!<rss |<rdf:RDF\s+.*?xmlns="http://purl\.org/rss|<feed\s+xmlns="!s;
56 }
57
58 1;
Note: See TracBrowser for help on using the browser.