Changeset 1544

Show
Ignore:
Timestamp:
08/20/06 22:52:16
Author:
miyagawa
Message:

Added strip_html based on HTML::TreeBuilder? and FormatText?

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • branches/hackathon-summary/plagger/Makefile.PL

    r1351 r1544  
    6464        recommends('HTML::TreeBuilder::XPath'), 
    6565    ], 
     66    'Better html to text formatter' => [ 
     67        -default => 1, 
     68        recommends('HTML::TreeBuilder'), 
     69        recommends('HTML::FormatText'), 
     70    ], 
    6671); 
    6772 
  • branches/hackathon-summary/plagger/lib/Plagger/Util.pm

    r1538 r1544  
    2828sub strip_html { 
    2929    my $html = shift; 
    30     $html =~ s/<[^>]*>//g; 
    31     HTML::Entities::decode($html); 
     30 
     31    eval { 
     32        require HTML::FormatText; 
     33        require HTML::TreeBuilder; 
     34    }; 
     35 
     36    if ($@) { 
     37        # dump stripper 
     38        $html =~ s/<[^>]*>//g; 
     39        return HTML::Entities::decode($html); 
     40    } 
     41 
     42    my $tree = HTML::TreeBuilder->new; 
     43    $tree->parse($html); 
     44    $tree->eof; 
     45 
     46    my $formatter = HTML::FormatText->new(leftmargin => 0); 
     47    my $text = $formatter->format($tree); 
     48    $text =~ s/\s*$//s; 
     49    $text; 
    3250} 
    3351 
  • branches/hackathon-summary/plagger/t/core/text.t

    r1527 r1544  
    11use t::TestPlagger; 
    22plan tests => 1 * blocks; 
     3 
     4test_requires('HTML::FormatText'); 
     5test_requires('HTML::TreeBuilder'); 
    36 
    47filters { input => 'chomp', expected => 'yaml' }; 
     
    5356--- expected 
    5457type: html 
    55 plaintext: Hello  World 
     58plaintext: "Hello\nWorld" 
    5659 
    5760=== <wbr> 
     
    6063--- expected 
    6164type: html 
    62 plaintext: Hello World 
     65plaintext: Hello World 
    6366 
    6467=== Unknown Tags