root/trunk/plagger/lib/Plagger/Plugin/Filter/GuessLanguage.pm

Revision 2060 (checked in by miyagawa, 1 year ago)

apply RT 42542

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
Line 
1 package Plagger::Plugin::Filter::GuessLanguage;
2 use strict;
3 use base qw( Plagger::Plugin );
4
5 use Text::Language::Guess;
6 use Locale::Language;
7 use Lingua::ZH::HanDetect;
8
9 sub register {
10     my($self, $context) = @_;
11     $context->register_hook(
12         $self,
13         'plugin.init'        => \&init_guesser,
14         'update.entry.fixup' => \&guess,
15     );
16 }
17
18 sub rule_hook { 'update.entry.fixup' }
19
20 my $re_western_lang_code = qr/^(?:en|fr|es|pt|it|de|nl|sv|no|da)$/;
21
22 sub init_guesser {
23     my ($self, $context, $args) = @_;
24
25     my @western_languages; # ie. Text::Language::Guess-able languages
26     my %accepts;
27
28     foreach my $lang (@{ $self->conf->{languages} || [] }) {
29
30         # see if $lang is human friendly lang name
31         if (my $code = language2code($lang)) {
32             push @western_languages, $code if $code =~ $re_western_lang_code;
33             $accepts{$code} = 1;
34         }
35
36         # see if $lang is existing lang code
37         elsif (my $name = code2language($lang)) {
38             push @western_languages, $lang if $lang =~ $re_western_lang_code;
39             $accepts{$lang} = 1;
40         }
41
42         # $lang is something wrong or unsupported
43         else {
44             $context->log(warn => "no such language: $lang");
45         }
46     }
47
48     $self->{guess_language}->{accepts} = \%accepts;
49     $self->{guess_language}->{western} = Text::Language::Guess->new(
50         @western_languages
51             ? ( languages => \@western_languages )
52             : ()
53     );
54 }
55
56 sub guess {
57     my ($self, $context, $args) = @_;
58
59     my $target = $self->conf->{target} || 'feed';
60
61     my $guessed;
62     if (!$guessed && $target =~ /both|entry/) {
63         $guessed = $self->guess_entry($context, $args);
64     }
65     if (!$guessed && $target =~ /both|feed/) {
66         $guessed = $self->guess_feed($context, $args);
67     }
68 }
69
70 sub guess_language {
71     my ($self, $text) = @_;
72
73     return unless defined $text && length $text;
74
75     my $code;
76
77     # xxx: just a quick hack. there may be a better way.
78
79     my %accepts = %{ $self->{guess_language}->{accepts} };
80
81     if (!%accepts || $accepts{ja}) {
82         return 'ja' if $text =~ /\p{Hiragana}|\p{Katakana}/s;
83     }
84     if (!%accepts || $accepts{ko}) {
85         return 'ko' if $text =~ /\p{Hangul}/s;
86     }
87     if (!%accepts || $accepts{zh}) {
88         my ($encoding, $variant) = Lingua::ZH::HanDetect::han_detect($text);
89         return 'zh' if $encoding && $variant; # maybe chinese (but maybe j/k)
90     }
91
92     $code = $self->{guess_language}->{western}->language_guess_string($text);
93
94     # skip if no western lang is allowed
95     return $code if !%accepts || $accepts{$code};
96
97     return# doomed!
98 }
99
100 sub guess_feed {
101     my ($self, $context, $args) = @_;
102
103     return $args->{feed}->language if $args->{feed}->language;
104
105     $context->log(debug => "start guessing language");
106
107     my $body = join "\n", map $_->body_text, $args->{feed}->entries;
108
109     my $code = $self->guess_language($body);
110
111     if ($code) {
112         $context->log(debug => "guessed: $code");
113         $args->{feed}->language($code);
114         return $code;
115     }
116     else {
117         $context->log(debug => "can't identify the feed's language");
118         return;
119     }
120 }
121
122 sub guess_entry {
123     my ($self, $context, $args) = @_;
124
125     return $args->{entry}->language if $args->{entry}->language;
126
127     $context->log(debug => "start guessing entry's language");
128
129     my $code = $self->guess_language($args->{entry}->body_text);
130
131     if ($code) {
132         $context->log(debug => "guessed: $code");
133         $args->{entry}->language($code);
134         return $code;
135     }
136     else {
137         $context->log(debug => "can't identify the entry's language");
138         return;
139     }
140 }
141
142 1;
143
144 __END__
145
146 =head1 NAME
147
148 Plagger::Plugin::Filter::GuessLanguage - guess language of feeds/entries
149
150 =head1 SYNOPSIS
151
152   - module: Filter::GuessLanguage
153     config:
154       languages:
155         - en
156         - de
157         - Japanese
158       target: both
159
160 =head1 DESCRIPTION
161
162 =head1 CONFIG
163
164 =over 4
165
166 =item languages (optional)
167
168 Which languages you think the feeds/entries are written in.
169 English language names and ISO two letter codes are acceptable.
170 Unless you DO want to limit, specify nothing.
171
172 =item target
173
174 'entry' or 'feed' (default) or 'both'.
175
176 =back
177
178 =head1 AUTHOR
179
180 Kenichi Ishigaki
181
182 =head1 SEE ALSO
183
184 L<Plagger>, L<Text::Language::Guess>
185
186 =cut
Note: See TracBrowser for help on using the browser.