View file File name : Da.pm Content :package Lingua::Stem::Snowball::Da; use strict; use bytes; # -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2, # *NOT* "earlier versions", as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ##### use constant DEBUG=>0; use vars qw(%cache $VERSION); $Lingua::Stem::Snowball::Da::VERSION = 1.01; # special characters my $aa = chr(229); # å my $ae = chr(230); # æ my $oe = chr(248); # &oring; # delete the s if a "s ending" is preceeded by one # of these characters. my %s_ending = ( a => 1, b => 1, c => 1, d => 1, f => 1, g => 1, h => 1, j => 1, k => 1, l => 1, "m" => 1, n => 1, o => 1, p => 1, r => 1, t => 1, v => 1, "y" => 1, z => 1, $aa => 1, ); # danish vowels. my $vowels = "aeiouy$ae$aa$oe"; my %vowels = ( a=>1, e=>1, i=>1, o=>1, u=>1, "y"=>1, $ae=>1, $aa=>1, $oe=>1, ); # #### # the endings in step 1 # XXX: these must be sorted by length # to save time we've done it already. my @endings = ( ["erendes"], ["erende", "hedens"], ["erens", "endes", "heden", "ethed", "ernes", "erets", "heder", "erede"], ["ende", "enes", "ered", "eren", "erer", "eres", "eret", "erne", "heds"], ["hed", "ers", "ene", "ere", "ens", "ets"], ["er", "es", "et", "en"], ["e"], ); %Lingua::Stem::Snowball::Da::cache = (); sub new { my $pkg = shift; $pkg = ref $pkg || $pkg; my %arg = @_; my $self = {}; bless $self, $pkg; $self->{USE_CACHE} = $arg{use_cache} || 0; return $self; } sub step1 { my ($rs, $word) = @_; # ### STEP 1 my $endinglen = 8; foreach (@endings) { $endinglen--; my $endingw = substr($rs, -$endinglen); # do this once. foreach (@$_) { # only continue if the word has this ending at all. next unless $endingw eq $_; warn "matched $_ in $word" if DEBUG; # a) delete the ending. return substr($word, 0, -$endinglen); } } if (substr($rs, -1) eq 's') { # b) # check if it has a valid "s ending"... if ((length $rs == 1) ? exists $s_ending{substr($word, -2, -1)} : exists $s_ending{substr($rs, -2, -1)}) { warn "Valid s eding $word" if DEBUG; # ...delete the last character (which is a s) return substr($word, 0, -1); } } return $word; } sub stem { my ($self, $word) = @_; my $orig_word; warn " --- start : $word ---" if DEBUG; if ($self->{USE_CACHE}) { $orig_word = $word; return $cache{$word} if defined $cache{$word}; } my ($rs, $lslen, $rslen) = getsides($word); return $word unless $lslen >= 3; $word = step1($rs, $word); # ### STEP 2 warn "Step 2" if DEBUG; ($rs, $lslen, $rslen) = getsides($word); return $word unless $lslen >= 3; if (substr($rs, -2) =~ /gd|dt|gt|kt/) { warn "delete last letter $word in step 2" if DEBUG; $word = substr($word, 0, - 1); ($rs, $lslen, $rslen) = getsides($word); return $word unless $lslen >= 3; } # ### STEP 3 if (substr($rs, -4) eq "igst") { warn "st as in igst deleted in $word" if DEBUG; $word = substr($word, 0, -2); ($rs, $lslen, $rslen) = getsides($word); return $word unless $lslen >= 3; } if (substr($rs, -4) eq "l${oe}st") { warn "t as in l${oe}st deleted in $word" if DEBUG; $word = substr($word, 0, -1); ($rs, $lslen, $rslen) = getsides($word); return $word unless $lslen >= 3; } for (qw/elig lig els ig/) { my $len = length; if (substr($rs, -$len) eq $_) { warn "delete $_ in $word" if DEBUG; $word = substr($word, 0, -$len); ($rs) = getsides($word); if (substr($rs, -2) =~ /gd|dt|gt|kt/) { warn "delete last letter $word in step 2 again" if DEBUG; $word = substr($word, 0, - 1); ($rs, $lslen, $rslen) = getsides($word); } last; } } return $word unless $lslen >= 3 && length $word > 3; # ### STEP 4 if ($word =~ /([^$vowels])\1$/o) { warn "delete double konsonant in $word" if DEBUG; $word = substr($word, 0, -1); } if ($self->{USE_CACHE}) { $cache{$orig_word} = $word; } warn " --- end : $word ---" if DEBUG; return $word; } sub getsides { my $word = shift; # ### # find the first vowel with a non-vowel after it. my($found_vowel, $nonv_position, $curpos) = (0, -1, 0); #$found_vowel = 1 if exists $vowels{substr($word,0,1)}; foreach (split//, $word) { $curpos++; if (exists $vowels{$_}) { $found_vowel = 1; next; } elsif ($found_vowel) { $nonv_position = $curpos; last; } } # got nothing: return false return undef if $nonv_position == -1; my($rs, $lslen); # left side and right side. # ### # length of the left side must be atleast 3 chars. if ($nonv_position < 3) { $lslen = length substr($word, 0, 3); $rs = substr($word, 3); } else { $lslen = $nonv_position; $rs = substr($word, $nonv_position); } return($rs, $lslen, length $rs); } 1; __END__ =head1 NAME Lingua::Stem::Snowball::Da - Porters stemming algorithm for Denmark =head1 SYNOPSIS use Lingua::Stem::Snowball::Da my $stemmer = new Lingua::Stem::Snowball::Da (use_cache => 1); foreach my $word (@words) { my $stemmed = $stemmer->stem($word); print $stemmed, "\n"; } =head1 DESCRIPTION The stem function takes a scalar as a parameter and stems the word according to Martin Porters Danish stemming algorithm, which can be found at the Snowball website: L<http://snowball.tartarus.org/>. It also supports caching if you pass the use_cache option when constructing a new L:S:S:D object. =head2 EXPORT Lingua::Stem::Snowball::Da has nothing to export. =head1 AUTHOR Dennis Haney E<lt>davh@davh.dkE<gt> Ask Solem Hoel, E<lt>ask@unixmonks.netE<gt> (Swedish version) =head1 SEE ALSO L<perl>. L<Lingua::Stem::Snowball>. L<Lingua::Stem>. L<http://snowball.tartarus.org>. =cut