Edit file File name : DocumentWriter.pm Content :package Plucene::Index::DocumentWriter; =head1 NAME Plucene::Index::DocumentWriter - the document writer =head1 SYNOPSIS my $writer = Plucene::Index::DocumentWriter ->new($directory, $analyser, $max_field_length); $writer->add_document($segment, $doc); =head1 DESCRIPTION This is the document writer class. =head2 METHODS =cut use strict; use warnings; use File::Slurp; use Plucene::Index::FieldInfos; use Plucene::Index::FieldsWriter; use Plucene::Index::Term; use Plucene::Index::TermInfo; use Plucene::Index::TermInfosWriter; use Plucene::Search::Similarity; use Plucene::Store::OutputStream; use IO::Scalar; =head2 new my $writer = Plucene::Index::DocumentWriter ->new($directory, $analyser, $max_field_length); This will create a new Plucene::Index::DocumentWriter object with the passed in arguments. =cut sub new { my ($self, $d, $a, $mfl) = @_; bless { directory => $d, analyzer => $a, max_field_length => $mfl, postings => {}, }, $self; } =head2 add_document $writer->add_document($segment, $doc); =cut sub add_document { my ($self, $segment, $doc) = @_; my $fi = Plucene::Index::FieldInfos->new(); $fi->add($doc); $fi->write("$self->{directory}/$segment.fnm"); $self->{field_infos} = $fi; my $fw = Plucene::Index::FieldsWriter->new($self->{directory}, $segment, $fi); $fw->add_document($doc); $self->{postings} = {}; $self->{field_lengths} = []; $self->_invert_document($doc); my @postings = sort { $a->{term}->{field} cmp $b->{term}->{field} || $a->{term}->{text} cmp $b->{term}->{text} } values %{ $self->{postings} }; $self->_write_postings($segment, @postings); $self->_write_norms($doc, $segment); } sub _invert_document { my ($self, $doc) = @_; for my $field (grep $_->is_indexed, $doc->fields) { my $name = $field->name; my $fn = $self->{field_infos}->field_number($name); my $pos = $self->{field_lengths}->[$fn]; if (!$field->is_tokenized) { $self->_add_position($name, $field->string, $pos++); } else { my $reader = $field->reader || IO::Scalar->new(\$field->{string}); my $stream = $self->{analyzer}->tokenstream({ field => $name, reader => $reader }); while (my $t = $stream->next) { $self->_add_position($name, $t->text, $pos++); last if $pos > $self->{max_field_length}; } } $self->{field_lengths}->[$fn] = $pos; } } sub _add_position { my ($self, $field, $text, $pos) = @_; my $ti = $self->{postings}->{"$field\0$text"}; if ($ti) { $ti->{positions}->[ $ti->freq ] = $pos; $ti->{freq}++; return; } $self->{postings}->{"$field\0$text"} = Plucene::Index::Posting->new({ term => Plucene::Index::Term->new({ field => $field, text => $text }), positions => [$pos], freq => 1, }); } sub _write_postings { my ($self, $segment, @postings) = @_; my (@freqs, @proxs); my $tis = Plucene::Index::TermInfosWriter->new($self->{directory}, $segment, $self->{field_infos}); my $ti = Plucene::Index::TermInfo->new(); for my $posting (@postings) { $ti->doc_freq(1); $ti->freq_pointer(scalar @freqs); $ti->prox_pointer(scalar @proxs); $tis->add($posting->term, $ti); my $f = $posting->freq; push @freqs, ($f == 1) ? 1 : (0, $f); my $last_pos = 0; my $positions = $posting->positions; for my $j (0 .. $f - 1) { my $pos = $positions->[$j] || 0; push @proxs, $pos - $last_pos; $last_pos = $pos; } } write_file("$self->{directory}/$segment.frq" => pack('(w)*', @freqs)); write_file("$self->{directory}/$segment.prx" => pack('(w)*', @proxs)); $tis->break_ref; } sub _write_norms { my ($self, $doc, $segment) = @_; for my $field (grep $_->is_indexed, $doc->fields) { my $fn = $self->{field_infos}->field_number($field->name); warn "Couldn't find field @{[ $field->name ]} in list [ @{[ map $_->name, $self->{field_infos}->fields]}]" unless $fn >= 0; my $norm = Plucene::Store::OutputStream->new("$self->{directory}/$segment.f$fn"); my $val = $self->{field_lengths}[$fn]; my $norm_val = Plucene::Search::Similarity->norm($val); $norm->print(chr($norm_val)); } } package Plucene::Index::Posting; use base 'Class::Accessor::Fast'; __PACKAGE__->mk_accessors(qw( term freq positions )); 1; Save