Mercurial > hgbook

# Locale::Po4a::Po -- manipulation of po files
# $Id: Po.pm,v 1.95 2009-02-28 22:18:39 nekral-guest Exp $
#
# This program is free software; you may redistribute it and/or modify it
# under the terms of GPL (see COPYING).

############################################################################
# Modules and declarations
############################################################################

=head1 NAME

Locale::Po4a::Po - po file manipulation module

=head1 SYNOPSIS

    use Locale::Po4a::Po;
    my $pofile=Locale::Po4a::Po->new();

    # Read po file
    $pofile->read('file.po');

    # Add an entry
    $pofile->push('msgid' => 'Hello', 'msgstr' => 'bonjour',
                  'flags' => "wrap", 'reference'=>'file.c:46');

    # Extract a translation
    $pofile->gettext("Hello"); # returns 'bonjour'

    # Write back to a file
    $pofile->write('otherfile.po');

=head1 DESCRIPTION

Locale::Po4a::Po is a module that allows you to manipulate message
catalogs. You can load and write from/to a file (which extension is often
I<po>), you can build new entries on the fly or request for the translation
of a string.

For a more complete description of message catalogs in the po format and
their use, please refer to the documentation of the gettext program.

This module is part of the PO4A project, which objective is to use po files
(designed at origin to ease the translation of program messages) to
translate everything, including documentation (man page, info manual),
package description, debconf templates, and everything which may benefit
from this.

=head1 OPTIONS ACCEPTED BY THIS MODULE

=over 4

=item porefs

This specifies the reference format. It can be one of 'none' to not produce
any reference, 'noline' to not specify the line number, and 'full' to
include complete references.

=back

=cut

use IO::File;


require Exporter;

package Locale::Po4a::Po;
use DynaLoader;

use Locale::Po4a::Common qw(wrap_msg wrap_mod wrap_ref_mod dgettext);

use subs qw(makespace);
use vars qw(@ISA @EXPORT_OK);
@ISA = qw(Exporter DynaLoader);
@EXPORT = qw(%debug);
@EXPORT_OK = qw(&move_po_if_needed);

use Locale::Po4a::TransTractor;
# Try to use a C extension if present.
eval("bootstrap Locale::Po4a::Po $Locale::Po4a::TransTractor::VERSION");

use 5.006;
use strict;
use warnings;

use Carp qw(croak);
use File::Path; # mkdir before write
use File::Copy; # move
use POSIX qw(strftime floor);
use Time::Local;

use Encode;

my @known_flags=qw(wrap no-wrap c-format fuzzy);

our %debug=('canonize'  => 0,
            'quote'     => 0,
            'escape'    => 0,
            'encoding'  => 0,
            'filter'    => 0);

=head1 Functions about whole message catalogs

=over 4

=item new()

Creates a new message catalog. If an argument is provided, it's the name of
a po file we should load.

=cut

sub new {
    my ($this, $options) = (shift, shift);
    my $class = ref($this) || $this;
    my $self = {};
    bless $self, $class;
    $self->initialize($options);

    my $filename = shift;
    $self->read($filename) if defined($filename) && length($filename);
    return $self;
}

# Return the numerical timezone (e.g. +0200)
# Neither the %z nor the %s formats of strftime are portable:
# '%s' is not supported on Solaris and '%z' indicates
# "2006-10-25 19:36E. Europe Standard Time" on MS Windows.
sub timezone {
    my @g = gmtime();
    my @l = localtime();

    my $diff;
    $diff  = floor(timelocal(@l)/60 +0.5);
    $diff -= floor(timelocal(@g)/60 +0.5);

    my $h = floor($diff / 60) + $l[8]; # $l[8] indicates if we are currently
                                       # in a daylight saving time zone
    my $m = $diff%60;

    return sprintf "%+03d%02d\n", $h, $m;
}

sub initialize {
    my ($self, $options) = (shift, shift);
    my $date = strftime("%Y-%m-%d %H:%M", localtime).timezone();
    chomp $date;
#    $options = ref($options) || $options;

    $self->{options}{'porefs'}= 'full';
    $self->{options}{'msgid-bugs-address'}= undef;
    $self->{options}{'copyright-holder'}= "Free Software Foundation, Inc.";
    $self->{options}{'package-name'}= "PACKAGE";
    $self->{options}{'package-version'}= "VERSION";
    foreach my $opt (keys %$options) {
        if ($options->{$opt}) {
            die wrap_mod("po4a::po",
                         dgettext ("po4a", "Unknown option: %s"), $opt)
                unless exists $self->{options}{$opt};
            $self->{options}{$opt} = $options->{$opt};
        }
    }
    $self->{options}{'porefs'} =~ /^(full|noline|none)$/ ||
        die wrap_mod("po4a::po",
                     dgettext ("po4a",
                               "Invalid value for option 'porefs' ('%s' is ".
                               "not one of 'full', 'noline' or 'none')"),
                     $self->{options}{'porefs'});

    $self->{po}=();
    $self->{count}=0;  # number of msgids in the PO
    # count_doc: number of strings in the document
    # (duplicate strings counted multiple times)
    $self->{count_doc}=0;
    $self->{header_comment}=
                     " SOME DESCRIPTIVE TITLE\n"
                    ." Copyright (C) YEAR ".
                     $self->{options}{'copyright-holder'}."\n"
                    ." This file is distributed under the same license ".
                     "as the ".$self->{options}{'package-name'}." package.\n"
                    ." FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.\n"
                    ."\n"
                    .", fuzzy";
#    $self->header_tag="fuzzy";
    $self->{header}=escape_text("Project-Id-Version: ".
                                $self->{options}{'package-name'}." ".
                                $self->{options}{'package-version'}."\n".
                        ((defined $self->{options}{'msgid-bugs-address'})?
        "Report-Msgid-Bugs-To: ".$self->{options}{'msgid-bugs-address'}."\n":
                                "").
                                "POT-Creation-Date: $date\n".
                                "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n".
                                "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\n".
                                "Language-Team: LANGUAGE <LL\@li.org>\n".
                                "MIME-Version: 1.0\n".
                                "Content-Type: text/plain; charset=CHARSET\n".
                                "Content-Transfer-Encoding: ENCODING");

    $self->{encoder}=find_encoding("ascii");

    # To make stats about gettext hits
    $self->stats_clear();
}

=item read($)

Reads a po file (which name is given as argument).  Previously existing
entries in self are not removed, the new ones are added to the end of the
catalog.

=cut

sub read {
    my $self=shift;
    my $filename=shift
        or croak wrap_mod("po4a::po",
                          dgettext("po4a",
                                   "Please provide a non-null filename"));

    my $fh;
    if ($filename eq '-') {
        $fh=*STDIN;
    } else {
        open $fh,"<$filename"
            or croak wrap_mod("po4a::po",
                              dgettext("po4a", "Can't read from %s: %s"),
                              $filename, $!);
    }

    ## Read paragraphs line-by-line
    my $pofile="";
    my $textline;
    while (defined ($textline = <$fh>)) {
        $pofile .= $textline;
    }
#    close INPUT
#        or croak (sprintf(dgettext("po4a",
#                                   "Can't close %s after reading: %s"),
#                          $filename,$!)."\n");

    my $linenum=0;

    foreach my $msg (split (/\n\n/,$pofile)) {
        my ($msgid,$msgstr,$comment,$automatic,$reference,$flags,$buffer);
        my ($msgid_plural, $msgstr_plural);
        foreach my $line (split (/\n/,$msg)) {
            $linenum++;
            if ($line =~ /^#\. ?(.*)$/) {  # Automatic comment
                $automatic .= (defined($automatic) ? "\n" : "").$1;

            } elsif ($line =~ /^#: ?(.*)$/) { # reference
                $reference .= (defined($reference) ? "\n" : "").$1;

            } elsif ($line =~ /^#, ?(.*)$/) { # flags
                $flags .= (defined($flags) ? "\n" : "").$1;

            } elsif ($line =~ /^#(.*)$/) {  # Translator comments
                $comment .= (defined($comment) ? "\n" : "").($1||"");

            } elsif ($line =~ /^msgid (".*")$/) { # begin of msgid
                $buffer = $1;

            } elsif ($line =~ /^msgid_plural (".*")$/) {
                # begin of msgid_plural, end of msgid

                $msgid = $buffer;
                $buffer = $1;

            } elsif ($line =~ /^msgstr (".*")$/) {
                # begin of msgstr, end of msgid

                $msgid = $buffer;
                $buffer = "$1";

            } elsif ($line =~ /^msgstr\[([0-9]+)\] (".*")$/) {
                # begin of msgstr[x], end of msgid_plural or msgstr[x-1]

                # Note: po4a cannot uses plural forms
                # (no integer to use the plural form)
                #   * drop the msgstr[x] where x >= 2
                #   * use msgstr[0] as the translation of msgid
                #   * use msgstr[1] as the translation of msgid_plural

                if ($1 eq "0") {
                    $msgid_plural = $buffer;
                    $buffer = "$2";
                } elsif ($1 eq "1") {
                    $msgstr = $buffer;
                    $buffer = "$2";
                } elsif ($1 eq "2") {
                    $msgstr_plural = $buffer;
                    warn wrap_ref_mod("$filename:$linenum",
                                      "po4a::po",
                                      dgettext("po4a", "Messages with more than 2 plural forms are not supported."));
                }
            } elsif ($line =~ /^(".*")$/) {
                # continuation of a line
                $buffer .= "\n$1";

            } else {
                warn wrap_ref_mod("$filename:$linenum",
                                  "po4a::po",
                                  dgettext("po4a", "Strange line: -->%s<--"),
                                  $line);
            }
        }
        $linenum++;
        if (defined $msgid_plural) {
            $msgstr_plural=$buffer;

            $msgid = unquote_text($msgid) if (defined($msgid));
            $msgstr = unquote_text($msgstr) if (defined($msgstr));

            $self->push_raw ('msgid'     => $msgid,
                             'msgstr'    => $msgstr,
                             'reference' => $reference,
                             'flags'     => $flags,
                             'comment'   => $comment,
                             'automatic' => $automatic,
                             'plural'    => 0);

            $msgid_plural = unquote_text($msgid_plural)
                if (defined($msgid_plural));
            $msgstr_plural = unquote_text($msgstr_plural)
                if (defined($msgstr_plural));

            $self->push_raw ('msgid'     => $msgid_plural,
                             'msgstr'    => $msgstr_plural,
                             'reference' => $reference,
                             'flags'     => $flags,
                             'comment'   => $comment,
                             'automatic' => $automatic,
                             'plural'    => 1);
        } else {
            $msgstr=$buffer;

            $msgid = unquote_text($msgid) if (defined($msgid));
            $msgstr = unquote_text($msgstr) if (defined($msgstr));

            $self->push_raw ('msgid'     => $msgid,
                             'msgstr'    => $msgstr,
                             'reference' => $reference,
                             'flags'     => $flags,
                             'comment'   => $comment,
                             'automatic' => $automatic);
        }
    }
}

=item write($)

Writes the current catalog to the given file.

=cut

sub write{
    my $self=shift;
    my $filename=shift
        or croak dgettext("po4a","Can't write to a file without filename")."\n";

    my $fh;
    if ($filename eq '-') {
        $fh=\*STDOUT;
    } else {
        # make sure the directory in which we should write the localized
        # file exists
        my $dir = $filename;
        if ($dir =~ m|/|) {
            $dir =~ s|/[^/]*$||;

            File::Path::mkpath($dir, 0, 0755) # Croaks on error
                if (length ($dir) && ! -e $dir);
        }
        open $fh,">$filename"
            or croak wrap_mod("po4a::po",
                              dgettext("po4a", "Can't write to %s: %s"),
                              $filename, $!);
    }

    print $fh "".format_comment($self->{header_comment},"")
        if defined($self->{header_comment}) && length($self->{header_comment});

    print $fh "msgid \"\"\n";
    print $fh "msgstr ".quote_text($self->{header})."\n\n";


    my $buf_msgstr_plural; # USed to keep the first msgstr of plural forms
    my $first=1;
    foreach my $msgid ( sort { ($self->{po}{"$a"}{'pos'}) <=>
                               ($self->{po}{"$b"}{'pos'})
                             }  keys %{$self->{po}}) {
        my $output="";

        if ($first) {
            $first=0;
        } else {
            $output .= "\n";
        }

        $output .= format_comment($self->{po}{$msgid}{'comment'},"")
            if    defined($self->{po}{$msgid}{'comment'})
               && length ($self->{po}{$msgid}{'comment'});
        if (   defined($self->{po}{$msgid}{'automatic'})
            && length ($self->{po}{$msgid}{'automatic'})) {
            foreach my $comment (split(/\\n/,$self->{po}{$msgid}{'automatic'}))
            {
                $output .= format_comment($comment, ". ")
            }
        }
        $output .= format_comment($self->{po}{$msgid}{'type'},". type: ")
            if    defined($self->{po}{$msgid}{'type'})
               && length ($self->{po}{$msgid}{'type'});
        $output .= format_comment($self->{po}{$msgid}{'reference'},": ")
            if    defined($self->{po}{$msgid}{'reference'})
               && length ($self->{po}{$msgid}{'reference'});
        $output .= "#, ". join(", ", sort split(/\s+/,$self->{po}{$msgid}{'flags'}))."\n"
            if    defined($self->{po}{$msgid}{'flags'})
               && length ($self->{po}{$msgid}{'flags'});

        if (exists $self->{po}{$msgid}{'plural'}) {
            if ($self->{po}{$msgid}{'plural'} == 0) {
                if ($self->get_charset =~ /^utf-8$/i) {
                    my $msgstr = Encode::decode_utf8($self->{po}{$msgid}{'msgstr'});
                    $msgid = Encode::decode_utf8($msgid);
                    $output .= Encode::encode_utf8("msgid ".quote_text($msgid)."\n");
                    $buf_msgstr_plural = Encode::encode_utf8("msgstr[0] ".quote_text($msgstr)."\n");
                } else {
                    $output = "msgid ".quote_text($msgid)."\n";
                    $buf_msgstr_plural = "msgstr[0] ".quote_text($self->{po}{$msgid}{'msgstr'})."\n";
                }
            } elsif ($self->{po}{$msgid}{'plural'} == 1) {
# TODO: there may be only one plural form
                if ($self->get_charset =~ /^utf-8$/i) {
                    my $msgstr = Encode::decode_utf8($self->{po}{$msgid}{'msgstr'});
                    $msgid = Encode::decode_utf8($msgid);
                    $output = Encode::encode_utf8("msgid_plural ".quote_text($msgid)."\n");
                    $output .= $buf_msgstr_plural;
                    $output .= Encode::encode_utf8("msgstr[1] ".quote_text($msgstr)."\n");
                    $buf_msgstr_plural = "";
                } else {
                    $output = "msgid_plural ".quote_text($msgid)."\n";
                    $output .= $buf_msgstr_plural;
                    $output .= "msgstr[1] ".quote_text($self->{po}{$msgid}{'msgstr'})."\n";
                }
            } else {
                die wrap_msg(dgettext("po4a","Can't write PO files with more than two plural forms."));
            }
        } else {
            if ($self->get_charset =~ /^utf-8$/i) {
                my $msgstr = Encode::decode_utf8($self->{po}{$msgid}{'msgstr'});
                $msgid = Encode::decode_utf8($msgid);
                $output .= Encode::encode_utf8("msgid ".quote_text($msgid)."\n");
                $output .= Encode::encode_utf8("msgstr ".quote_text($msgstr)."\n");
            } else {
                $output .= "msgid ".quote_text($msgid)."\n";
                $output .= "msgstr ".quote_text($self->{po}{$msgid}{'msgstr'})."\n";
            }
        }

        print $fh $output;
    }
#    print STDERR "$fh";
#    if ($filename ne '-') {
#        close $fh
#            or croak (sprintf(dgettext("po4a",
#                                       "Can't close %s after writing: %s\n"),
#                              $filename,$!));
#    }
}

=item write_if_needed($$)

Like write, but if the PO or POT file already exists, the object will be
written in a temporary file which will be compared with the existing file
to check that the update is needed (this avoids to change a POT just to
update a line reference or the POT-Creation-Date field).

=cut

sub move_po_if_needed {
    my ($new_po, $old_po, $backup) = (shift, shift, shift);
    my $diff;

    if (-e $old_po) {
        my $diff_ignore = "-I'^#:' "
                         ."-I'^\"POT-Creation-Date:' "
                         ."-I'^\"PO-Revision-Date:'";
        $diff = qx(diff -q $diff_ignore $old_po $new_po);
        if ( $diff eq "" ) {
            unlink $new_po
                or die wrap_msg(dgettext("po4a","Can't unlink %s: %s."),
                                $new_po, $!);
            # touch the old PO
            my ($atime, $mtime) = (time,time);
            utime $atime, $mtime, $old_po;
        } else {
            if ($backup) {
                copy $old_po, $old_po."~"
                    or die wrap_msg(dgettext("po4a","Can't copy %s to %s: %s."),
                                    $old_po, $old_po."~", $!);
            } else {
            }
            move $new_po, $old_po
                or die wrap_msg(dgettext("po4a","Can't move %s to %s: %s."),
                                $new_po, $old_po, $!);
        }
    } else {
        move $new_po, $old_po
            or die wrap_msg(dgettext("po4a","Can't move %s to %s: %s."),
                            $new_po, $old_po, $!);
    }
}

sub write_if_needed {
    my $self=shift;
    my $filename=shift
        or croak dgettext("po4a","Can't write to a file without filename")."\n";

    if (-e $filename) {
        my ($tmp_filename);
        (undef,$tmp_filename)=File::Temp->tempfile($filename."XXXX",
                                                   DIR    => "/tmp",
                                                   OPEN   => 0,
                                                   UNLINK => 0);
        $self->write($tmp_filename);
        move_po_if_needed($tmp_filename, $filename);
    } else {
        $self->write($filename);
    }
}

=item gettextize($$)

This function produces one translated message catalog from two catalogs, an
original and a translation. This process is described in L<po4a(7)|po4a.7>,
section I<Gettextization: how does it work?>.

=cut

sub gettextize {
    my $this = shift;
    my $class = ref($this) || $this;
    my ($poorig,$potrans)=(shift,shift);

    my $pores=Locale::Po4a::Po->new();

    my $please_fail = 0;
    my $toobad = dgettext("po4a",
        "\nThe gettextization failed (once again). Don't give up, ".
        "gettextizing is a subtle art, but this is only needed once ".
        "to convert a project to the gorgeous luxus offered by po4a ".
        "to translators.".
        "\nPlease refer to the po4a(7) documentation, the section ".
        "\"HOWTO convert a pre-existing translation to po4a?\" ".
        "contains several hints to help you in your task");

    # Don't fail right now when the entry count does not match. Instead, give
    # it a try so that the user can see where we fail (which is probably where
    # the problem is).
    if ($poorig->count_entries_doc() > $potrans->count_entries_doc()) {
        warn wrap_mod("po4a gettextize", dgettext("po4a",
            "Original has more strings than the translation (%d>%d). ".
            "Please fix it by editing the translated version to add ".
            "some dummy entry."),
                      $poorig->count_entries_doc(),
                      $potrans->count_entries_doc());
        $please_fail = 1;
    } elsif ($poorig->count_entries_doc() < $potrans->count_entries_doc()) {
        warn wrap_mod("po4a gettextize", dgettext("po4a",
            "Original has less strings than the translation (%d<%d). ".
            "Please fix it by removing the extra entry from the ".
            "translated file. You may need an addendum (cf po4a(7)) ".
            "to reput the chunk in place after gettextization. A ".
            "possible cause is that a text duplicated in the original ".
            "is not translated the same way each time. Remove one of ".
            "the translations, and you're fine."),
                      $poorig->count_entries_doc(),
                      $potrans->count_entries_doc());
        $please_fail = 1;
    }

    if ( $poorig->get_charset =~ /^utf-8$/i ) {
        $potrans->to_utf8;
        $pores->set_charset("utf-8");
    } else {
        if ($potrans->get_charset eq "CHARSET") {
            $pores->set_charset("ascii");
        } else {
            $pores->set_charset($potrans->get_charset);
        }
    }
    print "Po character sets:\n".
        "  original=".$poorig->get_charset."\n".
        "  translated=".$potrans->get_charset."\n".
        "  result=".$pores->get_charset."\n"
            if $debug{'encoding'};

    for (my ($o,$t)=(0,0) ;
         $o<$poorig->count_entries_doc() && $t<$potrans->count_entries_doc();
         $o++,$t++) {
        #
        # Extract some informations

        my ($orig,$trans)=($poorig->msgid_doc($o),$potrans->msgid_doc($t));
#       print STDERR "Matches [[$orig]]<<$trans>>\n";

        my ($reforig,$reftrans)=($poorig->{po}{$orig}{'reference'},
                                 $potrans->{po}{$trans}{'reference'});
        my ($typeorig,$typetrans)=($poorig->{po}{$orig}{'type'},
                                   $potrans->{po}{$trans}{'type'});

        #
        # Make sure the type of both string exist
        #
        die wrap_mod("po4a gettextize",
                     "Internal error: type of original string number %s ".
                     "isn't provided", $o)
            if ($typeorig eq '');

        die wrap_mod("po4a gettextize",
                     "Internal error: type of translated string number %s ".
                     "isn't provided", $o)
            if ($typetrans eq '');

        #
        # Make sure both type are the same
        #
        if ($typeorig ne $typetrans){
            $pores->write("gettextization.failed.po");
            die wrap_msg(dgettext("po4a",
                         "po4a gettextization: Structure disparity between ".
                         "original and translated files:\n".
                         "msgid (at %s) is of type '%s' while\n".
                         "msgstr (at %s) is of type '%s'.\n".
                         "Original text: %s\n".
                         "Translated text: %s\n".
                         "(result so far dumped to gettextization.failed.po)").
                         "%s",
                         $reforig, $typeorig,
                         $reftrans, $typetrans,
                         $orig,
                         $trans,
                         $toobad);
        }

        #
        # Push the entry
        #
        my $flags;
        if (defined $poorig->{po}{$orig}{'flags'}) {
            $flags = $poorig->{po}{$orig}{'flags'}." fuzzy";
        } else {
            $flags = "fuzzy";
        }
        $pores->push_raw('msgid'     => $orig,
                         'msgstr'    => $trans,
                         'flags'     => $flags,
                         'type'      => $typeorig,
                         'reference' => $reforig,
                         'conflict'  => 1,
                         'transref'  => $potrans->{po}{$trans}{'reference'})
            unless (defined($pores->{po}{$orig})
                    and ($pores->{po}{$orig}{'msgstr'} eq $trans))
        # FIXME: maybe we should be smarter about what reference should be
        #        sent to push_raw.
    }

    # make sure we return a useful error message when entry count differ
    die "$toobad\n" if $please_fail;

    return $pores;
}

=item filter($)

This function extracts a catalog from an existing one. Only the entries having
a reference in the given file will be placed in the resulting catalog.

This function parses its argument, converts it to a perl function definition,
eval this definition and filter the fields for which this function returns
true.

I love perl sometimes ;)

=cut

sub filter {
    my $self=shift;
    our $filter=shift;

    my $res;
    $res = Locale::Po4a::Po->new();

    # Parse the filter
    our $code="sub apply { return ";
    our $pos=0;
    our $length = length $filter;

    # explode chars to parts. How to subscript a string in Perl?
    our @filter = split(//,$filter);

    sub gloups {
        my $fmt=shift;
        my $space = "";
        for (1..$pos){
            $space .= ' ';
        }
        die wrap_msg("$fmt\n$filter\n$space^ HERE");
    }
    sub showmethecode {
        return unless $debug{'filter'};
        my $fmt=shift;
        my $space="";
        for (1..$pos){
            $space .= ' ';
        }
        print STDERR "$filter\n$space^ $fmt\n";#"$code\n";
    }

    # I dream of a lex in perl :-/
    sub parse_expression {
        showmethecode("Begin expression")
            if $debug{'filter'};

        gloups("Begin of expression expected, got '%s'",$filter[$pos])
            unless ($filter[$pos] eq '(');
        $pos ++; # pass the '('
        if ($filter[$pos] eq '&') {
            # AND
            $pos++;
            showmethecode("Begin of AND")
                if $debug{'filter'};
            $code .= "(";
            while (1) {
                gloups ("Unfinished AND statement.")
                    if ($pos == $length);
                parse_expression();
                if ($filter[$pos] eq '(') {
                    $code .= " && ";
                } elsif ($filter[$pos] eq ')') {
                    last; # do not eat that char
                } else {
                    gloups("End of AND or begin of sub-expression expected, got '%s'", $filter[$pos]);
                }
            }
            $code .= ")";
        } elsif ($filter[$pos] eq '|') {
            # OR
            $pos++;
            $code .= "(";
            while (1) {
                gloups("Unfinished OR statement.")
                    if ($pos == $length);
                parse_expression();
                if ($filter[$pos] eq '(') {
                    $code .= " || ";
                } elsif ($filter[$pos] eq ')') {
                    last; # do not eat that char
                } else {
                    gloups("End of OR or begin of sub-expression expected, got '%s'",$filter[$pos]);
                }
            }
            $code .= ")";
        } elsif ($filter[$pos] eq '!') {
            # NOT
            $pos++;
            $code .= "(!";
            gloups("Missing sub-expression in NOT statement.")
                if ($pos == $length);
            parse_expression();
            $code .= ")";
        } else {
            # must be an equal. Let's get field and argument
            my ($field,$arg,$done);
            $field = substr($filter,$pos);
            gloups("EQ statement contains no '=' or invalid field name")
                unless ($field =~ /([a-z]*)=/i);
            $field = lc($1);
            $pos += (length $field) + 1;

            # check that we've got a valid field name,
            # and the number it referes to
            # DO NOT CHANGE THE ORDER
            my @names=qw(msgid msgstr reference flags comment automatic);
            my $fieldpos;
            for ($fieldpos = 0;
                 $fieldpos < scalar @names && $field ne $names[$fieldpos];
                 $fieldpos++) {}
            gloups("Invalid field name: %s",$field)
                if $fieldpos == scalar @names; # not found

            # Now, get the argument value. It has to be between quotes,
            # which can be escaped
            # We point right on the first char of the argument
            # (first quote already eaten)
            my $escaped = 0;
            my $quoted = 0;
            if ($filter[$pos] eq '"') {
                $pos++;
                $quoted = 1;
            }
            showmethecode(($quoted?"Quoted":"Unquoted")." argument of field '$field'")
                if $debug{'filter'};

            while (!$done) {
                gloups("Unfinished EQ argument.")
                    if ($pos == $length);

                if ($quoted) {
                    if ($filter[$pos] eq '\\') {
                        if ($escaped) {
                            $arg .= '\\';
                            $escaped = 0;
                        } else {
                            $escaped = 1;
                        }
                    } elsif ($escaped) {
                        if ($filter[$pos] eq '"') {
                            $arg .= '"';
                            $escaped = 0;
                        } else {
                            gloups("Invalid escape sequence in argument: '\\%s'",$filter[$pos]);
                        }
                    } else {
                        if ($filter[$pos] eq '"') {
                            $done = 1;
                        } else {
                            $arg .= $filter[$pos];
                        }
                    }
                } else {
                    if ($filter[$pos] eq ')') {
                        # counter the next ++ since we don't want to eat
                        # this char
                        $pos--;
                        $done = 1;
                    } else {
                        $arg .= $filter[$pos];
                    }
                }
                $pos++;
            }
            # and now, add the code to check this equality
            $code .= "(\$_[$fieldpos] =~ m/$arg/)";

        }
        showmethecode("End of expression")
            if $debug{'filter'};
        gloups("Unfinished statement.")
            if ($pos == $length);
        gloups("End of expression expected, got '%s'",$filter[$pos])
            unless ($filter[$pos] eq ')');
        $pos++;
    }
    # And now, launch the beast, finish the function and use eval
    # to construct this function.
    # Ok, the lack of lexer is a fair price for the eval ;)
    parse_expression();
    gloups("Garbage at the end of the expression")
        if ($pos != $length);
    $code .= "; }";
    print STDERR "CODE = $code\n"
        if $debug{'filter'};
    eval $code;
    die wrap_mod("po4a::po", dgettext("po4a", "Eval failure: %s"), $@)
        if $@;

    for (my $cpt=(0) ;
         $cpt<$self->count_entries();
         $cpt++) {

        my ($msgid,$ref,$msgstr,$flags,$type,$comment,$automatic);

        $msgid = $self->msgid($cpt);
        $ref=$self->{po}{$msgid}{'reference'};

        $msgstr= $self->{po}{$msgid}{'msgstr'};
        $flags =  $self->{po}{$msgid}{'flags'};
        $type = $self->{po}{$msgid}{'type'};
        $comment = $self->{po}{$msgid}{'comment'};
        $automatic = $self->{po}{$msgid}{'automatic'};

        # DO NOT CHANGE THE ORDER
        $res->push_raw('msgid' => $msgid,
                       'msgstr' => $msgstr,
                       'flags' => $flags,
                       'type'  => $type,
                       'reference' => $ref,
                       'comment' => $comment,
                       'automatic' => $automatic)
               if (apply($msgid,$msgstr,$ref,$flags,$comment,$automatic));
    }
    # delete the apply subroutine
    # otherwise it will be redefined.
    undef &apply;
    return $res;
}

=item to_utf8()

Recodes to utf-8 the po's msgstrs. Does nothing if the charset is not
specified in the po file ("CHARSET" value), or if it's already utf-8 or
ascii.

=cut

sub to_utf8 {
    my $this = shift;
    my $charset = $this->get_charset();

    unless ($charset eq "CHARSET" or
            $charset =~ /^ascii$/i or
            $charset =~ /^utf-8$/i) {
        foreach my $msgid ( keys %{$this->{po}} ) {
            Encode::from_to($this->{po}{$msgid}{'msgstr'}, $charset, "utf-8");
        }
        $this->set_charset("utf-8");
    }
}

=back

=head1 Functions to use a message catalog for translations

=over 4

=item gettext($%)

Request the translation of the string given as argument in the current catalog.
The function returns the original (untranslated) string if the string was not
found.

After the string to translate, you can pass a hash of extra
arguments. Here are the valid entries:

=over

=item wrap

boolean indicating whether we can consider that whitespaces in string are
not important. If yes, the function canonizes the string before looking for
a translation, and wraps the result.

=item wrapcol

The column at which we should wrap (default: 76).

=back

=cut

sub gettext {
    my $self=shift;
    my $text=shift;
    my (%opt)=@_;
    my $res;

    return "" unless defined($text) && length($text); # Avoid returning the header.
    my $validoption="reference wrap wrapcol";
    my %validoption;

    map { $validoption{$_}=1 } (split(/ /,$validoption));
    foreach (keys %opt) {
        Carp::confess "internal error:  unknown arg $_.\n".
                      "Here are the valid options: $validoption.\n"
            unless $validoption{$_};
    }

    $text=canonize($text)
        if ($opt{'wrap'});

    my $esc_text=escape_text($text);

    $self->{gettextqueries}++;

    if (    defined $self->{po}{$esc_text}
        and defined $self->{po}{$esc_text}{'msgstr'}
        and length $self->{po}{$esc_text}{'msgstr'}
        and (   not defined $self->{po}{$esc_text}{'flags'}
             or $self->{po}{$esc_text}{'flags'} !~ /fuzzy/)) {

        $self->{gettexthits}++;
        $res = unescape_text($self->{po}{$esc_text}{'msgstr'});
        if (defined $self->{po}{$esc_text}{'plural'}) {
            if ($self->{po}{$esc_text}{'plural'} eq "0") {
                warn wrap_mod("po4a gettextize", dgettext("po4a",
                              "'%s' is the singular form of a message, ".
                              "po4a will use the msgstr[0] translation (%s)."),
                              $esc_text, $res);
            } else {
                warn wrap_mod("po4a gettextize", dgettext("po4a",
                              "'%s' is the plural form of a message, ".
                              "po4a will use the msgstr[1] translation (%s)."),
                              $esc_text, $res);
            }
        }
    } else {
        $res = $text;
    }

    if ($opt{'wrap'}) {
        if ($self->get_charset =~ /^utf-8$/i) {
            $res=Encode::decode_utf8($res);
            $res=wrap ($res, $opt{'wrapcol'} || 76);
            $res=Encode::encode_utf8($res);
        } else {
            $res=wrap ($res, $opt{'wrapcol'} || 76);
        }
    }
#    print STDERR "Gettext >>>$text<<<(escaped=$esc_text)=[[[$res]]]\n\n";
    return $res;
}

=item stats_get()

Returns statistics about the hit ratio of gettext since the last time that
stats_clear() was called. Please note that it's not the same
statistics than the one printed by msgfmt --statistic. Here, it's statistics
about recent usage of the po file, while msgfmt reports the status of the
file.  Example of use:

    [some use of the po file to translate stuff]

    ($percent,$hit,$queries) = $pofile->stats_get();
    print "So far, we found translations for $percent\%  ($hit of $queries) of strings.\n";

=cut

sub stats_get() {
    my $self=shift;
    my ($h,$q)=($self->{gettexthits},$self->{gettextqueries});
    my $p = ($q == 0 ? 100 : int($h/$q*10000)/100);

#    $p =~ s/\.00//;
#    $p =~ s/(\..)0/$1/;

    return ( $p,$h,$q );
}

=item stats_clear()

Clears the statistics about gettext hits.

=cut

sub stats_clear {
    my $self = shift;
    $self->{gettextqueries} = 0;
    $self->{gettexthits} = 0;
}

=back

=head1 Functions to build a message catalog

=over 4

=item push(%)

Push a new entry at the end of the current catalog. The arguments should
form a hash table. The valid keys are:

=over 4

=item msgid

the string in original language.

=item msgstr

the translation.

=item reference

an indication of where this string was found. Example: file.c:46 (meaning
in 'file.c' at line 46). It can be a space-separated list in case of
multiple occurrences.

=item comment

a comment added here manually (by the translators). The format here is free.

=item automatic

a comment which was automatically added by the string extraction
program. See the I<--add-comments> option of the B<xgettext> program for
more information.

=item flags

space-separated list of all defined flags for this entry.

Valid flags are: c-text, python-text, lisp-text, elisp-text, librep-text,
smalltalk-text, java-text, awk-text, object-pascal-text, ycp-text,
tcl-text, wrap, no-wrap and fuzzy.

See the gettext documentation for their meaning.

=item type

This is mostly an internal argument: it is used while gettextizing
documents. The idea here is to parse both the original and the translation
into a po object, and merge them, using one's msgid as msgid and the
other's msgid as msgstr. To make sure that things get ok, each msgid in po
objects are given a type, based on their structure (like "chapt", "sect1",
"p" and so on in docbook). If the types of strings are not the same, that
means that both files do not share the same structure, and the process
reports an error.

This information is written as automatic comment in the po file since this
gives to translators some context about the strings to translate.

=item wrap

boolean indicating whether whitespaces can be mangled in cosmetic
reformattings. If true, the string is canonized before use.

This information is written to the po file using the 'wrap' or 'no-wrap' flag.

=item wrapcol

The column at which we should wrap (default: 76).

This information is not written to the po file.

=back

=cut

sub push {
    my $self=shift;
    my %entry=@_;

    my $validoption="wrap wrapcol type msgid msgstr automatic flags reference";
    my %validoption;

    map { $validoption{$_}=1 } (split(/ /,$validoption));
    foreach (keys %entry) {
        Carp::confess "internal error:  unknown arg $_.\n".
                      "Here are the valid options: $validoption.\n"
            unless $validoption{$_};
    }

    unless ($entry{'wrap'}) {
        $entry{'flags'} .= " no-wrap";
    }
    if (defined ($entry{'msgid'})) {
        $entry{'msgid'} = canonize($entry{'msgid'})
            if ($entry{'wrap'});

        $entry{'msgid'} = escape_text($entry{'msgid'});
    }
    if (defined ($entry{'msgstr'})) {
        $entry{'msgstr'} = canonize($entry{'msgstr'})
            if ($entry{'wrap'});

        $entry{'msgstr'} = escape_text($entry{'msgstr'});
    }

    $self->push_raw(%entry);
}

# The same as push(), but assuming that msgid and msgstr are already escaped
sub push_raw {
    my $self=shift;
    my %entry=@_;
    my ($msgid,$msgstr,$reference,$comment,$automatic,$flags,$type,$transref)=
        ($entry{'msgid'},$entry{'msgstr'},
         $entry{'reference'},$entry{'comment'},$entry{'automatic'},
         $entry{'flags'},$entry{'type'},$entry{'transref'});
    my $keep_conflict = $entry{'conflict'};

#    print STDERR "Push_raw\n";
#    print STDERR " msgid=>>>$msgid<<<\n" if $msgid;
#    print STDERR " msgstr=[[[$msgstr]]]\n" if $msgstr;
#    Carp::cluck " flags=$flags\n" if $flags;

    return unless defined($entry{'msgid'});

    #no msgid => header definition
    unless (length($entry{'msgid'})) {
#       if (defined($self->{header}) && $self->{header} =~ /\S/) {
#           warn dgettext("po4a","Redefinition of the header. ".
#                                "The old one will be discarded\n");
#       } FIXME: do that iff the header isn't the default one.
        $self->{header}=$msgstr;
        $self->{header_comment}=$comment;
        my $charset = $self->get_charset;
        if ($charset ne "CHARSET") {
            $self->{encoder}=find_encoding($charset);
        } else {
            $self->{encoder}=find_encoding("ascii");
        }
        return;
    }

    if ($self->{options}{'porefs'} eq "none") {
        $reference = "";
    } elsif ($self->{options}{'porefs'} eq "noline") {
        $reference =~ s/:[0-9]*/:1/g;
    }

    if (defined($self->{po}{$msgid})) {
        warn wrap_mod("po4a::po",
                      dgettext("po4a","msgid defined twice: %s"),
                      $msgid)
            if (0); # FIXME: put a verbose stuff
        if (    defined $msgstr
            and defined $self->{po}{$msgid}{'msgstr'}
            and $self->{po}{$msgid}{'msgstr'} ne $msgstr) {
            my $txt=quote_text($msgid);
            my ($first,$second)=
                (format_comment(". ",$self->{po}{$msgid}{'reference'}).
                 quote_text($self->{po}{$msgid}{'msgstr'}),

                 format_comment(". ",$reference).
                 quote_text($msgstr));

            if ($keep_conflict) {
                if ($self->{po}{$msgid}{'msgstr'} =~ m/^#-#-#-#-#  .*  #-#-#-#-#\\n/s) {
                    $msgstr = $self->{po}{$msgid}{'msgstr'}.
                              "\\n#-#-#-#-#  $transref  #-#-#-#-#\\n".
                              $msgstr;
                } else {
                    $msgstr = "#-#-#-#-#  ".
                              $self->{po}{$msgid}{'transref'}.
                              "  #-#-#-#-#\\n".
                              $self->{po}{$msgid}{'msgstr'}."\\n".
                              "#-#-#-#-#  $transref  #-#-#-#-#\\n".
                              $msgstr;
                }
                # Every msgid will have the same list of references.
                # Only keep the last list.
                $self->{po}{$msgid}{'reference'} = "";
            } else {
            warn wrap_msg(dgettext("po4a",
                                   "Translations don't match for:\n".
                                   "%s\n".
                                   "-->First translation:\n".
                                   "%s\n".
                                   " Second translation:\n".
                                   "%s\n".
                                   " Old translation discarded."),
                          $txt,$first,$second);
            }
        }
    }
    if (defined $transref) {
        $self->{po}{$msgid}{'transref'} = $transref;
    }
    if (defined $reference) {
        if (defined $self->{po}{$msgid}{'reference'}) {
            $self->{po}{$msgid}{'reference'} .= " ".$reference;
        } else {
            $self->{po}{$msgid}{'reference'} = $reference;
        }
    }
    $self->{po}{$msgid}{'msgstr'} = $msgstr;
    $self->{po}{$msgid}{'comment'} = $comment;
    $self->{po}{$msgid}{'automatic'} = $automatic;
    if (defined($self->{po}{$msgid}{'pos_doc'})) {
        $self->{po}{$msgid}{'pos_doc'} .= " ".$self->{count_doc}++;
    } else {
        $self->{po}{$msgid}{'pos_doc'}  = $self->{count_doc}++;
    }
    unless (defined($self->{po}{$msgid}{'pos'})) {
        $self->{po}{$msgid}{'pos'} = $self->{count}++;
    }
    $self->{po}{$msgid}{'type'} = $type;
    $self->{po}{$msgid}{'plural'} = $entry{'plural'}
        if defined $entry{'plural'};

    if (defined($flags)) {
        $flags = " $flags ";
        $flags =~ s/,/ /g;
        foreach my $flag (@known_flags) {
            if ($flags =~ /\s$flag\s/) { # if flag to be set
                unless (   defined($self->{po}{$msgid}{'flags'})
                        && $self->{po}{$msgid}{'flags'} =~ /\b$flag\b/) {
                    # flag not already set
                    if (defined $self->{po}{$msgid}{'flags'}) {
                        $self->{po}{$msgid}{'flags'} .= " ".$flag;
                    } else {
                        $self->{po}{$msgid}{'flags'} = $flag;
                    }
                }
            }
        }
    }
#    print STDERR "stored ((($msgid)))=>(((".$self->{po}{$msgid}{'msgstr'}.")))\n\n";

}

=back

=head1 Miscellaneous functions

=over 4

=item count_entries()

Returns the number of entries in the catalog (without the header).

=cut

sub count_entries($) {
    my $self=shift;
    return $self->{count};
}

=item count_entries_doc()

Returns the number of entries in document. If a string appears multiple times
in the document, it will be counted multiple times

=cut

sub count_entries_doc($) {
    my $self=shift;
    return $self->{count_doc};
}

=item msgid($)

Returns the msgid of the given number.

=cut

sub msgid($$) {
    my $self=shift;
    my $num=shift;

    foreach my $msgid ( keys %{$self->{po}} ) {
        return $msgid if ($self->{po}{$msgid}{'pos'} eq $num);
    }
    return undef;
}

=item msgid_doc($)

Returns the msgid with the given position in the document.

=cut

sub msgid_doc($$) {
    my $self=shift;
    my $num=shift;

    foreach my $msgid ( keys %{$self->{po}} ) {
        foreach my $pos (split / /, $self->{po}{$msgid}{'pos_doc'}) {
            return $msgid if ($pos eq $num);
        }
    }
    return undef;
}

=item get_charset()

Returns the character set specified in the po header. If it hasn't been
set, it will return "CHARSET".

=cut

sub get_charset() {
    my $self=shift;

    $self->{header} =~ /charset=(.*?)[\s\\]/;

    if (defined $1) {
        return $1;
    } else {
        return "CHARSET";
    }
}

=item set_charset($)

This sets the character set of the po header to the value specified in its
first argument. If you never call this function (and no file with a specified
character set is read), the default value is left to "CHARSET". This value
doesn't change the behavior of this module, it's just used to fill that field
in the header, and to return it in get_charset().

=cut

sub set_charset() {
    my $self=shift;

    my ($newchar,$oldchar);
    $newchar = shift;
    $oldchar = $self->get_charset();

    $self->{header} =~ s/$oldchar/$newchar/;
    $self->{encoder}=find_encoding($newchar);
}

#----[ helper functions ]---------------------------------------------------

# transforme the string from its po file representation to the form which
#   should be used to print it
sub unescape_text {
    my $text = shift;

    print STDERR "\nunescape [$text]====" if $debug{'escape'};
    $text = join("",split(/\n/,$text));
    $text =~ s/\\"/"/g;
    # unescape newlines
    #   NOTE on \G:
    #   The following regular expression introduce newlines.
    #   Thus, ^ doesn't match all beginnings of lines.
    #   \G is a zero-width assertion that matches the position
    #   of the previous substitution with s///g. As every
    #   substitution ends by a newline, it always matches a
    #   position just after a newline.
    $text =~ s/(           # $1:
                (\G|[^\\]) #    beginning of the line or any char
                           #    different from '\'
                (\\\\)*    #    followed by any even number of '\'
               )\\n        # and followed by an escaped newline
              /$1\n/sgx;   # single string, match globally, allow comments
    # unescape tabulations
    $text =~ s/(          # $1:
                (\G|[^\\])#    beginning of the line or any char
                          #    different from '\'
                (\\\\)*   #    followed by any even number of '\'
               )\\t       # and followed by an escaped tabulation
              /$1\t/mgx;  # multilines string, match globally, allow comments
    # and unescape the escape character
    $text =~ s/\\\\/\\/g;
    print STDERR ">$text<\n" if $debug{'escape'};

    return $text;
}

# transform the string to its representation as it should be written in po
# files
sub escape_text {
    my $text = shift;

    print STDERR "\nescape [$text]====" if $debug{'escape'};
    $text =~ s/\\/\\\\/g;
    $text =~ s/"/\\"/g;
    $text =~ s/\n/\\n/g;
    $text =~ s/\t/\\t/g;
    print STDERR ">$text<\n" if $debug{'escape'};

    return $text;
}

# put quotes around the string on each lines (without escaping it)
# It does also normalize the text (ie, make sure its representation is wraped
#   on the 80th char, but without changing the meaning of the string)
sub quote_text {
    my $string = shift;

    return '""' unless defined($string) && length($string);

    print STDERR "\nquote [$string]====" if $debug{'quote'};
    # break lines on newlines, if any
    # see unescape_text for an explanation on \G
    $string =~ s/(           # $1:
                  (\G|[^\\]) #    beginning of the line or any char
                             #    different from '\'
                  (\\\\)*    #    followed by any even number of '\'
                 \\n)        # and followed by an escaped newline
                /$1\n/sgx;   # single string, match globally, allow comments
    $string = wrap($string);
    my @string = split(/\n/,$string);
    $string = join ("\"\n\"",@string);
    $string = "\"$string\"";
    if (scalar @string > 1 && $string[0] ne '') {
        $string = "\"\"\n".$string;
    }

    print STDERR ">$string<\n" if $debug{'quote'};
    return $string;
}

# undo the work of the quote_text function
sub unquote_text {
    my $string = shift;
    print STDERR "\nunquote [$string]====" if $debug{'quote'};
    $string =~ s/^""\\n//s;
    $string =~ s/^"(.*)"$/$1/s;
    $string =~ s/"\n"//gm;
    # Note: an even number of '\' could precede \\n, but I could not build a
    # document to test this
    $string =~ s/([^\\])\\n\n/$1!!DUMMYPOPM!!/gm;
    $string =~ s|!!DUMMYPOPM!!|\\n|gm;
    print STDERR ">$string<\n" if $debug{'quote'};
    return $string;
}

# canonize the string: write it on only one line, changing consecutive
# whitespace to only one space.
# Warning, it changes the string and should only be called if the string is
# plain text
sub canonize {
    my $text=shift;
    print STDERR "\ncanonize [$text]====" if $debug{'canonize'};
    $text =~ s/^ *//s;
    $text =~ s/^[ \t]+/  /gm;
    # if ($text eq "\n"), it messed up the first string (header)
    $text =~ s/\n/  /gm if ($text ne "\n");
    $text =~ s/([.)])  +/$1  /gm;
    $text =~ s/([^.)])  */$1 /gm;
    $text =~ s/ *$//s;
    print STDERR ">$text<\n" if $debug{'canonize'};
    return $text;
}

# wraps the string. We don't use Text::Wrap since it mangles whitespace at
# the end of splited line
sub wrap {
    my $text=shift;
    return "0" if ($text eq '0');
    my $col=shift || 76;
    my @lines=split(/\n/,"$text");
    my $res="";
    my $first=1;
    while (defined(my $line=shift @lines)) {
        if ($first && length($line) > $col - 10) {
            unshift @lines,$line;
            $first=0;
            next;
        }
        if (length($line) > $col) {
            my $pos=rindex($line," ",$col);
            while (substr($line,$pos-1,1) eq '.' && $pos != -1) {
                $pos=rindex($line," ",$pos-1);
            }
            if ($pos == -1) {
                # There are no spaces in the first $col chars, pick-up the
                # first space
                $pos = index($line," ");
            }
            if ($pos != -1) {
                my $end=substr($line,$pos+1);
                $line=substr($line,0,$pos+1);
                if ($end =~ s/^( +)//) {
                    $line .= $1;
                }
                unshift @lines,$end;
            }
        }
        $first=0;
        $res.="$line\n";
    }
    # Restore the original trailing spaces
    $res =~ s/\s+$//s;
    if ($text =~ m/(\s+)$/s) {
        $res .= $1;
    }
    return $res;
}

# outputs properly a '# ... ' line to be put in the po file
sub format_comment {
    my $comment=shift;
    my $char=shift;
    my $result = "#". $char . $comment;
    $result =~ s/\n/\n#$char/gs;
    $result =~ s/^#$char$/#/gm;
    $result .= "\n";
    return $result;
}


1;
__END__

=back

=head1 AUTHORS

 Denis Barbier <barbier@linuxfr.org>
 Martin Quinson (mquinson#debian.org)

=cut
author	Dongsheng Song <dongsheng.song@gmail.com>
date	Thu, 12 Mar 2009 15:43:56 +0800
parents
children