#!/usr/bin/env perl## check_po.pl - check po file translations for likely errors## Written by David W. Pfitzner dwp@mso.anu.edu.au# This script is hereby placed in the Public Domain.## Various checks on po file translations:# - printf-style format strings;# - differences in trailing newlines;# - empty (non-fuzzy) msgid;# - likely whitespace errors on joining multi-line entries# Ignores all fuzzy entries.## Options:# -x Don't do standard checks above (eg, just check one of below).# -n Check newlines within strings; ie, that have equal numbers# of newlines in msgstr and msgid. (Optional because this may# happen legitimately.)# -w Check leading whitespace. Sometimes whitespace is simply # spacing (eg, for widget labels etc), or punctuation differences,# so this may be ok.# -W Check trailing whitespace. See -w above.# -p Check trailing punctuation.# -c Check capitalization of first non-whitespace character # (only if [a-zA-Z]).# -e Check on empty (c.q. new) msgstr## Reads stdin (or filename args, via <>), writes any problems to stdout.## Modified by Davide Pagnin nightmare@freeciv.it to support plural forms## Version: 0.41 (2002-06-06)# TODO: This script needs to be able to handle Farsi's %Id flag for# number format specifiers. More information on how it works, see# http://www.gnu.org/software/hello/manual/gettext/c_002dformat.html# It's possible someone has already made this change... look around# for an updated version of this script.use strict;use vars qw($opt_c $opt_n $opt_p $opt_w $opt_W $opt_x $opt_e);use Getopt::Std;getopts('cnpwWxe');# Globals, for current po entry:## Note that msgid and msgstr have newlines represented by the # two characters '\' and 'n' (and similarly for other escapes).my @amsgid; # lines exactly as in inputmy @amsgstr;my $entryline; # lineno where entry startsmy $msgid; # lines joined by ""my $msgstr;my $is_fuzzy;my $is_cformat;my $state; # From constant values below.my $did_print; # Whether we have printed this entry, to # print only once for multiple problems.use constant S_LOOKING_START => 0; # looking for start of entryuse constant S_DOING_MSGID => 1; # doing msgid partuse constant S_DOING_MSGSTR => 2; # doing msgstr part # Initialize or reinitalize globals to prepare for new entry:sub new_entry { @amsgid = (); @amsgstr = (); $msgid = undef; $msgstr = undef; $entryline = 0; $is_fuzzy = 0; $is_cformat = 0; $did_print = 0; $state = S_LOOKING_START;}# Nicely print either a "msgid" or "msgstr" (name is one of these) # with given array of data.sub print_one { my $name = shift; print " $name \"", join("\"\n \"", @_), "\"\n";}# Print a problem (args like print()), preceeded by entry unless# we have already printed that: label, and msgid and msgstr.#sub print_problem { unless ($did_print) { print "ENTRY:", ($ARGV eq "-" ? "" : " ($ARGV, line $entryline)"), "\n"; print_one("msgid", @amsgid); print_one("msgstr", @amsgstr); $did_print = 1; } print "*** ", @_;}# Check final newline: probably, translations should end in a newline # if and only if the original string does. # (See also check_trailing_whitespace and check_num_newlines below.)#sub check_trailing_newlines { if ($opt_x) { return; } my ($ichar, $schar); $ichar = (length($msgid)>=2) ? substr($msgid, -2, 2) : ""; $schar = (length($msgstr)>=2) ? substr($msgstr, -2, 2) : ""; if ($ichar eq "\\n" && $schar ne "\\n") { print_problem "Missing trailing newline\n"; } if ($ichar ne "\\n" && $schar eq "\\n") { print_problem "Extra trailing newline\n"; }}# Check leading whitespace. In general, any leading whitespace should # be the same in msgstr and msgid -- but not always.#sub check_leading_whitespace { unless ($opt_w) { return; } my ($id, $str); if ($msgid =~ m/^(\s+)/) { $id = $1; } else { $id = ""; } if ($msgstr =~ m/^(\s+)/) { $str = $1; } else { $str = ""; } if ($id ne $str) { print_problem "Different leading whitespace\n"; }}# Check trailing whitespace. In general, any trailing whitespace should # be the same in msgstr and msgid -- but not always.#sub check_trailing_whitespace { unless ($opt_W) { return; } my ($id, $str); if ($msgid =~ m/((?:\s|\\n)+)$/) { $id = $1; } else { $id = ""; } if ($msgstr =~ m/((?:\s|\\n)+)$/) { $str = $1; } else { $str = ""; } if ($id ne $str) { print_problem "Different trailing whitespace\n"; }}# Check equal numbers of newlines. In general ... etc.#sub check_num_newlines { unless ($opt_n) { return; } my $num_i = ($msgid =~ m(\\n)g); my $num_s = ($msgstr =~ m(\\n)g); if ($num_i != $num_s) { print_problem "Mismatch in newline count\n"; }}# Check capitalization of first non-whitespace character (for [a-zA-Z] # only). In general ... etc.#sub check_leading_capitalization { unless ($opt_c) { return; } my ($id, $str); if ($msgid =~ m/^\s*([a-zA-Z])/) { $id = $1; } if ($msgstr =~ m/^\s*([a-zA-Z])/) { $str = $1; } if (defined($id) && defined($str)) { if (($id =~ /^[a-z]$/ && $str =~ /^[A-Z]$/) || ($id =~ /^[A-Z]$/ && $str =~ /^[a-z]$/)) { print_problem "Different leading capitalization\n"; } }}# Check trailing 'punctuation' characters (ignoring trailing whitespace).# In general .. etc.#sub check_trailing_punctuation { unless ($opt_p) { return; } my ($id, $str); # Might want more characters: if ($msgid =~ m/([\\\.\/\,\!\?\"\'\:\;])+(?:\s|\\n)*$/) { $id = $1; } else { $id = ""; } if ($msgstr =~ m/([\\\.\/\,\!\?\"\'\:\;])+(?:\s|\\n)*$/) { $str = $1; } else { $str = ""; } ##print "$id $str\n"; if ($id ne $str) { print_problem "Different trailing punctuation\n"; }}# Check that multiline strings have whitespace separation, since# otherwise, eg:# msgstr "this is a multiline"# "string"# expands to:# "this is a multilinestring"#sub check_whitespace_joins { if ($opt_x) { return; } my $ok = 1; my $i = 0; foreach my $aref (\@amsgid, \@amsgstr) { my $prev = undef; LINE: foreach my $line (@$aref) { if (defined($prev) && length($prev) && $prev !~ /\s$/ && $prev !~ /\\n$/ && $line !~ /^\s/ && $line !~ /^\\n/) { $ok = 0; last LINE; } $prev = $line; } if (!$ok) { print_problem("Possible non-whitespace line-join problem in ", ($i==0 ? "msgid" : "msgstr"), " \n"); } $i++; }}# Check printf-style format entries.# Non-trivial, because translation strings may use format specifiers# out of order, or skip some specifiers etc. Also gettext marks# anything with '%' as cformat, though not all are.#sub check_cformat { unless ($is_cformat) { return; } if ($opt_x) { return; } my (@iform, @sform); @iform = ($msgid =~ m/\%[0-9\.\$]*[a-z]/g); @sform = ($msgstr =~ m/\%[0-9\.\$]*[a-z]/g); ##print join("::", @iform), "\n"; ##print join("::", @sform), "\n"; my $js; # index in sform my $j; # index into iform SFORM: for ($js=0; $js < @sform; $js++) { my $sf = $sform[$js]; my $sf_orig = $sf; if ($sf =~ s/^\%([0-9]+)\$(.*[a-z])$/\%$2/) { $j = $1-1; } else { $j = $js; } if ($j > $#iform) { print_problem("Format number mismatch for $sf_orig [msgstr:", ($js+1), "]\n"); next SFORM; } my $if = $iform[$j]; if ($sf ne $if) { print_problem("Format mismatch: $sf_orig [msgstr:", ($js+1), "]", " vs $if [msgid:", ($j+1), "]\n"); } }}# Run all individual checks on current entry, reporting any problems.sub check_entry { if ($is_fuzzy) { return; } $msgid = join("", @amsgid); $msgstr = join("", @amsgstr); unless ($opt_x) { if (length($msgid)==0) { print_problem "Zero length msgid\n"; } } if (length($msgstr)==0) { unless ($opt_e) { return; } print_problem "Untranslated msgid\n"; } check_cformat; check_whitespace_joins; check_num_newlines; check_leading_whitespace; check_trailing_newlines; check_trailing_whitespace; check_leading_capitalization; check_trailing_punctuation;}new_entry;LINE:while(<>) { if ( m(^\s*$) ) { if ($state==S_DOING_MSGSTR) { check_entry; new_entry; } next LINE; } if ( m(^\#, fuzzy) ) { $is_fuzzy = 1; } if ( m(^\#, .*c-format) ) { # .* is because can have fuzzy, c-format $is_cformat = 1; } if ( m(^\#) ) { next LINE; } if ( m(^msgid \"(.*)\"$) ) { $entryline = $.; @amsgid = ($1); $state = S_DOING_MSGID; next LINE; } if ( m(^msgid_plural \"(.*)\"$) ) { $entryline = $.; @amsgid = ($1); $state = S_DOING_MSGID; next LINE; } if ( m(^msgstr \"(.*)\"$) ) { @amsgstr = ($1); $state = S_DOING_MSGSTR; next LINE; } if ( m(^msgstr\[[0-5]\] \"(.*)\"$) ) { @amsgstr = ($1); $state = S_DOING_MSGSTR; next LINE; } if ( m(^\"(.*)\"$) ) { if ($state==S_DOING_MSGID) { push @amsgid, $1; } elsif($state==S_DOING_MSGSTR) { push @amsgstr, $1; } else { die "Looking at string $_ in bad state $state,"; } next LINE; } die "Unexpected at $.: ", $_;}