Mercurial > pidgin.yaz
annotate po/check_po.pl @ 28590:a0bff269046a
A little more detail from the AIM docs
author | Mark Doliner <mark@kingant.net> |
---|---|
date | Sun, 06 Sep 2009 21:23:53 +0000 |
parents | 2046abd60e0a |
children |
rev | line source |
---|---|
24794
a675137fc598
other small German translation update
Björn Voigt <bjoern@cs.tu-berlin.de>
parents:
22015
diff
changeset
|
1 #!/usr/bin/env perl |
6238 | 2 # |
3 # check_po.pl - check po file translations for likely errors | |
4 # | |
5 # Written by David W. Pfitzner dwp@mso.anu.edu.au | |
6 # This script is hereby placed in the Public Domain. | |
7 # | |
8 # Various checks on po file translations: | |
9 # - printf-style format strings; | |
10 # - differences in trailing newlines; | |
11 # - empty (non-fuzzy) msgid; | |
12 # - likely whitespace errors on joining multi-line entries | |
13 # Ignores all fuzzy entries. | |
14 # | |
15 # Options: | |
16 # -x Don't do standard checks above (eg, just check one of below). | |
17 # -n Check newlines within strings; ie, that have equal numbers | |
18 # of newlines in msgstr and msgid. (Optional because this may | |
19 # happen legitimately.) | |
20 # -w Check leading whitespace. Sometimes whitespace is simply | |
21 # spacing (eg, for widget labels etc), or punctuation differences, | |
22 # so this may be ok. | |
23 # -W Check trailing whitespace. See -w above. | |
24 # -p Check trailing punctuation. | |
25 # -c Check capitalization of first non-whitespace character | |
26 # (only if [a-zA-Z]). | |
27 # -e Check on empty (c.q. new) msgstr | |
28 # | |
29 # Reads stdin (or filename args, via <>), writes any problems to stdout. | |
30 # | |
31 # Modified by Davide Pagnin nightmare@freeciv.it to support plural forms | |
32 # | |
33 # Version: 0.41 (2002-06-06) | |
34 | |
28577
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28556
diff
changeset
|
35 # TODO: This script needs to be able to handle Farsi's %Id flag for |
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28556
diff
changeset
|
36 # number format specifiers. More information on how it works, see |
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28556
diff
changeset
|
37 # http://www.gnu.org/software/hello/manual/gettext/c_002dformat.html |
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28556
diff
changeset
|
38 # It's possible someone has already made this change... look around |
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28556
diff
changeset
|
39 # for an updated version of this script. |
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28556
diff
changeset
|
40 |
6238 | 41 use strict; |
42 use vars qw($opt_c $opt_n $opt_p $opt_w $opt_W $opt_x $opt_e); | |
43 use Getopt::Std; | |
44 | |
45 getopts('cnpwWxe'); | |
46 | |
47 # Globals, for current po entry: | |
48 # | |
49 # Note that msgid and msgstr have newlines represented by the | |
50 # two characters '\' and 'n' (and similarly for other escapes). | |
51 | |
52 my @amsgid; # lines exactly as in input | |
53 my @amsgstr; | |
54 my $entryline; # lineno where entry starts | |
55 my $msgid; # lines joined by "" | |
56 my $msgstr; | |
57 my $is_fuzzy; | |
58 my $is_cformat; | |
59 my $state; # From constant values below. | |
60 my $did_print; # Whether we have printed this entry, to | |
61 # print only once for multiple problems. | |
62 | |
63 use constant S_LOOKING_START => 0; # looking for start of entry | |
64 use constant S_DOING_MSGID => 1; # doing msgid part | |
65 use constant S_DOING_MSGSTR => 2; # doing msgstr part | |
66 | |
67 # Initialize or reinitalize globals to prepare for new entry: | |
68 sub new_entry { | |
69 @amsgid = (); | |
70 @amsgstr = (); | |
71 $msgid = undef; | |
72 $msgstr = undef; | |
73 $entryline = 0; | |
74 $is_fuzzy = 0; | |
75 $is_cformat = 0; | |
76 $did_print = 0; | |
77 $state = S_LOOKING_START; | |
78 } | |
79 | |
80 # Nicely print either a "msgid" or "msgstr" (name is one of these) | |
81 # with given array of data. | |
82 sub print_one { | |
83 my $name = shift; | |
84 print " $name \"", join("\"\n \"", @_), "\"\n"; | |
85 } | |
86 | |
87 # Print a problem (args like print()), preceeded by entry unless | |
88 # we have already printed that: label, and msgid and msgstr. | |
89 # | |
90 sub print_problem { | |
91 unless ($did_print) { | |
92 print "ENTRY:", ($ARGV eq "-" ? "" : " ($ARGV, line $entryline)"), "\n"; | |
93 print_one("msgid", @amsgid); | |
94 print_one("msgstr", @amsgstr); | |
95 $did_print = 1; | |
96 } | |
97 print "*** ", @_; | |
98 } | |
99 | |
100 # Check final newline: probably, translations should end in a newline | |
101 # if and only if the original string does. | |
102 # (See also check_trailing_whitespace and check_num_newlines below.) | |
103 # | |
104 sub check_trailing_newlines { | |
105 if ($opt_x) { return; } | |
106 | |
107 my ($ichar, $schar); | |
108 | |
109 $ichar = (length($msgid)>=2) ? substr($msgid, -2, 2) : ""; | |
110 $schar = (length($msgstr)>=2) ? substr($msgstr, -2, 2) : ""; | |
111 | |
112 if ($ichar eq "\\n" && $schar ne "\\n") { | |
113 print_problem "Missing trailing newline\n"; | |
114 } | |
115 if ($ichar ne "\\n" && $schar eq "\\n") { | |
116 print_problem "Extra trailing newline\n"; | |
117 } | |
118 | |
119 } | |
120 | |
121 # Check leading whitespace. In general, any leading whitespace should | |
122 # be the same in msgstr and msgid -- but not always. | |
123 # | |
124 sub check_leading_whitespace { | |
125 unless ($opt_w) { return; } | |
126 | |
127 my ($id, $str); | |
128 | |
129 if ($msgid =~ m/^(\s+)/) { | |
130 $id = $1; | |
131 } else { | |
132 $id = ""; | |
133 } | |
134 if ($msgstr =~ m/^(\s+)/) { | |
135 $str = $1; | |
136 } else { | |
137 $str = ""; | |
138 } | |
139 if ($id ne $str) { | |
140 print_problem "Different leading whitespace\n"; | |
141 } | |
142 } | |
143 | |
144 # Check trailing whitespace. In general, any trailing whitespace should | |
145 # be the same in msgstr and msgid -- but not always. | |
146 # | |
147 sub check_trailing_whitespace { | |
148 unless ($opt_W) { return; } | |
149 | |
150 my ($id, $str); | |
151 | |
152 if ($msgid =~ m/((?:\s|\\n)+)$/) { | |
153 $id = $1; | |
154 } else { | |
155 $id = ""; | |
156 } | |
157 if ($msgstr =~ m/((?:\s|\\n)+)$/) { | |
158 $str = $1; | |
159 } else { | |
160 $str = ""; | |
161 } | |
162 if ($id ne $str) { | |
163 print_problem "Different trailing whitespace\n"; | |
164 } | |
165 } | |
166 | |
167 # Check equal numbers of newlines. In general ... etc. | |
168 # | |
169 sub check_num_newlines { | |
170 unless ($opt_n) { return; } | |
171 | |
172 my $num_i = ($msgid =~ m(\\n)g); | |
173 my $num_s = ($msgstr =~ m(\\n)g); | |
174 | |
175 if ($num_i != $num_s) { | |
176 print_problem "Mismatch in newline count\n"; | |
177 } | |
178 | |
179 } | |
180 | |
181 # Check capitalization of first non-whitespace character (for [a-zA-Z] | |
182 # only). In general ... etc. | |
183 # | |
184 sub check_leading_capitalization { | |
185 unless ($opt_c) { return; } | |
186 | |
187 my ($id, $str); | |
188 | |
189 if ($msgid =~ m/^\s*([a-zA-Z])/) { | |
190 $id = $1; | |
191 } | |
192 if ($msgstr =~ m/^\s*([a-zA-Z])/) { | |
193 $str = $1; | |
194 } | |
195 if (defined($id) && defined($str)) { | |
196 if (($id =~ /^[a-z]$/ && $str =~ /^[A-Z]$/) || | |
197 ($id =~ /^[A-Z]$/ && $str =~ /^[a-z]$/)) { | |
198 print_problem "Different leading capitalization\n"; | |
199 } | |
200 } | |
201 } | |
202 | |
203 # Check trailing 'punctuation' characters (ignoring trailing whitespace). | |
204 # In general .. etc. | |
205 # | |
206 sub check_trailing_punctuation { | |
207 unless ($opt_p) { return; } | |
208 | |
209 my ($id, $str); | |
210 | |
211 # Might want more characters: | |
212 if ($msgid =~ m/([\\\.\/\,\!\?\"\'\:\;])+(?:\s|\\n)*$/) { | |
213 $id = $1; | |
214 } else { | |
215 $id = ""; | |
216 } | |
217 if ($msgstr =~ m/([\\\.\/\,\!\?\"\'\:\;])+(?:\s|\\n)*$/) { | |
218 $str = $1; | |
219 } else { | |
220 $str = ""; | |
221 } | |
222 ##print "$id $str\n"; | |
223 if ($id ne $str) { | |
224 print_problem "Different trailing punctuation\n"; | |
225 } | |
226 } | |
227 | |
228 # Check that multiline strings have whitespace separation, since | |
229 # otherwise, eg: | |
230 # msgstr "this is a multiline" | |
231 # "string" | |
232 # expands to: | |
233 # "this is a multilinestring" | |
234 # | |
235 sub check_whitespace_joins { | |
236 if ($opt_x) { return; } | |
237 | |
238 my $ok = 1; | |
239 my $i = 0; | |
240 | |
241 foreach my $aref (\@amsgid, \@amsgstr) { | |
242 my $prev = undef; | |
243 LINE: | |
244 foreach my $line (@$aref) { | |
245 if (defined($prev) | |
246 && length($prev) | |
247 && $prev !~ /\s$/ | |
248 && $prev !~ /\\n$/ | |
249 && $line !~ /^\s/ | |
250 && $line !~ /^\\n/) | |
251 { | |
252 $ok = 0; | |
253 last LINE; | |
254 } | |
255 $prev = $line; | |
256 } | |
257 if (!$ok) { | |
258 print_problem("Possible non-whitespace line-join problem in ", | |
259 ($i==0 ? "msgid" : "msgstr"), " \n"); | |
260 } | |
261 $i++; | |
262 } | |
263 } | |
264 | |
265 # Check printf-style format entries. | |
266 # Non-trivial, because translation strings may use format specifiers | |
267 # out of order, or skip some specifiers etc. Also gettext marks | |
268 # anything with '%' as cformat, though not all are. | |
269 # | |
270 sub check_cformat { | |
271 unless ($is_cformat) { return; } | |
272 if ($opt_x) { return; } | |
273 | |
274 my (@iform, @sform); | |
275 @iform = ($msgid =~ m/\%[0-9\.\$]*[a-z]/g); | |
276 @sform = ($msgstr =~ m/\%[0-9\.\$]*[a-z]/g); | |
277 | |
278 ##print join("::", @iform), "\n"; | |
279 ##print join("::", @sform), "\n"; | |
280 | |
281 my $js; # index in sform | |
282 my $j; # index into iform | |
283 SFORM: | |
284 for ($js=0; $js < @sform; $js++) { | |
285 my $sf = $sform[$js]; | |
286 my $sf_orig = $sf; | |
287 if ($sf =~ s/^\%([0-9]+)\$(.*[a-z])$/\%$2/) { | |
288 $j = $1-1; | |
289 } else { | |
290 $j = $js; | |
291 } | |
292 if ($j > $#iform) { | |
293 print_problem("Format number mismatch for $sf_orig [msgstr:", | |
294 ($js+1), "]\n"); | |
295 next SFORM; | |
296 } | |
297 my $if = $iform[$j]; | |
298 if ($sf ne $if) { | |
299 print_problem("Format mismatch: $sf_orig [msgstr:", ($js+1), "]", | |
300 " vs $if [msgid:", ($j+1), "]\n"); | |
301 } | |
302 } | |
303 } | |
304 | |
305 # Run all individual checks on current entry, reporting any problems. | |
306 sub check_entry { | |
307 if ($is_fuzzy) { | |
308 return; | |
309 } | |
310 $msgid = join("", @amsgid); | |
311 $msgstr = join("", @amsgstr); | |
312 | |
313 unless ($opt_x) { | |
314 if (length($msgid)==0) { | |
315 print_problem "Zero length msgid\n"; | |
316 } | |
317 } | |
318 if (length($msgstr)==0) { | |
319 unless ($opt_e) { return; } | |
320 print_problem "Untranslated msgid\n"; | |
321 } | |
322 check_cformat; | |
323 check_whitespace_joins; | |
324 check_num_newlines; | |
325 check_leading_whitespace; | |
326 check_trailing_newlines; | |
327 check_trailing_whitespace; | |
328 check_leading_capitalization; | |
329 check_trailing_punctuation; | |
330 } | |
331 | |
332 new_entry; | |
333 | |
334 LINE: | |
335 while(<>) { | |
336 if ( m(^\s*$) ) { | |
337 if ($state==S_DOING_MSGSTR) { | |
338 check_entry; | |
339 new_entry; | |
340 } | |
341 next LINE; | |
342 } | |
343 if ( m(^\#, fuzzy) ) { | |
344 $is_fuzzy = 1; | |
345 } | |
346 if ( m(^\#, .*c-format) ) { | |
347 # .* is because can have fuzzy, c-format | |
348 $is_cformat = 1; | |
349 } | |
350 if ( m(^\#) ) { | |
351 next LINE; | |
352 } | |
353 if ( m(^msgid \"(.*)\"$) ) { | |
354 $entryline = $.; | |
355 @amsgid = ($1); | |
356 $state = S_DOING_MSGID; | |
357 next LINE; | |
358 } | |
359 if ( m(^msgid_plural \"(.*)\"$) ) { | |
360 $entryline = $.; | |
361 @amsgid = ($1); | |
362 $state = S_DOING_MSGID; | |
363 next LINE; | |
364 } | |
365 if ( m(^msgstr \"(.*)\"$) ) { | |
366 @amsgstr = ($1); | |
367 $state = S_DOING_MSGSTR; | |
368 next LINE; | |
369 } | |
28556
0d4b9ed6665a
Some of our translations actually have 6 plural forms.
Elliott Sales de Andrade <qulogic@pidgin.im>
parents:
24794
diff
changeset
|
370 if ( m(^msgstr\[[0-5]\] \"(.*)\"$) ) { |
6238 | 371 @amsgstr = ($1); |
372 $state = S_DOING_MSGSTR; | |
373 next LINE; | |
374 } | |
375 if ( m(^\"(.*)\"$) ) { | |
376 if ($state==S_DOING_MSGID) { | |
377 push @amsgid, $1; | |
378 } elsif($state==S_DOING_MSGSTR) { | |
379 push @amsgstr, $1; | |
380 } else { | |
381 die "Looking at string $_ in bad state $state,"; | |
382 } | |
383 next LINE; | |
384 } | |
385 die "Unexpected at $.: ", $_; | |
386 } |