Mercurial > pidgin
annotate po/check_po.pl @ 30157:98de21a2aa9a
I found these by running "strings" on some MSN binary files. Doesn't seem
like that could be considered copyright infringement. They're not
used right now, but it seems like they could be useful in the future.
author | Mark Doliner <mark@kingant.net> |
---|---|
date | Thu, 03 Jun 2010 04:10:13 +0000 |
parents | 2046abd60e0a |
children |
rev | line source |
---|---|
24829
a675137fc598
other small German translation update
Björn Voigt <bjoern@cs.tu-berlin.de>
parents:
22015
diff
changeset
|
1 #!/usr/bin/env perl |
6238 | 2 # |
3 # check_po.pl - check po file translations for likely errors | |
4 # | |
5 # Written by David W. Pfitzner dwp@mso.anu.edu.au | |
6 # This script is hereby placed in the Public Domain. | |
7 # | |
8 # Various checks on po file translations: | |
9 # - printf-style format strings; | |
10 # - differences in trailing newlines; | |
11 # - empty (non-fuzzy) msgid; | |
12 # - likely whitespace errors on joining multi-line entries | |
13 # Ignores all fuzzy entries. | |
14 # | |
15 # Options: | |
16 # -x Don't do standard checks above (eg, just check one of below). | |
17 # -n Check newlines within strings; ie, that have equal numbers | |
18 # of newlines in msgstr and msgid. (Optional because this may | |
19 # happen legitimately.) | |
20 # -w Check leading whitespace. Sometimes whitespace is simply | |
21 # spacing (eg, for widget labels etc), or punctuation differences, | |
22 # so this may be ok. | |
23 # -W Check trailing whitespace. See -w above. | |
24 # -p Check trailing punctuation. | |
25 # -c Check capitalization of first non-whitespace character | |
26 # (only if [a-zA-Z]). | |
27 # -e Check on empty (c.q. new) msgstr | |
28 # | |
29 # Reads stdin (or filename args, via <>), writes any problems to stdout. | |
30 # | |
31 # Modified by Davide Pagnin nightmare@freeciv.it to support plural forms | |
32 # | |
33 # Version: 0.41 (2002-06-06) | |
34 | |
28218
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28197
diff
changeset
|
35 # TODO: This script needs to be able to handle Farsi's %Id flag for |
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28197
diff
changeset
|
36 # number format specifiers. More information on how it works, see |
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28197
diff
changeset
|
37 # http://www.gnu.org/software/hello/manual/gettext/c_002dformat.html |
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28197
diff
changeset
|
38 # It's possible someone has already made this change... look around |
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28197
diff
changeset
|
39 # for an updated version of this script. |
2046abd60e0a
Add a note. This script shows a lot of false warnings for fa.po
Mark Doliner <mark@kingant.net>
parents:
28197
diff
changeset
|
40 |
6238 | 41 use strict; |
42 use vars qw($opt_c $opt_n $opt_p $opt_w $opt_W $opt_x $opt_e); | |
43 use Getopt::Std; | |
44 | |
45 getopts('cnpwWxe'); | |
46 | |
47 # Globals, for current po entry: | |
48 # | |
49 # Note that msgid and msgstr have newlines represented by the | |
50 # two characters '\' and 'n' (and similarly for other escapes). | |
51 | |
52 my @amsgid; # lines exactly as in input | |
53 my @amsgstr; | |
54 my $entryline; # lineno where entry starts | |
55 my $msgid; # lines joined by "" | |
56 my $msgstr; | |
57 my $is_fuzzy; | |
58 my $is_cformat; | |
59 my $state; # From constant values below. | |
60 my $did_print; # Whether we have printed this entry, to | |
61 # print only once for multiple problems. | |
62 | |
63 use constant S_LOOKING_START => 0; # looking for start of entry | |
64 use constant S_DOING_MSGID => 1; # doing msgid part | |
65 use constant S_DOING_MSGSTR => 2; # doing msgstr part | |
66 | |
67 # Initialize or reinitalize globals to prepare for new entry: | |
68 sub new_entry { | |
69 @amsgid = (); | |
70 @amsgstr = (); | |
71 $msgid = undef; | |
72 $msgstr = undef; | |
73 $entryline = 0; | |
74 $is_fuzzy = 0; | |
75 $is_cformat = 0; | |
76 $did_print = 0; | |
77 $state = S_LOOKING_START; | |
78 } | |
79 | |
80 # Nicely print either a "msgid" or "msgstr" (name is one of these) | |
81 # with given array of data. | |
82 sub print_one { | |
83 my $name = shift; | |
84 print " $name \"", join("\"\n \"", @_), "\"\n"; | |
85 } | |
86 | |
87 # Print a problem (args like print()), preceeded by entry unless | |
88 # we have already printed that: label, and msgid and msgstr. | |
89 # | |
90 sub print_problem { | |
91 unless ($did_print) { | |
92 print "ENTRY:", ($ARGV eq "-" ? "" : " ($ARGV, line $entryline)"), "\n"; | |
93 print_one("msgid", @amsgid); | |
94 print_one("msgstr", @amsgstr); | |
95 $did_print = 1; | |
96 } | |
97 print "*** ", @_; | |
98 } | |
99 | |
100 # Check final newline: probably, translations should end in a newline | |
101 # if and only if the original string does. | |
102 # (See also check_trailing_whitespace and check_num_newlines below.) | |
103 # | |
104 sub check_trailing_newlines { | |
105 if ($opt_x) { return; } | |
106 | |
107 my ($ichar, $schar); | |
108 | |
109 $ichar = (length($msgid)>=2) ? substr($msgid, -2, 2) : ""; | |
110 $schar = (length($msgstr)>=2) ? substr($msgstr, -2, 2) : ""; | |
111 | |
112 if ($ichar eq "\\n" && $schar ne "\\n") { | |
113 print_problem "Missing trailing newline\n"; | |
114 } | |
115 if ($ichar ne "\\n" && $schar eq "\\n") { | |
116 print_problem "Extra trailing newline\n"; | |
117 } | |
118 | |
119 } | |
120 | |
121 # Check leading whitespace. In general, any leading whitespace should | |
122 # be the same in msgstr and msgid -- but not always. | |
123 # | |
124 sub check_leading_whitespace { | |
125 unless ($opt_w) { return; } | |
126 | |
127 my ($id, $str); | |
128 | |
129 if ($msgid =~ m/^(\s+)/) { | |
130 $id = $1; | |
131 } else { | |
132 $id = ""; | |
133 } | |
134 if ($msgstr =~ m/^(\s+)/) { | |
135 $str = $1; | |
136 } else { | |
137 $str = ""; | |
138 } | |
139 if ($id ne $str) { | |
140 print_problem "Different leading whitespace\n"; | |
141 } | |
142 } | |
143 | |
144 # Check trailing whitespace. In general, any trailing whitespace should | |
145 # be the same in msgstr and msgid -- but not always. | |
146 # | |
147 sub check_trailing_whitespace { | |
148 unless ($opt_W) { return; } | |
149 | |
150 my ($id, $str); | |
151 | |
152 if ($msgid =~ m/((?:\s|\\n)+)$/) { | |
153 $id = $1; | |
154 } else { | |
155 $id = ""; | |
156 } | |
157 if ($msgstr =~ m/((?:\s|\\n)+)$/) { | |
158 $str = $1; | |
159 } else { | |
160 $str = ""; | |
161 } | |
162 if ($id ne $str) { | |
163 print_problem "Different trailing whitespace\n"; | |
164 } | |
165 } | |
166 | |
167 # Check equal numbers of newlines. In general ... etc. | |
168 # | |
169 sub check_num_newlines { | |
170 unless ($opt_n) { return; } | |
171 | |
172 my $num_i = ($msgid =~ m(\\n)g); | |
173 my $num_s = ($msgstr =~ m(\\n)g); | |
174 | |
175 if ($num_i != $num_s) { | |
176 print_problem "Mismatch in newline count\n"; | |
177 } | |
178 | |
179 } | |
180 | |
181 # Check capitalization of first non-whitespace character (for [a-zA-Z] | |
182 # only). In general ... etc. | |
183 # | |
184 sub check_leading_capitalization { | |
185 unless ($opt_c) { return; } | |
186 | |
187 my ($id, $str); | |
188 | |
189 if ($msgid =~ m/^\s*([a-zA-Z])/) { | |
190 $id = $1; | |
191 } | |
192 if ($msgstr =~ m/^\s*([a-zA-Z])/) { | |
193 $str = $1; | |
194 } | |
195 if (defined($id) && defined($str)) { | |
196 if (($id =~ /^[a-z]$/ && $str =~ /^[A-Z]$/) || | |
197 ($id =~ /^[A-Z]$/ && $str =~ /^[a-z]$/)) { | |
198 print_problem "Different leading capitalization\n"; | |
199 } | |
200 } | |
201 } | |
202 | |
203 # Check trailing 'punctuation' characters (ignoring trailing whitespace). | |
204 # In general .. etc. | |
205 # | |
206 sub check_trailing_punctuation { | |
207 unless ($opt_p) { return; } | |
208 | |
209 my ($id, $str); | |
210 | |
211 # Might want more characters: | |
212 if ($msgid =~ m/([\\\.\/\,\!\?\"\'\:\;])+(?:\s|\\n)*$/) { | |
213 $id = $1; | |
214 } else { | |
215 $id = ""; | |
216 } | |
217 if ($msgstr =~ m/([\\\.\/\,\!\?\"\'\:\;])+(?:\s|\\n)*$/) { | |
218 $str = $1; | |
219 } else { | |
220 $str = ""; | |
221 } | |
222 ##print "$id $str\n"; | |
223 if ($id ne $str) { | |
224 print_problem "Different trailing punctuation\n"; | |
225 } | |
226 } | |
227 | |
228 # Check that multiline strings have whitespace separation, since | |
229 # otherwise, eg: | |
230 # msgstr "this is a multiline" | |
231 # "string" | |
232 # expands to: | |
233 # "this is a multilinestring" | |
234 # | |
235 sub check_whitespace_joins { | |
236 if ($opt_x) { return; } | |
237 | |
238 my $ok = 1; | |
239 my $i = 0; | |
240 | |
241 foreach my $aref (\@amsgid, \@amsgstr) { | |
242 my $prev = undef; | |
243 LINE: | |
244 foreach my $line (@$aref) { | |
245 if (defined($prev) | |
246 && length($prev) | |
247 && $prev !~ /\s$/ | |
248 && $prev !~ /\\n$/ | |
249 && $line !~ /^\s/ | |
250 && $line !~ /^\\n/) | |
251 { | |
252 $ok = 0; | |
253 last LINE; | |
254 } | |
255 $prev = $line; | |
256 } | |
257 if (!$ok) { | |
258 print_problem("Possible non-whitespace line-join problem in ", | |
259 ($i==0 ? "msgid" : "msgstr"), " \n"); | |
260 } | |
261 $i++; | |
262 } | |
263 } | |
264 | |
265 # Check printf-style format entries. | |
266 # Non-trivial, because translation strings may use format specifiers | |
267 # out of order, or skip some specifiers etc. Also gettext marks | |
268 # anything with '%' as cformat, though not all are. | |
269 # | |
270 sub check_cformat { | |
271 unless ($is_cformat) { return; } | |
272 if ($opt_x) { return; } | |
273 | |
274 my (@iform, @sform); | |
275 @iform = ($msgid =~ m/\%[0-9\.\$]*[a-z]/g); | |
276 @sform = ($msgstr =~ m/\%[0-9\.\$]*[a-z]/g); | |
277 | |
278 ##print join("::", @iform), "\n"; | |
279 ##print join("::", @sform), "\n"; | |
280 | |
281 my $js; # index in sform | |
282 my $j; # index into iform | |
283 SFORM: | |
284 for ($js=0; $js < @sform; $js++) { | |
285 my $sf = $sform[$js]; | |
286 my $sf_orig = $sf; | |
287 if ($sf =~ s/^\%([0-9]+)\$(.*[a-z])$/\%$2/) { | |
288 $j = $1-1; | |
289 } else { | |
290 $j = $js; | |
291 } | |
292 if ($j > $#iform) { | |
293 print_problem("Format number mismatch for $sf_orig [msgstr:", | |
294 ($js+1), "]\n"); | |
295 next SFORM; | |
296 } | |
297 my $if = $iform[$j]; | |
298 if ($sf ne $if) { | |
299 print_problem("Format mismatch: $sf_orig [msgstr:", ($js+1), "]", | |
300 " vs $if [msgid:", ($j+1), "]\n"); | |
301 } | |
302 } | |
303 } | |
304 | |
305 # Run all individual checks on current entry, reporting any problems. | |
306 sub check_entry { | |
307 if ($is_fuzzy) { | |
308 return; | |
309 } | |
310 $msgid = join("", @amsgid); | |
311 $msgstr = join("", @amsgstr); | |
312 | |
313 unless ($opt_x) { | |
314 if (length($msgid)==0) { | |
315 print_problem "Zero length msgid\n"; | |
316 } | |
317 } | |
318 if (length($msgstr)==0) { | |
319 unless ($opt_e) { return; } | |
320 print_problem "Untranslated msgid\n"; | |
321 } | |
322 check_cformat; | |
323 check_whitespace_joins; | |
324 check_num_newlines; | |
325 check_leading_whitespace; | |
326 check_trailing_newlines; | |
327 check_trailing_whitespace; | |
328 check_leading_capitalization; | |
329 check_trailing_punctuation; | |
330 } | |
331 | |
332 new_entry; | |
333 | |
334 LINE: | |
335 while(<>) { | |
336 if ( m(^\s*$) ) { | |
337 if ($state==S_DOING_MSGSTR) { | |
338 check_entry; | |
339 new_entry; | |
340 } | |
341 next LINE; | |
342 } | |
343 if ( m(^\#, fuzzy) ) { | |
344 $is_fuzzy = 1; | |
345 } | |
346 if ( m(^\#, .*c-format) ) { | |
347 # .* is because can have fuzzy, c-format | |
348 $is_cformat = 1; | |
349 } | |
350 if ( m(^\#) ) { | |
351 next LINE; | |
352 } | |
353 if ( m(^msgid \"(.*)\"$) ) { | |
354 $entryline = $.; | |
355 @amsgid = ($1); | |
356 $state = S_DOING_MSGID; | |
357 next LINE; | |
358 } | |
359 if ( m(^msgid_plural \"(.*)\"$) ) { | |
360 $entryline = $.; | |
361 @amsgid = ($1); | |
362 $state = S_DOING_MSGID; | |
363 next LINE; | |
364 } | |
365 if ( m(^msgstr \"(.*)\"$) ) { | |
366 @amsgstr = ($1); | |
367 $state = S_DOING_MSGSTR; | |
368 next LINE; | |
369 } | |
28197
0d4b9ed6665a
Some of our translations actually have 6 plural forms.
Elliott Sales de Andrade <qulogic@pidgin.im>
parents:
24829
diff
changeset
|
370 if ( m(^msgstr\[[0-5]\] \"(.*)\"$) ) { |
6238 | 371 @amsgstr = ($1); |
372 $state = S_DOING_MSGSTR; | |
373 next LINE; | |
374 } | |
375 if ( m(^\"(.*)\"$) ) { | |
376 if ($state==S_DOING_MSGID) { | |
377 push @amsgid, $1; | |
378 } elsif($state==S_DOING_MSGSTR) { | |
379 push @amsgstr, $1; | |
380 } else { | |
381 die "Looking at string $_ in bad state $state,"; | |
382 } | |
383 next LINE; | |
384 } | |
385 die "Unexpected at $.: ", $_; | |
386 } |