Mercurial > hgbook
comparison tools/po4a/lib/Locale/Po4a/TransTractor.pm @ 722:082bb76417f1
Add Po4a 0.37-dev(2009-03-08)
author | Dongsheng Song <dongsheng.song@gmail.com> |
---|---|
date | Thu, 12 Mar 2009 15:43:56 +0800 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
721:2180358c32c4 | 722:082bb76417f1 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 require Exporter; | |
4 | |
5 package Locale::Po4a::TransTractor; | |
6 use DynaLoader; | |
7 | |
8 use 5.006; | |
9 use strict; | |
10 use warnings; | |
11 | |
12 use subs qw(makespace); | |
13 use vars qw($VERSION @ISA @EXPORT); | |
14 $VERSION="0.36"; | |
15 @ISA = qw(DynaLoader); | |
16 @EXPORT = qw(new process translate | |
17 read write readpo writepo | |
18 getpoout setpoout); | |
19 | |
20 # Try to use a C extension if present. | |
21 eval("bootstrap Locale::Po4a::TransTractor $VERSION"); | |
22 | |
23 use Carp qw(croak); | |
24 use Locale::Po4a::Po; | |
25 use Locale::Po4a::Common; | |
26 | |
27 use File::Path; # mkdir before write | |
28 | |
29 use Encode; | |
30 use Encode::Guess; | |
31 | |
32 =head1 NAME | |
33 | |
34 Locale::Po4a::TransTractor - Generic trans(lator ex)tractor. | |
35 | |
36 =head1 DESCRIPTION | |
37 | |
38 The po4a (po for anything) project goal is to ease translations (and more | |
39 interestingly, the maintenance of translations) using gettext tools on | |
40 areas where they were not expected like documentation. | |
41 | |
42 This class is the ancestor of every po4a parsers used to parse a document to | |
43 search translatable strings, extract them to a po file and replace them by | |
44 their translation in the output document. | |
45 | |
46 More formally, it takes the following arguments as input: | |
47 | |
48 =over 2 | |
49 | |
50 =item - | |
51 | |
52 a document to translate ; | |
53 | |
54 =item - | |
55 | |
56 a po file containing the translations to use. | |
57 | |
58 =back | |
59 | |
60 As output, it produces: | |
61 | |
62 =over 2 | |
63 | |
64 =item - | |
65 | |
66 another po file, resulting of the extraction of translatable strings from | |
67 the input document ; | |
68 | |
69 =item - | |
70 | |
71 a translated document, with the same structure than the one in input, but | |
72 with all translatable strings replaced with the translations found in the | |
73 po file provided in input. | |
74 | |
75 =back | |
76 | |
77 Here is a graphical representation of this: | |
78 | |
79 Input document --\ /---> Output document | |
80 \ / (translated) | |
81 +-> parse() function -----+ | |
82 / \ | |
83 Input po --------/ \---> Output po | |
84 (extracted) | |
85 | |
86 =head1 FUNCTIONS YOUR PARSER SHOULD OVERRIDE | |
87 | |
88 =over 4 | |
89 | |
90 =item parse() | |
91 | |
92 This is where all the work takes place: the parsing of input documents, the | |
93 generation of output, and the extraction of the translatable strings. This | |
94 is pretty simple using the provided functions presented in the section | |
95 "INTERNAL FUNCTIONS" below. See also the synopsis, which present an | |
96 example. | |
97 | |
98 This function is called by the process() function bellow, but if you choose | |
99 to use the new() function, and to add content manually to your document, | |
100 you will have to call this function yourself. | |
101 | |
102 =item docheader() | |
103 | |
104 This function returns the header we should add to the produced document, | |
105 quoted properly to be a comment in the target language. See the section | |
106 "Educating developers about translations", from L<po4a(7)|po4a.7>, for what | |
107 it is good for. | |
108 | |
109 =back | |
110 | |
111 =cut | |
112 | |
113 sub docheader {} | |
114 | |
115 sub parse {} | |
116 | |
117 =head1 SYNOPSIS | |
118 | |
119 The following example parses a list of paragraphs beginning with "<p>". For the sake | |
120 of simplicity, we assume that the document is well formatted, i.e. that '<p>' | |
121 tags are the only tags present, and that this tag is at the very beginning | |
122 of each paragraph. | |
123 | |
124 sub parse { | |
125 my $self = shift; | |
126 | |
127 PARAGRAPH: while (1) { | |
128 my ($paragraph,$pararef)=("",""); | |
129 my $first=1; | |
130 my ($line,$lref)=$self->shiftline(); | |
131 while (defined($line)) { | |
132 if ($line =~ m/<p>/ && !$first--; ) { | |
133 # Not the first time we see <p>. | |
134 # Reput the current line in input, | |
135 # and put the built paragraph to output | |
136 $self->unshiftline($line,$lref); | |
137 | |
138 # Now that the document is formed, translate it: | |
139 # - Remove the leading tag | |
140 $paragraph =~ s/^<p>//s; | |
141 | |
142 # - push to output the leading tag (untranslated) and the | |
143 # rest of the paragraph (translated) | |
144 $self->pushline( "<p>" | |
145 . $document->translate($paragraph,$pararef) | |
146 ); | |
147 | |
148 next PARAGRAPH; | |
149 } else { | |
150 # Append to the paragraph | |
151 $paragraph .= $line; | |
152 $pararef = $lref unless(length($pararef)); | |
153 } | |
154 | |
155 # Reinit the loop | |
156 ($line,$lref)=$self->shiftline(); | |
157 } | |
158 # Did not get a defined line? End of input file. | |
159 return; | |
160 } | |
161 } | |
162 | |
163 Once you've implemented the parse function, you can use your document | |
164 class, using the public interface presented in the next section. | |
165 | |
166 =head1 PUBLIC INTERFACE for scripts using your parser | |
167 | |
168 =head2 Constructor | |
169 | |
170 =over 4 | |
171 | |
172 =item process(%) | |
173 | |
174 This function can do all you need to do with a po4a document in one | |
175 invocation. Its arguments must be packed as a hash. ACTIONS: | |
176 | |
177 =over 3 | |
178 | |
179 =item a. | |
180 | |
181 Reads all the po files specified in po_in_name | |
182 | |
183 =item b. | |
184 | |
185 Reads all original documents specified in file_in_name | |
186 | |
187 =item c. | |
188 | |
189 Parses the document | |
190 | |
191 =item d. | |
192 | |
193 Reads and applies all the addenda specified | |
194 | |
195 =item e. | |
196 | |
197 Writes the translated document to file_out_name (if given) | |
198 | |
199 =item f. | |
200 | |
201 Writes the extracted po file to po_out_name (if given) | |
202 | |
203 =back | |
204 | |
205 ARGUMENTS, beside the ones accepted by new() (with expected type): | |
206 | |
207 =over 4 | |
208 | |
209 =item file_in_name (@) | |
210 | |
211 List of filenames where we should read the input document. | |
212 | |
213 =item file_in_charset ($) | |
214 | |
215 Charset used in the input document (if it isn't specified, it will try | |
216 to detect it from the input document). | |
217 | |
218 =item file_out_name ($) | |
219 | |
220 Filename where we should write the output document. | |
221 | |
222 =item file_out_charset ($) | |
223 | |
224 Charset used in the output document (if it isn't specified, it will use | |
225 the po file charset). | |
226 | |
227 =item po_in_name (@) | |
228 | |
229 List of filenames where we should read the input po files from, containing | |
230 the translation which will be used to translate the document. | |
231 | |
232 =item po_out_name ($) | |
233 | |
234 Filename where we should write the output po file, containing the strings | |
235 extracted from the input document. | |
236 | |
237 =item addendum (@) | |
238 | |
239 List of filenames where we should read the addenda from. | |
240 | |
241 =item addendum_charset ($) | |
242 | |
243 Charset for the addenda. | |
244 | |
245 =back | |
246 | |
247 =item new(%) | |
248 | |
249 Create a new Po4a document. Accepted options (but be in a hash): | |
250 | |
251 =over 4 | |
252 | |
253 =item verbose ($) | |
254 | |
255 Sets the verbosity. | |
256 | |
257 =item debug ($) | |
258 | |
259 Sets the debugging. | |
260 | |
261 =back | |
262 | |
263 =cut | |
264 | |
265 sub process { | |
266 ## Determine if we were called via an object-ref or a classname | |
267 my $self = shift; | |
268 | |
269 ## Any remaining arguments are treated as initial values for the | |
270 ## hash that is used to represent this object. | |
271 my %params = @_; | |
272 | |
273 # Build the args for new() | |
274 my %newparams = (); | |
275 foreach (keys %params) { | |
276 next if ($_ eq 'po_in_name' || | |
277 $_ eq 'po_out_name' || | |
278 $_ eq 'file_in_name' || | |
279 $_ eq 'file_in_charset' || | |
280 $_ eq 'file_out_name' || | |
281 $_ eq 'file_out_charset' || | |
282 $_ eq 'addendum' || | |
283 $_ eq 'addendum_charset'); | |
284 $newparams{$_}=$params{$_}; | |
285 } | |
286 | |
287 $self->detected_charset($params{'file_in_charset'}); | |
288 $self->{TT}{'file_out_charset'}=$params{'file_out_charset'}; | |
289 if (defined($self->{TT}{'file_out_charset'}) and | |
290 length($self->{TT}{'file_out_charset'})) { | |
291 $self->{TT}{'file_out_encoder'} = find_encoding($self->{TT}{'file_out_charset'}); | |
292 } | |
293 $self->{TT}{'addendum_charset'}=$params{'addendum_charset'}; | |
294 | |
295 foreach my $file (@{$params{'po_in_name'}}) { | |
296 print STDERR "readpo($file)... " if $self->debug(); | |
297 $self->readpo($file); | |
298 print STDERR "done.\n" if $self->debug() | |
299 } | |
300 foreach my $file (@{$params{'file_in_name'}}) { | |
301 print STDERR "read($file)..." if $self->debug(); | |
302 $self->read($file); | |
303 print STDERR "done.\n" if $self->debug(); | |
304 } | |
305 print STDERR "parse..." if $self->debug(); | |
306 $self->parse(); | |
307 print STDERR "done.\n" if $self->debug(); | |
308 foreach my $file (@{$params{'addendum'}}) { | |
309 print STDERR "addendum($file)..." if $self->debug(); | |
310 $self->addendum($file) || die "An addendum failed\n"; | |
311 print STDERR "done.\n" if $self->debug(); | |
312 } | |
313 if (defined $params{'file_out_name'}) { | |
314 print STDERR "write(".$params{'file_out_name'}.")... " | |
315 if $self->debug(); | |
316 $self->write($params{'file_out_name'}); | |
317 print STDERR "done.\n" if $self->debug(); | |
318 } | |
319 if (defined $params{'po_out_name'}) { | |
320 print STDERR "writepo(".$params{'po_out_name'}.")... " | |
321 if $self->debug(); | |
322 $self->writepo($params{'po_out_name'}); | |
323 print STDERR "done.\n" if $self->debug(); | |
324 } | |
325 return $self; | |
326 } | |
327 | |
328 sub new { | |
329 ## Determine if we were called via an object-ref or a classname | |
330 my $this = shift; | |
331 my $class = ref($this) || $this; | |
332 my $self = { }; | |
333 my %options=@_; | |
334 ## Bless ourselves into the desired class and perform any initialization | |
335 bless $self, $class; | |
336 | |
337 ## initialize the plugin | |
338 # prevent the plugin from croaking on the options intended for Po.pm | |
339 $self->{options}{'porefs'} = ''; | |
340 # let the plugin parse the options and such | |
341 $self->initialize(%options); | |
342 | |
343 ## Create our private data | |
344 my %po_options; | |
345 $po_options{'porefs'} = $self->{options}{'porefs'}; | |
346 | |
347 # private data | |
348 $self->{TT}=(); | |
349 $self->{TT}{po_in}=Locale::Po4a::Po->new(); | |
350 $self->{TT}{po_out}=Locale::Po4a::Po->new(\%po_options); | |
351 # Warning, this is an array of array: | |
352 # The document is splited on lines, and for each | |
353 # [0] is the line content, [1] is the reference [2] the type | |
354 $self->{TT}{doc_in}=(); | |
355 $self->{TT}{doc_out}=(); | |
356 if (defined $options{'verbose'}) { | |
357 $self->{TT}{verbose} = $options{'verbose'}; | |
358 } | |
359 if (defined $options{'debug'}) { | |
360 $self->{TT}{debug} = $options{'debug'}; | |
361 } | |
362 # Input document is in ascii until we prove the opposite (in read()) | |
363 $self->{TT}{ascii_input}=1; | |
364 # We try not to use utf unless it's forced from the outside (in case the | |
365 # document isn't in ascii) | |
366 $self->{TT}{utf_mode}=0; | |
367 | |
368 | |
369 return $self; | |
370 } | |
371 | |
372 =back | |
373 | |
374 =head2 Manipulating document files | |
375 | |
376 =over 4 | |
377 | |
378 =item read($) | |
379 | |
380 Add another input document at the end of the existing one. The argument is | |
381 the filename to read. | |
382 | |
383 Please note that it does not parse anything. You should use the parse() | |
384 function when you're done with packing input files into the document. | |
385 | |
386 =cut | |
387 | |
388 #' | |
389 sub read() { | |
390 my $self=shift; | |
391 my $filename=shift | |
392 or croak wrap_msg(dgettext("po4a", "Can't read from file without having a filename")); | |
393 my $linenum=0; | |
394 | |
395 open INPUT,"<$filename" | |
396 or croak wrap_msg(dgettext("po4a", "Can't read from %s: %s"), $filename, $!); | |
397 while (defined (my $textline = <INPUT>)) { | |
398 $linenum++; | |
399 my $ref="$filename:$linenum"; | |
400 my @entry=($textline,$ref); | |
401 push @{$self->{TT}{doc_in}}, @entry; | |
402 | |
403 if (!defined($self->{TT}{'file_in_charset'})) { | |
404 # Detect if this file has non-ascii characters | |
405 if($self->{TT}{ascii_input}) { | |
406 my $decoder = guess_encoding($textline); | |
407 if (!ref($decoder) or $decoder !~ /Encode::XS=/) { | |
408 # We have detected a non-ascii line | |
409 $self->{TT}{ascii_input} = 0; | |
410 # Save the reference for future error message | |
411 $self->{TT}{non_ascii_ref} ||= $ref; | |
412 } | |
413 } | |
414 } | |
415 } | |
416 close INPUT | |
417 or croak wrap_msg(dgettext("po4a", "Can't close %s after reading: %s"), $filename, $!); | |
418 | |
419 } | |
420 | |
421 =item write($) | |
422 | |
423 Write the translated document to the given filename. | |
424 | |
425 =cut | |
426 | |
427 sub write { | |
428 my $self=shift; | |
429 my $filename=shift | |
430 or croak wrap_msg(dgettext("po4a", "Can't write to a file without filename")); | |
431 | |
432 my $fh; | |
433 if ($filename eq '-') { | |
434 $fh=\*STDOUT; | |
435 } else { | |
436 # make sure the directory in which we should write the localized file exists | |
437 my $dir = $filename; | |
438 if ($dir =~ m|/|) { | |
439 $dir =~ s|/[^/]*$||; | |
440 | |
441 File::Path::mkpath($dir, 0, 0755) # Croaks on error | |
442 if (length ($dir) && ! -e $dir); | |
443 } | |
444 open $fh,">$filename" | |
445 or croak wrap_msg(dgettext("po4a", "Can't write to %s: %s"), $filename, $!); | |
446 } | |
447 | |
448 map { print $fh $_ } $self->docheader(); | |
449 map { print $fh $_ } @{$self->{TT}{doc_out}}; | |
450 | |
451 if ($filename ne '-') { | |
452 close $fh or croak wrap_msg(dgettext("po4a", "Can't close %s after writing: %s"), $filename, $!); | |
453 } | |
454 | |
455 } | |
456 | |
457 =back | |
458 | |
459 =head2 Manipulating po files | |
460 | |
461 =over 4 | |
462 | |
463 =item readpo($) | |
464 | |
465 Add the content of a file (which name is passed in argument) to the | |
466 existing input po. The old content is not discarded. | |
467 | |
468 =item writepo($) | |
469 | |
470 Write the extracted po file to the given filename. | |
471 | |
472 =item stats() | |
473 | |
474 Returns some statistics about the translation done so far. Please note that | |
475 it's not the same statistics than the one printed by msgfmt | |
476 --statistic. Here, it's stats about recent usage of the po file, while | |
477 msgfmt reports the status of the file. It is a wrapper to the | |
478 Locale::Po4a::Po::stats_get function applied to the input po file. Example | |
479 of use: | |
480 | |
481 [normal use of the po4a document...] | |
482 | |
483 ($percent,$hit,$queries) = $document->stats(); | |
484 print "We found translations for $percent\% ($hit from $queries) of strings.\n"; | |
485 | |
486 =back | |
487 | |
488 =cut | |
489 | |
490 sub getpoout { | |
491 return $_[0]->{TT}{po_out}; | |
492 } | |
493 sub setpoout { | |
494 $_[0]->{TT}{po_out} = $_[1]; | |
495 } | |
496 sub readpo { | |
497 $_[0]->{TT}{po_in}->read($_[1]); | |
498 } | |
499 sub writepo { | |
500 $_[0]->{TT}{po_out}->write( $_[1] ); | |
501 } | |
502 sub stats { | |
503 return $_[0]->{TT}{po_in}->stats_get(); | |
504 } | |
505 | |
506 =head2 Manipulating addenda | |
507 | |
508 =over 4 | |
509 | |
510 =item addendum($) | |
511 | |
512 Please refer to L<po4a(7)|po4a.7> for more information on what addenda are, | |
513 and how translators should write them. To apply an addendum to the translated | |
514 document, simply pass its filename to this function and you are done ;) | |
515 | |
516 This function returns a non-null integer on error. | |
517 | |
518 =cut | |
519 | |
520 # Internal function to read the header. | |
521 sub addendum_parse { | |
522 my ($filename,$header)=shift; | |
523 | |
524 my ($errcode,$mode,$position,$boundary,$bmode,$content)= | |
525 (1,"","","","",""); | |
526 | |
527 unless (open (INS, "<$filename")) { | |
528 warn wrap_msg(dgettext("po4a", "Can't read from %s: %s"), $filename, $!); | |
529 goto END_PARSE_ADDFILE; | |
530 } | |
531 | |
532 unless (defined ($header=<INS>) && $header) { | |
533 warn wrap_msg(dgettext("po4a", "Can't read Po4a header from %s."), $filename); | |
534 goto END_PARSE_ADDFILE; | |
535 } | |
536 | |
537 unless ($header =~ s/PO4A-HEADER://i) { | |
538 warn wrap_msg(dgettext("po4a", "First line of %s does not look like a Po4a header."), $filename); | |
539 goto END_PARSE_ADDFILE; | |
540 } | |
541 foreach my $part (split(/;/,$header)) { | |
542 unless ($part =~ m/^\s*([^=]*)=(.*)$/) { | |
543 warn wrap_msg(dgettext("po4a", "Syntax error in Po4a header of %s, near \"%s\""), $filename, $part); | |
544 goto END_PARSE_ADDFILE; | |
545 } | |
546 my ($key,$value)=($1,$2); | |
547 $key=lc($key); | |
548 if ($key eq 'mode') { $mode=lc($value); | |
549 } elsif ($key eq 'position') { $position=$value; | |
550 } elsif ($key eq 'endboundary') { | |
551 $boundary=$value; | |
552 $bmode='after'; | |
553 } elsif ($key eq 'beginboundary') { | |
554 $boundary=$value; | |
555 $bmode='before'; | |
556 } else { | |
557 warn wrap_msg(dgettext("po4a", "Invalid argument in the Po4a header of %s: %s"), $filename, $key); | |
558 goto END_PARSE_ADDFILE; | |
559 } | |
560 } | |
561 | |
562 unless (length($mode)) { | |
563 warn wrap_msg(dgettext("po4a", "The Po4a header of %s does not define the mode."), $filename); | |
564 goto END_PARSE_ADDFILE; | |
565 } | |
566 unless ($mode eq "before" || $mode eq "after") { | |
567 warn wrap_msg(dgettext("po4a", "Mode invalid in the Po4a header of %s: should be 'before' or 'after' not %s."), $filename, $mode); | |
568 goto END_PARSE_ADDFILE; | |
569 } | |
570 | |
571 unless (length($position)) { | |
572 warn wrap_msg(dgettext("po4a", "The Po4a header of %s does not define the position."), $filename); | |
573 goto END_PARSE_ADDFILE; | |
574 } | |
575 unless ($mode eq "before" || length($boundary)) { | |
576 warn wrap_msg(dgettext("po4a", "No ending boundary given in the Po4a header, but mode=after.")); | |
577 goto END_PARSE_ADDFILE; | |
578 } | |
579 | |
580 while (defined(my $line = <INS>)) { | |
581 $content .= $line; | |
582 } | |
583 close INS; | |
584 | |
585 $errcode=0; | |
586 END_PARSE_ADDFILE: | |
587 return ($errcode,$mode,$position,$boundary,$bmode,$content); | |
588 } | |
589 | |
590 sub mychomp { | |
591 my ($str) = shift; | |
592 chomp($str); | |
593 return $str; | |
594 } | |
595 | |
596 sub addendum { | |
597 my ($self,$filename) = @_; | |
598 | |
599 print STDERR "Apply addendum $filename..." if $self->debug(); | |
600 unless ($filename) { | |
601 warn wrap_msg(dgettext("po4a", | |
602 "Can't apply addendum when not given the filename")); | |
603 return 0; | |
604 } | |
605 die wrap_msg(dgettext("po4a", "Addendum %s does not exist."), $filename) | |
606 unless -e $filename; | |
607 | |
608 my ($errcode,$mode,$position,$boundary,$bmode,$content)= | |
609 addendum_parse($filename); | |
610 return 0 if ($errcode); | |
611 | |
612 print STDERR "mode=$mode;pos=$position;bound=$boundary;bmode=$bmode;ctn=$content\n" | |
613 if $self->debug(); | |
614 | |
615 # We only recode the addendum if an origin charset is specified, else we | |
616 # suppose it's already in the output document's charset | |
617 if (defined($self->{TT}{'addendum_charset'}) && | |
618 length($self->{TT}{'addendum_charset'})) { | |
619 Encode::from_to($content,$self->{TT}{'addendum_charset'}, | |
620 $self->get_out_charset); | |
621 } | |
622 | |
623 my $found = scalar grep { /$position/ } @{$self->{TT}{doc_out}}; | |
624 if ($found == 0) { | |
625 warn wrap_msg(dgettext("po4a", | |
626 "No candidate position for the addendum %s."), $filename); | |
627 return 0; | |
628 } | |
629 if ($found > 1) { | |
630 warn wrap_msg(dgettext("po4a", | |
631 "More than one candidate position found for the addendum %s."), $filename); | |
632 return 0; | |
633 } | |
634 | |
635 if ($mode eq "before") { | |
636 if ($self->verbose() > 1 || $self->debug() ) { | |
637 map { print STDERR wrap_msg(dgettext("po4a", "Addendum '%s' applied before this line: %s"), $filename, $_) if (/$position/); | |
638 } @{$self->{TT}{doc_out}}; | |
639 } | |
640 @{$self->{TT}{doc_out}} = map { /$position/ ? ($content,$_) : $_ | |
641 } @{$self->{TT}{doc_out}}; | |
642 } else { | |
643 my @newres=(); | |
644 | |
645 do { | |
646 # make sure it doesnt whine on empty document | |
647 my $line = scalar @{$self->{TT}{doc_out}} ? shift @{$self->{TT}{doc_out}} : ""; | |
648 push @newres,$line; | |
649 my $outline=mychomp($line); | |
650 $outline =~ s/^[ \t]*//; | |
651 | |
652 if ($line =~ m/$position/) { | |
653 while ($line=shift @{$self->{TT}{doc_out}}) { | |
654 last if ($line=~/$boundary/); | |
655 push @newres,$line; | |
656 } | |
657 if (defined $line) { | |
658 if ($bmode eq 'before') { | |
659 print wrap_msg(dgettext("po4a", | |
660 "Addendum '%s' applied before this line: %s"), | |
661 $filename, $outline) | |
662 if ($self->verbose() > 1 || $self->debug()); | |
663 push @newres,$content; | |
664 push @newres,$line; | |
665 } else { | |
666 print wrap_msg(dgettext("po4a", | |
667 "Addendum '%s' applied after the line: %s."), | |
668 $filename, $outline) | |
669 if ($self->verbose() > 1 || $self->debug()); | |
670 push @newres,$line; | |
671 push @newres,$content; | |
672 } | |
673 } else { | |
674 print wrap_msg(dgettext("po4a", "Addendum '%s' applied at the end of the file."), $filename) | |
675 if ($self->verbose() > 1 || $self->debug()); | |
676 push @newres,$content; | |
677 } | |
678 } | |
679 } while (scalar @{$self->{TT}{doc_out}}); | |
680 @{$self->{TT}{doc_out}} = @newres; | |
681 } | |
682 print STDERR "done.\n" if $self->debug(); | |
683 return 1; | |
684 } | |
685 | |
686 =back | |
687 | |
688 =head1 INTERNAL FUNCTIONS used to write derivated parsers | |
689 | |
690 =head2 Getting input, providing output | |
691 | |
692 Four functions are provided to get input and return output. They are very | |
693 similar to shift/unshift and push/pop. The first pair is about input, while | |
694 the second is about output. Mnemonic: in input, you are interested in the | |
695 first line, what shift gives, and in output you want to add your result at | |
696 the end, like push does. | |
697 | |
698 =over 4 | |
699 | |
700 =item shiftline() | |
701 | |
702 This function returns the next line of the doc_in to be parsed and its | |
703 reference (packed as an array). | |
704 | |
705 =item unshiftline($$) | |
706 | |
707 Unshifts a line of the input document and its reference. | |
708 | |
709 =item pushline($) | |
710 | |
711 Push a new line to the doc_out. | |
712 | |
713 =item popline() | |
714 | |
715 Pop the last pushed line from the doc_out. | |
716 | |
717 =back | |
718 | |
719 =cut | |
720 | |
721 sub shiftline { | |
722 my ($line,$ref)=(shift @{$_[0]->{TT}{doc_in}}, | |
723 shift @{$_[0]->{TT}{doc_in}}); | |
724 return ($line,$ref); | |
725 } | |
726 sub unshiftline { | |
727 my $self = shift; | |
728 unshift @{$self->{TT}{doc_in}},@_; | |
729 } | |
730 | |
731 sub pushline { push @{$_[0]->{TT}{doc_out}}, $_[1] if defined $_[1]; } | |
732 sub popline { return pop @{$_[0]->{TT}{doc_out}}; } | |
733 | |
734 =head2 Marking strings as translatable | |
735 | |
736 One function is provided to handle the text which should be translated. | |
737 | |
738 =over 4 | |
739 | |
740 =item translate($$$) | |
741 | |
742 Mandatory arguments: | |
743 | |
744 =over 2 | |
745 | |
746 =item - | |
747 | |
748 A string to translate | |
749 | |
750 =item - | |
751 | |
752 The reference of this string (ie, position in inputfile) | |
753 | |
754 =item - | |
755 | |
756 The type of this string (ie, the textual description of its structural role | |
757 ; used in Locale::Po4a::Po::gettextization() ; see also L<po4a(7)|po4a.7>, | |
758 section I<Gettextization: how does it work?>) | |
759 | |
760 =back | |
761 | |
762 This function can also take some extra arguments. They must be organized as | |
763 a hash. For example: | |
764 | |
765 $self->translate("string","ref","type", | |
766 'wrap' => 1); | |
767 | |
768 =over | |
769 | |
770 =item wrap | |
771 | |
772 boolean indicating whether we can consider that whitespaces in string are | |
773 not important. If yes, the function canonizes the string before looking for | |
774 a translation or extracting it, and wraps the translation. | |
775 | |
776 =item wrapcol | |
777 | |
778 The column at which we should wrap (default: 76). | |
779 | |
780 =item comment | |
781 | |
782 An extra comment to add to the entry. | |
783 | |
784 =back | |
785 | |
786 Actions: | |
787 | |
788 =over 2 | |
789 | |
790 =item - | |
791 | |
792 Pushes the string, reference and type to po_out. | |
793 | |
794 =item - | |
795 | |
796 Returns the translation of the string (as found in po_in) so that the | |
797 parser can build the doc_out. | |
798 | |
799 =item - | |
800 | |
801 Handles the charsets to recode the strings before sending them to | |
802 po_out and before returning the translations. | |
803 | |
804 =back | |
805 | |
806 =back | |
807 | |
808 =cut | |
809 | |
810 sub translate { | |
811 my $self=shift; | |
812 my ($string,$ref,$type)=(shift,shift,shift); | |
813 my (%options)=@_; | |
814 | |
815 # my $validoption="wrap wrapcol"; | |
816 # my %validoption; | |
817 | |
818 return "" unless defined($string) && length($string); | |
819 | |
820 # map { $validoption{$_}=1 } (split(/ /,$validoption)); | |
821 # foreach (keys %options) { | |
822 # Carp::confess "internal error: translate() called with unknown arg $_. Valid options: $validoption" | |
823 # unless $validoption{$_}; | |
824 # } | |
825 | |
826 my $in_charset; | |
827 if ($self->{TT}{ascii_input}) { | |
828 $in_charset = "ascii"; | |
829 } else { | |
830 if (defined($self->{TT}{'file_in_charset'}) and | |
831 length($self->{TT}{'file_in_charset'}) and | |
832 $self->{TT}{'file_in_charset'} !~ m/ascii/i) { | |
833 $in_charset=$self->{TT}{'file_in_charset'}; | |
834 } else { | |
835 # FYI, the document charset have to be determined *before* we see the first | |
836 # string to recode. | |
837 die wrap_mod("po4a", dgettext("po4a", "Couldn't determine the input document's charset. Please specify it on the command line. (non-ascii char at %s)"), $self->{TT}{non_ascii_ref}) | |
838 } | |
839 } | |
840 | |
841 if ($self->{TT}{po_in}->get_charset ne "CHARSET") { | |
842 $string = encode_from_to($string, | |
843 $self->{TT}{'file_in_encoder'}, | |
844 $self->{TT}{po_in}{encoder}); | |
845 } | |
846 | |
847 if (defined $options{'wrapcol'} && $options{'wrapcol'} < 0) { | |
848 # FIXME: should be the parameter given with --width | |
849 $options{'wrapcol'} = 76 + $options{'wrapcol'}; | |
850 } | |
851 my $transstring = $self->{TT}{po_in}->gettext($string, | |
852 'wrap' => $options{'wrap'}||0, | |
853 'wrapcol' => $options{'wrapcol'}); | |
854 | |
855 if ($self->{TT}{po_in}->get_charset ne "CHARSET") { | |
856 my $out_encoder = $self->{TT}{'file_out_encoder'}; | |
857 unless (defined $out_encoder) { | |
858 $out_encoder = find_encoding($self->get_out_charset) | |
859 } | |
860 $transstring = encode_from_to($transstring, | |
861 $self->{TT}{po_in}{encoder}, | |
862 $out_encoder); | |
863 } | |
864 | |
865 # If the input document isn't completely in ascii, we should see what to | |
866 # do with the current string | |
867 unless ($self->{TT}{ascii_input}) { | |
868 my $out_charset = $self->{TT}{po_out}->get_charset; | |
869 # We set the output po charset | |
870 if ($out_charset eq "CHARSET") { | |
871 if ($self->{TT}{utf_mode}) { | |
872 $out_charset="utf-8"; | |
873 } else { | |
874 $out_charset=$in_charset; | |
875 } | |
876 $self->{TT}{po_out}->set_charset($out_charset); | |
877 } | |
878 if ( $in_charset !~ /^$out_charset$/i ) { | |
879 Encode::from_to($string,$in_charset,$out_charset); | |
880 if (defined($options{'comment'}) and length($options{'comment'})) { | |
881 Encode::from_to($options{'comment'},$in_charset,$out_charset); | |
882 } | |
883 } | |
884 } | |
885 | |
886 # the comments provided by the modules are automatic comments from the PO point of view | |
887 $self->{TT}{po_out}->push('msgid' => $string, | |
888 'reference' => $ref, | |
889 'type' => $type, | |
890 'automatic' => $options{'comment'}, | |
891 'wrap' => $options{'wrap'}||0, | |
892 'wrapcol' => $options{'wrapcol'}); | |
893 | |
894 # if ($self->{TT}{po_in}->get_charset ne "CHARSET") { | |
895 # Encode::from_to($transstring,$self->{TT}{po_in}->get_charset, | |
896 # $self->get_out_charset); | |
897 # } | |
898 | |
899 if ($options{'wrap'}||0) { | |
900 $transstring =~ s/( *)$//s; | |
901 my $trailing_spaces = $1||""; | |
902 $transstring =~ s/ *$//gm; | |
903 $transstring .= $trailing_spaces; | |
904 } | |
905 | |
906 return $transstring; | |
907 } | |
908 | |
909 =head2 Misc functions | |
910 | |
911 =over 4 | |
912 | |
913 =item verbose() | |
914 | |
915 Returns if the verbose option was passed during the creation of the | |
916 TransTractor. | |
917 | |
918 =cut | |
919 | |
920 sub verbose { | |
921 if (defined $_[1]) { | |
922 $_[0]->{TT}{verbose} = $_[1]; | |
923 } else { | |
924 return $_[0]->{TT}{verbose} || 0; # undef and 0 have the same meaning, but one generates warnings | |
925 } | |
926 } | |
927 | |
928 =item debug() | |
929 | |
930 Returns if the debug option was passed during the creation of the | |
931 TransTractor. | |
932 | |
933 =cut | |
934 | |
935 sub debug { | |
936 return $_[0]->{TT}{debug}; | |
937 } | |
938 | |
939 =item detected_charset($) | |
940 | |
941 This tells TransTractor that a new charset (the first argument) has been | |
942 detected from the input document. It can usually be read from the document | |
943 header. Only the first charset will remain, coming either from the | |
944 process() arguments or detected from the document. | |
945 | |
946 =cut | |
947 | |
948 sub detected_charset { | |
949 my ($self,$charset)=(shift,shift); | |
950 unless (defined($self->{TT}{'file_in_charset'}) and | |
951 length($self->{TT}{'file_in_charset'}) ) { | |
952 $self->{TT}{'file_in_charset'}=$charset; | |
953 if (defined $charset) { | |
954 $self->{TT}{'file_in_encoder'}=find_encoding($charset); | |
955 } | |
956 } | |
957 | |
958 if (defined $self->{TT}{'file_in_charset'} and | |
959 length $self->{TT}{'file_in_charset'} and | |
960 $self->{TT}{'file_in_charset'} !~ m/ascii/i) { | |
961 $self->{TT}{ascii_input}=0; | |
962 } | |
963 } | |
964 | |
965 =item get_out_charset() | |
966 | |
967 This function will return the charset that should be used in the output | |
968 document (usually useful to substitute the input document's detected charset | |
969 where it has been found). | |
970 | |
971 It will use the output charset specified in the command line. If it wasn't | |
972 specified, it will use the input po's charset, and if the input po has the | |
973 default "CHARSET", it will return the input document's charset, so that no | |
974 encoding is performed. | |
975 | |
976 =cut | |
977 | |
978 sub get_out_charset { | |
979 my $self=shift; | |
980 my $charset; | |
981 | |
982 # Use the value specified at the command line | |
983 if (defined($self->{TT}{'file_out_charset'}) and | |
984 length($self->{TT}{'file_out_charset'})) { | |
985 $charset=$self->{TT}{'file_out_charset'}; | |
986 } else { | |
987 if ($self->{TT}{utf_mode} && $self->{TT}{ascii_input}) { | |
988 $charset="utf-8"; | |
989 } else { | |
990 $charset=$self->{TT}{po_in}->get_charset; | |
991 $charset=$self->{TT}{'file_in_charset'} | |
992 if $charset eq "CHARSET" and | |
993 defined($self->{TT}{'file_in_charset'}) and | |
994 length($self->{TT}{'file_in_charset'}); | |
995 $charset="ascii" | |
996 if $charset eq "CHARSET"; | |
997 } | |
998 } | |
999 return $charset; | |
1000 } | |
1001 | |
1002 =item recode_skipped_text($) | |
1003 | |
1004 This function returns the recoded text passed as argument, from the input | |
1005 document's charset to the output document's one. This isn't needed when | |
1006 translating a string (translate() recodes everything itself), but it is when | |
1007 you skip a string from the input document and you want the output document to | |
1008 be consistent with the global encoding. | |
1009 | |
1010 =cut | |
1011 | |
1012 sub recode_skipped_text { | |
1013 my ($self,$text)=(shift,shift); | |
1014 unless ($self->{TT}{'ascii_input'}) { | |
1015 if(defined($self->{TT}{'file_in_charset'}) and | |
1016 length($self->{TT}{'file_in_charset'}) ) { | |
1017 $text = encode_from_to($text, | |
1018 $self->{TT}{'file_in_encoder'}, | |
1019 find_encoding($self->get_out_charset)); | |
1020 } else { | |
1021 die wrap_mod("po4a", dgettext("po4a", "Couldn't determine the input document's charset. Please specify it on the command line. (non-ascii char at %s)"), $self->{TT}{non_ascii_ref}) | |
1022 } | |
1023 } | |
1024 return $text; | |
1025 } | |
1026 | |
1027 | |
1028 # encode_from_to($,$,$) | |
1029 # | |
1030 # Encode the given text from one encoding to another one. | |
1031 # It differs from Encode::from_to because it does not take the name of the | |
1032 # encoding in argument, but the encoders (as returned by the | |
1033 # Encode::find_encoding(<name>) method). Thus it permits to save a bunch | |
1034 # of call to find_encoding. | |
1035 # | |
1036 # If the "from" encoding is undefined, it is considered as UTF-8 (or | |
1037 # ascii). | |
1038 # If the "to" encoding is undefined, it is considered as UTF-8. | |
1039 # | |
1040 sub encode_from_to { | |
1041 my ($text,$from,$to) = (shift,shift,shift); | |
1042 | |
1043 if (not defined $from) { | |
1044 # for ascii and UTF-8, no conversion needed to get an utf-8 | |
1045 # string. | |
1046 } else { | |
1047 $text = $from->decode($text, 0); | |
1048 } | |
1049 | |
1050 if (not defined $to) { | |
1051 # Already in UTF-8, no conversion needed | |
1052 } else { | |
1053 $text = $to->encode($text, 0); | |
1054 } | |
1055 | |
1056 return $text; | |
1057 } | |
1058 | |
1059 =back | |
1060 | |
1061 =head1 FUTURE DIRECTIONS | |
1062 | |
1063 One shortcoming of the current TransTractor is that it can't handle | |
1064 translated document containing all languages, like debconf templates, or | |
1065 .desktop files. | |
1066 | |
1067 To address this problem, the only interface changes needed are: | |
1068 | |
1069 =over 2 | |
1070 | |
1071 =item - | |
1072 | |
1073 take a hash as po_in_name (a list per language) | |
1074 | |
1075 =item - | |
1076 | |
1077 add an argument to translate to indicate the target language | |
1078 | |
1079 =item - | |
1080 | |
1081 make a pushline_all function, which would make pushline of its content for | |
1082 all language, using a map-like syntax: | |
1083 | |
1084 $self->pushline_all({ "Description[".$langcode."]=". | |
1085 $self->translate($line,$ref,$langcode) | |
1086 }); | |
1087 | |
1088 =back | |
1089 | |
1090 Will see if it's enough ;) | |
1091 | |
1092 =head1 AUTHORS | |
1093 | |
1094 Denis Barbier <barbier@linuxfr.org> | |
1095 Martin Quinson (mquinson#debian.org) | |
1096 Jordi Vilalta <jvprat@gmail.com> | |
1097 | |
1098 =cut | |
1099 | |
1100 1; |