comparison tools/po4a/lib/Locale/Po4a/Xml.pm @ 722:082bb76417f1

Add Po4a 0.37-dev(2009-03-08)
author Dongsheng Song <dongsheng.song@gmail.com>
date Thu, 12 Mar 2009 15:43:56 +0800
parents
children
comparison
equal deleted inserted replaced
721:2180358c32c4 722:082bb76417f1
1 #!/usr/bin/perl
2
3 # Po4a::Xml.pm
4 #
5 # extract and translate translatable strings from XML documents.
6 #
7 # This code extracts plain text from tags and attributes from generic
8 # XML documents, and it can be used as a base to build modules for
9 # XML-based documents.
10 #
11 # Copyright (c) 2004 by Jordi Vilalta <jvprat@gmail.com>
12 # Copyright (c) 2008-2009 by Nicolas François <nicolas.francois@centraliens.net>
13 #
14 # This program is free software; you can redistribute it and/or modify
15 # it under the terms of the GNU General Public License as published by
16 # the Free Software Foundation; either version 2 of the License, or
17 # (at your option) any later version.
18 #
19 # This program is distributed in the hope that it will be useful,
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 # GNU General Public License for more details.
23 #
24 # You should have received a copy of the GNU General Public License
25 # along with this program; if not, write to the Free Software
26 # Foundation, Inc.,
27 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28 #
29 ########################################################################
30
31 =head1 NAME
32
33 Locale::Po4a::Xml - Convert XML documents and derivates from/to PO files
34
35 =head1 DESCRIPTION
36
37 The po4a (po for anything) project goal is to ease translations (and more
38 interestingly, the maintenance of translations) using gettext tools on
39 areas where they were not expected like documentation.
40
41 Locale::Po4a::Xml is a module to help the translation of XML documents into
42 other [human] languages. It can also be used as a base to build modules for
43 XML-based documents.
44
45 =cut
46
47 package Locale::Po4a::Xml;
48
49 use 5.006;
50 use strict;
51 use warnings;
52
53 require Exporter;
54 use vars qw(@ISA @EXPORT);
55 @ISA = qw(Locale::Po4a::TransTractor);
56 @EXPORT = qw(new initialize @tag_types);
57
58 use Locale::Po4a::TransTractor;
59 use Locale::Po4a::Common;
60 use Carp qw(croak);
61 use File::Basename;
62 use File::Spec;
63
64 #It will mantain the path from the root tag to the current one
65 my @path;
66
67 #It will contain a list of external entities and their attached paths
68 my %entities;
69
70 my @comments;
71
72 sub shiftline {
73 my $self = shift;
74 # call Transtractor's shiftline
75 my ($line,$ref) = $self->SUPER::shiftline();
76 return ($line,$ref) if (not defined $line);
77
78 for my $k (keys %entities) {
79 if ($line =~ m/^(.*?)&$k;(.*)$/s) {
80 my ($before, $after) = ($1, $2);
81 my $linenum=0;
82 my @textentries;
83
84 open (my $in, $entities{$k})
85 or croak wrap_mod("po4a::xml",
86 dgettext("po4a", "Can't read from %s: %s"),
87 $entities{$k}, $!);
88 while (defined (my $textline = <$in>)) {
89 $linenum++;
90 my $textref=$entities{$k}.":$linenum";
91 push @textentries, ($textline,$textref);
92 }
93 close $in
94 or croak wrap_mod("po4a::xml",
95 dgettext("po4a", "Can't close %s after reading: %s"),
96 $entities{$k}, $!);
97
98 push @textentries, ($after, $ref);
99 $line = $before.(shift @textentries);
100 $ref .= " ".(shift @textentries);
101 $self->unshiftline(@textentries);
102 }
103 }
104
105 return ($line,$ref);
106 }
107
108 sub read {
109 my ($self,$filename)=@_;
110 push @{$self->{DOCPOD}{infile}}, $filename;
111 $self->Locale::Po4a::TransTractor::read($filename);
112 }
113
114 sub parse {
115 my $self=shift;
116 map {$self->parse_file($_)} @{$self->{DOCPOD}{infile}};
117 }
118
119 # @save_holders is a stack of references to ('paragraph', 'translation',
120 # 'sub_translations', 'open', 'close', 'folded_attributes') hashes, where:
121 # paragraph is a reference to an array (see paragraph in the
122 # treat_content() subroutine) of strings followed by
123 # references. It contains the @paragraph array as it was
124 # before the processing was interrupted by a tag instroducing
125 # a placeholder.
126 # translation is the translation of this level up to now
127 # sub_translations is a reference to an array of strings containing the
128 # translations which must replace the placeholders.
129 # open is the tag which opened the placeholder.
130 # close is the tag which closed the placeholder.
131 # folded_attributes is an hash of tags with their attributes (<tag attrs=...>
132 # strings), referenced by the folded tag id, which should
133 # replace the <tag po4a-id=id> strings in the current
134 # translation.
135 #
136 # If @save_holders only has 1 holder, then we are not processing the
137 # content of an holder, we are translating the document.
138 my @save_holders;
139
140
141 # If we are at the bottom of the stack and there is no <placeholder ...> in
142 # the current translation, we can push the translation in the translated
143 # document.
144 # Otherwise, we keep the translation in the current holder.
145 sub pushline {
146 my ($self, $line) = (shift, shift);
147
148 my $holder = $save_holders[$#save_holders];
149 my $translation = $holder->{'translation'};
150 $translation .= $line;
151
152 while ( %{$holder->{folded_attributes}}
153 and $translation =~ m/^(.*)<([^>]+?)\s+po4a-id=([0-9]+)>(.*)$/s) {
154 my $begin = $1;
155 my $tag = $2;
156 my $id = $3;
157 my $end = $4;
158 if (defined $holder->{folded_attributes}->{$id}) {
159 # TODO: check if the tag is the same
160 $translation = $begin.$holder->{folded_attributes}->{$id}.$end;
161 delete $holder->{folded_attributes}->{$id};
162 } else {
163 # TODO: It will be hard to identify the location.
164 # => find a way to retrieve the reference.
165 die wrap_mod("po4a::xml", dgettext("po4a", "'po4a-id=%d' in the translation does not exist in the original string (or 'po4a-id=%d' used twice in the translation)."), $id, $id);
166 }
167 }
168 # TODO: check that %folded_attributes is empty at some time
169 # => in translate_paragraph?
170
171 if ( ($#save_holders > 0)
172 or ($translation =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s)) {
173 $holder->{'translation'} = $translation;
174 } else {
175 $self->SUPER::pushline($translation);
176 $holder->{'translation'} = '';
177 }
178 }
179
180 =head1 TRANSLATING WITH PO4A::XML
181
182 This module can be used directly to handle generic XML documents. This will
183 extract all tag's content, and no attributes, since it's where the text is
184 written in most XML based documents.
185
186 There are some options (described in the next section) that can customize
187 this behavior. If this doesn't fit to your document format you're encouraged
188 to write your own module derived from this, to describe your format's details.
189 See the section "Writing derivate modules" below, for the process description.
190
191 =cut
192
193 #
194 # Parse file and translate it
195 #
196 sub parse_file {
197 my ($self,$filename) = @_;
198 my $eof = 0;
199
200 while (!$eof) {
201 # We get all the text until the next breaking tag (not
202 # inline) and translate it
203 $eof = $self->treat_content;
204 if (!$eof) {
205 # And then we treat the following breaking tag
206 $eof = $self->treat_tag;
207 }
208 }
209 }
210
211 =head1 OPTIONS ACCEPTED BY THIS MODULE
212
213 The global debug option causes this module to show the excluded strings, in
214 order to see if it skips something important.
215
216 These are this module's particular options:
217
218 =over 4
219
220 =item B<nostrip>
221
222 Prevents it to strip the spaces around the extracted strings.
223
224 =item B<wrap>
225
226 Canonizes the string to translate, considering that whitespaces are not
227 important, and wraps the translated document. This option can be overridden
228 by custom tag options. See the "tags" option below.
229
230 =item B<caseinsensitive>
231
232 It makes the tags and attributes searching to work in a case insensitive
233 way. If it's defined, it will treat E<lt>BooKE<gt>laNG and E<lt>BOOKE<gt>Lang as E<lt>bookE<gt>lang.
234
235 =item B<includeexternal>
236
237 When defined, external entities are included in the generated (translated)
238 document, and for the extraction of strings. If it's not defined, you
239 will have to translate external entities separately as independent
240 documents.
241
242 =item B<ontagerror>
243
244 This option defines the behavior of the module when it encounter a invalid
245 Xml syntax (a closing tag which does not match the last opening tag, or a
246 tag's attribute without value).
247 It can take the following values:
248
249 =over
250
251 =item I<fail>
252
253 This is the default value.
254 The module will exit with an error.
255
256 =item I<warn>
257
258 The module will continue, and will issue a warning.
259
260 =item I<silent>
261
262 The module will continue without any warnings.
263
264 =back
265
266 Be careful when using this option.
267 It is generally recommended to fix the input file.
268
269 =item B<tagsonly>
270
271 Extracts only the specified tags in the "tags" option. Otherwise, it
272 will extract all the tags except the ones specified.
273
274 Note: This option is deprecated.
275
276 =item B<doctype>
277
278 String that will try to match with the first line of the document's doctype
279 (if defined). If it doesn't, a warning will indicate that the document
280 might be of a bad type.
281
282 =item B<tags>
283
284 Space-separated list of tags you want to translate or skip. By default,
285 the specified tags will be excluded, but if you use the "tagsonly" option,
286 the specified tags will be the only ones included. The tags must be in the
287 form E<lt>aaaE<gt>, but you can join some (E<lt>bbbE<gt>E<lt>aaaE<gt>) to say that the content of
288 the tag E<lt>aaaE<gt> will only be translated when it's into a E<lt>bbbE<gt> tag.
289
290 You can also specify some tag options putting some characters in front of
291 the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
292 to override the default behavior specified by the global "wrap" option.
293
294 Example: WE<lt>chapterE<gt>E<lt>titleE<gt>
295
296 Note: This option is deprecated.
297 You should use the B<translated> and B<untranslated> options instead.
298
299 =item B<attributes>
300
301 Space-separated list of tag's attributes you want to translate. You can
302 specify the attributes by their name (for example, "lang"), but you can
303 prefix it with a tag hierarchy, to specify that this attribute will only be
304 translated when it's into the specified tag. For example: E<lt>bbbE<gt>E<lt>aaaE<gt>lang
305 specifies that the lang attribute will only be translated if it's into an
306 E<lt>aaaE<gt> tag, and it's into a E<lt>bbbE<gt> tag.
307
308 =item B<foldattributes>
309
310 Do not translate attributes in inline tags.
311 Instead, replace all attributes of a tag by po4a-id=<id>.
312
313 This is useful when attributes shall not be translated, as this simplifies the
314 strings for translators, and avoids typos.
315
316 =item B<break>
317
318 Space-separated list of tags which should break the sequence.
319 By default, all tags break the sequence.
320
321 The tags must be in the form <aaa>, but you can join some
322 (<bbb><aaa>), if a tag (<aaa>) should only be considered
323 when it's into another tag (<bbb>).
324
325 =item B<inline>
326
327 Space-separated list of tags which should be treated as inline.
328 By default, all tags break the sequence.
329
330 The tags must be in the form <aaa>, but you can join some
331 (<bbb><aaa>), if a tag (<aaa>) should only be considered
332 when it's into another tag (<bbb>).
333
334 =item B<placeholder>
335
336 Space-separated list of tags which should be treated as placeholders.
337 Placeholders do not break the sequence, but the content of placeholders is
338 translated separately.
339
340 The location of the placeholder in its blocks will be marked with a string
341 similar to:
342
343 <placeholder type=\"footnote\" id=\"0\"/>
344
345 The tags must be in the form <aaa>, but you can join some
346 (<bbb><aaa>), if a tag (<aaa>) should only be considered
347 when it's into another tag (<bbb>).
348
349 =item B<nodefault>
350
351 Space separated list of tags that the module should not try to set by
352 default in any category.
353
354 =item B<cpp>
355
356 Support C preprocessor directives.
357 When this option is set, po4a will consider preprocessor directives as
358 paragraph separators.
359 This is important if the XML file must be preprocessed because otherwise
360 the directives may be inserted in the middle of lines if po4a consider it
361 belong to the current paragraph, and they won't be recognized by the
362 preprocessor.
363 Note: the preprocessor directives must only appear between tags
364 (they must not break a tag).
365
366 =item B<translated>
367
368 Space-separated list of tags you want to translate.
369
370 The tags must be in the form <aaa>, but you can join some
371 (<bbb><aaa>), if a tag (<aaa>) should only be considered
372 when it's into another tag (<bbb>).
373
374 You can also specify some tag options putting some characters in front of
375 the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
376 to overide the default behavior specified by the global "wrap" option.
377
378 Example: WE<lt>chapterE<gt>E<lt>titleE<gt>
379
380 =item B<untranslated>
381
382 Space-separated list of tags you do not want to translate.
383
384 The tags must be in the form <aaa>, but you can join some
385 (<bbb><aaa>), if a tag (<aaa>) should only be considered
386 when it's into another tag (<bbb>).
387
388 =item B<defaulttranslateoption>
389
390 The default categories for tags that are not in any of the translated,
391 untranslated, break, inline, or placeholder.
392
393 This is a set of letters:
394
395 =over
396
397 =item I<w>
398
399 Tags should be translated and content can be re-wrapped.
400
401 =item I<W>
402
403 Tags should be translated and content should not be re-wrapped.
404
405 =item I<i>
406
407 Tags should be translated inline.
408
409 =item I<p>
410
411 Tags should be translated as placeholders.
412
413 =back
414
415 =back
416
417 =cut
418 # TODO: defaulttranslateoption
419 # w => indicate that it is only valid for translatable tags and do not
420 # care about inline/break/placeholder?
421 # ...
422
423 sub initialize {
424 my $self = shift;
425 my %options = @_;
426
427 # Reset the path
428 @path = ();
429
430 # Initialize the stack of holders
431 my @paragraph = ();
432 my @sub_translations = ();
433 my %folded_attributes;
434 my %holder = ('paragraph' => \@paragraph,
435 'translation' => "",
436 'sub_translations' => \@sub_translations,
437 'folded_attributes' => \%folded_attributes);
438 @save_holders = (\%holder);
439
440 $self->{options}{'nostrip'}=0;
441 $self->{options}{'wrap'}=0;
442 $self->{options}{'caseinsensitive'}=0;
443 $self->{options}{'tagsonly'}=0;
444 $self->{options}{'tags'}='';
445 $self->{options}{'break'}='';
446 $self->{options}{'translated'}='';
447 $self->{options}{'untranslated'}='';
448 $self->{options}{'defaulttranslateoption'}='';
449 $self->{options}{'attributes'}='';
450 $self->{options}{'foldattributes'}=0;
451 $self->{options}{'inline'}='';
452 $self->{options}{'placeholder'}='';
453 $self->{options}{'doctype'}='';
454 $self->{options}{'nodefault'}='';
455 $self->{options}{'includeexternal'}=0;
456 $self->{options}{'ontagerror'}="fail";
457 $self->{options}{'cpp'}=0;
458
459 $self->{options}{'verbose'}='';
460 $self->{options}{'debug'}='';
461
462 foreach my $opt (keys %options) {
463 if ($options{$opt}) {
464 die wrap_mod("po4a::xml",
465 dgettext("po4a", "Unknown option: %s"), $opt)
466 unless exists $self->{options}{$opt};
467 $self->{options}{$opt} = $options{$opt};
468 }
469 }
470 # Default options set by modules. Forbidden for users.
471 $self->{options}{'_default_translated'}='';
472 $self->{options}{'_default_untranslated'}='';
473 $self->{options}{'_default_break'}='';
474 $self->{options}{'_default_inline'}='';
475 $self->{options}{'_default_placeholder'}='';
476 $self->{options}{'_default_attributes'}='';
477
478 #It will maintain the list of the translatable tags
479 $self->{tags}=();
480 $self->{translated}=();
481 $self->{untranslated}=();
482 #It will maintain the list of the translatable attributes
483 $self->{attributes}=();
484 #It will maintain the list of the breaking tags
485 $self->{break}=();
486 #It will maintain the list of the inline tags
487 $self->{inline}=();
488 #It will maintain the list of the placeholder tags
489 $self->{placeholder}=();
490 #list of the tags that must not be set in the tags or inline category
491 #by this module or sub-module (unless specified in an option)
492 $self->{nodefault}=();
493
494 $self->treat_options;
495 }
496
497 =head1 WRITING DERIVATE MODULES
498
499 =head2 DEFINE WHAT TAGS AND ATTRIBUTES TO TRANSLATE
500
501 The simplest customization is to define which tags and attributes you want
502 the parser to translate. This should be done in the initialize function.
503 First you should call the main initialize, to get the command-line options,
504 and then, append your custom definitions to the options hash. If you want
505 to treat some new options from command line, you should define them before
506 calling the main initialize:
507
508 $self->{options}{'new_option'}='';
509 $self->SUPER::initialize(%options);
510 $self->{options}{'_default_translated'}.=' <p> <head><title>';
511 $self->{options}{'attributes'}.=' <p>lang id';
512 $self->{options}{'_default_inline'}.=' <br>';
513 $self->treat_options;
514
515 You should use the B<_default_inline>, B<_default_break>,
516 B<_default_placeholder>, B<_default_translated>, B<_default_untranslated>,
517 and B<_default_attributes> options in derivated modules. This allow users
518 to override the default behavior defined in your module with command line
519 options.
520
521 =head2 OVERRIDING THE found_string FUNCTION
522
523 Another simple step is to override the function "found_string", which
524 receives the extracted strings from the parser, in order to translate them.
525 There you can control which strings you want to translate, and perform
526 transformations to them before or after the translation itself.
527
528 It receives the extracted text, the reference on where it was, and a hash
529 that contains extra information to control what strings to translate, how
530 to translate them and to generate the comment.
531
532 The content of these options depends on the kind of string it is (specified in an
533 entry of this hash):
534
535 =over
536
537 =item type="tag"
538
539 The found string is the content of a translatable tag. The entry "tag_options"
540 contains the option characters in front of the tag hierarchy in the module
541 "tags" option.
542
543 =item type="attribute"
544
545 Means that the found string is the value of a translatable attribute. The
546 entry "attribute" has the name of the attribute.
547
548 =back
549
550 It must return the text that will replace the original in the translated
551 document. Here's a basic example of this function:
552
553 sub found_string {
554 my ($self,$text,$ref,$options)=@_;
555 $text = $self->translate($text,$ref,"type ".$options->{'type'},
556 'wrap'=>$self->{options}{'wrap'});
557 return $text;
558 }
559
560 There's another simple example in the new Dia module, which only filters
561 some strings.
562
563 =cut
564
565 sub found_string {
566 my ($self,$text,$ref,$options)=@_;
567
568 if ($text =~ m/^\s*$/s) {
569 return $text;
570 }
571
572 my $comment;
573 my $wrap = $self->{options}{'wrap'};
574
575 if ($options->{'type'} eq "tag") {
576 $comment = "Content of: ".$self->get_path;
577
578 if($options->{'tag_options'} =~ /w/) {
579 $wrap = 1;
580 }
581 if($options->{'tag_options'} =~ /W/) {
582 $wrap = 0;
583 }
584 } elsif ($options->{'type'} eq "attribute") {
585 $comment = "Attribute '".$options->{'attribute'}."' of: ".$self->get_path;
586 } elsif ($options->{'type'} eq "CDATA") {
587 $comment = "CDATA";
588 $wrap = 0;
589 } else {
590 die wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Internal error: unknown type identifier '%s'."), $options->{'type'});
591 }
592 $text = $self->translate($text,$ref,$comment,'wrap'=>$wrap, comment => $options->{'comments'});
593 return $text;
594 }
595
596 =head2 MODIFYING TAG TYPES (TODO)
597
598 This is a more complex one, but it enables a (almost) total customization.
599 It's based in a list of hashes, each one defining a tag type's behavior. The
600 list should be sorted so that the most general tags are after the most
601 concrete ones (sorted first by the beginning and then by the end keys). To
602 define a tag type you'll have to make a hash with the following keys:
603
604 =over 4
605
606 =item beginning
607
608 Specifies the beginning of the tag, after the "E<lt>".
609
610 =item end
611
612 Specifies the end of the tag, before the "E<gt>".
613
614 =item breaking
615
616 It says if this is a breaking tag class. A non-breaking (inline) tag is one
617 that can be taken as part of the content of another tag. It can take the
618 values false (0), true (1) or undefined. If you leave this undefined, you'll
619 have to define the f_breaking function that will say whether a concrete tag of
620 this class is a breaking tag or not.
621
622 =item f_breaking
623
624 It's a function that will tell if the next tag is a breaking one or not. It
625 should be defined if the "breaking" option is not.
626
627 =item f_extract
628
629 If you leave this key undefined, the generic extraction function will have to
630 extract the tag itself. It's useful for tags that can have other tags or
631 special structures in them, so that the main parser doesn't get mad. This
632 function receives a boolean that says if the tag should be removed from the
633 input stream or not.
634
635 =item f_translate
636
637 This function receives the tag (in the get_string_until() format) and returns
638 the translated tag (translated attributes or all needed transformations) as a
639 single string.
640
641 =back
642
643 =cut
644
645 ##### Generic XML tag types #####'
646
647 our @tag_types = (
648 { beginning => "!--#",
649 end => "--",
650 breaking => 0,
651 f_extract => \&tag_extract_comment,
652 f_translate => \&tag_trans_comment},
653 { beginning => "!--",
654 end => "--",
655 breaking => 0,
656 f_extract => \&tag_extract_comment,
657 f_translate => \&tag_trans_comment},
658 { beginning => "?xml",
659 end => "?",
660 breaking => 1,
661 f_translate => \&tag_trans_xmlhead},
662 { beginning => "?",
663 end => "?",
664 breaking => 1,
665 f_translate => \&tag_trans_procins},
666 { beginning => "!DOCTYPE",
667 end => "",
668 breaking => 1,
669 f_extract => \&tag_extract_doctype,
670 f_translate => \&tag_trans_doctype},
671 { beginning => "![CDATA[",
672 end => "",
673 breaking => 1,
674 f_extract => \&CDATA_extract,
675 f_translate => \&CDATA_trans},
676 { beginning => "/",
677 end => "",
678 f_breaking => \&tag_break_close,
679 f_translate => \&tag_trans_close},
680 { beginning => "",
681 end => "/",
682 f_breaking => \&tag_break_alone,
683 f_translate => \&tag_trans_alone},
684 { beginning => "",
685 end => "",
686 f_breaking => \&tag_break_open,
687 f_translate => \&tag_trans_open}
688 );
689
690 sub tag_extract_comment {
691 my ($self,$remove)=(shift,shift);
692 my ($eof,@tag)=$self->get_string_until('-->',{include=>1,remove=>$remove});
693 return ($eof,@tag);
694 }
695
696 sub tag_trans_comment {
697 my ($self,@tag)=@_;
698 return $self->join_lines(@tag);
699 }
700
701 sub tag_trans_xmlhead {
702 my ($self,@tag)=@_;
703
704 # We don't have to translate anything from here: throw away references
705 my $tag = $self->join_lines(@tag);
706 $tag =~ /encoding=(("|')|)(.*?)(\s|\2)/s;
707 my $in_charset=$3;
708 $self->detected_charset($in_charset);
709 my $out_charset=$self->get_out_charset;
710
711 if (defined $in_charset) {
712 $tag =~ s/$in_charset/$out_charset/;
713 } else {
714 if ($tag =~ m/standalone/) {
715 $tag =~ s/(standalone)/encoding="$out_charset" $1/;
716 } else {
717 $tag.= " encoding=\"$out_charset\"";
718 }
719 }
720
721 return $tag;
722 }
723
724 sub tag_trans_procins {
725 my ($self,@tag)=@_;
726 return $self->join_lines(@tag);
727 }
728
729 sub tag_extract_doctype {
730 my ($self,$remove)=(shift,shift);
731
732 # Check if there is an internal subset (between []).
733 my ($eof,@tag)=$self->get_string_until('>',{include=>1,unquoted=>1});
734 my $parity = 0;
735 my $paragraph = "";
736 map { $parity = 1 - $parity; $paragraph.= $parity?$_:""; } @tag;
737 my $found = 0;
738 if ($paragraph =~ m/<.*\[.*</s) {
739 $found = 1
740 }
741
742 if (not $found) {
743 ($eof,@tag)=$self->get_string_until('>',{include=>1,remove=>$remove,unquoted=>1});
744 } else {
745 ($eof,@tag)=$self->get_string_until(']\s*>',{include=>1,remove=>$remove,unquoted=>1,regex=>1});
746 }
747 return ($eof,@tag);
748 }
749
750 sub tag_trans_doctype {
751 # This check is not really reliable. There are system and public
752 # identifiers. Only the public one could be checked reliably.
753 my ($self,@tag)=@_;
754 if (defined $self->{options}{'doctype'} ) {
755 my $doctype = $self->{options}{'doctype'};
756 if ( $tag[0] !~ /\Q$doctype\E/i ) {
757 warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Bad document type. '%s' expected. You can fix this warning with a -o doctype option, or ignore this check with -o doctype=\"\"."), $doctype);
758 }
759 }
760 my $i = 0;
761 my $basedir = $tag[1];
762 $basedir =~ s/:[0-9]+$//;
763 $basedir = dirname($basedir);
764
765 while ( $i < $#tag ) {
766 my $t = $tag[$i];
767 my $ref = $tag[$i+1];
768 if ( $t =~ /^(\s*<!ENTITY\s+)(.*)$/is ) {
769 my $part1 = $1;
770 my $part2 = $2;
771 my $includenow = 0;
772 my $file = 0;
773 my $name = "";
774 if ($part2 =~ /^(%\s+)(.*)$/s ) {
775 $part1.= $1;
776 $part2 = $2;
777 $includenow = 1;
778 }
779 $part2 =~ /^(\S+)(\s+)(.*)$/s;
780 $name = $1;
781 $part1.= $1.$2;
782 $part2 = $3;
783 if ( $part2 =~ /^(SYSTEM\s+)(.*)$/is ) {
784 $part1.= $1;
785 $part2 = $2;
786 $file = 1;
787 if ($self->{options}{'includeexternal'}) {
788 $entities{$name} = $part2;
789 $entities{$name} =~ s/^"?(.*?)".*$/$1/s;
790 $entities{$name} = File::Spec->catfile($basedir, $entities{$name});
791 }
792 }
793 if ((not $file) and (not $includenow)) {
794 if ($part2 =~ m/^\s*(["'])(.*)\1(\s*>.*)$/s) {
795 my $comment = "Content of the $name entity";
796 my $quote = $1;
797 my $text = $2;
798 $part2 = $3;
799 $text = $self->translate($text,
800 $ref,
801 $comment,
802 'wrap'=>1);
803 $t = $part1."$quote$text$quote$part2";
804 }
805 }
806 # print $part1."\n";
807 # print $name."\n";
808 # print $part2."\n";
809 }
810 $tag[$i] = $t;
811 $i += 2;
812 }
813 return $self->join_lines(@tag);
814 }
815
816 sub tag_break_close {
817 my ($self,@tag)=@_;
818 my $struct = $self->get_path;
819 my $options = $self->get_translate_options($struct);
820 if ($options =~ m/[ip]/) {
821 return 0;
822 } else {
823 return 1;
824 }
825 }
826
827 sub tag_trans_close {
828 my ($self,@tag)=@_;
829 my $name = $self->get_tag_name(@tag);
830
831 my $test = pop @path;
832 if (!defined($test) || $test ne $name ) {
833 my $ontagerror = $self->{options}{'ontagerror'};
834 if ($ontagerror eq "warn") {
835 warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong. Continuing..."), $name);
836 } elsif ($ontagerror ne "silent") {
837 die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
838 }
839 }
840 return $self->join_lines(@tag);
841 }
842
843 sub CDATA_extract {
844 my ($self,$remove)=(shift,shift);
845 my ($eof, @tag) = $self->get_string_until(']]>',{include=>1,unquoted=>0,remove=>$remove});
846
847 return ($eof, @tag);
848 }
849
850 sub CDATA_trans {
851 my ($self,@tag)=@_;
852 return $self->found_string($self->join_lines(@tag),
853 $tag[1],
854 {'type' => "CDATA"});
855 }
856
857 sub tag_break_alone {
858 my ($self,@tag)=@_;
859 my $struct = $self->get_path($self->get_tag_name(@tag));
860 if ($self->get_translate_options($struct) =~ m/i/) {
861 return 0;
862 } else {
863 return 1;
864 }
865 }
866
867 sub tag_trans_alone {
868 my ($self,@tag)=@_;
869 my $name = $self->get_tag_name(@tag);
870 push @path, $name;
871
872 $name = $self->treat_attributes(@tag);
873
874 pop @path;
875 return $name;
876 }
877
878 sub tag_break_open {
879 my ($self,@tag)=@_;
880 my $struct = $self->get_path($self->get_tag_name(@tag));
881 my $options = $self->get_translate_options($struct);
882 if ($options =~ m/[ip]/) {
883 return 0;
884 } else {
885 return 1;
886 }
887 }
888
889 sub tag_trans_open {
890 my ($self,@tag)=@_;
891 my $name = $self->get_tag_name(@tag);
892 push @path, $name;
893
894 $name = $self->treat_attributes(@tag);
895
896 return $name;
897 }
898
899 ##### END of Generic XML tag types #####
900
901 =head1 INTERNAL FUNCTIONS used to write derivated parsers
902
903 =head2 WORKING WITH TAGS
904
905 =over 4
906
907 =item get_path()
908
909 This function returns the path to the current tag from the document's root,
910 in the form E<lt>htmlE<gt>E<lt>bodyE<gt>E<lt>pE<gt>.
911
912 An additional array of tags (without brackets) can be passed in argument.
913 These path elements are added to the end of the current path.
914
915 =cut
916
917 sub get_path {
918 my $self = shift;
919 my @add = @_;
920 if ( @path > 0 or @add > 0 ) {
921 return "<".join("><",@path,@add).">";
922 } else {
923 return "outside any tag (error?)";
924 }
925 }
926
927 =item tag_type()
928
929 This function returns the index from the tag_types list that fits to the next
930 tag in the input stream, or -1 if it's at the end of the input file.
931
932 =cut
933
934 sub tag_type {
935 my $self = shift;
936 my ($line,$ref) = $self->shiftline();
937 my ($match1,$match2);
938 my $found = 0;
939 my $i = 0;
940
941 if (!defined($line)) { return -1; }
942
943 $self->unshiftline($line,$ref);
944 my ($eof,@lines) = $self->get_string_until(">",{include=>1,unquoted=>1});
945 my $line2 = $self->join_lines(@lines);
946 while (!$found && $i < @tag_types) {
947 ($match1,$match2) = ($tag_types[$i]->{beginning},$tag_types[$i]->{end});
948 if ($line =~ /^<\Q$match1\E/) {
949 if (!defined($tag_types[$i]->{f_extract})) {
950 #print substr($line2,length($line2)-1-length($match2),1+length($match2))."\n";
951 if (defined($line2) and $line2 =~ /\Q$match2\E>$/) {
952 $found = 1;
953 #print "YES: <".$match1." ".$match2.">\n";
954 } else {
955 #print "NO: <".$match1." ".$match2.">\n";
956 $i++;
957 }
958 } else {
959 $found = 1;
960 }
961 } else {
962 $i++;
963 }
964 }
965 if (!$found) {
966 #It should never enter here, unless you undefine the most
967 #general tags (as <...>)
968 die "po4a::xml: Unknown tag type: ".$line."\n";
969 } else {
970 return $i;
971 }
972 }
973
974 =item extract_tag($$)
975
976 This function returns the next tag from the input stream without the beginning
977 and end, in an array form, to maintain the references from the input file. It
978 has two parameters: the type of the tag (as returned by tag_type) and a
979 boolean, that indicates if it should be removed from the input stream.
980
981 =cut
982
983 sub extract_tag {
984 my ($self,$type,$remove) = (shift,shift,shift);
985 my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
986 my ($eof,@tag);
987 if (defined($tag_types[$type]->{f_extract})) {
988 ($eof,@tag) = &{$tag_types[$type]->{f_extract}}($self,$remove);
989 } else {
990 ($eof,@tag) = $self->get_string_until($match2.">",{include=>1,remove=>$remove,unquoted=>1});
991 }
992 $tag[0] =~ /^<\Q$match1\E(.*)$/s;
993 $tag[0] = $1;
994 $tag[$#tag-1] =~ /^(.*)\Q$match2\E>$/s;
995 $tag[$#tag-1] = $1;
996 return ($eof,@tag);
997 }
998
999 =item get_tag_name(@)
1000
1001 This function returns the name of the tag passed as an argument, in the array
1002 form returned by extract_tag.
1003
1004 =cut
1005
1006 sub get_tag_name {
1007 my ($self,@tag)=@_;
1008 $tag[0] =~ /^(\S*)/;
1009 return $1;
1010 }
1011
1012 =item breaking_tag()
1013
1014 This function returns a boolean that says if the next tag in the input stream
1015 is a breaking tag or not (inline tag). It leaves the input stream intact.
1016
1017 =cut
1018
1019 sub breaking_tag {
1020 my $self = shift;
1021 my $break;
1022
1023 my $type = $self->tag_type;
1024 if ($type == -1) { return 0; }
1025
1026 #print "TAG TYPE = ".$type."\n";
1027 $break = $tag_types[$type]->{breaking};
1028 if (!defined($break)) {
1029 # This tag's breaking depends on its content
1030 my ($eof,@lines) = $self->extract_tag($type,0);
1031 $break = &{$tag_types[$type]->{f_breaking}}($self,@lines);
1032 }
1033 #print "break = ".$break."\n";
1034 return $break;
1035 }
1036
1037 =item treat_tag()
1038
1039 This function translates the next tag from the input stream. Using each
1040 tag type's custom translation functions.
1041
1042 =cut
1043
1044 sub treat_tag {
1045 my $self = shift;
1046 my $type = $self->tag_type;
1047
1048 my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
1049 my ($eof,@lines) = $self->extract_tag($type,1);
1050
1051 $lines[0] =~ /^(\s*)(.*)$/s;
1052 my $space1 = $1;
1053 $lines[0] = $2;
1054 $lines[$#lines-1] =~ /^(.*?)(\s*)$/s;
1055 my $space2 = $2;
1056 $lines[$#lines-1] = $1;
1057
1058 # Calling this tag type's specific handling (translation of
1059 # attributes...)
1060 my $line = &{$tag_types[$type]->{f_translate}}($self,@lines);
1061 $self->pushline("<".$match1.$space1.$line.$space2.$match2.">");
1062 return $eof;
1063 }
1064
1065 =item tag_in_list($@)
1066
1067 This function returns a string value that says if the first argument (a tag
1068 hierarchy) matches any of the tags from the second argument (a list of tags
1069 or tag hierarchies). If it doesn't match, it returns 0. Else, it returns the
1070 matched tag's options (the characters in front of the tag) or 1 (if that tag
1071 doesn't have options).
1072
1073 =back
1074
1075 =cut
1076 sub tag_in_list ($$$) {
1077 my ($self,$path,$list) = @_;
1078 if ($self->{options}{'caseinsensitive'}) {
1079 $path = lc $path;
1080 }
1081
1082 while (1) {
1083 if (defined $list->{$path}) {
1084 if (length $list->{$path}) {
1085 return $list->{$path};
1086 } else {
1087 return 1;
1088 }
1089 }
1090 last unless ($path =~ m/</);
1091 $path =~ s/^<.*?>//;
1092 }
1093
1094 return 0;
1095 }
1096
1097 =head2 WORKING WITH ATTRIBUTES
1098
1099 =over 4
1100
1101 =item treat_attributes(@)
1102
1103 This function handles the translation of the tags' attributes. It receives the tag
1104 without the beginning / end marks, and then it finds the attributes, and it
1105 translates the translatable ones (specified by the module option "attributes").
1106 This returns a plain string with the translated tag.
1107
1108 =back
1109
1110 =cut
1111
1112 sub treat_attributes {
1113 my ($self,@tag)=@_;
1114
1115 $tag[0] =~ /^(\S*)(.*)/s;
1116 my $text = $1;
1117 $tag[0] = $2;
1118
1119 while (@tag) {
1120 my $complete = 1;
1121
1122 $text .= $self->skip_spaces(\@tag);
1123 if (@tag) {
1124 # Get the attribute's name
1125 $complete = 0;
1126
1127 $tag[0] =~ /^([^\s=]+)(.*)/s;
1128 my $name = $1;
1129 my $ref = $tag[1];
1130 $tag[0] = $2;
1131 $text .= $name;
1132 $text .= $self->skip_spaces(\@tag);
1133 if (@tag) {
1134 # Get the '='
1135 if ($tag[0] =~ /^=(.*)/s) {
1136 $tag[0] = $1;
1137 $text .= "=";
1138 $text .= $self->skip_spaces(\@tag);
1139 if (@tag) {
1140 # Get the value
1141 my $value="";
1142 $ref=$tag[1];
1143 my $quot=substr($tag[0],0,1);
1144 if ($quot ne "\"" and $quot ne "'") {
1145 # Unquoted value
1146 $quot="";
1147 $tag[0] =~ /^(\S+)(.*)/s;
1148 $value = $1;
1149 $tag[0] = $2;
1150 } else {
1151 # Quoted value
1152 $text .= $quot;
1153 $tag[0] =~ /^\Q$quot\E(.*)/s;
1154 $tag[0] = $1;
1155 while ($tag[0] !~ /\Q$quot\E/) {
1156 $value .= $tag[0];
1157 shift @tag;
1158 shift @tag;
1159 }
1160 $tag[0] =~ /^(.*?)\Q$quot\E(.*)/s;
1161 $value .= $1;
1162 $tag[0] = $2;
1163 }
1164 $complete = 1;
1165 if ($self->tag_in_list($self->get_path.$name,$self->{attributes})) {
1166 $text .= $self->found_string($value, $ref, { type=>"attribute", attribute=>$name });
1167 } else {
1168 print wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Content of attribute %s excluded: %s"), $self->get_path.$name, $value)
1169 if $self->debug();
1170 $text .= $self->recode_skipped_text($value);
1171 }
1172 $text .= $quot;
1173 }
1174 }
1175 }
1176
1177 unless ($complete) {
1178 my $ontagerror = $self->{options}{'ontagerror'};
1179 if ($ontagerror eq "warn") {
1180 warn wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax. Continuing..."));
1181 } elsif ($ontagerror ne "silent") {
1182 die wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax"));
1183 }
1184 }
1185 }
1186 }
1187 return $text;
1188 }
1189
1190 # Returns an empty string if the content in the $path should not be
1191 # translated.
1192 #
1193 # Otherwise, returns the set of options for translation:
1194 # w: the content shall be re-wrapped
1195 # W: the content shall not be re-wrapped
1196 # i: the tag shall be inlined
1197 # p: a placeholder shall replace the tag (and its content)
1198 #
1199 # A translatable inline tag in an untranslated tag is treated as a translatable breaking tag.
1200 my %translate_options_cache;
1201 sub get_translate_options {
1202 my $self = shift;
1203 my $path = shift;
1204
1205 if (defined $translate_options_cache{$path}) {
1206 return $translate_options_cache{$path};
1207 }
1208
1209 my $options = "";
1210 my $translate = 0;
1211 my $usedefault = 1;
1212
1213 my $inlist = 0;
1214 my $tag = $self->get_tag_from_list($path, $self->{tags});
1215 if (defined $tag) {
1216 $inlist = 1;
1217 }
1218 if ($self->{options}{'tagsonly'} eq $inlist) {
1219 $usedefault = 0;
1220 if (defined $tag) {
1221 $options = $tag;
1222 $options =~ s/<.*$//;
1223 } else {
1224 if ($self->{options}{'wrap'}) {
1225 $options = "w";
1226 } else {
1227 $options = "W";
1228 }
1229 }
1230 $translate = 1;
1231 }
1232
1233 # TODO: a less precise set of tags should not override a more precise one
1234 # The tags and tagsonly options are deprecated.
1235 # The translated and untranslated options have an higher priority.
1236 $tag = $self->get_tag_from_list($path, $self->{translated});
1237 if (defined $tag) {
1238 $usedefault = 0;
1239 $options = $tag;
1240 $options =~ s/<.*$//;
1241 $translate = 1;
1242 }
1243
1244 if ($translate and $options !~ m/w/i) {
1245 $options .= ($self->{options}{'wrap'})?"w":"W";
1246 }
1247
1248 if (not defined $tag) {
1249 $tag = $self->get_tag_from_list($path, $self->{untranslated});
1250 if (defined $tag) {
1251 $usedefault = 0;
1252 $options = "";
1253 $translate = 0;
1254 }
1255 }
1256
1257 $tag = $self->get_tag_from_list($path, $self->{inline});
1258 if (defined $tag) {
1259 $usedefault = 0;
1260 $options .= "i";
1261 } else {
1262 $tag = $self->get_tag_from_list($path, $self->{placeholder});
1263 if (defined $tag) {
1264 $usedefault = 0;
1265 $options .= "p";
1266 }
1267 }
1268
1269 if ($usedefault) {
1270 $options = $self->{options}{'defaulttranslateoption'};
1271 }
1272
1273 # A translatable inline tag in an untranslated tag is treated as a
1274 # translatable breaking tag.
1275 if ($options =~ m/i/) {
1276 my $ppath = $path;
1277 $ppath =~ s/<[^>]*>$//;
1278 my $poptions = $self->get_translate_options ($ppath);
1279 if ($poptions eq "") {
1280 $options =~ s/i//;
1281 }
1282 }
1283
1284 if ($options =~ m/i/ and $self->{options}{'foldattributes'}) {
1285 $options .= "f";
1286 }
1287
1288 $translate_options_cache{$path} = $options;
1289 return $options;
1290 }
1291
1292
1293 # Return the tag (or biggest set of tags) of a list which matches with the
1294 # given path.
1295 #
1296 # The tag (or set of tags) is returned with its options.
1297 #
1298 # If no tags could match the path, undef is returned.
1299 sub get_tag_from_list ($$$) {
1300 my ($self,$path,$list) = @_;
1301 if ($self->{options}{'caseinsensitive'}) {
1302 $path = lc $path;
1303 }
1304
1305 while (1) {
1306 if (defined $list->{$path}) {
1307 return $list->{$path}.$path;
1308 }
1309 last unless ($path =~ m/</);
1310 $path =~ s/^<.*?>//;
1311 }
1312
1313 return undef;
1314 }
1315
1316
1317
1318 sub treat_content {
1319 my $self = shift;
1320 my $blank="";
1321 # Indicates if the paragraph will have to be translated
1322 my $translate = "";
1323
1324 my ($eof,@paragraph)=$self->get_string_until('<',{remove=>1});
1325
1326 while (!$eof and !$self->breaking_tag) {
1327 NEXT_TAG:
1328 my @text;
1329 my $type = $self->tag_type;
1330 my $f_extract = $tag_types[$type]->{'f_extract'};
1331 if ( defined($f_extract)
1332 and $f_extract eq \&tag_extract_comment) {
1333 # Remove the content of the comments
1334 ($eof, @text) = $self->extract_tag($type,1);
1335 $text[$#text-1] .= "\0";
1336 if ($tag_types[$type]->{'beginning'} eq "!--#") {
1337 $text[0] = "#".$text[0];
1338 }
1339 push @comments, @text;
1340 } else {
1341 my ($tmpeof, @tag) = $self->extract_tag($type,0);
1342 # Append the found inline tag
1343 ($eof,@text)=$self->get_string_until('>',
1344 {include=>1,
1345 remove=>1,
1346 unquoted=>1});
1347 # Append or remove the opening/closing tag from
1348 # the tag path
1349 if ($tag_types[$type]->{'end'} eq "") {
1350 if ($tag_types[$type]->{'beginning'} eq "") {
1351 # Opening inline tag
1352 my $cur_tag_name = $self->get_tag_name(@tag);
1353 my $t_opts = $self->get_translate_options($self->get_path($cur_tag_name));
1354 if ($t_opts =~ m/p/) {
1355 # We enter a new holder.
1356 # Append a <placeholder ...> tag to the current
1357 # paragraph, and save the @paragraph in the
1358 # current holder.
1359 my $last_holder = $save_holders[$#save_holders];
1360 my $placeholder_str = "<placeholder type=\"".$cur_tag_name."\" id=\"".($#{$last_holder->{'sub_translations'}}+1)."\"/>";
1361 push @paragraph, ($placeholder_str, $text[1]);
1362 my @saved_paragraph = @paragraph;
1363
1364 $last_holder->{'paragraph'} = \@saved_paragraph;
1365
1366 # Then we must push a new holder
1367 my @new_paragraph = ();
1368 my @sub_translations = ();
1369 my %folded_attributes;
1370 my %new_holder = ('paragraph' => \@new_paragraph,
1371 'open' => $text[0],
1372 'translation' => "",
1373 'close' => undef,
1374 'sub_translations' => \@sub_translations,
1375 'folded_attributes' => \%folded_attributes);
1376 push @save_holders, \%new_holder;
1377 @text = ();
1378
1379 # The current @paragraph
1380 # (for the current holder)
1381 # is empty.
1382 @paragraph = ();
1383 } elsif ($t_opts =~ m/f/) {
1384 my $tag_full = $self->join_lines(@text);
1385 my $tag_ref = $text[1];
1386 if ($tag_full =~ m/^<\s*\S+\s+\S.*>$/s) {
1387 my $holder = $save_holders[$#save_holders];
1388 my $id = 0;
1389 foreach (keys %{$holder->{folded_attributes}}) {
1390 $id = $_ + 1 if ($_ >= $id);
1391 }
1392 $holder->{folded_attributes}->{$id} = $tag_full;
1393
1394 @text = ("<$cur_tag_name po4a-id=$id>", $tag_ref);
1395 }
1396 }
1397 push @path, $cur_tag_name;
1398 } elsif ($tag_types[$type]->{'beginning'} eq "/") {
1399 # Closing inline tag
1400
1401 # Check if this is closing the
1402 # last opening tag we detected.
1403 my $test = pop @path;
1404 my $name = $self->get_tag_name(@tag);
1405 if (!defined($test) ||
1406 $test ne $name ) {
1407 my $ontagerror = $self->{options}{'ontagerror'};
1408 if ($ontagerror eq "warn") {
1409 warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong. Continuing..."), $name);
1410 } elsif ($ontagerror ne "silent") {
1411 die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
1412 }
1413 }
1414
1415 if ($self->get_translate_options($self->get_path($self->get_tag_name(@tag))) =~ m/p/) {
1416 # This closes the current holder.
1417
1418 push @path, $self->get_tag_name(@tag);
1419 # Now translate this paragraph if needed.
1420 # This will call pushline and append the
1421 # translation to the current holder's translation.
1422 $self->translate_paragraph(@paragraph);
1423 pop @path;
1424
1425 # Now that this holder is closed, we can remove
1426 # the holder from the stack.
1427 my $holder = pop @save_holders;
1428 # We need to keep the translation of this holder
1429 my $translation = $holder->{'open'}.$holder->{'translation'}.$text[0];
1430 # FIXME: @text could be multilines.
1431
1432 @text = ();
1433
1434 # Then we store the translation in the previous
1435 # holder's sub_translations array
1436 my $previous_holder = $save_holders[$#save_holders];
1437 push @{$previous_holder->{'sub_translations'}}, $translation;
1438 # We also need to restore the @paragraph array, as
1439 # it was before we encountered the holder.
1440 @paragraph = @{$previous_holder->{'paragraph'}};
1441 }
1442 }
1443 }
1444 push @paragraph, @text;
1445 }
1446
1447 # Next tag
1448 ($eof,@text)=$self->get_string_until('<',{remove=>1});
1449 if ($#text > 0) {
1450 # Check if text (extracted after the inline tag)
1451 # has to be translated
1452 push @paragraph, @text;
1453 }
1454 }
1455
1456 # This strips the extracted strings
1457 # (only if you don't specify the 'nostrip' option, and if the
1458 # paragraph can be re-wrapped)
1459 $translate = $self->get_translate_options($self->get_path);
1460 if (!$self->{options}{'nostrip'} and $translate !~ m/W/) {
1461 my $clean = 0;
1462 # Clean the beginning
1463 while (!$clean and $#paragraph > 0) {
1464 $paragraph[0] =~ /^(\s*)(.*)/s;
1465 my $match = $1;
1466 if ($paragraph[0] eq $match) {
1467 if ($match ne "") {
1468 $self->pushline($match);
1469 }
1470 shift @paragraph;
1471 shift @paragraph;
1472 } else {
1473 $paragraph[0] = $2;
1474 if ($match ne "") {
1475 $self->pushline($match);
1476 }
1477 $clean = 1;
1478 }
1479 }
1480 $clean = 0;
1481 # Clean the end
1482 while (!$clean and $#paragraph > 0) {
1483 $paragraph[$#paragraph-1] =~ /^(.*?)(\s*)$/s;
1484 my $match = $2;
1485 if ($paragraph[$#paragraph-1] eq $match) {
1486 if ($match ne "") {
1487 $blank = $match.$blank;
1488 }
1489 pop @paragraph;
1490 pop @paragraph;
1491 } else {
1492 $paragraph[$#paragraph-1] = $1;
1493 if ($match ne "") {
1494 $blank = $match.$blank;
1495 }
1496 $clean = 1;
1497 }
1498 }
1499 }
1500
1501 # Translate the string when needed
1502 # This will either push the translation in the translated document or
1503 # in the current holder translation.
1504 $self->translate_paragraph(@paragraph);
1505
1506 # Push the trailing blanks
1507 if ($blank ne "") {
1508 $self->pushline($blank);
1509 }
1510 return $eof;
1511 }
1512
1513 # Translate a @paragraph array of (string, reference).
1514 # The $translate argument indicates if the strings must be translated or
1515 # just pushed
1516 sub translate_paragraph {
1517 my $self = shift;
1518 my @paragraph = @_;
1519 my $translate = $self->get_translate_options($self->get_path);
1520
1521 while ( (scalar @paragraph)
1522 and ($paragraph[0] =~ m/^\s*\n/s)) {
1523 $self->pushline($paragraph[0]);
1524 shift @paragraph;
1525 shift @paragraph;
1526 }
1527
1528 my $comments;
1529 while (@comments) {
1530 my ($comment,$eoc);
1531 do {
1532 my ($t,$l) = (shift @comments, shift @comments);
1533 $t =~ s/\n?(\0)?$//;
1534 $eoc = $1;
1535 $comment .= "\n" if defined $comment;
1536 $comment .= $t;
1537 } until ($eoc);
1538 $comments .= "\n" if defined $comments;
1539 $comments .= $comment;
1540 $self->pushline("<!--".$comment."-->\n") if defined $comment;
1541 }
1542 @comments = ();
1543
1544 if ($self->{options}{'cpp'}) {
1545 my @tmp = @paragraph;
1546 @paragraph = ();
1547 while (@tmp) {
1548 my ($t,$l) = (shift @tmp, shift @tmp);
1549 # #include can be followed by a filename between
1550 # <> brackets. In that case, the argument won't be
1551 # handled in the same call to translate_paragraph.
1552 # Thus do not try to match "include ".
1553 if ($t =~ m/^#[ \t]*(if |endif|undef |include|else|ifdef |ifndef |define )/si) {
1554 if (@paragraph) {
1555 $self->translate_paragraph(@paragraph);
1556 @paragraph = ();
1557 $self->pushline("\n");
1558 }
1559 $self->pushline($t);
1560 } else {
1561 push @paragraph, ($t,$l);
1562 }
1563 }
1564 }
1565
1566 my $para = $self->join_lines(@paragraph);
1567 if ( length($para) > 0 ) {
1568 if ($translate ne "") {
1569 # This tag should be translated
1570 $self->pushline($self->found_string(
1571 $para,
1572 $paragraph[1], {
1573 type=>"tag",
1574 tag_options=>$translate,
1575 comments=>$comments
1576 }));
1577 } else {
1578 # Inform that this tag isn't translated in debug mode
1579 print wrap_ref_mod($paragraph[1], "po4a::xml", dgettext ("po4a", "Content of tag %s excluded: %s"), $self->get_path, $para)
1580 if $self->debug();
1581 $self->pushline($self->recode_skipped_text($para));
1582 }
1583 }
1584 # Now the paragraph is fully translated.
1585 # If we have all the holders' translation, we can replace the
1586 # placeholders by their translations.
1587 # We must wait to have all the translations because the holders are
1588 # numbered.
1589 {
1590 my $holder = $save_holders[$#save_holders];
1591 my $translation = $holder->{'translation'};
1592
1593 # Count the number of <placeholder ...> in $translation
1594 my $count = 0;
1595 my $str = $translation;
1596 while ( (defined $str)
1597 and ($str =~ m/^.*?<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s)) {
1598 $count += 1;
1599 $str = $2;
1600 if ($holder->{'sub_translations'}->[$1] =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s) {
1601 $count = -1;
1602 last;
1603 }
1604 }
1605
1606 if ( (defined $translation)
1607 and (scalar(@{$holder->{'sub_translations'}}) == $count)) {
1608 # OK, all the holders of the current paragraph are
1609 # closed (and translated).
1610 # Replace them by their translation.
1611 while ($translation =~ m/^(.*?)<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s) {
1612 # FIXME: we could also check that
1613 # * the holder exists
1614 # * all the holders are used
1615 $translation = $1.$holder->{'sub_translations'}->[$2].$3;
1616 }
1617 # We have our translation
1618 $holder->{'translation'} = $translation;
1619 # And there is no need for any holder in it.
1620 my @sub_translations = ();
1621 $holder->{'sub_translations'} = \@sub_translations;
1622 }
1623 }
1624
1625 }
1626
1627
1628
1629 =head2 WORKING WITH THE MODULE OPTIONS
1630
1631 =over 4
1632
1633 =item treat_options()
1634
1635 This function fills the internal structures that contain the tags, attributes
1636 and inline data with the options of the module (specified in the command-line
1637 or in the initialize function).
1638
1639 =back
1640
1641 =cut
1642
1643 sub treat_options {
1644 my $self = shift;
1645
1646 if ($self->{options}{'caseinsensitive'}) {
1647 $self->{options}{'nodefault'} = lc $self->{options}{'nodefault'};
1648 $self->{options}{'tags'} = lc $self->{options}{'tags'};
1649 $self->{options}{'break'} = lc $self->{options}{'break'};
1650 $self->{options}{'_default_break'} = lc $self->{options}{'_default_break'};
1651 $self->{options}{'translated'} = lc $self->{options}{'translated'};
1652 $self->{options}{'_default_translated'} = lc $self->{options}{'_default_translated'};
1653 $self->{options}{'untranslated'} = lc $self->{options}{'untranslated'};
1654 $self->{options}{'_default_untranslated'} = lc $self->{options}{'_default_untranslated'};
1655 $self->{options}{'attributes'} = lc $self->{options}{'attributes'};
1656 $self->{options}{'_default_attributes'} = lc $self->{options}{'_default_attributes'};
1657 $self->{options}{'inline'} = lc $self->{options}{'inline'};
1658 $self->{options}{'_default_inline'} = lc $self->{options}{'_default_inline'};
1659 $self->{options}{'placeholder'} = lc $self->{options}{'placeholder'};
1660 $self->{options}{'_default_placeholder'} = lc $self->{options}{'_default_placeholder'};
1661 }
1662
1663 $self->{options}{'nodefault'} =~ /^\s*(.*)\s*$/s;
1664 my %list_nodefault;
1665 foreach (split(/\s+/s,$1)) {
1666 $list_nodefault{$_} = 1;
1667 }
1668 $self->{nodefault} = \%list_nodefault;
1669
1670 $self->{options}{'tags'} =~ /^\s*(.*)\s*$/s;
1671 if (length $self->{options}{'tags'}) {
1672 warn wrap_mod("po4a::xml",
1673 dgettext("po4a",
1674 "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tags");
1675 }
1676 foreach (split(/\s+/s,$1)) {
1677 $_ =~ m/^(.*?)(<.*)$/;
1678 $self->{tags}->{$2} = $1 || "";
1679 }
1680
1681 if ($self->{options}{'tagsonly'}) {
1682 warn wrap_mod("po4a::xml",
1683 dgettext("po4a",
1684 "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tagsonly");
1685 }
1686
1687 $self->{options}{'break'} =~ /^\s*(.*)\s*$/s;
1688 foreach my $tag (split(/\s+/s,$1)) {
1689 $tag =~ m/^(.*?)(<.*)$/;
1690 $self->{break}->{$2} = $1 || "";
1691 }
1692 $self->{options}{'_default_break'} =~ /^\s*(.*)\s*$/s;
1693 foreach my $tag (split(/\s+/s,$1)) {
1694 $tag =~ m/^(.*?)(<.*)$/;
1695 $self->{break}->{$2} = $1 || ""
1696 unless $list_nodefault{$2}
1697 or defined $self->{break}->{$2};
1698 }
1699
1700 $self->{options}{'translated'} =~ /^\s*(.*)\s*$/s;
1701 foreach my $tag (split(/\s+/s,$1)) {
1702 $tag =~ m/^(.*?)(<.*)$/;
1703 $self->{translated}->{$2} = $1 || "";
1704 }
1705 $self->{options}{'_default_translated'} =~ /^\s*(.*)\s*$/s;
1706 foreach my $tag (split(/\s+/s,$1)) {
1707 $tag =~ m/^(.*?)(<.*)$/;
1708 $self->{translated}->{$2} = $1 || ""
1709 unless $list_nodefault{$2}
1710 or defined $self->{translated}->{$2};
1711 }
1712
1713 $self->{options}{'untranslated'} =~ /^\s*(.*)\s*$/s;
1714 foreach my $tag (split(/\s+/s,$1)) {
1715 $tag =~ m/^(.*?)(<.*)$/;
1716 $self->{untranslated}->{$2} = $1 || "";
1717 }
1718 $self->{options}{'_default_untranslated'} =~ /^\s*(.*)\s*$/s;
1719 foreach my $tag (split(/\s+/s,$1)) {
1720 $tag =~ m/^(.*?)(<.*)$/;
1721 $self->{untranslated}->{$2} = $1 || ""
1722 unless $list_nodefault{$2}
1723 or defined $self->{untranslated}->{$2};
1724 }
1725
1726 $self->{options}{'attributes'} =~ /^\s*(.*)\s*$/s;
1727 foreach my $tag (split(/\s+/s,$1)) {
1728 if ($tag =~ m/^(.*?)(<.*)$/) {
1729 $self->{attributes}->{$2} = $1 || "";
1730 } else {
1731 $self->{attributes}->{$tag} = "";
1732 }
1733 }
1734 $self->{options}{'_default_attributes'} =~ /^\s*(.*)\s*$/s;
1735 foreach my $tag (split(/\s+/s,$1)) {
1736 if ($tag =~ m/^(.*?)(<.*)$/) {
1737 $self->{attributes}->{$2} = $1 || ""
1738 unless $list_nodefault{$2}
1739 or defined $self->{attributes}->{$2};
1740 } else {
1741 $self->{attributes}->{$tag} = ""
1742 unless $list_nodefault{$tag}
1743 or defined $self->{attributes}->{$tag};
1744 }
1745 }
1746
1747 my @list_inline;
1748 $self->{options}{'inline'} =~ /^\s*(.*)\s*$/s;
1749 foreach my $tag (split(/\s+/s,$1)) {
1750 $tag =~ m/^(.*?)(<.*)$/;
1751 $self->{inline}->{$2} = $1 || "";
1752 }
1753 $self->{options}{'_default_inline'} =~ /^\s*(.*)\s*$/s;
1754 foreach my $tag (split(/\s+/s,$1)) {
1755 $tag =~ m/^(.*?)(<.*)$/;
1756 $self->{inline}->{$2} = $1 || ""
1757 unless $list_nodefault{$2}
1758 or defined $self->{inline}->{$2};
1759 }
1760
1761 $self->{options}{'placeholder'} =~ /^\s*(.*)\s*$/s;
1762 foreach my $tag (split(/\s+/s,$1)) {
1763 $tag =~ m/^(.*?)(<.*)$/;
1764 $self->{placeholder}->{$2} = $1 || "";
1765 }
1766 $self->{options}{'_default_placeholder'} =~ /^\s*(.*)\s*$/s;
1767 foreach my $tag (split(/\s+/s,$1)) {
1768 $tag =~ m/^(.*?)(<.*)$/;
1769 $self->{placeholder}->{$2} = $1 || ""
1770 unless $list_nodefault{$2}
1771 or defined $self->{placeholder}->{$2};
1772 }
1773
1774 # There should be no translated and untranslated tags
1775 foreach my $tag (keys %{$self->{translated}}) {
1776 die wrap_mod("po4a::xml",
1777 dgettext("po4a",
1778 "Tag '%s' both in the %s and %s categories."), $tag, "translated", "untranslated")
1779 if defined $self->{untranslated}->{$tag};
1780 }
1781 # There should be no inline, break, and placeholder tags
1782 foreach my $tag (keys %{$self->{inline}}) {
1783 die wrap_mod("po4a::xml",
1784 dgettext("po4a",
1785 "Tag '%s' both in the %s and %s categories."), $tag, "inline", "break")
1786 if defined $self->{break}->{$tag};
1787 die wrap_mod("po4a::xml",
1788 dgettext("po4a",
1789 "Tag '%s' both in the %s and %s categories."), $tag, "inline", "placeholder")
1790 if defined $self->{placeholder}->{$tag};
1791 }
1792 foreach my $tag (keys %{$self->{break}}) {
1793 die wrap_mod("po4a::xml",
1794 dgettext("po4a",
1795 "Tag '%s' both in the %s and %s categories."), $tag, "break", "placeholder")
1796 if defined $self->{placeholder}->{$tag};
1797 }
1798 }
1799
1800 =head2 GETTING TEXT FROM THE INPUT DOCUMENT
1801
1802 =over
1803
1804 =item get_string_until($%)
1805
1806 This function returns an array with the lines (and references) from the input
1807 document until it finds the first argument. The second argument is an options
1808 hash. Value 0 means disabled (the default) and 1, enabled.
1809
1810 The valid options are:
1811
1812 =over 4
1813
1814 =item include
1815
1816 This makes the returned array to contain the searched text
1817
1818 =item remove
1819
1820 This removes the returned stream from the input
1821
1822 =item unquoted
1823
1824 This ensures that the searched text is outside any quotes
1825
1826 =back
1827
1828 =cut
1829
1830 sub get_string_until {
1831 my ($self,$search) = (shift,shift);
1832 my $options = shift;
1833 my ($include,$remove,$unquoted, $regex) = (0,0,0,0);
1834
1835 if (defined($options->{include})) { $include = $options->{include}; }
1836 if (defined($options->{remove})) { $remove = $options->{remove}; }
1837 if (defined($options->{unquoted})) { $unquoted = $options->{unquoted}; }
1838 if (defined($options->{regex})) { $regex = $options->{regex}; }
1839
1840 my ($line,$ref) = $self->shiftline();
1841 my (@text,$paragraph);
1842 my ($eof,$found) = (0,0);
1843
1844 $search = "\Q$search\E" unless $regex;
1845 while (defined($line) and !$found) {
1846 push @text, ($line,$ref);
1847 $paragraph .= $line;
1848 if ($unquoted) {
1849 if ( $paragraph =~ /^((\".*?\")|(\'.*?\')|[^\"\'])*$search/s ) {
1850 $found = 1;
1851 }
1852 } else {
1853 if ( $paragraph =~ /$search/s ) {
1854 $found = 1;
1855 }
1856 }
1857 if (!$found) {
1858 ($line,$ref)=$self->shiftline();
1859 }
1860 }
1861
1862 if (!defined($line)) { $eof = 1; }
1863
1864 if ( $found ) {
1865 $line = "";
1866 if($unquoted) {
1867 $paragraph =~ /^(?:(?:\".*?\")|(?:\'.*?\')|[^\"\'])*?$search(.*)$/s;
1868 $line = $1;
1869 $text[$#text-1] =~ s/\Q$line\E$//s;
1870 } else {
1871 $paragraph =~ /$search(.*)$/s;
1872 $line = $1;
1873 $text[$#text-1] =~ s/\Q$line\E$//s;
1874 }
1875 if(!$include) {
1876 $text[$#text-1] =~ /^(.*)($search.*)$/s;
1877 $text[$#text-1] = $1;
1878 $line = $2.$line;
1879 }
1880 if (defined($line) and ($line ne "")) {
1881 $self->unshiftline ($line,$text[$#text]);
1882 }
1883 }
1884 if (!$remove) {
1885 $self->unshiftline (@text);
1886 }
1887
1888 #If we get to the end of the file, we return the whole paragraph
1889 return ($eof,@text);
1890 }
1891
1892 =item skip_spaces(\@)
1893
1894 This function receives as argument the reference to a paragraph (in the format
1895 returned by get_string_until), skips his heading spaces and returns them as
1896 a simple string.
1897
1898 =cut
1899
1900 sub skip_spaces {
1901 my ($self,$pstring)=@_;
1902 my $space="";
1903
1904 while (@$pstring and (@$pstring[0] =~ /^(\s+)(.*)$/s or @$pstring[0] eq "")) {
1905 if (@$pstring[0] ne "") {
1906 $space .= $1;
1907 @$pstring[0] = $2;
1908 }
1909
1910 if (@$pstring[0] eq "") {
1911 shift @$pstring;
1912 shift @$pstring;
1913 }
1914 }
1915 return $space;
1916 }
1917
1918 =item join_lines(@)
1919
1920 This function returns a simple string with the text from the argument array
1921 (discarding the references).
1922
1923 =cut
1924
1925 sub join_lines {
1926 my ($self,@lines)=@_;
1927 my ($line,$ref);
1928 my $text = "";
1929 while ($#lines > 0) {
1930 ($line,$ref) = (shift @lines,shift @lines);
1931 $text .= $line;
1932 }
1933 return $text;
1934 }
1935
1936 =back
1937
1938 =head1 STATUS OF THIS MODULE
1939
1940 This module can translate tags and attributes.
1941
1942 =head1 TODO LIST
1943
1944 DOCTYPE (ENTITIES)
1945
1946 There is a minimal support for the translation of entities. They are
1947 translated as a whole, and tags are not taken into account. Multilines
1948 entities are not supported and entities are always rewrapped during the
1949 translation.
1950
1951 MODIFY TAG TYPES FROM INHERITED MODULES
1952 (move the tag_types structure inside the $self hash?)
1953
1954 =head1 SEE ALSO
1955
1956 L<po4a(7)|po4a.7>, L<Locale::Po4a::TransTractor(3pm)|Locale::Po4a::TransTractor>.
1957
1958 =head1 AUTHORS
1959
1960 Jordi Vilalta <jvprat@gmail.com>
1961 Nicolas François <nicolas.francois@centraliens.net>
1962
1963 =head1 COPYRIGHT AND LICENSE
1964
1965 Copyright (c) 2004 by Jordi Vilalta <jvprat@gmail.com>
1966 Copyright (c) 2008-2009 by Nicolas François <nicolas.francois@centraliens.net>
1967
1968 This program is free software; you may redistribute it and/or modify it
1969 under the terms of GPL (see the COPYING file).
1970
1971 =cut
1972
1973 1;