Mercurial > hgbook

diff tools/po4a/lib/Locale/Po4a/Xml.pm @ 722:082bb76417f1
Add Po4a 0.37-dev(2009-03-08)
author: Dongsheng Song <dongsheng.song@gmail.com>
date: Thu, 12 Mar 2009 15:43:56 +0800
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/po4a/lib/Locale/Po4a/Xml.pm	Thu Mar 12 15:43:56 2009 +0800
@@ -0,0 +1,1973 @@
+#!/usr/bin/perl
+
+# Po4a::Xml.pm 
+# 
+# extract and translate translatable strings from XML documents.
+# 
+# This code extracts plain text from tags and attributes from generic
+# XML documents, and it can be used as a base to build modules for
+# XML-based documents.
+#
+# Copyright (c) 2004 by Jordi Vilalta  <jvprat@gmail.com>
+# Copyright (c) 2008-2009 by Nicolas François  <nicolas.francois@centraliens.net>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+#
+########################################################################
+
+=head1 NAME
+
+Locale::Po4a::Xml - Convert XML documents and derivates from/to PO files
+
+=head1 DESCRIPTION
+
+The po4a (po for anything) project goal is to ease translations (and more
+interestingly, the maintenance of translations) using gettext tools on
+areas where they were not expected like documentation.
+
+Locale::Po4a::Xml is a module to help the translation of XML documents into
+other [human] languages. It can also be used as a base to build modules for
+XML-based documents.
+
+=cut
+
+package Locale::Po4a::Xml;
+
+use 5.006;
+use strict;
+use warnings;
+
+require Exporter;
+use vars qw(@ISA @EXPORT);
+@ISA = qw(Locale::Po4a::TransTractor);
+@EXPORT = qw(new initialize @tag_types);
+
+use Locale::Po4a::TransTractor;
+use Locale::Po4a::Common;
+use Carp qw(croak);
+use File::Basename;
+use File::Spec;
+
+#It will mantain the path from the root tag to the current one
+my @path;
+
+#It will contain a list of external entities and their attached paths
+my %entities;
+
+my @comments;
+
+sub shiftline {
+    my $self = shift;
+    # call Transtractor's shiftline
+    my ($line,$ref) = $self->SUPER::shiftline();
+    return ($line,$ref) if (not defined $line);
+
+    for my $k (keys %entities) {
+        if ($line =~ m/^(.*?)&$k;(.*)$/s) {
+            my ($before, $after) = ($1, $2);
+            my $linenum=0;
+            my @textentries;
+
+            open (my $in, $entities{$k})
+                or croak wrap_mod("po4a::xml",
+                                  dgettext("po4a", "Can't read from %s: %s"),
+                                  $entities{$k}, $!);
+            while (defined (my $textline = <$in>)) {
+                $linenum++;
+                my $textref=$entities{$k}.":$linenum";
+                push @textentries, ($textline,$textref);
+            }
+            close $in
+                or croak wrap_mod("po4a::xml",
+                          dgettext("po4a", "Can't close %s after reading: %s"),
+                                  $entities{$k}, $!);
+
+            push @textentries, ($after, $ref);
+            $line = $before.(shift @textentries);
+            $ref .= " ".(shift @textentries);
+            $self->unshiftline(@textentries);
+        }
+    }
+
+    return ($line,$ref);
+}
+
+sub read {
+	my ($self,$filename)=@_;
+	push @{$self->{DOCPOD}{infile}}, $filename;
+	$self->Locale::Po4a::TransTractor::read($filename);
+}
+
+sub parse {
+	my $self=shift;
+	map {$self->parse_file($_)} @{$self->{DOCPOD}{infile}};
+}
+
+# @save_holders is a stack of references to ('paragraph', 'translation',
+# 'sub_translations', 'open', 'close', 'folded_attributes') hashes, where:
+# paragraph         is a reference to an array (see paragraph in the
+#                   treat_content() subroutine) of strings followed by
+#                   references.  It contains the @paragraph array as it was
+#                   before the processing was interrupted by a tag instroducing
+#                   a placeholder.
+# translation       is the translation of this level up to now
+# sub_translations  is a reference to an array of strings containing the
+#                   translations which must replace the placeholders.
+# open              is the tag which opened the placeholder.
+# close             is the tag which closed the placeholder.
+# folded_attributes is an hash of tags with their attributes (<tag attrs=...>
+#                   strings), referenced by the folded tag id, which should
+#                   replace the <tag po4a-id=id> strings in the current
+#                   translation.
+#
+# If @save_holders only has 1 holder, then we are not processing the
+# content of an holder, we are translating the document.
+my @save_holders;
+
+
+# If we are at the bottom of the stack and there is no <placeholder ...> in
+# the current translation, we can push the translation in the translated
+# document.
+# Otherwise, we keep the translation in the current holder.
+sub pushline {
+	my ($self, $line) = (shift, shift);
+
+	my $holder = $save_holders[$#save_holders];
+	my $translation = $holder->{'translation'};
+	$translation .= $line;
+
+	while (    %{$holder->{folded_attributes}}
+	       and $translation =~ m/^(.*)<([^>]+?)\s+po4a-id=([0-9]+)>(.*)$/s) {
+		my $begin = $1;
+		my $tag = $2;
+		my $id = $3;
+		my $end = $4;
+		if (defined $holder->{folded_attributes}->{$id}) {
+			# TODO: check if the tag is the same
+			$translation = $begin.$holder->{folded_attributes}->{$id}.$end;
+			delete $holder->{folded_attributes}->{$id};
+		} else {
+			# TODO: It will be hard to identify the location.
+			#       => find a way to retrieve the reference.
+			die wrap_mod("po4a::xml", dgettext("po4a", "'po4a-id=%d' in the translation does not exist in the original string (or 'po4a-id=%d' used twice in the translation)."), $id, $id);
+		}
+	}
+# TODO: check that %folded_attributes is empty at some time
+# => in translate_paragraph?
+
+	if (   ($#save_holders > 0)
+	    or ($translation =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s)) {
+		$holder->{'translation'} = $translation;
+	} else {
+		$self->SUPER::pushline($translation);
+		$holder->{'translation'} = '';
+	}
+}
+
+=head1 TRANSLATING WITH PO4A::XML
+
+This module can be used directly to handle generic XML documents.  This will
+extract all tag's content, and no attributes, since it's where the text is
+written in most XML based documents.
+
+There are some options (described in the next section) that can customize
+this behavior.  If this doesn't fit to your document format you're encouraged
+to write your own module derived from this, to describe your format's details.
+See the section "Writing derivate modules" below, for the process description.
+
+=cut
+
+#
+# Parse file and translate it
+#
+sub parse_file {
+	my ($self,$filename) = @_;
+	my $eof = 0;
+
+	while (!$eof) {
+		# We get all the text until the next breaking tag (not
+		# inline) and translate it
+		$eof = $self->treat_content;
+		if (!$eof) {
+			# And then we treat the following breaking tag
+			$eof = $self->treat_tag;
+		}
+	}
+}
+
+=head1 OPTIONS ACCEPTED BY THIS MODULE
+
+The global debug option causes this module to show the excluded strings, in
+order to see if it skips something important.
+
+These are this module's particular options:
+
+=over 4
+
+=item B<nostrip>
+
+Prevents it to strip the spaces around the extracted strings.
+
+=item B<wrap>
+
+Canonizes the string to translate, considering that whitespaces are not
+important, and wraps the translated document. This option can be overridden
+by custom tag options. See the "tags" option below.
+
+=item B<caseinsensitive>
+
+It makes the tags and attributes searching to work in a case insensitive
+way.  If it's defined, it will treat E<lt>BooKE<gt>laNG and E<lt>BOOKE<gt>Lang as E<lt>bookE<gt>lang.
+
+=item B<includeexternal>
+
+When defined, external entities are included in the generated (translated)
+document, and for the extraction of strings.  If it's not defined, you
+will have to translate external entities separately as independent
+documents.
+
+=item B<ontagerror>
+
+This option defines the behavior of the module when it encounter a invalid
+Xml syntax (a closing tag which does not match the last opening tag, or a
+tag's attribute without value).
+It can take the following values:
+
+=over
+
+=item I<fail>
+
+This is the default value.
+The module will exit with an error.
+
+=item I<warn>
+
+The module will continue, and will issue a warning.
+
+=item I<silent>
+
+The module will continue without any warnings.
+
+=back
+
+Be careful when using this option.
+It is generally recommended to fix the input file.
+
+=item B<tagsonly>
+
+Extracts only the specified tags in the "tags" option.  Otherwise, it
+will extract all the tags except the ones specified.
+
+Note: This option is deprecated.
+
+=item B<doctype>
+
+String that will try to match with the first line of the document's doctype
+(if defined). If it doesn't, a warning will indicate that the document
+might be of a bad type.
+
+=item B<tags>
+
+Space-separated list of tags you want to translate or skip.  By default,
+the specified tags will be excluded, but if you use the "tagsonly" option,
+the specified tags will be the only ones included.  The tags must be in the
+form E<lt>aaaE<gt>, but you can join some (E<lt>bbbE<gt>E<lt>aaaE<gt>) to say that the content of
+the tag E<lt>aaaE<gt> will only be translated when it's into a E<lt>bbbE<gt> tag.
+
+You can also specify some tag options putting some characters in front of
+the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
+to override the default behavior specified by the global "wrap" option.
+
+Example: WE<lt>chapterE<gt>E<lt>titleE<gt>
+
+Note: This option is deprecated.
+You should use the B<translated> and B<untranslated> options instead.
+
+=item B<attributes>
+
+Space-separated list of tag's attributes you want to translate.  You can
+specify the attributes by their name (for example, "lang"), but you can
+prefix it with a tag hierarchy, to specify that this attribute will only be
+translated when it's into the specified tag. For example: E<lt>bbbE<gt>E<lt>aaaE<gt>lang
+specifies that the lang attribute will only be translated if it's into an
+E<lt>aaaE<gt> tag, and it's into a E<lt>bbbE<gt> tag.
+
+=item B<foldattributes>
+
+Do not translate attributes in inline tags.
+Instead, replace all attributes of a tag by po4a-id=<id>.
+
+This is useful when attributes shall not be translated, as this simplifies the
+strings for translators, and avoids typos.
+
+=item B<break>
+
+Space-separated list of tags which should break the sequence.
+By default, all tags break the sequence.
+
+The tags must be in the form <aaa>, but you can join some
+(<bbb><aaa>), if a tag (<aaa>) should only be considered 
+when it's into another tag (<bbb>).
+
+=item B<inline>
+
+Space-separated list of tags which should be treated as inline.
+By default, all tags break the sequence.
+
+The tags must be in the form <aaa>, but you can join some
+(<bbb><aaa>), if a tag (<aaa>) should only be considered 
+when it's into another tag (<bbb>).
+
+=item B<placeholder>
+
+Space-separated list of tags which should be treated as placeholders.
+Placeholders do not break the sequence, but the content of placeholders is
+translated separately.
+
+The location of the placeholder in its blocks will be marked with a string
+similar to:
+
+  <placeholder type=\"footnote\" id=\"0\"/>
+
+The tags must be in the form <aaa>, but you can join some
+(<bbb><aaa>), if a tag (<aaa>) should only be considered 
+when it's into another tag (<bbb>).
+
+=item B<nodefault>
+
+Space separated list of tags that the module should not try to set by
+default in any category.
+
+=item B<cpp>
+
+Support C preprocessor directives.
+When this option is set, po4a will consider preprocessor directives as
+paragraph separators.
+This is important if the XML file must be preprocessed because otherwise
+the directives may be inserted in the middle of lines if po4a consider it
+belong to the current paragraph, and they won't be recognized by the
+preprocessor.
+Note: the preprocessor directives must only appear between tags
+(they must not break a tag).
+
+=item B<translated>
+
+Space-separated list of tags you want to translate.
+
+The tags must be in the form <aaa>, but you can join some
+(<bbb><aaa>), if a tag (<aaa>) should only be considered 
+when it's into another tag (<bbb>).
+
+You can also specify some tag options putting some characters in front of
+the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
+to overide the default behavior specified by the global "wrap" option.
+
+Example: WE<lt>chapterE<gt>E<lt>titleE<gt>
+
+=item B<untranslated>
+
+Space-separated list of tags you do not want to translate.
+
+The tags must be in the form <aaa>, but you can join some
+(<bbb><aaa>), if a tag (<aaa>) should only be considered 
+when it's into another tag (<bbb>).
+
+=item B<defaulttranslateoption>
+
+The default categories for tags that are not in any of the translated,
+untranslated, break, inline, or placeholder.
+
+This is a set of letters:
+
+=over
+
+=item I<w>
+
+Tags should be translated and content can be re-wrapped.
+
+=item I<W>
+
+Tags should be translated and content should not be re-wrapped.
+
+=item I<i>
+
+Tags should be translated inline.
+
+=item I<p>
+
+Tags should be translated as placeholders.
+
+=back
+
+=back
+
+=cut
+# TODO: defaulttranslateoption
+# w => indicate that it is only valid for translatable tags and do not
+#      care about inline/break/placeholder?
+# ...
+
+sub initialize {
+	my $self = shift;
+	my %options = @_;
+
+	# Reset the path
+	@path = ();
+
+	# Initialize the stack of holders
+	my @paragraph = ();
+	my @sub_translations = ();
+	my %folded_attributes;
+	my %holder = ('paragraph' => \@paragraph,
+	              'translation' => "",
+	              'sub_translations' => \@sub_translations,
+	              'folded_attributes' => \%folded_attributes);
+	@save_holders = (\%holder);
+
+	$self->{options}{'nostrip'}=0;
+	$self->{options}{'wrap'}=0;
+	$self->{options}{'caseinsensitive'}=0;
+	$self->{options}{'tagsonly'}=0;
+	$self->{options}{'tags'}='';
+	$self->{options}{'break'}='';
+	$self->{options}{'translated'}='';
+	$self->{options}{'untranslated'}='';
+	$self->{options}{'defaulttranslateoption'}='';
+	$self->{options}{'attributes'}='';
+	$self->{options}{'foldattributes'}=0;
+	$self->{options}{'inline'}='';
+	$self->{options}{'placeholder'}='';
+	$self->{options}{'doctype'}='';
+	$self->{options}{'nodefault'}='';
+	$self->{options}{'includeexternal'}=0;
+	$self->{options}{'ontagerror'}="fail";
+	$self->{options}{'cpp'}=0;
+
+	$self->{options}{'verbose'}='';
+	$self->{options}{'debug'}='';
+
+	foreach my $opt (keys %options) {
+		if ($options{$opt}) {
+			die wrap_mod("po4a::xml",
+				dgettext("po4a", "Unknown option: %s"), $opt)
+				unless exists $self->{options}{$opt};
+			$self->{options}{$opt} = $options{$opt};
+		}
+	}
+	# Default options set by modules. Forbidden for users.
+	$self->{options}{'_default_translated'}='';
+	$self->{options}{'_default_untranslated'}='';
+	$self->{options}{'_default_break'}='';
+	$self->{options}{'_default_inline'}='';
+	$self->{options}{'_default_placeholder'}='';
+	$self->{options}{'_default_attributes'}='';
+
+	#It will maintain the list of the translatable tags
+	$self->{tags}=();
+	$self->{translated}=();
+	$self->{untranslated}=();
+	#It will maintain the list of the translatable attributes
+	$self->{attributes}=();
+	#It will maintain the list of the breaking tags
+	$self->{break}=();
+	#It will maintain the list of the inline tags
+	$self->{inline}=();
+	#It will maintain the list of the placeholder tags
+	$self->{placeholder}=();
+	#list of the tags that must not be set in the tags or inline category
+	#by this module or sub-module (unless specified in an option)
+	$self->{nodefault}=();
+
+	$self->treat_options;
+}
+
+=head1 WRITING DERIVATE MODULES
+
+=head2 DEFINE WHAT TAGS AND ATTRIBUTES TO TRANSLATE
+
+The simplest customization is to define which tags and attributes you want
+the parser to translate.  This should be done in the initialize function.
+First you should call the main initialize, to get the command-line options,
+and then, append your custom definitions to the options hash.  If you want
+to treat some new options from command line, you should define them before
+calling the main initialize:
+
+  $self->{options}{'new_option'}='';
+  $self->SUPER::initialize(%options);
+  $self->{options}{'_default_translated'}.=' <p> <head><title>';
+  $self->{options}{'attributes'}.=' <p>lang id';
+  $self->{options}{'_default_inline'}.=' <br>';
+  $self->treat_options;
+
+You should use the B<_default_inline>, B<_default_break>,
+B<_default_placeholder>, B<_default_translated>, B<_default_untranslated>,
+and B<_default_attributes> options in derivated modules. This allow users
+to override the default behavior defined in your module with command line
+options.
+
+=head2 OVERRIDING THE found_string FUNCTION
+
+Another simple step is to override the function "found_string", which
+receives the extracted strings from the parser, in order to translate them.
+There you can control which strings you want to translate, and perform
+transformations to them before or after the translation itself.
+
+It receives the extracted text, the reference on where it was, and a hash
+that contains extra information to control what strings to translate, how
+to translate them and to generate the comment.
+
+The content of these options depends on the kind of string it is (specified in an 
+entry of this hash):
+
+=over
+
+=item type="tag"
+
+The found string is the content of a translatable tag. The entry "tag_options"
+contains the option characters in front of the tag hierarchy in the module
+"tags" option.
+
+=item type="attribute"
+
+Means that the found string is the value of a translatable attribute. The
+entry "attribute" has the name of the attribute.
+
+=back
+
+It must return the text that will replace the original in the translated
+document. Here's a basic example of this function:
+
+  sub found_string {
+    my ($self,$text,$ref,$options)=@_;
+    $text = $self->translate($text,$ref,"type ".$options->{'type'},
+      'wrap'=>$self->{options}{'wrap'});
+    return $text;
+  }
+
+There's another simple example in the new Dia module, which only filters
+some strings.
+
+=cut
+
+sub found_string {
+	my ($self,$text,$ref,$options)=@_;
+
+	if ($text =~ m/^\s*$/s) {
+		return $text;
+	}
+
+	my $comment;
+	my $wrap = $self->{options}{'wrap'};
+
+	if ($options->{'type'} eq "tag") {
+		$comment = "Content of: ".$self->get_path;
+
+		if($options->{'tag_options'} =~ /w/) {
+			$wrap = 1;
+		}
+		if($options->{'tag_options'} =~ /W/) {
+			$wrap = 0;
+		}
+	} elsif ($options->{'type'} eq "attribute") {
+		$comment = "Attribute '".$options->{'attribute'}."' of: ".$self->get_path;
+	} elsif ($options->{'type'} eq "CDATA") {
+		$comment = "CDATA";
+		$wrap = 0;
+	} else {
+		die wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Internal error: unknown type identifier '%s'."), $options->{'type'});
+	}
+	$text = $self->translate($text,$ref,$comment,'wrap'=>$wrap, comment => $options->{'comments'});
+	return $text;
+}
+
+=head2 MODIFYING TAG TYPES (TODO)
+
+This is a more complex one, but it enables a (almost) total customization.
+It's based in a list of hashes, each one defining a tag type's behavior. The
+list should be sorted so that the most general tags are after the most
+concrete ones (sorted first by the beginning and then by the end keys). To
+define a tag type you'll have to make a hash with the following keys:
+
+=over 4
+
+=item beginning
+
+Specifies the beginning of the tag, after the "E<lt>".
+
+=item end
+
+Specifies the end of the tag, before the "E<gt>".
+
+=item breaking
+
+It says if this is a breaking tag class.  A non-breaking (inline) tag is one
+that can be taken as part of the content of another tag.  It can take the
+values false (0), true (1) or undefined.  If you leave this undefined, you'll
+have to define the f_breaking function that will say whether a concrete tag of
+this class is a breaking tag or not.
+
+=item f_breaking
+
+It's a function that will tell if the next tag is a breaking one or not.  It
+should be defined if the "breaking" option is not.
+
+=item f_extract
+
+If you leave this key undefined, the generic extraction function will have to
+extract the tag itself.  It's useful for tags that can have other tags or
+special structures in them, so that the main parser doesn't get mad.  This
+function receives a boolean that says if the tag should be removed from the
+input stream or not.
+
+=item f_translate
+
+This function receives the tag (in the get_string_until() format) and returns
+the translated tag (translated attributes or all needed transformations) as a
+single string.
+
+=back
+
+=cut
+
+##### Generic XML tag types #####' 
+
+our @tag_types = ( 
+	{	beginning	=> "!--#",
+		end		=> "--",
+		breaking	=> 0,
+		f_extract	=> \&tag_extract_comment,
+		f_translate	=> \&tag_trans_comment},
+	{	beginning	=> "!--",
+		end		=> "--",
+		breaking	=> 0,
+		f_extract	=> \&tag_extract_comment,
+		f_translate	=> \&tag_trans_comment},
+	{	beginning	=> "?xml",
+		end		=> "?",
+		breaking	=> 1,
+		f_translate	=> \&tag_trans_xmlhead},
+	{	beginning	=> "?",
+		end		=> "?",
+		breaking	=> 1,
+		f_translate	=> \&tag_trans_procins},
+	{	beginning	=> "!DOCTYPE",
+		end		=> "",
+		breaking	=> 1,
+		f_extract	=> \&tag_extract_doctype,
+		f_translate	=> \&tag_trans_doctype},
+	{	beginning	=> "![CDATA[",
+		end		=> "",
+		breaking	=> 1,
+		f_extract	=> \&CDATA_extract,
+		f_translate	=> \&CDATA_trans},
+	{	beginning	=> "/",
+		end		=> "",
+		f_breaking	=> \&tag_break_close,
+		f_translate	=> \&tag_trans_close},
+	{	beginning	=> "",
+		end		=> "/",
+		f_breaking	=> \&tag_break_alone,
+		f_translate	=> \&tag_trans_alone},
+	{	beginning	=> "",
+		end		=> "",
+		f_breaking	=> \&tag_break_open,
+		f_translate	=> \&tag_trans_open}
+);
+
+sub tag_extract_comment {
+	my ($self,$remove)=(shift,shift);
+	my ($eof,@tag)=$self->get_string_until('-->',{include=>1,remove=>$remove});
+	return ($eof,@tag);
+}
+
+sub tag_trans_comment {
+	my ($self,@tag)=@_;
+	return $self->join_lines(@tag);
+}
+
+sub tag_trans_xmlhead {
+	my ($self,@tag)=@_;
+
+	# We don't have to translate anything from here: throw away references
+	my $tag = $self->join_lines(@tag);
+	$tag =~ /encoding=(("|')|)(.*?)(\s|\2)/s;
+	my $in_charset=$3;
+	$self->detected_charset($in_charset);
+	my $out_charset=$self->get_out_charset;
+
+	if (defined $in_charset) {
+		$tag =~ s/$in_charset/$out_charset/;
+	} else {
+		if ($tag =~ m/standalone/) {
+			$tag =~ s/(standalone)/encoding="$out_charset" $1/;
+		} else {
+			$tag.= " encoding=\"$out_charset\"";
+		}
+	}
+
+	return $tag;
+}
+
+sub tag_trans_procins {
+	my ($self,@tag)=@_;
+	return $self->join_lines(@tag);
+}
+
+sub tag_extract_doctype {
+	my ($self,$remove)=(shift,shift);
+
+	# Check if there is an internal subset (between []).
+	my ($eof,@tag)=$self->get_string_until('>',{include=>1,unquoted=>1});
+	my $parity = 0;
+	my $paragraph = "";
+	map { $parity = 1 - $parity; $paragraph.= $parity?$_:""; } @tag;
+	my $found = 0;
+	if ($paragraph =~ m/<.*\[.*</s) {
+		$found = 1
+	}
+
+	if (not $found) {
+		($eof,@tag)=$self->get_string_until('>',{include=>1,remove=>$remove,unquoted=>1});
+	} else {
+		($eof,@tag)=$self->get_string_until(']\s*>',{include=>1,remove=>$remove,unquoted=>1,regex=>1});
+	}
+	return ($eof,@tag);
+}
+
+sub tag_trans_doctype {
+# This check is not really reliable.  There are system and public
+# identifiers.  Only the public one could be checked reliably.
+	my ($self,@tag)=@_;
+	if (defined $self->{options}{'doctype'} ) {
+		my $doctype = $self->{options}{'doctype'};
+		if ( $tag[0] !~ /\Q$doctype\E/i ) {
+			warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Bad document type. '%s' expected. You can fix this warning with a -o doctype option, or ignore this check with -o doctype=\"\"."), $doctype);
+		}
+	}
+	my $i = 0;
+	my $basedir = $tag[1];
+	$basedir =~ s/:[0-9]+$//;
+	$basedir = dirname($basedir);
+
+	while ( $i < $#tag ) {
+		my $t = $tag[$i];
+		my $ref = $tag[$i+1];
+		if ( $t =~ /^(\s*<!ENTITY\s+)(.*)$/is ) {
+			my $part1 = $1;
+			my $part2 = $2;
+			my $includenow = 0;
+			my $file = 0;
+			my $name = "";
+			if ($part2 =~ /^(%\s+)(.*)$/s ) {
+				$part1.= $1;
+				$part2 = $2;
+				$includenow = 1;
+			}
+			$part2 =~ /^(\S+)(\s+)(.*)$/s;
+			$name = $1;
+			$part1.= $1.$2;
+			$part2 = $3;
+			if ( $part2 =~ /^(SYSTEM\s+)(.*)$/is ) {
+				$part1.= $1;
+				$part2 = $2;
+				$file = 1;
+				if ($self->{options}{'includeexternal'}) {
+					$entities{$name} = $part2;
+					$entities{$name} =~ s/^"?(.*?)".*$/$1/s;
+					$entities{$name} = File::Spec->catfile($basedir, $entities{$name});
+				}
+			}
+			if ((not $file) and (not $includenow)) {
+			    if ($part2 =~ m/^\s*(["'])(.*)\1(\s*>.*)$/s) {
+				my $comment = "Content of the $name entity";
+				my $quote = $1;
+				my $text = $2;
+				$part2 = $3;
+				$text = $self->translate($text,
+				                         $ref,
+				                         $comment,
+				                         'wrap'=>1);
+				$t = $part1."$quote$text$quote$part2";
+			    }
+			}
+#			print $part1."\n";
+#			print $name."\n";
+#			print $part2."\n";
+		}
+		$tag[$i] = $t;
+		$i += 2;
+	}
+	return $self->join_lines(@tag);
+}
+
+sub tag_break_close {
+	my ($self,@tag)=@_;
+	my $struct = $self->get_path;
+	my $options = $self->get_translate_options($struct);
+	if ($options =~ m/[ip]/) {
+		return 0;
+	} else {
+		return 1;
+	}
+}
+
+sub tag_trans_close {
+	my ($self,@tag)=@_;
+	my $name = $self->get_tag_name(@tag);
+
+	my $test = pop @path;
+	if (!defined($test) || $test ne $name ) {
+		my $ontagerror = $self->{options}{'ontagerror'};
+		if ($ontagerror eq "warn") {
+			warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong.  Continuing..."), $name);
+		} elsif ($ontagerror ne "silent") {
+			die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
+		}
+	}
+	return $self->join_lines(@tag);
+}
+
+sub CDATA_extract {
+	my ($self,$remove)=(shift,shift);
+        my ($eof, @tag) = $self->get_string_until(']]>',{include=>1,unquoted=>0,remove=>$remove});
+
+	return ($eof, @tag);
+}
+
+sub CDATA_trans {
+	my ($self,@tag)=@_;
+	return $self->found_string($self->join_lines(@tag),
+	                           $tag[1],
+	                           {'type' => "CDATA"});
+}
+
+sub tag_break_alone {
+	my ($self,@tag)=@_;
+	my $struct = $self->get_path($self->get_tag_name(@tag));
+	if ($self->get_translate_options($struct) =~ m/i/) {
+		return 0;
+	} else {
+		return 1;
+	}
+}
+
+sub tag_trans_alone {
+	my ($self,@tag)=@_;
+	my $name = $self->get_tag_name(@tag);
+	push @path, $name;
+
+	$name = $self->treat_attributes(@tag);
+
+	pop @path;
+	return $name;
+}
+
+sub tag_break_open {
+	my ($self,@tag)=@_;
+	my $struct = $self->get_path($self->get_tag_name(@tag));
+	my $options = $self->get_translate_options($struct);
+	if ($options =~ m/[ip]/) {
+		return 0;
+	} else {
+		return 1;
+	}
+}
+
+sub tag_trans_open {
+	my ($self,@tag)=@_;
+	my $name = $self->get_tag_name(@tag);
+	push @path, $name;
+
+	$name = $self->treat_attributes(@tag);
+
+	return $name;
+}
+
+##### END of Generic XML tag types #####
+
+=head1 INTERNAL FUNCTIONS used to write derivated parsers
+
+=head2 WORKING WITH TAGS
+
+=over 4
+
+=item get_path()
+
+This function returns the path to the current tag from the document's root,
+in the form E<lt>htmlE<gt>E<lt>bodyE<gt>E<lt>pE<gt>.
+
+An additional array of tags (without brackets) can be passed in argument.
+These path elements are added to the end of the current path.
+
+=cut
+
+sub get_path {
+	my $self = shift;
+	my @add = @_;
+	if ( @path > 0 or @add > 0 ) {
+		return "<".join("><",@path,@add).">";
+	} else {
+		return "outside any tag (error?)";
+	}
+}
+
+=item tag_type()
+
+This function returns the index from the tag_types list that fits to the next
+tag in the input stream, or -1 if it's at the end of the input file.
+
+=cut
+
+sub tag_type {
+	my $self = shift;
+	my ($line,$ref) = $self->shiftline();
+	my ($match1,$match2);
+	my $found = 0;
+	my $i = 0;
+
+	if (!defined($line)) { return -1; }
+
+	$self->unshiftline($line,$ref);
+	my ($eof,@lines) = $self->get_string_until(">",{include=>1,unquoted=>1});
+	my $line2 = $self->join_lines(@lines);
+	while (!$found && $i < @tag_types) {
+		($match1,$match2) = ($tag_types[$i]->{beginning},$tag_types[$i]->{end});
+		if ($line =~ /^<\Q$match1\E/) {
+			if (!defined($tag_types[$i]->{f_extract})) {
+#print substr($line2,length($line2)-1-length($match2),1+length($match2))."\n";
+				if (defined($line2) and $line2 =~ /\Q$match2\E>$/) {
+					$found = 1;
+#print "YES: <".$match1." ".$match2.">\n";
+				} else {
+#print "NO: <".$match1." ".$match2.">\n";
+					$i++;
+				}
+			} else {
+				$found = 1;
+			}
+		} else {
+			$i++;
+		}
+	}
+	if (!$found) {
+		#It should never enter here, unless you undefine the most
+		#general tags (as <...>)
+		die "po4a::xml: Unknown tag type: ".$line."\n";
+	} else {
+		return $i;
+	}
+}
+
+=item extract_tag($$)
+
+This function returns the next tag from the input stream without the beginning
+and end, in an array form, to maintain the references from the input file.  It
+has two parameters: the type of the tag (as returned by tag_type) and a
+boolean, that indicates if it should be removed from the input stream.
+
+=cut
+
+sub extract_tag {
+	my ($self,$type,$remove) = (shift,shift,shift);
+	my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
+	my ($eof,@tag);
+	if (defined($tag_types[$type]->{f_extract})) {
+		($eof,@tag) = &{$tag_types[$type]->{f_extract}}($self,$remove);
+	} else {
+		($eof,@tag) = $self->get_string_until($match2.">",{include=>1,remove=>$remove,unquoted=>1});
+	}
+	$tag[0] =~ /^<\Q$match1\E(.*)$/s;
+	$tag[0] = $1;
+	$tag[$#tag-1] =~ /^(.*)\Q$match2\E>$/s;
+	$tag[$#tag-1] = $1;
+	return ($eof,@tag);
+}
+
+=item get_tag_name(@)
+
+This function returns the name of the tag passed as an argument, in the array
+form returned by extract_tag.
+
+=cut
+
+sub get_tag_name {
+	my ($self,@tag)=@_;
+	$tag[0] =~ /^(\S*)/;
+	return $1;
+}
+
+=item breaking_tag()
+
+This function returns a boolean that says if the next tag in the input stream
+is a breaking tag or not (inline tag).  It leaves the input stream intact.
+
+=cut
+
+sub breaking_tag {
+	my $self = shift;
+	my $break;
+
+	my $type = $self->tag_type;
+	if ($type == -1) { return 0; }
+
+#print "TAG TYPE = ".$type."\n";
+	$break = $tag_types[$type]->{breaking};
+	if (!defined($break)) {
+		# This tag's breaking depends on its content
+		my ($eof,@lines) = $self->extract_tag($type,0);
+		$break = &{$tag_types[$type]->{f_breaking}}($self,@lines);
+	}
+#print "break = ".$break."\n";
+	return $break;
+}
+
+=item treat_tag()
+
+This function translates the next tag from the input stream.  Using each
+tag type's custom translation functions.
+
+=cut
+
+sub treat_tag {
+	my $self = shift;
+	my $type = $self->tag_type;
+
+	my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
+	my ($eof,@lines) = $self->extract_tag($type,1);
+
+	$lines[0] =~ /^(\s*)(.*)$/s;
+	my $space1 = $1;
+	$lines[0] = $2;
+	$lines[$#lines-1] =~ /^(.*?)(\s*)$/s;
+	my $space2 = $2;
+	$lines[$#lines-1] = $1;
+
+	# Calling this tag type's specific handling (translation of
+	# attributes...)
+	my $line = &{$tag_types[$type]->{f_translate}}($self,@lines);
+	$self->pushline("<".$match1.$space1.$line.$space2.$match2.">");
+	return $eof;
+}
+
+=item tag_in_list($@)
+
+This function returns a string value that says if the first argument (a tag
+hierarchy) matches any of the tags from the second argument (a list of tags
+or tag hierarchies). If it doesn't match, it returns 0. Else, it returns the
+matched tag's options (the characters in front of the tag) or 1 (if that tag
+doesn't have options).
+
+=back
+
+=cut
+sub tag_in_list ($$$) {
+	my ($self,$path,$list) = @_;
+	if ($self->{options}{'caseinsensitive'}) {
+		$path = lc $path;
+	}
+
+	while (1) {
+		if (defined $list->{$path}) {
+			if (length $list->{$path}) {
+				return $list->{$path};
+			} else {
+				return 1;
+			}
+		}
+		last unless ($path =~ m/</);
+		$path =~ s/^<.*?>//;
+	} 
+
+	return 0;
+}
+
+=head2 WORKING WITH ATTRIBUTES
+
+=over 4
+
+=item treat_attributes(@)
+
+This function handles the translation of the tags' attributes. It receives the tag
+without the beginning / end marks, and then it finds the attributes, and it
+translates the translatable ones (specified by the module option "attributes").
+This returns a plain string with the translated tag.
+
+=back
+
+=cut
+
+sub treat_attributes {
+	my ($self,@tag)=@_;
+
+	$tag[0] =~ /^(\S*)(.*)/s;
+	my $text = $1;
+	$tag[0] = $2;
+
+	while (@tag) {
+		my $complete = 1;
+
+		$text .= $self->skip_spaces(\@tag);
+		if (@tag) {
+			# Get the attribute's name
+			$complete = 0;
+
+			$tag[0] =~ /^([^\s=]+)(.*)/s;
+			my $name = $1;
+			my $ref = $tag[1];
+			$tag[0] = $2;
+			$text .= $name;
+			$text .= $self->skip_spaces(\@tag);
+			if (@tag) {
+				# Get the '='
+				if ($tag[0] =~ /^=(.*)/s) {
+					$tag[0] = $1;
+					$text .= "=";
+					$text .= $self->skip_spaces(\@tag);
+					if (@tag) {
+						# Get the value
+						my $value="";
+						$ref=$tag[1];
+						my $quot=substr($tag[0],0,1);
+						if ($quot ne "\"" and $quot ne "'") {
+							# Unquoted value
+							$quot="";
+							$tag[0] =~ /^(\S+)(.*)/s;
+							$value = $1;
+							$tag[0] = $2;
+						} else {
+							# Quoted value
+							$text .= $quot;
+							$tag[0] =~ /^\Q$quot\E(.*)/s;
+							$tag[0] = $1;
+							while ($tag[0] !~ /\Q$quot\E/) {
+								$value .= $tag[0];
+								shift @tag;
+								shift @tag;
+							}
+							$tag[0] =~ /^(.*?)\Q$quot\E(.*)/s;
+							$value .= $1;
+							$tag[0] = $2;
+						}
+						$complete = 1;
+						if ($self->tag_in_list($self->get_path.$name,$self->{attributes})) {
+							$text .= $self->found_string($value, $ref, { type=>"attribute", attribute=>$name });
+						} else {
+							print wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Content of attribute %s excluded: %s"), $self->get_path.$name, $value)
+							       if $self->debug();
+							$text .= $self->recode_skipped_text($value);
+						}
+						$text .= $quot;
+					}
+				}
+			}
+          
+			unless ($complete) {
+				my $ontagerror = $self->{options}{'ontagerror'};
+				if ($ontagerror eq "warn") {
+					warn wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax.  Continuing..."));
+				} elsif ($ontagerror ne "silent") {
+					die wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax"));
+				}
+			}
+		}
+	}
+	return $text;
+}
+
+# Returns an empty string if the content in the $path should not be
+# translated.
+#
+# Otherwise, returns the set of options for translation:
+#   w: the content shall be re-wrapped
+#   W: the content shall not be re-wrapped
+#   i: the tag shall be inlined
+#   p: a placeholder shall replace the tag (and its content)
+#
+# A translatable inline tag in an untranslated tag is treated as a translatable breaking tag.
+my %translate_options_cache;
+sub get_translate_options {
+	my $self = shift;
+	my $path = shift;
+
+	if (defined $translate_options_cache{$path}) {
+		return $translate_options_cache{$path};
+	}
+
+	my $options = "";
+	my $translate = 0;
+	my $usedefault = 1;
+
+	my $inlist = 0;
+	my $tag = $self->get_tag_from_list($path, $self->{tags});
+	if (defined $tag) {
+		$inlist = 1;
+	}
+	if ($self->{options}{'tagsonly'} eq $inlist) {
+		$usedefault = 0;
+		if (defined $tag) {
+			$options = $tag;
+			$options =~ s/<.*$//;
+		} else {
+			if ($self->{options}{'wrap'}) {
+				$options = "w";
+			} else {
+				$options = "W";
+			}
+		}
+		$translate = 1;
+	}
+
+# TODO: a less precise set of tags should not override a more precise one
+	# The tags and tagsonly options are deprecated.
+	# The translated and untranslated options have an higher priority.
+	$tag = $self->get_tag_from_list($path, $self->{translated});
+	if (defined $tag) {
+		$usedefault = 0;
+		$options = $tag;
+		$options =~ s/<.*$//;
+		$translate = 1;
+	}
+
+	if ($translate and $options !~ m/w/i) {
+		$options .= ($self->{options}{'wrap'})?"w":"W";
+	}
+
+	if (not defined $tag) {
+		$tag = $self->get_tag_from_list($path, $self->{untranslated});
+		if (defined $tag) {
+			$usedefault = 0;
+			$options = "";
+			$translate = 0;
+		}
+	}
+
+	$tag = $self->get_tag_from_list($path, $self->{inline});
+	if (defined $tag) {
+		$usedefault = 0;
+		$options .= "i";
+	} else {
+		$tag = $self->get_tag_from_list($path, $self->{placeholder});
+		if (defined $tag) {
+			$usedefault = 0;
+			$options .= "p";
+		}
+	}
+
+	if ($usedefault) {
+		$options = $self->{options}{'defaulttranslateoption'};
+	}
+
+	# A translatable inline tag in an untranslated tag is treated as a
+	# translatable breaking tag.
+	if ($options =~ m/i/) {
+		my $ppath = $path;
+		$ppath =~ s/<[^>]*>$//;
+		my $poptions = $self->get_translate_options ($ppath);
+		if ($poptions eq "") {
+			$options =~ s/i//;
+		}
+	}
+
+	if ($options =~ m/i/ and $self->{options}{'foldattributes'}) {
+		$options .= "f";
+	}
+
+	$translate_options_cache{$path} = $options;
+	return $options;
+}
+
+
+# Return the tag (or biggest set of tags) of a list which matches with the
+# given path.
+#
+# The tag (or set of tags) is returned with its options.
+#
+# If no tags could match the path, undef is returned.
+sub get_tag_from_list ($$$) {
+	my ($self,$path,$list) = @_;
+	if ($self->{options}{'caseinsensitive'}) {
+		$path = lc $path;
+	}
+
+	while (1) {
+		if (defined $list->{$path}) {
+			return $list->{$path}.$path;
+		}
+		last unless ($path =~ m/</);
+		$path =~ s/^<.*?>//;
+	}
+
+	return undef;
+}
+
+
+
+sub treat_content {
+	my $self = shift;
+	my $blank="";
+	# Indicates if the paragraph will have to be translated
+	my $translate = "";
+
+	my ($eof,@paragraph)=$self->get_string_until('<',{remove=>1});
+
+	while (!$eof and !$self->breaking_tag) {
+	NEXT_TAG:
+		my @text;
+		my $type = $self->tag_type;
+		my $f_extract = $tag_types[$type]->{'f_extract'};
+		if (    defined($f_extract)
+		    and $f_extract eq \&tag_extract_comment) {
+			# Remove the content of the comments
+			($eof, @text) = $self->extract_tag($type,1);
+			$text[$#text-1] .= "\0";
+			if ($tag_types[$type]->{'beginning'} eq "!--#") {
+				$text[0] = "#".$text[0];
+			}
+			push @comments, @text;
+		} else {
+			my ($tmpeof, @tag) = $self->extract_tag($type,0);
+			# Append the found inline tag
+			($eof,@text)=$self->get_string_until('>',
+			                                     {include=>1,
+			                                      remove=>1,
+			                                      unquoted=>1});
+			# Append or remove the opening/closing tag from
+			# the tag path
+			if ($tag_types[$type]->{'end'} eq "") {
+				if ($tag_types[$type]->{'beginning'} eq "") {
+					# Opening inline tag
+					my $cur_tag_name = $self->get_tag_name(@tag);
+					my $t_opts = $self->get_translate_options($self->get_path($cur_tag_name));
+					if ($t_opts =~ m/p/) {
+						# We enter a new holder.
+						# Append a <placeholder ...> tag to the current
+						# paragraph, and save the @paragraph in the
+						# current holder.
+						my $last_holder = $save_holders[$#save_holders];
+						my $placeholder_str = "<placeholder type=\"".$cur_tag_name."\" id=\"".($#{$last_holder->{'sub_translations'}}+1)."\"/>";
+						push @paragraph, ($placeholder_str, $text[1]);
+						my @saved_paragraph = @paragraph;
+
+						$last_holder->{'paragraph'} = \@saved_paragraph;
+
+						# Then we must push a new holder
+						my @new_paragraph = ();
+						my @sub_translations = ();
+						my %folded_attributes;
+						my %new_holder = ('paragraph' => \@new_paragraph,
+						                  'open' => $text[0],
+						                  'translation' => "",
+						                  'close' => undef,
+						                  'sub_translations' => \@sub_translations,
+						                  'folded_attributes' => \%folded_attributes);
+						push @save_holders, \%new_holder;
+						@text = ();
+
+						# The current @paragraph
+						# (for the current holder)
+						# is empty.
+						@paragraph = ();
+					} elsif ($t_opts =~ m/f/) {
+						my $tag_full = $self->join_lines(@text);
+						my $tag_ref = $text[1];
+						if ($tag_full =~ m/^<\s*\S+\s+\S.*>$/s) {
+							my $holder = $save_holders[$#save_holders];
+							my $id = 0;
+							foreach (keys %{$holder->{folded_attributes}}) {
+								$id = $_ + 1 if ($_ >= $id);
+							}
+							$holder->{folded_attributes}->{$id} = $tag_full;
+
+							@text = ("<$cur_tag_name po4a-id=$id>", $tag_ref);
+						}
+					}
+					push @path, $cur_tag_name;
+				} elsif ($tag_types[$type]->{'beginning'} eq "/") {
+					# Closing inline tag
+
+					# Check if this is closing the
+					# last opening tag we detected.
+					my $test = pop @path;
+					my $name = $self->get_tag_name(@tag);
+					if (!defined($test) ||
+					    $test ne $name ) {
+						my $ontagerror = $self->{options}{'ontagerror'};
+						if ($ontagerror eq "warn") {
+							warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong.  Continuing..."), $name);
+						} elsif ($ontagerror ne "silent") {
+							die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
+						}
+					}
+
+					if ($self->get_translate_options($self->get_path($self->get_tag_name(@tag))) =~ m/p/) {
+						# This closes the current holder.
+
+						push @path, $self->get_tag_name(@tag);
+						# Now translate this paragraph if needed.
+						# This will call pushline and append the
+						# translation to the current holder's translation.
+						$self->translate_paragraph(@paragraph);
+						pop @path;
+
+						# Now that this holder is closed, we can remove
+						# the holder from the stack.
+						my $holder = pop @save_holders;
+						# We need to keep the translation of this holder
+						my $translation = $holder->{'open'}.$holder->{'translation'}.$text[0];
+						# FIXME: @text could be multilines.
+
+						@text = ();
+
+						# Then we store the translation in the previous
+						# holder's sub_translations array
+						my $previous_holder = $save_holders[$#save_holders];
+						push @{$previous_holder->{'sub_translations'}}, $translation;
+						# We also need to restore the @paragraph array, as
+						# it was before we encountered the holder.
+						@paragraph = @{$previous_holder->{'paragraph'}};
+					}
+				}
+			}
+			push @paragraph, @text;
+		}
+
+		# Next tag
+		($eof,@text)=$self->get_string_until('<',{remove=>1});
+		if ($#text > 0) {
+			# Check if text (extracted after the inline tag)
+			# has to be translated
+			push @paragraph, @text;
+		}
+	}
+
+	# This strips the extracted strings
+	# (only if you don't specify the 'nostrip' option, and if the
+	# paragraph can be re-wrapped)
+	$translate = $self->get_translate_options($self->get_path);
+	if (!$self->{options}{'nostrip'} and $translate !~ m/W/) {
+		my $clean = 0;
+		# Clean the beginning
+		while (!$clean and $#paragraph > 0) {
+			$paragraph[0] =~ /^(\s*)(.*)/s;
+			my $match = $1;
+			if ($paragraph[0] eq $match) {
+				if ($match ne "") {
+					$self->pushline($match);
+				}
+				shift @paragraph;
+				shift @paragraph;
+			} else {
+				$paragraph[0] = $2;
+				if ($match ne "") {
+					$self->pushline($match);
+				}
+				$clean = 1;
+			}
+		}
+		$clean = 0;
+		# Clean the end
+		while (!$clean and $#paragraph > 0) {
+			$paragraph[$#paragraph-1] =~ /^(.*?)(\s*)$/s;
+			my $match = $2;
+			if ($paragraph[$#paragraph-1] eq $match) {
+				if ($match ne "") {
+					$blank = $match.$blank;
+				}
+				pop @paragraph;
+				pop @paragraph;
+			} else {
+				$paragraph[$#paragraph-1] = $1;
+				if ($match ne "") {
+					$blank = $match.$blank;
+				}
+				$clean = 1;
+			}
+		}
+	}
+
+	# Translate the string when needed
+	# This will either push the translation in the translated document or
+	# in the current holder translation.
+	$self->translate_paragraph(@paragraph);
+
+	# Push the trailing blanks
+	if ($blank ne "") {
+		$self->pushline($blank);
+	}
+	return $eof;
+}
+
+# Translate a @paragraph array of (string, reference).
+# The $translate argument indicates if the strings must be translated or
+# just pushed
+sub translate_paragraph {
+	my $self = shift;
+	my @paragraph = @_;
+	my $translate = $self->get_translate_options($self->get_path);
+
+	while (    (scalar @paragraph)
+	       and ($paragraph[0] =~ m/^\s*\n/s)) {
+		$self->pushline($paragraph[0]);
+		shift @paragraph;
+		shift @paragraph;
+	}
+
+	my $comments;
+	while (@comments) {
+		my ($comment,$eoc);
+		do {
+			my ($t,$l) = (shift @comments, shift @comments);
+			$t =~ s/\n?(\0)?$//;
+			$eoc = $1;
+			$comment .= "\n" if defined $comment;
+			$comment .= $t;
+		} until ($eoc);
+		$comments .= "\n" if defined $comments;
+		$comments .= $comment;
+		$self->pushline("<!--".$comment."-->\n") if defined $comment;
+	}
+	@comments = ();
+
+	if ($self->{options}{'cpp'}) {
+		my @tmp = @paragraph;
+		@paragraph = ();
+		while (@tmp) {
+			my ($t,$l) = (shift @tmp, shift @tmp);
+			# #include can be followed by a filename between
+			# <> brackets. In that case, the argument won't be
+			# handled in the same call to translate_paragraph.
+			# Thus do not try to match "include ".
+			if ($t =~ m/^#[ \t]*(if |endif|undef |include|else|ifdef |ifndef |define )/si) {
+				if (@paragraph) {
+					$self->translate_paragraph(@paragraph);
+					@paragraph = ();
+					$self->pushline("\n");
+				}
+				$self->pushline($t);
+			} else {
+				push @paragraph, ($t,$l);
+			}
+		}
+	}
+
+	my $para = $self->join_lines(@paragraph);
+	if ( length($para) > 0 ) {
+		if ($translate ne "") {
+			# This tag should be translated
+			$self->pushline($self->found_string(
+				$para,
+				$paragraph[1], {
+					type=>"tag",
+					tag_options=>$translate,
+					comments=>$comments
+				}));
+		} else {
+			# Inform that this tag isn't translated in debug mode
+			print wrap_ref_mod($paragraph[1], "po4a::xml", dgettext ("po4a", "Content of tag %s excluded: %s"), $self->get_path, $para)
+			       if $self->debug();
+			$self->pushline($self->recode_skipped_text($para));
+		}
+	}
+	# Now the paragraph is fully translated.
+	# If we have all the holders' translation, we can replace the
+	# placeholders by their translations.
+	# We must wait to have all the translations because the holders are
+	# numbered.
+	{
+		my $holder = $save_holders[$#save_holders];
+		my $translation = $holder->{'translation'};
+
+		# Count the number of <placeholder ...> in $translation
+		my $count = 0;
+		my $str = $translation;
+		while (    (defined $str)
+		       and ($str =~ m/^.*?<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s)) {
+			$count += 1;
+			$str = $2;
+			if ($holder->{'sub_translations'}->[$1] =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s) {
+				$count = -1;
+				last;
+			}
+		}
+
+		if (    (defined $translation)
+		    and (scalar(@{$holder->{'sub_translations'}}) == $count)) {
+			# OK, all the holders of the current paragraph are
+			# closed (and translated).
+			# Replace them by their translation.
+			while ($translation =~ m/^(.*?)<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s) {
+				# FIXME: we could also check that
+				#          * the holder exists
+				#          * all the holders are used
+				$translation = $1.$holder->{'sub_translations'}->[$2].$3;
+			}
+			# We have our translation
+			$holder->{'translation'} = $translation;
+			# And there is no need for any holder in it.
+			my @sub_translations = ();
+			$holder->{'sub_translations'} = \@sub_translations;
+		}
+	}
+
+}
+
+
+
+=head2 WORKING WITH THE MODULE OPTIONS
+
+=over 4
+
+=item treat_options()
+
+This function fills the internal structures that contain the tags, attributes
+and inline data with the options of the module (specified in the command-line
+or in the initialize function).
+
+=back
+
+=cut
+
+sub treat_options {
+	my $self = shift;
+
+	if ($self->{options}{'caseinsensitive'}) {
+		$self->{options}{'nodefault'}             = lc $self->{options}{'nodefault'};
+		$self->{options}{'tags'}                  = lc $self->{options}{'tags'};
+		$self->{options}{'break'}                 = lc $self->{options}{'break'};
+		$self->{options}{'_default_break'}        = lc $self->{options}{'_default_break'};
+		$self->{options}{'translated'}            = lc $self->{options}{'translated'};
+		$self->{options}{'_default_translated'}   = lc $self->{options}{'_default_translated'};
+		$self->{options}{'untranslated'}          = lc $self->{options}{'untranslated'};
+		$self->{options}{'_default_untranslated'} = lc $self->{options}{'_default_untranslated'};
+		$self->{options}{'attributes'}            = lc $self->{options}{'attributes'};
+		$self->{options}{'_default_attributes'}   = lc $self->{options}{'_default_attributes'};
+		$self->{options}{'inline'}                = lc $self->{options}{'inline'};
+		$self->{options}{'_default_inline'}       = lc $self->{options}{'_default_inline'};
+		$self->{options}{'placeholder'}           = lc $self->{options}{'placeholder'};
+		$self->{options}{'_default_placeholder'}  = lc $self->{options}{'_default_placeholder'};
+	}
+
+	$self->{options}{'nodefault'} =~ /^\s*(.*)\s*$/s;
+	my %list_nodefault;
+	foreach (split(/\s+/s,$1)) {
+		$list_nodefault{$_} = 1;
+	}
+	$self->{nodefault} = \%list_nodefault;
+
+	$self->{options}{'tags'} =~ /^\s*(.*)\s*$/s;
+	if (length $self->{options}{'tags'}) {
+		warn wrap_mod("po4a::xml",
+		             dgettext("po4a",
+		                      "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tags");
+	}
+	foreach (split(/\s+/s,$1)) {
+		$_ =~ m/^(.*?)(<.*)$/;
+		$self->{tags}->{$2} = $1 || "";
+	}
+
+	if ($self->{options}{'tagsonly'}) {
+		warn wrap_mod("po4a::xml",
+		             dgettext("po4a",
+		                      "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tagsonly");
+	}
+
+	$self->{options}{'break'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		$tag =~ m/^(.*?)(<.*)$/;
+		$self->{break}->{$2} = $1 || "";
+	}
+	$self->{options}{'_default_break'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		$tag =~ m/^(.*?)(<.*)$/;
+		$self->{break}->{$2} = $1 || ""
+			unless    $list_nodefault{$2}
+			       or defined $self->{break}->{$2};
+	}
+
+	$self->{options}{'translated'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		$tag =~ m/^(.*?)(<.*)$/;
+		$self->{translated}->{$2} = $1 || "";
+	}
+	$self->{options}{'_default_translated'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		$tag =~ m/^(.*?)(<.*)$/;
+		$self->{translated}->{$2} = $1 || ""
+			unless    $list_nodefault{$2}
+			       or defined $self->{translated}->{$2};
+	}
+
+	$self->{options}{'untranslated'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		$tag =~ m/^(.*?)(<.*)$/;
+		$self->{untranslated}->{$2} = $1 || "";
+	}
+	$self->{options}{'_default_untranslated'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		$tag =~ m/^(.*?)(<.*)$/;
+		$self->{untranslated}->{$2} = $1 || ""
+			unless    $list_nodefault{$2}
+			       or defined $self->{untranslated}->{$2};
+	}
+
+	$self->{options}{'attributes'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		if ($tag =~ m/^(.*?)(<.*)$/) {
+			$self->{attributes}->{$2} = $1 || "";
+		} else {
+			$self->{attributes}->{$tag} = "";
+		}
+	}
+	$self->{options}{'_default_attributes'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		if ($tag =~ m/^(.*?)(<.*)$/) {
+			$self->{attributes}->{$2} = $1 || ""
+				unless    $list_nodefault{$2}
+				       or defined $self->{attributes}->{$2};
+		} else {
+			$self->{attributes}->{$tag} = ""
+				unless    $list_nodefault{$tag}
+				       or defined $self->{attributes}->{$tag};
+		}
+	}
+
+	my @list_inline;
+	$self->{options}{'inline'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		$tag =~ m/^(.*?)(<.*)$/;
+		$self->{inline}->{$2} = $1 || "";
+	}
+	$self->{options}{'_default_inline'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		$tag =~ m/^(.*?)(<.*)$/;
+		$self->{inline}->{$2} = $1 || ""
+			unless    $list_nodefault{$2}
+			       or defined $self->{inline}->{$2};
+	}
+
+	$self->{options}{'placeholder'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		$tag =~ m/^(.*?)(<.*)$/;
+		$self->{placeholder}->{$2} = $1 || "";
+	}
+	$self->{options}{'_default_placeholder'} =~ /^\s*(.*)\s*$/s;
+	foreach my $tag (split(/\s+/s,$1)) {
+		$tag =~ m/^(.*?)(<.*)$/;
+		$self->{placeholder}->{$2} = $1 || ""
+			unless    $list_nodefault{$2}
+			       or defined $self->{placeholder}->{$2};
+	}
+
+	# There should be no translated and untranslated tags
+	foreach my $tag (keys %{$self->{translated}}) {
+		die wrap_mod("po4a::xml",
+		             dgettext("po4a",
+		                      "Tag '%s' both in the %s and %s categories."), $tag, "translated", "untranslated")
+			if defined $self->{untranslated}->{$tag};
+	}
+	# There should be no inline, break, and placeholder tags
+	foreach my $tag (keys %{$self->{inline}}) {
+		die wrap_mod("po4a::xml",
+		             dgettext("po4a",
+		                      "Tag '%s' both in the %s and %s categories."), $tag, "inline", "break")
+			if defined $self->{break}->{$tag};
+		die wrap_mod("po4a::xml",
+		             dgettext("po4a",
+		                      "Tag '%s' both in the %s and %s categories."), $tag, "inline", "placeholder")
+			if defined $self->{placeholder}->{$tag};
+	}
+	foreach my $tag (keys %{$self->{break}}) {
+		die wrap_mod("po4a::xml",
+		             dgettext("po4a",
+		                      "Tag '%s' both in the %s and %s categories."), $tag, "break", "placeholder")
+			if defined $self->{placeholder}->{$tag};
+	}
+}
+
+=head2 GETTING TEXT FROM THE INPUT DOCUMENT
+
+=over
+
+=item get_string_until($%)
+
+This function returns an array with the lines (and references) from the input
+document until it finds the first argument.  The second argument is an options
+hash. Value 0 means disabled (the default) and 1, enabled.
+
+The valid options are:
+
+=over 4
+
+=item include
+
+This makes the returned array to contain the searched text
+
+=item remove
+
+This removes the returned stream from the input
+
+=item unquoted
+
+This ensures that the searched text is outside any quotes
+
+=back
+
+=cut
+
+sub get_string_until {
+	my ($self,$search) = (shift,shift);
+	my $options = shift;
+	my ($include,$remove,$unquoted, $regex) = (0,0,0,0);
+
+	if (defined($options->{include})) { $include = $options->{include}; }
+	if (defined($options->{remove})) { $remove = $options->{remove}; }
+	if (defined($options->{unquoted})) { $unquoted = $options->{unquoted}; }
+	if (defined($options->{regex})) { $regex = $options->{regex}; }
+
+	my ($line,$ref) = $self->shiftline();
+	my (@text,$paragraph);
+	my ($eof,$found) = (0,0);
+
+	$search = "\Q$search\E" unless $regex;
+	while (defined($line) and !$found) {
+		push @text, ($line,$ref);
+		$paragraph .= $line;
+		if ($unquoted) {
+			if ( $paragraph =~ /^((\".*?\")|(\'.*?\')|[^\"\'])*$search/s ) {
+				$found = 1;
+			}
+		} else {
+			if ( $paragraph =~ /$search/s ) {
+				$found = 1;
+			}
+		}
+		if (!$found) {
+			($line,$ref)=$self->shiftline();
+		}
+	}
+
+	if (!defined($line)) { $eof = 1; }
+
+	if ( $found ) {
+		$line = "";
+		if($unquoted) {
+			$paragraph =~ /^(?:(?:\".*?\")|(?:\'.*?\')|[^\"\'])*?$search(.*)$/s;
+			$line = $1;
+			$text[$#text-1] =~ s/\Q$line\E$//s;
+		} else {
+			$paragraph =~ /$search(.*)$/s;
+			$line = $1;
+			$text[$#text-1] =~ s/\Q$line\E$//s;
+		}
+		if(!$include) {
+			$text[$#text-1] =~ /^(.*)($search.*)$/s;
+			$text[$#text-1] = $1;
+			$line = $2.$line;
+		}
+		if (defined($line) and ($line ne "")) {
+			$self->unshiftline ($line,$text[$#text]);
+		}
+	}
+	if (!$remove) {
+		$self->unshiftline (@text);
+	}
+
+	#If we get to the end of the file, we return the whole paragraph
+	return ($eof,@text);
+}
+
+=item skip_spaces(\@)
+
+This function receives as argument the reference to a paragraph (in the format
+returned by get_string_until), skips his heading spaces and returns them as
+a simple string.
+
+=cut
+
+sub skip_spaces {
+	my ($self,$pstring)=@_;
+	my $space="";
+
+	while (@$pstring and (@$pstring[0] =~ /^(\s+)(.*)$/s or @$pstring[0] eq "")) {
+		if (@$pstring[0] ne "") {
+			$space .= $1;
+			@$pstring[0] = $2;
+		}
+
+		if (@$pstring[0] eq "") {
+			shift @$pstring;
+			shift @$pstring;
+		}
+	}
+	return $space;
+}
+
+=item join_lines(@)
+
+This function returns a simple string with the text from the argument array
+(discarding the references).
+
+=cut
+
+sub join_lines {
+	my ($self,@lines)=@_;
+	my ($line,$ref);
+	my $text = "";
+	while ($#lines > 0) {
+		($line,$ref) = (shift @lines,shift @lines);
+		$text .= $line;
+	}
+	return $text;
+}
+
+=back
+
+=head1 STATUS OF THIS MODULE
+
+This module can translate tags and attributes.
+
+=head1 TODO LIST
+
+DOCTYPE (ENTITIES)
+
+There is a minimal support for the translation of entities. They are
+translated as a whole, and tags are not taken into account. Multilines
+entities are not supported and entities are always rewrapped during the
+translation.
+
+MODIFY TAG TYPES FROM INHERITED MODULES
+(move the tag_types structure inside the $self hash?)
+
+=head1 SEE ALSO
+
+L<po4a(7)|po4a.7>, L<Locale::Po4a::TransTractor(3pm)|Locale::Po4a::TransTractor>.
+
+=head1 AUTHORS
+
+ Jordi Vilalta <jvprat@gmail.com>
+ Nicolas François <nicolas.francois@centraliens.net>
+
+=head1 COPYRIGHT AND LICENSE
+
+ Copyright (c) 2004 by Jordi Vilalta  <jvprat@gmail.com>
+ Copyright (c) 2008-2009 by Nicolas François <nicolas.francois@centraliens.net>
+
+This program is free software; you may redistribute it and/or modify it
+under the terms of GPL (see the COPYING file).
+
+=cut
+
+1;
author	Dongsheng Song <dongsheng.song@gmail.com>
date	Thu, 12 Mar 2009 15:43:56 +0800
parents
children