Mercurial > hgbook

#!/usr/bin/perl

# Po4a::Xml.pm
#
# extract and translate translatable strings from XML documents.
#
# This code extracts plain text from tags and attributes from generic
# XML documents, and it can be used as a base to build modules for
# XML-based documents.
#
# Copyright (c) 2004 by Jordi Vilalta  <jvprat@gmail.com>
# Copyright (c) 2008-2009 by Nicolas François  <nicolas.francois@centraliens.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
########################################################################

=head1 NAME

Locale::Po4a::Xml - Convert XML documents and derivates from/to PO files

=head1 DESCRIPTION

The po4a (po for anything) project goal is to ease translations (and more
interestingly, the maintenance of translations) using gettext tools on
areas where they were not expected like documentation.

Locale::Po4a::Xml is a module to help the translation of XML documents into
other [human] languages. It can also be used as a base to build modules for
XML-based documents.

=cut

package Locale::Po4a::Xml;

use 5.006;
use strict;
use warnings;

require Exporter;
use vars qw(@ISA @EXPORT);
@ISA = qw(Locale::Po4a::TransTractor);
@EXPORT = qw(new initialize @tag_types);

use Locale::Po4a::TransTractor;
use Locale::Po4a::Common;
use Carp qw(croak);
use File::Basename;
use File::Spec;

#It will mantain the path from the root tag to the current one
my @path;

#It will contain a list of external entities and their attached paths
my %entities;

my @comments;

sub shiftline {
    my $self = shift;
    # call Transtractor's shiftline
    my ($line,$ref) = $self->SUPER::shiftline();
    return ($line,$ref) if (not defined $line);

    for my $k (keys %entities) {
        if ($line =~ m/^(.*?)&$k;(.*)$/s) {
            my ($before, $after) = ($1, $2);
            my $linenum=0;
            my @textentries;

            open (my $in, $entities{$k})
                or croak wrap_mod("po4a::xml",
                                  dgettext("po4a", "Can't read from %s: %s"),
                                  $entities{$k}, $!);
            while (defined (my $textline = <$in>)) {
                $linenum++;
                my $textref=$entities{$k}.":$linenum";
                push @textentries, ($textline,$textref);
            }
            close $in
                or croak wrap_mod("po4a::xml",
                          dgettext("po4a", "Can't close %s after reading: %s"),
                                  $entities{$k}, $!);

            push @textentries, ($after, $ref);
            $line = $before.(shift @textentries);
            $ref .= " ".(shift @textentries);
            $self->unshiftline(@textentries);
        }
    }

    return ($line,$ref);
}

sub read {
	my ($self,$filename)=@_;
	push @{$self->{DOCPOD}{infile}}, $filename;
	$self->Locale::Po4a::TransTractor::read($filename);
}

sub parse {
	my $self=shift;
	map {$self->parse_file($_)} @{$self->{DOCPOD}{infile}};
}

# @save_holders is a stack of references to ('paragraph', 'translation',
# 'sub_translations', 'open', 'close', 'folded_attributes') hashes, where:
# paragraph         is a reference to an array (see paragraph in the
#                   treat_content() subroutine) of strings followed by
#                   references.  It contains the @paragraph array as it was
#                   before the processing was interrupted by a tag instroducing
#                   a placeholder.
# translation       is the translation of this level up to now
# sub_translations  is a reference to an array of strings containing the
#                   translations which must replace the placeholders.
# open              is the tag which opened the placeholder.
# close             is the tag which closed the placeholder.
# folded_attributes is an hash of tags with their attributes (<tag attrs=...>
#                   strings), referenced by the folded tag id, which should
#                   replace the <tag po4a-id=id> strings in the current
#                   translation.
#
# If @save_holders only has 1 holder, then we are not processing the
# content of an holder, we are translating the document.
my @save_holders;


# If we are at the bottom of the stack and there is no <placeholder ...> in
# the current translation, we can push the translation in the translated
# document.
# Otherwise, we keep the translation in the current holder.
sub pushline {
	my ($self, $line) = (shift, shift);

	my $holder = $save_holders[$#save_holders];
	my $translation = $holder->{'translation'};
	$translation .= $line;

	while (    %{$holder->{folded_attributes}}
	       and $translation =~ m/^(.*)<([^>]+?)\s+po4a-id=([0-9]+)>(.*)$/s) {
		my $begin = $1;
		my $tag = $2;
		my $id = $3;
		my $end = $4;
		if (defined $holder->{folded_attributes}->{$id}) {
			# TODO: check if the tag is the same
			$translation = $begin.$holder->{folded_attributes}->{$id}.$end;
			delete $holder->{folded_attributes}->{$id};
		} else {
			# TODO: It will be hard to identify the location.
			#       => find a way to retrieve the reference.
			die wrap_mod("po4a::xml", dgettext("po4a", "'po4a-id=%d' in the translation does not exist in the original string (or 'po4a-id=%d' used twice in the translation)."), $id, $id);
		}
	}
# TODO: check that %folded_attributes is empty at some time
# => in translate_paragraph?

	if (   ($#save_holders > 0)
	    or ($translation =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s)) {
		$holder->{'translation'} = $translation;
	} else {
		$self->SUPER::pushline($translation);
		$holder->{'translation'} = '';
	}
}

=head1 TRANSLATING WITH PO4A::XML

This module can be used directly to handle generic XML documents.  This will
extract all tag's content, and no attributes, since it's where the text is
written in most XML based documents.

There are some options (described in the next section) that can customize
this behavior.  If this doesn't fit to your document format you're encouraged
to write your own module derived from this, to describe your format's details.
See the section "Writing derivate modules" below, for the process description.

=cut

#
# Parse file and translate it
#
sub parse_file {
	my ($self,$filename) = @_;
	my $eof = 0;

	while (!$eof) {
		# We get all the text until the next breaking tag (not
		# inline) and translate it
		$eof = $self->treat_content;
		if (!$eof) {
			# And then we treat the following breaking tag
			$eof = $self->treat_tag;
		}
	}
}

=head1 OPTIONS ACCEPTED BY THIS MODULE

The global debug option causes this module to show the excluded strings, in
order to see if it skips something important.

These are this module's particular options:

=over 4

=item B<nostrip>

Prevents it to strip the spaces around the extracted strings.

=item B<wrap>

Canonizes the string to translate, considering that whitespaces are not
important, and wraps the translated document. This option can be overridden
by custom tag options. See the "tags" option below.

=item B<caseinsensitive>

It makes the tags and attributes searching to work in a case insensitive
way.  If it's defined, it will treat E<lt>BooKE<gt>laNG and E<lt>BOOKE<gt>Lang as E<lt>bookE<gt>lang.

=item B<includeexternal>

When defined, external entities are included in the generated (translated)
document, and for the extraction of strings.  If it's not defined, you
will have to translate external entities separately as independent
documents.

=item B<ontagerror>

This option defines the behavior of the module when it encounter a invalid
Xml syntax (a closing tag which does not match the last opening tag, or a
tag's attribute without value).
It can take the following values:

=over

=item I<fail>

This is the default value.
The module will exit with an error.

=item I<warn>

The module will continue, and will issue a warning.

=item I<silent>

The module will continue without any warnings.

=back

Be careful when using this option.
It is generally recommended to fix the input file.

=item B<tagsonly>

Extracts only the specified tags in the "tags" option.  Otherwise, it
will extract all the tags except the ones specified.

Note: This option is deprecated.

=item B<doctype>

String that will try to match with the first line of the document's doctype
(if defined). If it doesn't, a warning will indicate that the document
might be of a bad type.

=item B<tags>

Space-separated list of tags you want to translate or skip.  By default,
the specified tags will be excluded, but if you use the "tagsonly" option,
the specified tags will be the only ones included.  The tags must be in the
form E<lt>aaaE<gt>, but you can join some (E<lt>bbbE<gt>E<lt>aaaE<gt>) to say that the content of
the tag E<lt>aaaE<gt> will only be translated when it's into a E<lt>bbbE<gt> tag.

You can also specify some tag options putting some characters in front of
the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
to override the default behavior specified by the global "wrap" option.

Example: WE<lt>chapterE<gt>E<lt>titleE<gt>

Note: This option is deprecated.
You should use the B<translated> and B<untranslated> options instead.

=item B<attributes>

Space-separated list of tag's attributes you want to translate.  You can
specify the attributes by their name (for example, "lang"), but you can
prefix it with a tag hierarchy, to specify that this attribute will only be
translated when it's into the specified tag. For example: E<lt>bbbE<gt>E<lt>aaaE<gt>lang
specifies that the lang attribute will only be translated if it's into an
E<lt>aaaE<gt> tag, and it's into a E<lt>bbbE<gt> tag.

=item B<foldattributes>

Do not translate attributes in inline tags.
Instead, replace all attributes of a tag by po4a-id=<id>.

This is useful when attributes shall not be translated, as this simplifies the
strings for translators, and avoids typos.

=item B<break>

Space-separated list of tags which should break the sequence.
By default, all tags break the sequence.

The tags must be in the form <aaa>, but you can join some
(<bbb><aaa>), if a tag (<aaa>) should only be considered
when it's into another tag (<bbb>).

=item B<inline>

Space-separated list of tags which should be treated as inline.
By default, all tags break the sequence.

The tags must be in the form <aaa>, but you can join some
(<bbb><aaa>), if a tag (<aaa>) should only be considered
when it's into another tag (<bbb>).

=item B<placeholder>

Space-separated list of tags which should be treated as placeholders.
Placeholders do not break the sequence, but the content of placeholders is
translated separately.

The location of the placeholder in its blocks will be marked with a string
similar to:

  <placeholder type=\"footnote\" id=\"0\"/>

The tags must be in the form <aaa>, but you can join some
(<bbb><aaa>), if a tag (<aaa>) should only be considered
when it's into another tag (<bbb>).

=item B<nodefault>

Space separated list of tags that the module should not try to set by
default in any category.

=item B<cpp>

Support C preprocessor directives.
When this option is set, po4a will consider preprocessor directives as
paragraph separators.
This is important if the XML file must be preprocessed because otherwise
the directives may be inserted in the middle of lines if po4a consider it
belong to the current paragraph, and they won't be recognized by the
preprocessor.
Note: the preprocessor directives must only appear between tags
(they must not break a tag).

=item B<translated>

Space-separated list of tags you want to translate.

The tags must be in the form <aaa>, but you can join some
(<bbb><aaa>), if a tag (<aaa>) should only be considered
when it's into another tag (<bbb>).

You can also specify some tag options putting some characters in front of
the tag hierarchy. For example, you can put 'w' (wrap) or 'W' (don't wrap)
to overide the default behavior specified by the global "wrap" option.

Example: WE<lt>chapterE<gt>E<lt>titleE<gt>

=item B<untranslated>

Space-separated list of tags you do not want to translate.

The tags must be in the form <aaa>, but you can join some
(<bbb><aaa>), if a tag (<aaa>) should only be considered
when it's into another tag (<bbb>).

=item B<defaulttranslateoption>

The default categories for tags that are not in any of the translated,
untranslated, break, inline, or placeholder.

This is a set of letters:

=over

=item I<w>

Tags should be translated and content can be re-wrapped.

=item I<W>

Tags should be translated and content should not be re-wrapped.

=item I<i>

Tags should be translated inline.

=item I<p>

Tags should be translated as placeholders.

=back

=back

=cut
# TODO: defaulttranslateoption
# w => indicate that it is only valid for translatable tags and do not
#      care about inline/break/placeholder?
# ...

sub initialize {
	my $self = shift;
	my %options = @_;

	# Reset the path
	@path = ();

	# Initialize the stack of holders
	my @paragraph = ();
	my @sub_translations = ();
	my %folded_attributes;
	my %holder = ('paragraph' => \@paragraph,
	              'translation' => "",
	              'sub_translations' => \@sub_translations,
	              'folded_attributes' => \%folded_attributes);
	@save_holders = (\%holder);

	$self->{options}{'nostrip'}=0;
	$self->{options}{'wrap'}=0;
	$self->{options}{'caseinsensitive'}=0;
	$self->{options}{'tagsonly'}=0;
	$self->{options}{'tags'}='';
	$self->{options}{'break'}='';
	$self->{options}{'translated'}='';
	$self->{options}{'untranslated'}='';
	$self->{options}{'defaulttranslateoption'}='';
	$self->{options}{'attributes'}='';
	$self->{options}{'foldattributes'}=0;
	$self->{options}{'inline'}='';
	$self->{options}{'placeholder'}='';
	$self->{options}{'doctype'}='';
	$self->{options}{'nodefault'}='';
	$self->{options}{'includeexternal'}=0;
	$self->{options}{'ontagerror'}="fail";
	$self->{options}{'cpp'}=0;

	$self->{options}{'verbose'}='';
	$self->{options}{'debug'}='';

	foreach my $opt (keys %options) {
		if ($options{$opt}) {
			die wrap_mod("po4a::xml",
				dgettext("po4a", "Unknown option: %s"), $opt)
				unless exists $self->{options}{$opt};
			$self->{options}{$opt} = $options{$opt};
		}
	}
	# Default options set by modules. Forbidden for users.
	$self->{options}{'_default_translated'}='';
	$self->{options}{'_default_untranslated'}='';
	$self->{options}{'_default_break'}='';
	$self->{options}{'_default_inline'}='';
	$self->{options}{'_default_placeholder'}='';
	$self->{options}{'_default_attributes'}='';

	#It will maintain the list of the translatable tags
	$self->{tags}=();
	$self->{translated}=();
	$self->{untranslated}=();
	#It will maintain the list of the translatable attributes
	$self->{attributes}=();
	#It will maintain the list of the breaking tags
	$self->{break}=();
	#It will maintain the list of the inline tags
	$self->{inline}=();
	#It will maintain the list of the placeholder tags
	$self->{placeholder}=();
	#list of the tags that must not be set in the tags or inline category
	#by this module or sub-module (unless specified in an option)
	$self->{nodefault}=();

	$self->treat_options;
}

=head1 WRITING DERIVATE MODULES

=head2 DEFINE WHAT TAGS AND ATTRIBUTES TO TRANSLATE

The simplest customization is to define which tags and attributes you want
the parser to translate.  This should be done in the initialize function.
First you should call the main initialize, to get the command-line options,
and then, append your custom definitions to the options hash.  If you want
to treat some new options from command line, you should define them before
calling the main initialize:

  $self->{options}{'new_option'}='';
  $self->SUPER::initialize(%options);
  $self->{options}{'_default_translated'}.=' <p> <head><title>';
  $self->{options}{'attributes'}.=' <p>lang id';
  $self->{options}{'_default_inline'}.=' <br>';
  $self->treat_options;

You should use the B<_default_inline>, B<_default_break>,
B<_default_placeholder>, B<_default_translated>, B<_default_untranslated>,
and B<_default_attributes> options in derivated modules. This allow users
to override the default behavior defined in your module with command line
options.

=head2 OVERRIDING THE found_string FUNCTION

Another simple step is to override the function "found_string", which
receives the extracted strings from the parser, in order to translate them.
There you can control which strings you want to translate, and perform
transformations to them before or after the translation itself.

It receives the extracted text, the reference on where it was, and a hash
that contains extra information to control what strings to translate, how
to translate them and to generate the comment.

The content of these options depends on the kind of string it is (specified in an
entry of this hash):

=over

=item type="tag"

The found string is the content of a translatable tag. The entry "tag_options"
contains the option characters in front of the tag hierarchy in the module
"tags" option.

=item type="attribute"

Means that the found string is the value of a translatable attribute. The
entry "attribute" has the name of the attribute.

=back

It must return the text that will replace the original in the translated
document. Here's a basic example of this function:

  sub found_string {
    my ($self,$text,$ref,$options)=@_;
    $text = $self->translate($text,$ref,"type ".$options->{'type'},
      'wrap'=>$self->{options}{'wrap'});
    return $text;
  }

There's another simple example in the new Dia module, which only filters
some strings.

=cut

sub found_string {
	my ($self,$text,$ref,$options)=@_;

	if ($text =~ m/^\s*$/s) {
		return $text;
	}

	my $comment;
	my $wrap = $self->{options}{'wrap'};

	if ($options->{'type'} eq "tag") {
		$comment = "Content of: ".$self->get_path;

		if($options->{'tag_options'} =~ /w/) {
			$wrap = 1;
		}
		if($options->{'tag_options'} =~ /W/) {
			$wrap = 0;
		}
	} elsif ($options->{'type'} eq "attribute") {
		$comment = "Attribute '".$options->{'attribute'}."' of: ".$self->get_path;
	} elsif ($options->{'type'} eq "CDATA") {
		$comment = "CDATA";
		$wrap = 0;
	} else {
		die wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Internal error: unknown type identifier '%s'."), $options->{'type'});
	}
	$text = $self->translate($text,$ref,$comment,'wrap'=>$wrap, comment => $options->{'comments'});
	return $text;
}

=head2 MODIFYING TAG TYPES (TODO)

This is a more complex one, but it enables a (almost) total customization.
It's based in a list of hashes, each one defining a tag type's behavior. The
list should be sorted so that the most general tags are after the most
concrete ones (sorted first by the beginning and then by the end keys). To
define a tag type you'll have to make a hash with the following keys:

=over 4

=item beginning

Specifies the beginning of the tag, after the "E<lt>".

=item end

Specifies the end of the tag, before the "E<gt>".

=item breaking

It says if this is a breaking tag class.  A non-breaking (inline) tag is one
that can be taken as part of the content of another tag.  It can take the
values false (0), true (1) or undefined.  If you leave this undefined, you'll
have to define the f_breaking function that will say whether a concrete tag of
this class is a breaking tag or not.

=item f_breaking

It's a function that will tell if the next tag is a breaking one or not.  It
should be defined if the "breaking" option is not.

=item f_extract

If you leave this key undefined, the generic extraction function will have to
extract the tag itself.  It's useful for tags that can have other tags or
special structures in them, so that the main parser doesn't get mad.  This
function receives a boolean that says if the tag should be removed from the
input stream or not.

=item f_translate

This function receives the tag (in the get_string_until() format) and returns
the translated tag (translated attributes or all needed transformations) as a
single string.

=back

=cut

##### Generic XML tag types #####'

our @tag_types = (
	{	beginning	=> "!--#",
		end		=> "--",
		breaking	=> 0,
		f_extract	=> \&tag_extract_comment,
		f_translate	=> \&tag_trans_comment},
	{	beginning	=> "!--",
		end		=> "--",
		breaking	=> 0,
		f_extract	=> \&tag_extract_comment,
		f_translate	=> \&tag_trans_comment},
	{	beginning	=> "?xml",
		end		=> "?",
		breaking	=> 1,
		f_translate	=> \&tag_trans_xmlhead},
	{	beginning	=> "?",
		end		=> "?",
		breaking	=> 1,
		f_translate	=> \&tag_trans_procins},
	{	beginning	=> "!DOCTYPE",
		end		=> "",
		breaking	=> 1,
		f_extract	=> \&tag_extract_doctype,
		f_translate	=> \&tag_trans_doctype},
	{	beginning	=> "![CDATA[",
		end		=> "",
		breaking	=> 1,
		f_extract	=> \&CDATA_extract,
		f_translate	=> \&CDATA_trans},
	{	beginning	=> "/",
		end		=> "",
		f_breaking	=> \&tag_break_close,
		f_translate	=> \&tag_trans_close},
	{	beginning	=> "",
		end		=> "/",
		f_breaking	=> \&tag_break_alone,
		f_translate	=> \&tag_trans_alone},
	{	beginning	=> "",
		end		=> "",
		f_breaking	=> \&tag_break_open,
		f_translate	=> \&tag_trans_open}
);

sub tag_extract_comment {
	my ($self,$remove)=(shift,shift);
	my ($eof,@tag)=$self->get_string_until('-->',{include=>1,remove=>$remove});
	return ($eof,@tag);
}

sub tag_trans_comment {
	my ($self,@tag)=@_;
	return $self->join_lines(@tag);
}

sub tag_trans_xmlhead {
	my ($self,@tag)=@_;

	# We don't have to translate anything from here: throw away references
	my $tag = $self->join_lines(@tag);
	$tag =~ /encoding=(("|')|)(.*?)(\s|\2)/s;
	my $in_charset=$3;
	$self->detected_charset($in_charset);
	my $out_charset=$self->get_out_charset;

	if (defined $in_charset) {
		$tag =~ s/$in_charset/$out_charset/;
	} else {
		if ($tag =~ m/standalone/) {
			$tag =~ s/(standalone)/encoding="$out_charset" $1/;
		} else {
			$tag.= " encoding=\"$out_charset\"";
		}
	}

	return $tag;
}

sub tag_trans_procins {
	my ($self,@tag)=@_;
	return $self->join_lines(@tag);
}

sub tag_extract_doctype {
	my ($self,$remove)=(shift,shift);

	# Check if there is an internal subset (between []).
	my ($eof,@tag)=$self->get_string_until('>',{include=>1,unquoted=>1});
	my $parity = 0;
	my $paragraph = "";
	map { $parity = 1 - $parity; $paragraph.= $parity?$_:""; } @tag;
	my $found = 0;
	if ($paragraph =~ m/<.*\[.*</s) {
		$found = 1
	}

	if (not $found) {
		($eof,@tag)=$self->get_string_until('>',{include=>1,remove=>$remove,unquoted=>1});
	} else {
		($eof,@tag)=$self->get_string_until(']\s*>',{include=>1,remove=>$remove,unquoted=>1,regex=>1});
	}
	return ($eof,@tag);
}

sub tag_trans_doctype {
# This check is not really reliable.  There are system and public
# identifiers.  Only the public one could be checked reliably.
	my ($self,@tag)=@_;
	if (defined $self->{options}{'doctype'} ) {
		my $doctype = $self->{options}{'doctype'};
		if ( $tag[0] !~ /\Q$doctype\E/i ) {
			warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Bad document type. '%s' expected. You can fix this warning with a -o doctype option, or ignore this check with -o doctype=\"\"."), $doctype);
		}
	}
	my $i = 0;
	my $basedir = $tag[1];
	$basedir =~ s/:[0-9]+$//;
	$basedir = dirname($basedir);

	while ( $i < $#tag ) {
		my $t = $tag[$i];
		my $ref = $tag[$i+1];
		if ( $t =~ /^(\s*<!ENTITY\s+)(.*)$/is ) {
			my $part1 = $1;
			my $part2 = $2;
			my $includenow = 0;
			my $file = 0;
			my $name = "";
			if ($part2 =~ /^(%\s+)(.*)$/s ) {
				$part1.= $1;
				$part2 = $2;
				$includenow = 1;
			}
			$part2 =~ /^(\S+)(\s+)(.*)$/s;
			$name = $1;
			$part1.= $1.$2;
			$part2 = $3;
			if ( $part2 =~ /^(SYSTEM\s+)(.*)$/is ) {
				$part1.= $1;
				$part2 = $2;
				$file = 1;
				if ($self->{options}{'includeexternal'}) {
					$entities{$name} = $part2;
					$entities{$name} =~ s/^"?(.*?)".*$/$1/s;
					$entities{$name} = File::Spec->catfile($basedir, $entities{$name});
				}
			}
			if ((not $file) and (not $includenow)) {
			    if ($part2 =~ m/^\s*(["'])(.*)\1(\s*>.*)$/s) {
				my $comment = "Content of the $name entity";
				my $quote = $1;
				my $text = $2;
				$part2 = $3;
				$text = $self->translate($text,
				                         $ref,
				                         $comment,
				                         'wrap'=>1);
				$t = $part1."$quote$text$quote$part2";
			    }
			}
#			print $part1."\n";
#			print $name."\n";
#			print $part2."\n";
		}
		$tag[$i] = $t;
		$i += 2;
	}
	return $self->join_lines(@tag);
}

sub tag_break_close {
	my ($self,@tag)=@_;
	my $struct = $self->get_path;
	my $options = $self->get_translate_options($struct);
	if ($options =~ m/[ip]/) {
		return 0;
	} else {
		return 1;
	}
}

sub tag_trans_close {
	my ($self,@tag)=@_;
	my $name = $self->get_tag_name(@tag);

	my $test = pop @path;
	if (!defined($test) || $test ne $name ) {
		my $ontagerror = $self->{options}{'ontagerror'};
		if ($ontagerror eq "warn") {
			warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong.  Continuing..."), $name);
		} elsif ($ontagerror ne "silent") {
			die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
		}
	}
	return $self->join_lines(@tag);
}

sub CDATA_extract {
	my ($self,$remove)=(shift,shift);
        my ($eof, @tag) = $self->get_string_until(']]>',{include=>1,unquoted=>0,remove=>$remove});

	return ($eof, @tag);
}

sub CDATA_trans {
	my ($self,@tag)=@_;
	return $self->found_string($self->join_lines(@tag),
	                           $tag[1],
	                           {'type' => "CDATA"});
}

sub tag_break_alone {
	my ($self,@tag)=@_;
	my $struct = $self->get_path($self->get_tag_name(@tag));
	if ($self->get_translate_options($struct) =~ m/i/) {
		return 0;
	} else {
		return 1;
	}
}

sub tag_trans_alone {
	my ($self,@tag)=@_;
	my $name = $self->get_tag_name(@tag);
	push @path, $name;

	$name = $self->treat_attributes(@tag);

	pop @path;
	return $name;
}

sub tag_break_open {
	my ($self,@tag)=@_;
	my $struct = $self->get_path($self->get_tag_name(@tag));
	my $options = $self->get_translate_options($struct);
	if ($options =~ m/[ip]/) {
		return 0;
	} else {
		return 1;
	}
}

sub tag_trans_open {
	my ($self,@tag)=@_;
	my $name = $self->get_tag_name(@tag);
	push @path, $name;

	$name = $self->treat_attributes(@tag);

	return $name;
}

##### END of Generic XML tag types #####

=head1 INTERNAL FUNCTIONS used to write derivated parsers

=head2 WORKING WITH TAGS

=over 4

=item get_path()

This function returns the path to the current tag from the document's root,
in the form E<lt>htmlE<gt>E<lt>bodyE<gt>E<lt>pE<gt>.

An additional array of tags (without brackets) can be passed in argument.
These path elements are added to the end of the current path.

=cut

sub get_path {
	my $self = shift;
	my @add = @_;
	if ( @path > 0 or @add > 0 ) {
		return "<".join("><",@path,@add).">";
	} else {
		return "outside any tag (error?)";
	}
}

=item tag_type()

This function returns the index from the tag_types list that fits to the next
tag in the input stream, or -1 if it's at the end of the input file.

=cut

sub tag_type {
	my $self = shift;
	my ($line,$ref) = $self->shiftline();
	my ($match1,$match2);
	my $found = 0;
	my $i = 0;

	if (!defined($line)) { return -1; }

	$self->unshiftline($line,$ref);
	my ($eof,@lines) = $self->get_string_until(">",{include=>1,unquoted=>1});
	my $line2 = $self->join_lines(@lines);
	while (!$found && $i < @tag_types) {
		($match1,$match2) = ($tag_types[$i]->{beginning},$tag_types[$i]->{end});
		if ($line =~ /^<\Q$match1\E/) {
			if (!defined($tag_types[$i]->{f_extract})) {
#print substr($line2,length($line2)-1-length($match2),1+length($match2))."\n";
				if (defined($line2) and $line2 =~ /\Q$match2\E>$/) {
					$found = 1;
#print "YES: <".$match1." ".$match2.">\n";
				} else {
#print "NO: <".$match1." ".$match2.">\n";
					$i++;
				}
			} else {
				$found = 1;
			}
		} else {
			$i++;
		}
	}
	if (!$found) {
		#It should never enter here, unless you undefine the most
		#general tags (as <...>)
		die "po4a::xml: Unknown tag type: ".$line."\n";
	} else {
		return $i;
	}
}

=item extract_tag($$)

This function returns the next tag from the input stream without the beginning
and end, in an array form, to maintain the references from the input file.  It
has two parameters: the type of the tag (as returned by tag_type) and a
boolean, that indicates if it should be removed from the input stream.

=cut

sub extract_tag {
	my ($self,$type,$remove) = (shift,shift,shift);
	my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
	my ($eof,@tag);
	if (defined($tag_types[$type]->{f_extract})) {
		($eof,@tag) = &{$tag_types[$type]->{f_extract}}($self,$remove);
	} else {
		($eof,@tag) = $self->get_string_until($match2.">",{include=>1,remove=>$remove,unquoted=>1});
	}
	$tag[0] =~ /^<\Q$match1\E(.*)$/s;
	$tag[0] = $1;
	$tag[$#tag-1] =~ /^(.*)\Q$match2\E>$/s;
	$tag[$#tag-1] = $1;
	return ($eof,@tag);
}

=item get_tag_name(@)

This function returns the name of the tag passed as an argument, in the array
form returned by extract_tag.

=cut

sub get_tag_name {
	my ($self,@tag)=@_;
	$tag[0] =~ /^(\S*)/;
	return $1;
}

=item breaking_tag()

This function returns a boolean that says if the next tag in the input stream
is a breaking tag or not (inline tag).  It leaves the input stream intact.

=cut

sub breaking_tag {
	my $self = shift;
	my $break;

	my $type = $self->tag_type;
	if ($type == -1) { return 0; }

#print "TAG TYPE = ".$type."\n";
	$break = $tag_types[$type]->{breaking};
	if (!defined($break)) {
		# This tag's breaking depends on its content
		my ($eof,@lines) = $self->extract_tag($type,0);
		$break = &{$tag_types[$type]->{f_breaking}}($self,@lines);
	}
#print "break = ".$break."\n";
	return $break;
}

=item treat_tag()

This function translates the next tag from the input stream.  Using each
tag type's custom translation functions.

=cut

sub treat_tag {
	my $self = shift;
	my $type = $self->tag_type;

	my ($match1,$match2) = ($tag_types[$type]->{beginning},$tag_types[$type]->{end});
	my ($eof,@lines) = $self->extract_tag($type,1);

	$lines[0] =~ /^(\s*)(.*)$/s;
	my $space1 = $1;
	$lines[0] = $2;
	$lines[$#lines-1] =~ /^(.*?)(\s*)$/s;
	my $space2 = $2;
	$lines[$#lines-1] = $1;

	# Calling this tag type's specific handling (translation of
	# attributes...)
	my $line = &{$tag_types[$type]->{f_translate}}($self,@lines);
	$self->pushline("<".$match1.$space1.$line.$space2.$match2.">");
	return $eof;
}

=item tag_in_list($@)

This function returns a string value that says if the first argument (a tag
hierarchy) matches any of the tags from the second argument (a list of tags
or tag hierarchies). If it doesn't match, it returns 0. Else, it returns the
matched tag's options (the characters in front of the tag) or 1 (if that tag
doesn't have options).

=back

=cut
sub tag_in_list ($$$) {
	my ($self,$path,$list) = @_;
	if ($self->{options}{'caseinsensitive'}) {
		$path = lc $path;
	}

	while (1) {
		if (defined $list->{$path}) {
			if (length $list->{$path}) {
				return $list->{$path};
			} else {
				return 1;
			}
		}
		last unless ($path =~ m/</);
		$path =~ s/^<.*?>//;
	}

	return 0;
}

=head2 WORKING WITH ATTRIBUTES

=over 4

=item treat_attributes(@)

This function handles the translation of the tags' attributes. It receives the tag
without the beginning / end marks, and then it finds the attributes, and it
translates the translatable ones (specified by the module option "attributes").
This returns a plain string with the translated tag.

=back

=cut

sub treat_attributes {
	my ($self,@tag)=@_;

	$tag[0] =~ /^(\S*)(.*)/s;
	my $text = $1;
	$tag[0] = $2;

	while (@tag) {
		my $complete = 1;

		$text .= $self->skip_spaces(\@tag);
		if (@tag) {
			# Get the attribute's name
			$complete = 0;

			$tag[0] =~ /^([^\s=]+)(.*)/s;
			my $name = $1;
			my $ref = $tag[1];
			$tag[0] = $2;
			$text .= $name;
			$text .= $self->skip_spaces(\@tag);
			if (@tag) {
				# Get the '='
				if ($tag[0] =~ /^=(.*)/s) {
					$tag[0] = $1;
					$text .= "=";
					$text .= $self->skip_spaces(\@tag);
					if (@tag) {
						# Get the value
						my $value="";
						$ref=$tag[1];
						my $quot=substr($tag[0],0,1);
						if ($quot ne "\"" and $quot ne "'") {
							# Unquoted value
							$quot="";
							$tag[0] =~ /^(\S+)(.*)/s;
							$value = $1;
							$tag[0] = $2;
						} else {
							# Quoted value
							$text .= $quot;
							$tag[0] =~ /^\Q$quot\E(.*)/s;
							$tag[0] = $1;
							while ($tag[0] !~ /\Q$quot\E/) {
								$value .= $tag[0];
								shift @tag;
								shift @tag;
							}
							$tag[0] =~ /^(.*?)\Q$quot\E(.*)/s;
							$value .= $1;
							$tag[0] = $2;
						}
						$complete = 1;
						if ($self->tag_in_list($self->get_path.$name,$self->{attributes})) {
							$text .= $self->found_string($value, $ref, { type=>"attribute", attribute=>$name });
						} else {
							print wrap_ref_mod($ref, "po4a::xml", dgettext("po4a", "Content of attribute %s excluded: %s"), $self->get_path.$name, $value)
							       if $self->debug();
							$text .= $self->recode_skipped_text($value);
						}
						$text .= $quot;
					}
				}
			}

			unless ($complete) {
				my $ontagerror = $self->{options}{'ontagerror'};
				if ($ontagerror eq "warn") {
					warn wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax.  Continuing..."));
				} elsif ($ontagerror ne "silent") {
					die wrap_ref_mod($ref, "po4a::xml", dgettext ("po4a", "Bad attribute syntax"));
				}
			}
		}
	}
	return $text;
}

# Returns an empty string if the content in the $path should not be
# translated.
#
# Otherwise, returns the set of options for translation:
#   w: the content shall be re-wrapped
#   W: the content shall not be re-wrapped
#   i: the tag shall be inlined
#   p: a placeholder shall replace the tag (and its content)
#
# A translatable inline tag in an untranslated tag is treated as a translatable breaking tag.
my %translate_options_cache;
sub get_translate_options {
	my $self = shift;
	my $path = shift;

	if (defined $translate_options_cache{$path}) {
		return $translate_options_cache{$path};
	}

	my $options = "";
	my $translate = 0;
	my $usedefault = 1;

	my $inlist = 0;
	my $tag = $self->get_tag_from_list($path, $self->{tags});
	if (defined $tag) {
		$inlist = 1;
	}
	if ($self->{options}{'tagsonly'} eq $inlist) {
		$usedefault = 0;
		if (defined $tag) {
			$options = $tag;
			$options =~ s/<.*$//;
		} else {
			if ($self->{options}{'wrap'}) {
				$options = "w";
			} else {
				$options = "W";
			}
		}
		$translate = 1;
	}

# TODO: a less precise set of tags should not override a more precise one
	# The tags and tagsonly options are deprecated.
	# The translated and untranslated options have an higher priority.
	$tag = $self->get_tag_from_list($path, $self->{translated});
	if (defined $tag) {
		$usedefault = 0;
		$options = $tag;
		$options =~ s/<.*$//;
		$translate = 1;
	}

	if ($translate and $options !~ m/w/i) {
		$options .= ($self->{options}{'wrap'})?"w":"W";
	}

	if (not defined $tag) {
		$tag = $self->get_tag_from_list($path, $self->{untranslated});
		if (defined $tag) {
			$usedefault = 0;
			$options = "";
			$translate = 0;
		}
	}

	$tag = $self->get_tag_from_list($path, $self->{inline});
	if (defined $tag) {
		$usedefault = 0;
		$options .= "i";
	} else {
		$tag = $self->get_tag_from_list($path, $self->{placeholder});
		if (defined $tag) {
			$usedefault = 0;
			$options .= "p";
		}
	}

	if ($usedefault) {
		$options = $self->{options}{'defaulttranslateoption'};
	}

	# A translatable inline tag in an untranslated tag is treated as a
	# translatable breaking tag.
	if ($options =~ m/i/) {
		my $ppath = $path;
		$ppath =~ s/<[^>]*>$//;
		my $poptions = $self->get_translate_options ($ppath);
		if ($poptions eq "") {
			$options =~ s/i//;
		}
	}

	if ($options =~ m/i/ and $self->{options}{'foldattributes'}) {
		$options .= "f";
	}

	$translate_options_cache{$path} = $options;
	return $options;
}


# Return the tag (or biggest set of tags) of a list which matches with the
# given path.
#
# The tag (or set of tags) is returned with its options.
#
# If no tags could match the path, undef is returned.
sub get_tag_from_list ($$$) {
	my ($self,$path,$list) = @_;
	if ($self->{options}{'caseinsensitive'}) {
		$path = lc $path;
	}

	while (1) {
		if (defined $list->{$path}) {
			return $list->{$path}.$path;
		}
		last unless ($path =~ m/</);
		$path =~ s/^<.*?>//;
	}

	return undef;
}


sub treat_content {
	my $self = shift;
	my $blank="";
	# Indicates if the paragraph will have to be translated
	my $translate = "";

	my ($eof,@paragraph)=$self->get_string_until('<',{remove=>1});

	while (!$eof and !$self->breaking_tag) {
	NEXT_TAG:
		my @text;
		my $type = $self->tag_type;
		my $f_extract = $tag_types[$type]->{'f_extract'};
		if (    defined($f_extract)
		    and $f_extract eq \&tag_extract_comment) {
			# Remove the content of the comments
			($eof, @text) = $self->extract_tag($type,1);
			$text[$#text-1] .= "\0";
			if ($tag_types[$type]->{'beginning'} eq "!--#") {
				$text[0] = "#".$text[0];
			}
			push @comments, @text;
		} else {
			my ($tmpeof, @tag) = $self->extract_tag($type,0);
			# Append the found inline tag
			($eof,@text)=$self->get_string_until('>',
			                                     {include=>1,
			                                      remove=>1,
			                                      unquoted=>1});
			# Append or remove the opening/closing tag from
			# the tag path
			if ($tag_types[$type]->{'end'} eq "") {
				if ($tag_types[$type]->{'beginning'} eq "") {
					# Opening inline tag
					my $cur_tag_name = $self->get_tag_name(@tag);
					my $t_opts = $self->get_translate_options($self->get_path($cur_tag_name));
					if ($t_opts =~ m/p/) {
						# We enter a new holder.
						# Append a <placeholder ...> tag to the current
						# paragraph, and save the @paragraph in the
						# current holder.
						my $last_holder = $save_holders[$#save_holders];
						my $placeholder_str = "<placeholder type=\"".$cur_tag_name."\" id=\"".($#{$last_holder->{'sub_translations'}}+1)."\"/>";
						push @paragraph, ($placeholder_str, $text[1]);
						my @saved_paragraph = @paragraph;

						$last_holder->{'paragraph'} = \@saved_paragraph;

						# Then we must push a new holder
						my @new_paragraph = ();
						my @sub_translations = ();
						my %folded_attributes;
						my %new_holder = ('paragraph' => \@new_paragraph,
						                  'open' => $text[0],
						                  'translation' => "",
						                  'close' => undef,
						                  'sub_translations' => \@sub_translations,
						                  'folded_attributes' => \%folded_attributes);
						push @save_holders, \%new_holder;
						@text = ();

						# The current @paragraph
						# (for the current holder)
						# is empty.
						@paragraph = ();
					} elsif ($t_opts =~ m/f/) {
						my $tag_full = $self->join_lines(@text);
						my $tag_ref = $text[1];
						if ($tag_full =~ m/^<\s*\S+\s+\S.*>$/s) {
							my $holder = $save_holders[$#save_holders];
							my $id = 0;
							foreach (keys %{$holder->{folded_attributes}}) {
								$id = $_ + 1 if ($_ >= $id);
							}
							$holder->{folded_attributes}->{$id} = $tag_full;

							@text = ("<$cur_tag_name po4a-id=$id>", $tag_ref);
						}
					}
					push @path, $cur_tag_name;
				} elsif ($tag_types[$type]->{'beginning'} eq "/") {
					# Closing inline tag

					# Check if this is closing the
					# last opening tag we detected.
					my $test = pop @path;
					my $name = $self->get_tag_name(@tag);
					if (!defined($test) ||
					    $test ne $name ) {
						my $ontagerror = $self->{options}{'ontagerror'};
						if ($ontagerror eq "warn") {
							warn wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong.  Continuing..."), $name);
						} elsif ($ontagerror ne "silent") {
							die wrap_ref_mod($tag[1], "po4a::xml", dgettext("po4a", "Unexpected closing tag </%s> found. The main document may be wrong."), $name);
						}
					}

					if ($self->get_translate_options($self->get_path($self->get_tag_name(@tag))) =~ m/p/) {
						# This closes the current holder.

						push @path, $self->get_tag_name(@tag);
						# Now translate this paragraph if needed.
						# This will call pushline and append the
						# translation to the current holder's translation.
						$self->translate_paragraph(@paragraph);
						pop @path;

						# Now that this holder is closed, we can remove
						# the holder from the stack.
						my $holder = pop @save_holders;
						# We need to keep the translation of this holder
						my $translation = $holder->{'open'}.$holder->{'translation'}.$text[0];
						# FIXME: @text could be multilines.

						@text = ();

						# Then we store the translation in the previous
						# holder's sub_translations array
						my $previous_holder = $save_holders[$#save_holders];
						push @{$previous_holder->{'sub_translations'}}, $translation;
						# We also need to restore the @paragraph array, as
						# it was before we encountered the holder.
						@paragraph = @{$previous_holder->{'paragraph'}};
					}
				}
			}
			push @paragraph, @text;
		}

		# Next tag
		($eof,@text)=$self->get_string_until('<',{remove=>1});
		if ($#text > 0) {
			# Check if text (extracted after the inline tag)
			# has to be translated
			push @paragraph, @text;
		}
	}

	# This strips the extracted strings
	# (only if you don't specify the 'nostrip' option, and if the
	# paragraph can be re-wrapped)
	$translate = $self->get_translate_options($self->get_path);
	if (!$self->{options}{'nostrip'} and $translate !~ m/W/) {
		my $clean = 0;
		# Clean the beginning
		while (!$clean and $#paragraph > 0) {
			$paragraph[0] =~ /^(\s*)(.*)/s;
			my $match = $1;
			if ($paragraph[0] eq $match) {
				if ($match ne "") {
					$self->pushline($match);
				}
				shift @paragraph;
				shift @paragraph;
			} else {
				$paragraph[0] = $2;
				if ($match ne "") {
					$self->pushline($match);
				}
				$clean = 1;
			}
		}
		$clean = 0;
		# Clean the end
		while (!$clean and $#paragraph > 0) {
			$paragraph[$#paragraph-1] =~ /^(.*?)(\s*)$/s;
			my $match = $2;
			if ($paragraph[$#paragraph-1] eq $match) {
				if ($match ne "") {
					$blank = $match.$blank;
				}
				pop @paragraph;
				pop @paragraph;
			} else {
				$paragraph[$#paragraph-1] = $1;
				if ($match ne "") {
					$blank = $match.$blank;
				}
				$clean = 1;
			}
		}
	}

	# Translate the string when needed
	# This will either push the translation in the translated document or
	# in the current holder translation.
	$self->translate_paragraph(@paragraph);

	# Push the trailing blanks
	if ($blank ne "") {
		$self->pushline($blank);
	}
	return $eof;
}

# Translate a @paragraph array of (string, reference).
# The $translate argument indicates if the strings must be translated or
# just pushed
sub translate_paragraph {
	my $self = shift;
	my @paragraph = @_;
	my $translate = $self->get_translate_options($self->get_path);

	while (    (scalar @paragraph)
	       and ($paragraph[0] =~ m/^\s*\n/s)) {
		$self->pushline($paragraph[0]);
		shift @paragraph;
		shift @paragraph;
	}

	my $comments;
	while (@comments) {
		my ($comment,$eoc);
		do {
			my ($t,$l) = (shift @comments, shift @comments);
			$t =~ s/\n?(\0)?$//;
			$eoc = $1;
			$comment .= "\n" if defined $comment;
			$comment .= $t;
		} until ($eoc);
		$comments .= "\n" if defined $comments;
		$comments .= $comment;
		$self->pushline("<!--".$comment."-->\n") if defined $comment;
	}
	@comments = ();

	if ($self->{options}{'cpp'}) {
		my @tmp = @paragraph;
		@paragraph = ();
		while (@tmp) {
			my ($t,$l) = (shift @tmp, shift @tmp);
			# #include can be followed by a filename between
			# <> brackets. In that case, the argument won't be
			# handled in the same call to translate_paragraph.
			# Thus do not try to match "include ".
			if ($t =~ m/^#[ \t]*(if |endif|undef |include|else|ifdef |ifndef |define )/si) {
				if (@paragraph) {
					$self->translate_paragraph(@paragraph);
					@paragraph = ();
					$self->pushline("\n");
				}
				$self->pushline($t);
			} else {
				push @paragraph, ($t,$l);
			}
		}
	}

	my $para = $self->join_lines(@paragraph);
	if ( length($para) > 0 ) {
		if ($translate ne "") {
			# This tag should be translated
			$self->pushline($self->found_string(
				$para,
				$paragraph[1], {
					type=>"tag",
					tag_options=>$translate,
					comments=>$comments
				}));
		} else {
			# Inform that this tag isn't translated in debug mode
			print wrap_ref_mod($paragraph[1], "po4a::xml", dgettext ("po4a", "Content of tag %s excluded: %s"), $self->get_path, $para)
			       if $self->debug();
			$self->pushline($self->recode_skipped_text($para));
		}
	}
	# Now the paragraph is fully translated.
	# If we have all the holders' translation, we can replace the
	# placeholders by their translations.
	# We must wait to have all the translations because the holders are
	# numbered.
	{
		my $holder = $save_holders[$#save_holders];
		my $translation = $holder->{'translation'};

		# Count the number of <placeholder ...> in $translation
		my $count = 0;
		my $str = $translation;
		while (    (defined $str)
		       and ($str =~ m/^.*?<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s)) {
			$count += 1;
			$str = $2;
			if ($holder->{'sub_translations'}->[$1] =~ m/<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>/s) {
				$count = -1;
				last;
			}
		}

		if (    (defined $translation)
		    and (scalar(@{$holder->{'sub_translations'}}) == $count)) {
			# OK, all the holders of the current paragraph are
			# closed (and translated).
			# Replace them by their translation.
			while ($translation =~ m/^(.*?)<placeholder\s+type="[^"]+"\s+id="(\d+)"\s*\/>(.*)$/s) {
				# FIXME: we could also check that
				#          * the holder exists
				#          * all the holders are used
				$translation = $1.$holder->{'sub_translations'}->[$2].$3;
			}
			# We have our translation
			$holder->{'translation'} = $translation;
			# And there is no need for any holder in it.
			my @sub_translations = ();
			$holder->{'sub_translations'} = \@sub_translations;
		}
	}

}


=head2 WORKING WITH THE MODULE OPTIONS

=over 4

=item treat_options()

This function fills the internal structures that contain the tags, attributes
and inline data with the options of the module (specified in the command-line
or in the initialize function).

=back

=cut

sub treat_options {
	my $self = shift;

	if ($self->{options}{'caseinsensitive'}) {
		$self->{options}{'nodefault'}             = lc $self->{options}{'nodefault'};
		$self->{options}{'tags'}                  = lc $self->{options}{'tags'};
		$self->{options}{'break'}                 = lc $self->{options}{'break'};
		$self->{options}{'_default_break'}        = lc $self->{options}{'_default_break'};
		$self->{options}{'translated'}            = lc $self->{options}{'translated'};
		$self->{options}{'_default_translated'}   = lc $self->{options}{'_default_translated'};
		$self->{options}{'untranslated'}          = lc $self->{options}{'untranslated'};
		$self->{options}{'_default_untranslated'} = lc $self->{options}{'_default_untranslated'};
		$self->{options}{'attributes'}            = lc $self->{options}{'attributes'};
		$self->{options}{'_default_attributes'}   = lc $self->{options}{'_default_attributes'};
		$self->{options}{'inline'}                = lc $self->{options}{'inline'};
		$self->{options}{'_default_inline'}       = lc $self->{options}{'_default_inline'};
		$self->{options}{'placeholder'}           = lc $self->{options}{'placeholder'};
		$self->{options}{'_default_placeholder'}  = lc $self->{options}{'_default_placeholder'};
	}

	$self->{options}{'nodefault'} =~ /^\s*(.*)\s*$/s;
	my %list_nodefault;
	foreach (split(/\s+/s,$1)) {
		$list_nodefault{$_} = 1;
	}
	$self->{nodefault} = \%list_nodefault;

	$self->{options}{'tags'} =~ /^\s*(.*)\s*$/s;
	if (length $self->{options}{'tags'}) {
		warn wrap_mod("po4a::xml",
		             dgettext("po4a",
		                      "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tags");
	}
	foreach (split(/\s+/s,$1)) {
		$_ =~ m/^(.*?)(<.*)$/;
		$self->{tags}->{$2} = $1 || "";
	}

	if ($self->{options}{'tagsonly'}) {
		warn wrap_mod("po4a::xml",
		             dgettext("po4a",
		                      "The '%s' option is deprecated. Please use the translated/untranslated and/or break/inline/placeholder categories."), "tagsonly");
	}

	$self->{options}{'break'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		$tag =~ m/^(.*?)(<.*)$/;
		$self->{break}->{$2} = $1 || "";
	}
	$self->{options}{'_default_break'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		$tag =~ m/^(.*?)(<.*)$/;
		$self->{break}->{$2} = $1 || ""
			unless    $list_nodefault{$2}
			       or defined $self->{break}->{$2};
	}

	$self->{options}{'translated'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		$tag =~ m/^(.*?)(<.*)$/;
		$self->{translated}->{$2} = $1 || "";
	}
	$self->{options}{'_default_translated'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		$tag =~ m/^(.*?)(<.*)$/;
		$self->{translated}->{$2} = $1 || ""
			unless    $list_nodefault{$2}
			       or defined $self->{translated}->{$2};
	}

	$self->{options}{'untranslated'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		$tag =~ m/^(.*?)(<.*)$/;
		$self->{untranslated}->{$2} = $1 || "";
	}
	$self->{options}{'_default_untranslated'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		$tag =~ m/^(.*?)(<.*)$/;
		$self->{untranslated}->{$2} = $1 || ""
			unless    $list_nodefault{$2}
			       or defined $self->{untranslated}->{$2};
	}

	$self->{options}{'attributes'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		if ($tag =~ m/^(.*?)(<.*)$/) {
			$self->{attributes}->{$2} = $1 || "";
		} else {
			$self->{attributes}->{$tag} = "";
		}
	}
	$self->{options}{'_default_attributes'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		if ($tag =~ m/^(.*?)(<.*)$/) {
			$self->{attributes}->{$2} = $1 || ""
				unless    $list_nodefault{$2}
				       or defined $self->{attributes}->{$2};
		} else {
			$self->{attributes}->{$tag} = ""
				unless    $list_nodefault{$tag}
				       or defined $self->{attributes}->{$tag};
		}
	}

	my @list_inline;
	$self->{options}{'inline'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		$tag =~ m/^(.*?)(<.*)$/;
		$self->{inline}->{$2} = $1 || "";
	}
	$self->{options}{'_default_inline'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		$tag =~ m/^(.*?)(<.*)$/;
		$self->{inline}->{$2} = $1 || ""
			unless    $list_nodefault{$2}
			       or defined $self->{inline}->{$2};
	}

	$self->{options}{'placeholder'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		$tag =~ m/^(.*?)(<.*)$/;
		$self->{placeholder}->{$2} = $1 || "";
	}
	$self->{options}{'_default_placeholder'} =~ /^\s*(.*)\s*$/s;
	foreach my $tag (split(/\s+/s,$1)) {
		$tag =~ m/^(.*?)(<.*)$/;
		$self->{placeholder}->{$2} = $1 || ""
			unless    $list_nodefault{$2}
			       or defined $self->{placeholder}->{$2};
	}

	# There should be no translated and untranslated tags
	foreach my $tag (keys %{$self->{translated}}) {
		die wrap_mod("po4a::xml",
		             dgettext("po4a",
		                      "Tag '%s' both in the %s and %s categories."), $tag, "translated", "untranslated")
			if defined $self->{untranslated}->{$tag};
	}
	# There should be no inline, break, and placeholder tags
	foreach my $tag (keys %{$self->{inline}}) {
		die wrap_mod("po4a::xml",
		             dgettext("po4a",
		                      "Tag '%s' both in the %s and %s categories."), $tag, "inline", "break")
			if defined $self->{break}->{$tag};
		die wrap_mod("po4a::xml",
		             dgettext("po4a",
		                      "Tag '%s' both in the %s and %s categories."), $tag, "inline", "placeholder")
			if defined $self->{placeholder}->{$tag};
	}
	foreach my $tag (keys %{$self->{break}}) {
		die wrap_mod("po4a::xml",
		             dgettext("po4a",
		                      "Tag '%s' both in the %s and %s categories."), $tag, "break", "placeholder")
			if defined $self->{placeholder}->{$tag};
	}
}

=head2 GETTING TEXT FROM THE INPUT DOCUMENT

=over

=item get_string_until($%)

This function returns an array with the lines (and references) from the input
document until it finds the first argument.  The second argument is an options
hash. Value 0 means disabled (the default) and 1, enabled.

The valid options are:

=over 4

=item include

This makes the returned array to contain the searched text

=item remove

This removes the returned stream from the input

=item unquoted

This ensures that the searched text is outside any quotes

=back

=cut

sub get_string_until {
	my ($self,$search) = (shift,shift);
	my $options = shift;
	my ($include,$remove,$unquoted, $regex) = (0,0,0,0);

	if (defined($options->{include})) { $include = $options->{include}; }
	if (defined($options->{remove})) { $remove = $options->{remove}; }
	if (defined($options->{unquoted})) { $unquoted = $options->{unquoted}; }
	if (defined($options->{regex})) { $regex = $options->{regex}; }

	my ($line,$ref) = $self->shiftline();
	my (@text,$paragraph);
	my ($eof,$found) = (0,0);

	$search = "\Q$search\E" unless $regex;
	while (defined($line) and !$found) {
		push @text, ($line,$ref);
		$paragraph .= $line;
		if ($unquoted) {
			if ( $paragraph =~ /^((\".*?\")|(\'.*?\')|[^\"\'])*$search/s ) {
				$found = 1;
			}
		} else {
			if ( $paragraph =~ /$search/s ) {
				$found = 1;
			}
		}
		if (!$found) {
			($line,$ref)=$self->shiftline();
		}
	}

	if (!defined($line)) { $eof = 1; }

	if ( $found ) {
		$line = "";
		if($unquoted) {
			$paragraph =~ /^(?:(?:\".*?\")|(?:\'.*?\')|[^\"\'])*?$search(.*)$/s;
			$line = $1;
			$text[$#text-1] =~ s/\Q$line\E$//s;
		} else {
			$paragraph =~ /$search(.*)$/s;
			$line = $1;
			$text[$#text-1] =~ s/\Q$line\E$//s;
		}
		if(!$include) {
			$text[$#text-1] =~ /^(.*)($search.*)$/s;
			$text[$#text-1] = $1;
			$line = $2.$line;
		}
		if (defined($line) and ($line ne "")) {
			$self->unshiftline ($line,$text[$#text]);
		}
	}
	if (!$remove) {
		$self->unshiftline (@text);
	}

	#If we get to the end of the file, we return the whole paragraph
	return ($eof,@text);
}

=item skip_spaces(\@)

This function receives as argument the reference to a paragraph (in the format
returned by get_string_until), skips his heading spaces and returns them as
a simple string.

=cut

sub skip_spaces {
	my ($self,$pstring)=@_;
	my $space="";

	while (@$pstring and (@$pstring[0] =~ /^(\s+)(.*)$/s or @$pstring[0] eq "")) {
		if (@$pstring[0] ne "") {
			$space .= $1;
			@$pstring[0] = $2;
		}

		if (@$pstring[0] eq "") {
			shift @$pstring;
			shift @$pstring;
		}
	}
	return $space;
}

=item join_lines(@)

This function returns a simple string with the text from the argument array
(discarding the references).

=cut

sub join_lines {
	my ($self,@lines)=@_;
	my ($line,$ref);
	my $text = "";
	while ($#lines > 0) {
		($line,$ref) = (shift @lines,shift @lines);
		$text .= $line;
	}
	return $text;
}

=back

=head1 STATUS OF THIS MODULE

This module can translate tags and attributes.

=head1 TODO LIST

DOCTYPE (ENTITIES)

There is a minimal support for the translation of entities. They are
translated as a whole, and tags are not taken into account. Multilines
entities are not supported and entities are always rewrapped during the
translation.

MODIFY TAG TYPES FROM INHERITED MODULES
(move the tag_types structure inside the $self hash?)

=head1 SEE ALSO

L<po4a(7)|po4a.7>, L<Locale::Po4a::TransTractor(3pm)|Locale::Po4a::TransTractor>.

=head1 AUTHORS

 Jordi Vilalta <jvprat@gmail.com>
 Nicolas François <nicolas.francois@centraliens.net>

=head1 COPYRIGHT AND LICENSE

 Copyright (c) 2004 by Jordi Vilalta  <jvprat@gmail.com>
 Copyright (c) 2008-2009 by Nicolas François <nicolas.francois@centraliens.net>

This program is free software; you may redistribute it and/or modify it
under the terms of GPL (see the COPYING file).

=cut

1;
author	Dongsheng Song <dongsheng.song@gmail.com>
date	Thu, 12 Mar 2009 15:43:56 +0800
parents
children