Mercurial > mplayer.hg
changeset 12686:cc5aa27a50ff
simple subtitle editor by Michael Klepikov
author | alex |
---|---|
date | Sat, 26 Jun 2004 12:35:35 +0000 |
parents | b4587790a399 |
children | cb35163ef0a1 |
files | TOOLS/subedit.pl |
diffstat | 1 files changed, 445 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TOOLS/subedit.pl Sat Jun 26 12:35:35 2004 +0000 @@ -0,0 +1,445 @@ +#!/usr/bin/perl -w + +# A script for pipelined editing of subtitle files. +# Copyright (C) 2004 Michael Klepikov <mike72@mail.ru> +# +# Version 1.0 initial release 28-Mar-04 +# +# Comments, suggestions -- send me an mail, but the recommended way is +# to enhance/fix on your own and submit to the distribution;) +# If you like, I can review the fixes. +# +# This script is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# Retain original credits when modifying. +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +use Math::BigInt; + +# Constants +my $FMT_UNKNOWN = 0; +my $FMT_SRT = 1; + +# Argument values +my $DEBUG = 0; +my $inFormat; +my $outFormat; +my $shiftMilli; +my $scaleMilli; +my $splitFromMilli; +my $splitToMilli; + +## Process command line +while (defined ($argVal = shift)) { + if ($argVal eq "-d" || $argVal eq "--debug") { + $DEBUG = 1; + } elsif ($argVal eq "-if" || $argVal eq "--input-format") { + $inFormat = shift; + usage ("Must specify input format") if ! $inFormat; + if ($inFormat =~ /^srt/i) { + $inFormat = $FMT_SRT; + } else { + usage ("Invalid input format"); + } + } elsif ($argVal eq "-of" || $argVal eq "--output-format") { + $outFormat = shift; + usage ("Must specify input format") if ! $outFormat; + if ($outFormat =~ /^srt/i) { + $outFormat = $FMT_SRT; + } else { + usage ("Invalid output format"); + } + } elsif ($argVal eq "-s" || $argVal eq "--shift") { + my $argTime = shift; + if (! defined $argTime || + ! defined ($shiftMilli = getTimeMillis ($argTime))) { + usage ("Invalid shift time value"); + } + } elsif ($argVal eq "-c" || $argVal eq "--scale") { + my $argTime = shift; + if (! defined $argTime || + ! defined ($scaleMilli = getTimeMillis ($argTime))) { + usage ("Invalid scale time value"); + } + } elsif ($argVal eq "-f" || $argVal eq "--split-from") { + my $argTime = shift; + if (! defined $argTime || + ! defined ($splitFromMilli = getTimeMillis ($argTime))) { + usage ("Invalid split start time value"); + } + } elsif ($argVal eq "-t" || $argVal eq "--split-to") { + my $argTime = shift; + if (! defined $argTime || + ! defined ($splitToMilli = getTimeMillis ($argTime))) { + usage ("Invalid split end time value"); + } + } elsif ($argVal eq "-h" || $argVal eq "--help") { + usage (); + } else { + usage ("Unrecognized argument $argVal"); + } +} + +# Input format defaults to SRT +$inFormat = $FMT_SRT if (! defined $inFormat); +# Output format defaults to the same as input +$outFormat = $inFormat if (! defined $outFormat); + +## Read + +my $subs; +if ($inFormat == $FMT_SRT) { + $subs = readSRT (*STDIN); + printf STDERR ("Read %d SRT subs\n", scalar @{$subs}) if $DEBUG; + # Sort by start time + @{$subs} = sort {$a -> {srtStartTime} <=> $b -> {srtEndTime}} @{$subs}; +} + +## Transform + +if (defined $shiftMilli && 0 != $shiftMilli) { + printf STDERR ("Shift: %d milliseconds\n", $shiftMilli) if $DEBUG; + shiftSRT ($subs, $shiftMilli); +} + +if (defined $splitFromMilli || defined $splitToMilli) { + if ($DEBUG) { + my $printFrom = (defined $splitFromMilli) ? $splitFromMilli : "-"; + my $printTo = (defined $splitToMilli) ? $splitToMilli : "-"; + printf STDERR ("Split: from $printFrom to $printTo\n"); + } + splitSRT ($subs, $splitFromMilli, $splitToMilli); +} + +if (defined $scaleMilli && 0 != $scaleMilli) { + my $lastSubIdx = scalar @{$subs} - 1; + if ($lastSubIdx >= 0) { + my $lastTimeOrig = $subs -> [$lastSubIdx] -> {srtEndTime}; + if ($lastTimeOrig == 0) { + die "Cannot scale when last subtitle ends at 00:00:00,000"; + } + my $lastTimeScaled = $lastTimeOrig + $scaleMilli; + printf STDERR ("Scale: %d/%d\n", $lastTimeScaled, $lastTimeOrig) if $DEBUG; + scaleSRT ($subs, $lastTimeScaled, $lastTimeOrig); + } +} + +## Write +if ($outFormat == $FMT_SRT) { + writeSRT (*STDOUT, $subs); +} + +# Close STDOUT, as recommended by Perl manual +# (allows diagnostics on disc overflow, etc.) +close (STDOUT) || die "Cannot close output stream: $!"; + +exit 0; + +## Subroutines + +# Convert string time format to milliseconds +# SRT style: "01:20:03.251", and "," is allowed instead of "." +# Return undef in case of format error +sub getTimeMillis +{ + $_ = shift; + my $millis = 0; + + if (/\s*(.*)[\.,]([0-9]+)?\s*$/) { # Fraction; strip surrounding spaces + #print STDERR "frac: \$1=$1 \$2=$2\n" if $DEBUG; + $_ = $1; + $millis += ("0." . $2) * 1000 if $2; + } + if (/(.*?)([0-9]+)$/) { # Seconds + #print STDERR "secs: \$1=$1 \$2=$2\n" if $DEBUG; + $_ = $1; + $millis += $2 * 1000 if $2; + } + if (/(.*?)([0-9]+):$/) { # Minutes + #print STDERR "mins: \$1=$1 \$2=$2\n" if $DEBUG; + $_ = $1; + $millis += $2 * 60000 if $2; + } + if (/(.*?)([0-9]+):$/) { # Hours + #print STDERR "mins: \$1=$1 \$2=$2\n" if $DEBUG; + $_ = $1; + $millis += $2 * 3600000 if $2; + } + if (/(.*?)\-$/) { # Minus sign + $_ = $1; + $millis *= -1; + } + $millis = undef if (! /^$/); # Make sure we ate everything up + if ($DEBUG) { + if (defined $millis) { + #print STDERR "time value match: $millis ms\n"; + } else { + #print STDERR "time mismatch\n"; + } + } + return $millis; +} + +# Convert milliseconds to SRT formatted string +sub getTimeSRT +{ + my $t = shift; + my $tMinus = ""; + if ($t < 0) { + $t = -$t; + $tMinus = "-"; + } + my $tMilli = $t % 1000; + $t /= 1000; + my $tSec = $t % 60; + $t /= 60; + my $tMin = $t % 60; + $t /= 60; + my $tHr = $t; + return sprintf ("%s%02d:%02d:%02d,%03d", + $tMinus, $tHr, $tMin, $tSec, $tMilli); +} + +# Read SRT subtitles +sub readSRT +{ + local *IN = shift; + my $subs = []; + + $_ = <IN>; + print STDERR "Undefined first line\n" if ! defined $_ && $DEBUG; + my $lineNo = 1; + READ_SUBS: + while (defined $_) { + # Each loop iteration reads one subtitle from <IN> + my $sub = {}; + + # print STDERR "Reading line $lineNo\n" if $DEBUG; + + # Skip empty lines + while (/^\s*$/) { + last READ_SUBS if ! ($_ = <IN>); + ++$lineNo; + } + + # Subtitle number + if (/^\s*([0-9]+)\s*$/) { + $sub -> {srtNumber} = $1; + # print "SRT num: $1\n" if $DEBUG; + } else { + die "Invalid SRT format at line $lineNo"; + } + + # Timing + if ($_ = <IN>) { + ++$lineNo; + } else { + die "Unexpected end of SRT stream at line $lineNo"; + } + # print STDERR "LINE: $_\n" if $DEBUG; + if (/^\s*(\S+)\s*--\>\s*(\S+)\s*$/) { + my $startMillis = getTimeMillis ($1); + my $endMillis = getTimeMillis ($2); + die "Invalid SRT timing format at line $lineNo: $_" + if ! defined $startMillis || ! defined $endMillis; + $sub -> {srtStartTime} = $startMillis; + $sub -> {srtEndTime} = $endMillis; + } else { + die "Invalid SRT timing format at line $lineNo: $_"; + } + + # Text lines + my $subLines = []; + while (1) { + last if ! ($_ = <IN>); # EOF ends subtitle + ++$lineNo; + last if /^\s*$/; # Empty line ends subtitle + ($_ = $_) =~ s/\s+$//; # Strip trailing spaces + push @{$subLines}, $_; + } + die "No text in SRT subtitle at line $lineNo" if 0 == scalar @{$subLines}; + $sub -> {lines} = $subLines; + + # Append subtitle to the list + push @{$subs}, $sub; + } + print STDERR "SRT read ok, $lineNo lines\n" if $DEBUG; + + return $subs; +} + +# Write SRT subtitles +sub writeSRT +{ + use integer; # For integer division + local *OUT = shift; + my $subs = shift; + + my $subNum = 0; + foreach (@{$subs}) { + ++$subNum; + + my $sub = $_; + my $sTimeSRT = getTimeSRT ($sub -> {srtStartTime}); + my $eTimeSRT = getTimeSRT ($sub -> {srtEndTime}); + printf OUT ("%d\n%s --> %s\n", $subNum, $sTimeSRT, $eTimeSRT); + foreach (@{$sub -> {lines}}) { + printf OUT ("%s\n", $_); + } + printf OUT "\n"; + } + printf STDERR ("Wrote %d SRT subs\n", $subNum) if $DEBUG; +} + +# Shift SRT subtitles by a given number of seconds. +# The number may be negative and fractional. +sub shiftSRT +{ + use integer; # $shiftMilli could be passed as float + my $subs = shift; + my $shiftMilli = shift; + + foreach (@{$subs}) { + $_ -> {srtStartTime} += $shiftMilli; + $_ -> {srtEndTime} += $shiftMilli; + } +} + +# Multiply each subtitle timing by a divident and divide by divisor. +# The idea is that the divident is usually the new total number of +# milliseconds in the subtitle file, and the divisor is the old +# total number of milliseconds in the subtitle file. +# We could simply use a double precision real coefficient instead of +# integer divident and divisor, and that could be good enough, but +# using integer arithmetics *guarantees* precision up to the last +# digit, so why settle for good enough when we can have a guarantee. +# +# Uses Math::BigInt arithmetics, because it works with numbers +# up to (total number of milliseconds for a subtitle timing)^2, +# which could be on the order of approximately 1e+13, which is +# larger than maximum 32-bit integer. +# There is a performance loss when using BigInt vs. regular floating +# point arithmetics, but the actual performance is quite acceptable +# on files with a few thousand subtitles. +sub scaleSRT +{ + use integer; # Divident and divisor could be passed as floats, truncate + my $subs = shift; + my $scaleDividend = shift; + my $scaleDivisor = shift; + + foreach (@{$subs}) { + my $ss = Math::BigInt -> new ($_ -> {srtStartTime}); + $ss = $ss -> bmul ($scaleDividend); + $_ -> {srtStartTime} = $ss -> bdiv ($scaleDivisor) -> bsstr (); + my $se = Math::BigInt -> new ($_ -> {srtEndTime}); + $se = $se -> bmul ($scaleDividend); + $_ -> {srtEndTime} = $se -> bdiv ($scaleDivisor) -> bsstr (); + } +} + +# Extract a fragment within a given time interval +# Either "from" or "to" may be undefined +sub splitSRT +{ + use integer; # fromMilli and toMilli could be passed as floats, truncate + my $subs = shift; + my $fromMilli = shift; + my $toMilli = shift; + + my $iSub = 0; + while ($iSub < scalar @{$subs}) { + $_ = $subs -> [$iSub]; + my $keep = 0; + if (! defined $fromMilli || $_ -> {srtEndTime} >= $fromMilli) { + # The subtitle ends later than the start boundary + + # Fix overlapping start timing, + # but only of the start boundary is not infinite (undef) + if (defined $fromMilli && $_ -> {srtStartTime} < $fromMilli) { + $_ -> {srtStartTime} = $fromMilli; + } + if (! defined $toMilli || $_ -> {srtStartTime} <= $toMilli) { + # The subtitle begins earlier than the end boundary + + # Fix overlapping end timing, + # but only of the end boundary is not infinite (undef) + if (defined $toMilli && $_ -> {srtEndTime} > $toMilli) { + $_ -> {srtEndTime} = $toMilli; + } + + # All conditions met, all fixes done + $keep = 1; + } + } + if ($keep) { + ++$iSub; + } else { + splice @{$subs}, $iSub, 1; + } + } +} + +# Print brief usage help +# Accepts an optional error message, e.g. for errors parsing command line +sub usage +{ + my $msg = shift; + my $exitCode = 0; + + if (defined $msg) { + $exitCode = 2; + print STDERR "$msg\n"; + } + + print STDERR <<USAGE; +Usage: $0 [switches] + -if,--input-format <fmt> input format; supported: SRT + default is SRT + -of,--output-format <fmt> output format; supported: SRT + default is same as input format + -s,--shift <time> shift all subtitles by <time> + (format: [-]hh:mm:ss,fraction) + -c,--scale <time> scale by adding <time> to overall duration + -f,--split-from <time> Drop subtitles that end before <time> + -t,--split-to <time> Drop subtitles that start after <time> + (will truncate timing if it overlaps a boundary) + -r,--renumber renumber SRT subtitles in output + -d,--debug enable debug output + -h,--help this help message + +All times could be negative. Input/output may also contain negative timings, +which is sometimes useful for intermediate results. +SRT subtitles are always renumbered on output. + +EXAMPLES + +Split subtitle file into two disks at a boundary of one hour 15 minutes: + + subedit.pl --split-to 1:15:0 < all.srt > p1.srt + subedit.pl -f 1:15:0 < all.srt | subedit.pl --shift -1:15:0 > p2.srt + +Join the previous two disks back into one file: + + subedit.pl -s 1:15:00 < p2.srt | cat p1.srt - | subedit.pl > all.srt + +Correct a situation where the first subtitle starts in sync with the video, +but the last one starts 3.5 seconds earlier than the speech in the video, +assuming the first subtitle timing is 00:01:05.030: + + subedit.pl -s -1:5.03 | subedit.pl -c 3.5 | subedit.pl -s 1:5.03 +USAGE + + exit $exitCode; +}