Mercurial > mplayer.hg
annotate TOOLS/subedit.pl @ 20962:a95ed9a69caf
Put files fully owned by my under GPL v2 "or later"
author | reimar |
---|---|
date | Fri, 17 Nov 2006 10:03:33 +0000 |
parents | 401b440a6d76 |
children | 92a795af2600 |
rev | line source |
---|---|
12686 | 1 #!/usr/bin/perl -w |
2 | |
3 # A script for pipelined editing of subtitle files. | |
4 # Copyright (C) 2004 Michael Klepikov <mike72@mail.ru> | |
5 # | |
6 # Version 1.0 initial release 28-Mar-04 | |
7 # | |
8 # Comments, suggestions -- send me an mail, but the recommended way is | |
9 # to enhance/fix on your own and submit to the distribution;) | |
10 # If you like, I can review the fixes. | |
11 # | |
12 # This script is free software; you can redistribute it and/or | |
13 # modify it under the terms of the GNU Lesser General Public | |
14 # License as published by the Free Software Foundation; either | |
15 # version 2 of the License, or (at your option) any later version. | |
16 # Retain original credits when modifying. | |
17 # | |
18 # This script is distributed in the hope that it will be useful, | |
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
21 # Lesser General Public License for more details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public | |
24 # License along with this library; if not, write to the Free Software | |
17367
401b440a6d76
Update licensing information: The FSF changed postal address.
diego
parents:
12686
diff
changeset
|
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
12686 | 26 # |
27 | |
28 use Math::BigInt; | |
29 | |
30 # Constants | |
31 my $FMT_UNKNOWN = 0; | |
32 my $FMT_SRT = 1; | |
33 | |
34 # Argument values | |
35 my $DEBUG = 0; | |
36 my $inFormat; | |
37 my $outFormat; | |
38 my $shiftMilli; | |
39 my $scaleMilli; | |
40 my $splitFromMilli; | |
41 my $splitToMilli; | |
42 | |
43 ## Process command line | |
44 while (defined ($argVal = shift)) { | |
45 if ($argVal eq "-d" || $argVal eq "--debug") { | |
46 $DEBUG = 1; | |
47 } elsif ($argVal eq "-if" || $argVal eq "--input-format") { | |
48 $inFormat = shift; | |
49 usage ("Must specify input format") if ! $inFormat; | |
50 if ($inFormat =~ /^srt/i) { | |
51 $inFormat = $FMT_SRT; | |
52 } else { | |
53 usage ("Invalid input format"); | |
54 } | |
55 } elsif ($argVal eq "-of" || $argVal eq "--output-format") { | |
56 $outFormat = shift; | |
57 usage ("Must specify input format") if ! $outFormat; | |
58 if ($outFormat =~ /^srt/i) { | |
59 $outFormat = $FMT_SRT; | |
60 } else { | |
61 usage ("Invalid output format"); | |
62 } | |
63 } elsif ($argVal eq "-s" || $argVal eq "--shift") { | |
64 my $argTime = shift; | |
65 if (! defined $argTime || | |
66 ! defined ($shiftMilli = getTimeMillis ($argTime))) { | |
67 usage ("Invalid shift time value"); | |
68 } | |
69 } elsif ($argVal eq "-c" || $argVal eq "--scale") { | |
70 my $argTime = shift; | |
71 if (! defined $argTime || | |
72 ! defined ($scaleMilli = getTimeMillis ($argTime))) { | |
73 usage ("Invalid scale time value"); | |
74 } | |
75 } elsif ($argVal eq "-f" || $argVal eq "--split-from") { | |
76 my $argTime = shift; | |
77 if (! defined $argTime || | |
78 ! defined ($splitFromMilli = getTimeMillis ($argTime))) { | |
79 usage ("Invalid split start time value"); | |
80 } | |
81 } elsif ($argVal eq "-t" || $argVal eq "--split-to") { | |
82 my $argTime = shift; | |
83 if (! defined $argTime || | |
84 ! defined ($splitToMilli = getTimeMillis ($argTime))) { | |
85 usage ("Invalid split end time value"); | |
86 } | |
87 } elsif ($argVal eq "-h" || $argVal eq "--help") { | |
88 usage (); | |
89 } else { | |
90 usage ("Unrecognized argument $argVal"); | |
91 } | |
92 } | |
93 | |
94 # Input format defaults to SRT | |
95 $inFormat = $FMT_SRT if (! defined $inFormat); | |
96 # Output format defaults to the same as input | |
97 $outFormat = $inFormat if (! defined $outFormat); | |
98 | |
99 ## Read | |
100 | |
101 my $subs; | |
102 if ($inFormat == $FMT_SRT) { | |
103 $subs = readSRT (*STDIN); | |
104 printf STDERR ("Read %d SRT subs\n", scalar @{$subs}) if $DEBUG; | |
105 # Sort by start time | |
106 @{$subs} = sort {$a -> {srtStartTime} <=> $b -> {srtEndTime}} @{$subs}; | |
107 } | |
108 | |
109 ## Transform | |
110 | |
111 if (defined $shiftMilli && 0 != $shiftMilli) { | |
112 printf STDERR ("Shift: %d milliseconds\n", $shiftMilli) if $DEBUG; | |
113 shiftSRT ($subs, $shiftMilli); | |
114 } | |
115 | |
116 if (defined $splitFromMilli || defined $splitToMilli) { | |
117 if ($DEBUG) { | |
118 my $printFrom = (defined $splitFromMilli) ? $splitFromMilli : "-"; | |
119 my $printTo = (defined $splitToMilli) ? $splitToMilli : "-"; | |
120 printf STDERR ("Split: from $printFrom to $printTo\n"); | |
121 } | |
122 splitSRT ($subs, $splitFromMilli, $splitToMilli); | |
123 } | |
124 | |
125 if (defined $scaleMilli && 0 != $scaleMilli) { | |
126 my $lastSubIdx = scalar @{$subs} - 1; | |
127 if ($lastSubIdx >= 0) { | |
128 my $lastTimeOrig = $subs -> [$lastSubIdx] -> {srtEndTime}; | |
129 if ($lastTimeOrig == 0) { | |
130 die "Cannot scale when last subtitle ends at 00:00:00,000"; | |
131 } | |
132 my $lastTimeScaled = $lastTimeOrig + $scaleMilli; | |
133 printf STDERR ("Scale: %d/%d\n", $lastTimeScaled, $lastTimeOrig) if $DEBUG; | |
134 scaleSRT ($subs, $lastTimeScaled, $lastTimeOrig); | |
135 } | |
136 } | |
137 | |
138 ## Write | |
139 if ($outFormat == $FMT_SRT) { | |
140 writeSRT (*STDOUT, $subs); | |
141 } | |
142 | |
143 # Close STDOUT, as recommended by Perl manual | |
144 # (allows diagnostics on disc overflow, etc.) | |
145 close (STDOUT) || die "Cannot close output stream: $!"; | |
146 | |
147 exit 0; | |
148 | |
149 ## Subroutines | |
150 | |
151 # Convert string time format to milliseconds | |
152 # SRT style: "01:20:03.251", and "," is allowed instead of "." | |
153 # Return undef in case of format error | |
154 sub getTimeMillis | |
155 { | |
156 $_ = shift; | |
157 my $millis = 0; | |
158 | |
159 if (/\s*(.*)[\.,]([0-9]+)?\s*$/) { # Fraction; strip surrounding spaces | |
160 #print STDERR "frac: \$1=$1 \$2=$2\n" if $DEBUG; | |
161 $_ = $1; | |
162 $millis += ("0." . $2) * 1000 if $2; | |
163 } | |
164 if (/(.*?)([0-9]+)$/) { # Seconds | |
165 #print STDERR "secs: \$1=$1 \$2=$2\n" if $DEBUG; | |
166 $_ = $1; | |
167 $millis += $2 * 1000 if $2; | |
168 } | |
169 if (/(.*?)([0-9]+):$/) { # Minutes | |
170 #print STDERR "mins: \$1=$1 \$2=$2\n" if $DEBUG; | |
171 $_ = $1; | |
172 $millis += $2 * 60000 if $2; | |
173 } | |
174 if (/(.*?)([0-9]+):$/) { # Hours | |
175 #print STDERR "mins: \$1=$1 \$2=$2\n" if $DEBUG; | |
176 $_ = $1; | |
177 $millis += $2 * 3600000 if $2; | |
178 } | |
179 if (/(.*?)\-$/) { # Minus sign | |
180 $_ = $1; | |
181 $millis *= -1; | |
182 } | |
183 $millis = undef if (! /^$/); # Make sure we ate everything up | |
184 if ($DEBUG) { | |
185 if (defined $millis) { | |
186 #print STDERR "time value match: $millis ms\n"; | |
187 } else { | |
188 #print STDERR "time mismatch\n"; | |
189 } | |
190 } | |
191 return $millis; | |
192 } | |
193 | |
194 # Convert milliseconds to SRT formatted string | |
195 sub getTimeSRT | |
196 { | |
197 my $t = shift; | |
198 my $tMinus = ""; | |
199 if ($t < 0) { | |
200 $t = -$t; | |
201 $tMinus = "-"; | |
202 } | |
203 my $tMilli = $t % 1000; | |
204 $t /= 1000; | |
205 my $tSec = $t % 60; | |
206 $t /= 60; | |
207 my $tMin = $t % 60; | |
208 $t /= 60; | |
209 my $tHr = $t; | |
210 return sprintf ("%s%02d:%02d:%02d,%03d", | |
211 $tMinus, $tHr, $tMin, $tSec, $tMilli); | |
212 } | |
213 | |
214 # Read SRT subtitles | |
215 sub readSRT | |
216 { | |
217 local *IN = shift; | |
218 my $subs = []; | |
219 | |
220 $_ = <IN>; | |
221 print STDERR "Undefined first line\n" if ! defined $_ && $DEBUG; | |
222 my $lineNo = 1; | |
223 READ_SUBS: | |
224 while (defined $_) { | |
225 # Each loop iteration reads one subtitle from <IN> | |
226 my $sub = {}; | |
227 | |
228 # print STDERR "Reading line $lineNo\n" if $DEBUG; | |
229 | |
230 # Skip empty lines | |
231 while (/^\s*$/) { | |
232 last READ_SUBS if ! ($_ = <IN>); | |
233 ++$lineNo; | |
234 } | |
235 | |
236 # Subtitle number | |
237 if (/^\s*([0-9]+)\s*$/) { | |
238 $sub -> {srtNumber} = $1; | |
239 # print "SRT num: $1\n" if $DEBUG; | |
240 } else { | |
241 die "Invalid SRT format at line $lineNo"; | |
242 } | |
243 | |
244 # Timing | |
245 if ($_ = <IN>) { | |
246 ++$lineNo; | |
247 } else { | |
248 die "Unexpected end of SRT stream at line $lineNo"; | |
249 } | |
250 # print STDERR "LINE: $_\n" if $DEBUG; | |
251 if (/^\s*(\S+)\s*--\>\s*(\S+)\s*$/) { | |
252 my $startMillis = getTimeMillis ($1); | |
253 my $endMillis = getTimeMillis ($2); | |
254 die "Invalid SRT timing format at line $lineNo: $_" | |
255 if ! defined $startMillis || ! defined $endMillis; | |
256 $sub -> {srtStartTime} = $startMillis; | |
257 $sub -> {srtEndTime} = $endMillis; | |
258 } else { | |
259 die "Invalid SRT timing format at line $lineNo: $_"; | |
260 } | |
261 | |
262 # Text lines | |
263 my $subLines = []; | |
264 while (1) { | |
265 last if ! ($_ = <IN>); # EOF ends subtitle | |
266 ++$lineNo; | |
267 last if /^\s*$/; # Empty line ends subtitle | |
268 ($_ = $_) =~ s/\s+$//; # Strip trailing spaces | |
269 push @{$subLines}, $_; | |
270 } | |
271 die "No text in SRT subtitle at line $lineNo" if 0 == scalar @{$subLines}; | |
272 $sub -> {lines} = $subLines; | |
273 | |
274 # Append subtitle to the list | |
275 push @{$subs}, $sub; | |
276 } | |
277 print STDERR "SRT read ok, $lineNo lines\n" if $DEBUG; | |
278 | |
279 return $subs; | |
280 } | |
281 | |
282 # Write SRT subtitles | |
283 sub writeSRT | |
284 { | |
285 use integer; # For integer division | |
286 local *OUT = shift; | |
287 my $subs = shift; | |
288 | |
289 my $subNum = 0; | |
290 foreach (@{$subs}) { | |
291 ++$subNum; | |
292 | |
293 my $sub = $_; | |
294 my $sTimeSRT = getTimeSRT ($sub -> {srtStartTime}); | |
295 my $eTimeSRT = getTimeSRT ($sub -> {srtEndTime}); | |
296 printf OUT ("%d\n%s --> %s\n", $subNum, $sTimeSRT, $eTimeSRT); | |
297 foreach (@{$sub -> {lines}}) { | |
298 printf OUT ("%s\n", $_); | |
299 } | |
300 printf OUT "\n"; | |
301 } | |
302 printf STDERR ("Wrote %d SRT subs\n", $subNum) if $DEBUG; | |
303 } | |
304 | |
305 # Shift SRT subtitles by a given number of seconds. | |
306 # The number may be negative and fractional. | |
307 sub shiftSRT | |
308 { | |
309 use integer; # $shiftMilli could be passed as float | |
310 my $subs = shift; | |
311 my $shiftMilli = shift; | |
312 | |
313 foreach (@{$subs}) { | |
314 $_ -> {srtStartTime} += $shiftMilli; | |
315 $_ -> {srtEndTime} += $shiftMilli; | |
316 } | |
317 } | |
318 | |
319 # Multiply each subtitle timing by a divident and divide by divisor. | |
320 # The idea is that the divident is usually the new total number of | |
321 # milliseconds in the subtitle file, and the divisor is the old | |
322 # total number of milliseconds in the subtitle file. | |
323 # We could simply use a double precision real coefficient instead of | |
324 # integer divident and divisor, and that could be good enough, but | |
325 # using integer arithmetics *guarantees* precision up to the last | |
326 # digit, so why settle for good enough when we can have a guarantee. | |
327 # | |
328 # Uses Math::BigInt arithmetics, because it works with numbers | |
329 # up to (total number of milliseconds for a subtitle timing)^2, | |
330 # which could be on the order of approximately 1e+13, which is | |
331 # larger than maximum 32-bit integer. | |
332 # There is a performance loss when using BigInt vs. regular floating | |
333 # point arithmetics, but the actual performance is quite acceptable | |
334 # on files with a few thousand subtitles. | |
335 sub scaleSRT | |
336 { | |
337 use integer; # Divident and divisor could be passed as floats, truncate | |
338 my $subs = shift; | |
339 my $scaleDividend = shift; | |
340 my $scaleDivisor = shift; | |
341 | |
342 foreach (@{$subs}) { | |
343 my $ss = Math::BigInt -> new ($_ -> {srtStartTime}); | |
344 $ss = $ss -> bmul ($scaleDividend); | |
345 $_ -> {srtStartTime} = $ss -> bdiv ($scaleDivisor) -> bsstr (); | |
346 my $se = Math::BigInt -> new ($_ -> {srtEndTime}); | |
347 $se = $se -> bmul ($scaleDividend); | |
348 $_ -> {srtEndTime} = $se -> bdiv ($scaleDivisor) -> bsstr (); | |
349 } | |
350 } | |
351 | |
352 # Extract a fragment within a given time interval | |
353 # Either "from" or "to" may be undefined | |
354 sub splitSRT | |
355 { | |
356 use integer; # fromMilli and toMilli could be passed as floats, truncate | |
357 my $subs = shift; | |
358 my $fromMilli = shift; | |
359 my $toMilli = shift; | |
360 | |
361 my $iSub = 0; | |
362 while ($iSub < scalar @{$subs}) { | |
363 $_ = $subs -> [$iSub]; | |
364 my $keep = 0; | |
365 if (! defined $fromMilli || $_ -> {srtEndTime} >= $fromMilli) { | |
366 # The subtitle ends later than the start boundary | |
367 | |
368 # Fix overlapping start timing, | |
369 # but only of the start boundary is not infinite (undef) | |
370 if (defined $fromMilli && $_ -> {srtStartTime} < $fromMilli) { | |
371 $_ -> {srtStartTime} = $fromMilli; | |
372 } | |
373 if (! defined $toMilli || $_ -> {srtStartTime} <= $toMilli) { | |
374 # The subtitle begins earlier than the end boundary | |
375 | |
376 # Fix overlapping end timing, | |
377 # but only of the end boundary is not infinite (undef) | |
378 if (defined $toMilli && $_ -> {srtEndTime} > $toMilli) { | |
379 $_ -> {srtEndTime} = $toMilli; | |
380 } | |
381 | |
382 # All conditions met, all fixes done | |
383 $keep = 1; | |
384 } | |
385 } | |
386 if ($keep) { | |
387 ++$iSub; | |
388 } else { | |
389 splice @{$subs}, $iSub, 1; | |
390 } | |
391 } | |
392 } | |
393 | |
394 # Print brief usage help | |
395 # Accepts an optional error message, e.g. for errors parsing command line | |
396 sub usage | |
397 { | |
398 my $msg = shift; | |
399 my $exitCode = 0; | |
400 | |
401 if (defined $msg) { | |
402 $exitCode = 2; | |
403 print STDERR "$msg\n"; | |
404 } | |
405 | |
406 print STDERR <<USAGE; | |
407 Usage: $0 [switches] | |
408 -if,--input-format <fmt> input format; supported: SRT | |
409 default is SRT | |
410 -of,--output-format <fmt> output format; supported: SRT | |
411 default is same as input format | |
412 -s,--shift <time> shift all subtitles by <time> | |
413 (format: [-]hh:mm:ss,fraction) | |
414 -c,--scale <time> scale by adding <time> to overall duration | |
415 -f,--split-from <time> Drop subtitles that end before <time> | |
416 -t,--split-to <time> Drop subtitles that start after <time> | |
417 (will truncate timing if it overlaps a boundary) | |
418 -r,--renumber renumber SRT subtitles in output | |
419 -d,--debug enable debug output | |
420 -h,--help this help message | |
421 | |
422 All times could be negative. Input/output may also contain negative timings, | |
423 which is sometimes useful for intermediate results. | |
424 SRT subtitles are always renumbered on output. | |
425 | |
426 EXAMPLES | |
427 | |
428 Split subtitle file into two disks at a boundary of one hour 15 minutes: | |
429 | |
430 subedit.pl --split-to 1:15:0 < all.srt > p1.srt | |
431 subedit.pl -f 1:15:0 < all.srt | subedit.pl --shift -1:15:0 > p2.srt | |
432 | |
433 Join the previous two disks back into one file: | |
434 | |
435 subedit.pl -s 1:15:00 < p2.srt | cat p1.srt - | subedit.pl > all.srt | |
436 | |
437 Correct a situation where the first subtitle starts in sync with the video, | |
438 but the last one starts 3.5 seconds earlier than the speech in the video, | |
439 assuming the first subtitle timing is 00:01:05.030: | |
440 | |
441 subedit.pl -s -1:5.03 | subedit.pl -c 3.5 | subedit.pl -s 1:5.03 | |
442 USAGE | |
443 | |
444 exit $exitCode; | |
445 } |