changeset 19492:c8daf3471201

SSA/ASS parser reworked, with 2 main results: support for script embedded fonts (fonts, uuencoded directly into script) added; matroska interface functions have got more sensible names.
author eugeni
date Tue, 22 Aug 2006 22:11:01 +0000
parents 10d8f2cae948
children aa58338824f8
files libass/ass.c libass/ass.h libass/ass_types.h libmpdemux/demux_mkv.c
diffstat 4 files changed, 244 insertions(+), 122 deletions(-) [+]
line wrap: on
line diff
--- a/libass/ass.c	Tue Aug 22 20:55:49 2006 +0000
+++ b/libass/ass.c	Tue Aug 22 22:11:01 2006 +0000
@@ -18,6 +18,7 @@
 #include <iconv.h>
 extern char *sub_cp;
 #endif
+extern int extract_embedded_fonts;
 
 #include "mp_msg.h"
 #include "ass.h"
@@ -26,12 +27,27 @@
 
 char *get_path(char *);
 
+struct parser_priv_s {
+	enum {PST_UNKNOWN = 0, PST_INFO, PST_STYLES, PST_EVENTS, PST_FONTS} state;
+	char* fontname;
+	char* fontdata;
+	int fontdata_size;
+	int fontdata_used;
+};
+
 #define ASS_STYLES_ALLOC 20
 #define ASS_EVENTS_ALLOC 200
 
 void ass_free_track(ass_track_t* track) {
 	int i;
 	
+	if (track->parser_priv) {
+		if (track->parser_priv->fontname)
+			free(track->parser_priv->fontname);
+		if (track->parser_priv->fontdata)
+			free(track->parser_priv->fontdata);
+		free(track->parser_priv);
+	}
 	if (track->style_format)
 		free(track->style_format);
 	if (track->event_format)
@@ -379,46 +395,219 @@
 	
 }
 
+static int process_styles_line(ass_track_t* track, char *str)
+{
+	if (!strncmp(str,"Format:", 7)) {
+		char* p = str + 7;
+		skip_spaces(&p);
+		track->style_format = strdup(p);
+		mp_msg(MSGT_GLOBAL, MSGL_DBG2, "Style format: %s\n", track->style_format);
+	} else if (!strncmp(str,"Style:", 6)) {
+		char* p = str + 6;
+		skip_spaces(&p);
+		process_style(track, p);
+	}
+	return 0;
+}
+
+static int process_info_line(ass_track_t* track, char *str)
+{
+	if (!strncmp(str, "PlayResX:", 9)) {
+		track->PlayResX = atoi(str + 9);
+	} else if (!strncmp(str,"PlayResY:", 9)) {
+		track->PlayResY = atoi(str + 9);
+	} else if (!strncmp(str,"Timer:", 6)) {
+		track->Timer = atof(str + 6);
+	} else if (!strncmp(str,"WrapStyle:", 10)) {
+		track->WrapStyle = atoi(str + 10);
+	}
+	return 0;
+}
+
+static int process_events_line(ass_track_t* track, char *str)
+{
+	if (!strncmp(str, "Format:", 7)) {
+		char* p = str + 7;
+		skip_spaces(&p);
+		track->event_format = strdup(p);
+		mp_msg(MSGT_GLOBAL, MSGL_DBG2, "Event format: %s\n", track->event_format);
+	} else if (!strncmp(str, "Dialogue:", 9)) {
+		// This should never be reached for embedded subtitles.
+		// They have slightly different format and are parsed in ass_process_chunk,
+		// called directly from demuxer
+		int eid;
+		ass_event_t* event;
+		
+		str += 9;
+		skip_spaces(&str);
+
+		eid = ass_alloc_event(track);
+		event = track->events + eid;
+
+		process_event_tail(track, event, str, 0);
+	} else {
+		mp_msg(MSGT_GLOBAL, MSGL_V, "Not understood: %s  \n", str);
+	}
+	return 0;
+}
+
+// Copied from mkvtoolnix
+static unsigned char* decode_chars(unsigned char c1, unsigned char c2,
+		unsigned char c3, unsigned char c4, unsigned char* dst, int cnt)
+{
+	uint32_t value;
+	unsigned char bytes[3];
+	int i;
+
+	value = ((c1 - 33) << 18) + ((c2 - 33) << 12) + ((c3 - 33) << 6) + (c4 - 33);
+	bytes[2] = value & 0xff;
+	bytes[1] = (value & 0xff00) >> 8;
+	bytes[0] = (value & 0xff0000) >> 16;
+
+	for (i = 0; i < cnt; ++i)
+		*dst++ = bytes[i];
+	return dst;
+}
+
+static int decode_font(ass_track_t* track)
+{
+	unsigned char* p;
+	unsigned char* q;
+	int i;
+	int size; // original size
+	int dsize; // decoded size
+	unsigned char* buf = 0;
+
+	mp_msg(MSGT_GLOBAL, MSGL_V, "font: %d bytes encoded data \n", track->parser_priv->fontdata_used);
+	size = track->parser_priv->fontdata_used;
+	if (size % 4 == 1) {
+		mp_msg(MSGT_GLOBAL, MSGL_ERR, "bad encoded data size\n");
+		goto error_decode_font;
+	}
+	buf = malloc(size / 4 * 3 + 2);
+	q = buf;
+	for (i = 0, p = (unsigned char*)track->parser_priv->fontdata; i < size / 4; i++, p+=4) {
+		q = decode_chars(p[0], p[1], p[2], p[3], q, 3);
+	}
+	if (size % 4 == 2) {
+		q = decode_chars(p[0], p[1], 0, 0, q, 1);
+	} else if (size % 4 == 3) {
+		q = decode_chars(p[0], p[1], p[2], 0, q, 2);
+	}
+	dsize = q - buf;
+	assert(dsize <= size / 4 * 3 + 2);
+	
+	if (extract_embedded_fonts)
+		ass_process_font(track->parser_priv->fontname, (char*)buf, dsize);
+
+error_decode_font:
+	if (buf) free(buf);
+	free(track->parser_priv->fontname);
+	free(track->parser_priv->fontdata);
+	track->parser_priv->fontname = 0;
+	track->parser_priv->fontdata = 0;
+	track->parser_priv->fontdata_size = 0;
+	track->parser_priv->fontdata_used = 0;
+	return 0;
+}
+
+static char* validate_fname(char* name);
+
+static int process_fonts_line(ass_track_t* track, char *str)
+{
+	int len;
+
+	if (!strncmp(str, "fontname:", 9)) {
+		char* p = str + 9;
+		skip_spaces(&p);
+		if (track->parser_priv->fontname) {
+			decode_font(track);
+		}
+		track->parser_priv->fontname = validate_fname(p);
+		mp_msg(MSGT_GLOBAL, MSGL_V, "fontname: %s\n", track->parser_priv->fontname);
+		return 0;
+	}
+	
+	if (!track->parser_priv->fontname) {
+		mp_msg(MSGT_GLOBAL, MSGL_V, "Not understood: %s  \n", str);
+		return 0;
+	}
+
+	len = strlen(str);
+	if (len > 80) {
+		mp_msg(MSGT_GLOBAL, MSGL_WARN, "Font line too long: %d, %s\n", len, str);
+		return 0;
+	}
+	if (track->parser_priv->fontdata_used + len > track->parser_priv->fontdata_size) {
+		track->parser_priv->fontdata_size += 100 * 1024;
+		track->parser_priv->fontdata = realloc(track->parser_priv->fontdata, track->parser_priv->fontdata_size);
+	}
+	memcpy(track->parser_priv->fontdata + track->parser_priv->fontdata_used, str, len);
+	track->parser_priv->fontdata_used += len;
+	
+	return 0;
+}
+
 /**
  * \brief Parse a header line
  * \param track track
  * \param str string to parse, zero-terminated
 */ 
-static int process_header_line(ass_track_t* track, char *str)
+static int process_line(ass_track_t* track, char *str)
 {
-	static int events_section_started = 0;
-	
-	mp_msg(MSGT_GLOBAL, MSGL_DBG2, "=== Header: %s\n", str);
-	if (strncmp(str, "PlayResX:", 9)==0) {
-		track->PlayResX = atoi(str + 9);
-	} else if (strncmp(str,"PlayResY:", 9)==0) {
-		track->PlayResY = atoi(str + 9);
-	} else if (strncmp(str,"Timer:", 6)==0) {
-		track->Timer = atof(str + 6);
-	} else if (strstr(str,"Styles]")) {
-		events_section_started = 0;
-		if (strchr(str, '+'))
-			track->track_type = TRACK_TYPE_ASS;
-		else
-			track->track_type = TRACK_TYPE_SSA;
-	} else if (strncmp(str,"[Events]", 8)==0) {
-		events_section_started = 1;
-	} else if (strncmp(str,"Format:", 7)==0) {
-		char* p = str + 7;
-		skip_spaces(&p);
-		if (events_section_started) {
-			track->event_format = strdup(p);
-			mp_msg(MSGT_GLOBAL, MSGL_DBG2, "Event format: %s\n", track->event_format);
-		} else {
-			track->style_format = strdup(p);
-			mp_msg(MSGT_GLOBAL, MSGL_DBG2, "Style format: %s\n", track->style_format);
+	if (strstr(str, "[Script Info]")) { // FIXME: strstr to skip possible BOM at the beginning of the script
+		track->parser_priv->state = PST_INFO;
+	} else if (!strncmp(str, "[V4 Styles]", 11)) {
+		track->parser_priv->state = PST_STYLES;
+		track->track_type = TRACK_TYPE_SSA;
+	} else if (!strncmp(str, "[V4+ Styles]", 12)) {
+		track->parser_priv->state = PST_STYLES;
+		track->track_type = TRACK_TYPE_ASS;
+	} else if (!strncmp(str, "[Events]", 8)) {
+		track->parser_priv->state = PST_EVENTS;
+	} else if (!strncmp(str, "[Fonts]", 7)) {
+		track->parser_priv->state = PST_FONTS;
+	} else {
+		switch (track->parser_priv->state) {
+		case PST_INFO:
+			process_info_line(track, str);
+			break;
+		case PST_STYLES:
+			process_styles_line(track, str);
+			break;
+		case PST_EVENTS:
+			process_events_line(track, str);
+			break;
+		case PST_FONTS:
+			process_fonts_line(track, str);
+			break;
+		default:
+			break;
 		}
-	} else if (strncmp(str,"Style:", 6)==0) {
-		char* p = str + 6;
-		skip_spaces(&p);
-		process_style(track, p);
-	} else if (strncmp(str,"WrapStyle:", 10)==0) {
-		track->WrapStyle = atoi(str + 10);
+	}
+
+	// there is no explicit end-of-font marker in ssa/ass
+	if ((track->parser_priv->state != PST_FONTS) && (track->parser_priv->fontname))
+		decode_font(track);
+
+	return 0;
+}
+
+static int process_text(ass_track_t* track, char* str)
+{
+	char* p = str;
+	while(1) {
+		char* q;
+		for (;((*p=='\r')||(*p=='\n'));++p) {}
+		for (q=p; ((*q!='\0')&&(*q!='\r')&&(*q!='\n')); ++q) {};
+		if (q==p)
+			break;
+		if (*q != '\0')
+			*(q++) = '\0';
+		process_line(track, p);
+		if (*q == '\0')
+			break;
+		p = q;
 	}
 	return 0;
 }
@@ -428,31 +617,17 @@
  * \param track track
  * \param data string to parse
  * \param size length of data
- CodecPrivate section contains [Stream Info] and [V4+ Styles] sections
+ CodecPrivate section contains [Stream Info] and [V4+ Styles] ([V4 Styles] for SSA) sections
 */ 
-void ass_process_chunk(ass_track_t* track, char *data, int size)
+void ass_process_codec_private(ass_track_t* track, char *data, int size)
 {
 	char* str = malloc(size + 1);
-	char* p;
 	int sid;
 
 	memcpy(str, data, size);
 	str[size] = '\0';
 
-	p = str;
-	while(1) {
-		char* q;
-		for (;((*p=='\r')||(*p=='\n'));++p) {}
-		for (q=p; ((*q!='\0')&&(*q!='\r')&&(*q!='\n')); ++q) {};
-		if (q==p)
-			break;
-		if (*q != '\0')
-			*(q++) = '\0';
-		process_header_line(track, p);
-		if (*q == '\0')
-			break;
-		p = q;
-	}
+	process_text(track, str);
 	free(str);
 
 	// add "Default" style to the end
@@ -464,6 +639,7 @@
 	if (!track->event_format) {
 		// probably an mkv produced by ancient mkvtoolnix
 		// such files don't have [Events] and Format: headers
+		track->parser_priv->state = PST_EVENTS;
 		if (track->track_type == TRACK_TYPE_SSA)
 			track->event_format = strdup("Format: Marked, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text");
 		else
@@ -488,7 +664,7 @@
  * \param timecode starting time of the event (milliseconds)
  * \param duration duration of the event (milliseconds)
 */ 
-void ass_process_line(ass_track_t* track, char *data, int size, long long timecode, long long duration)
+void ass_process_chunk(ass_track_t* track, char *data, int size, long long timecode, long long duration)
 {
 	char* str;
 	int eid;
@@ -535,29 +711,6 @@
 	free(str);
 }
 
-/**
- * \brief Process a line from external file.
- * \param track track
- * \param str string to parse
- * \param size length of data
-*/ 
-static void ass_process_external_line(ass_track_t* track, char *str, int size)
-{
-	int eid;
-	ass_event_t* event;
-
-	eid = ass_alloc_event(track);
-	event = track->events + eid;
-
-	if (strncmp("Dialogue:", str, 9) != 0)
-		return;
-
-	str += 9;
-	while (*str == ' ') {++str;}
-	
-	process_event_tail(track, event, str, 0);
-}
-
 #ifdef USE_ICONV
 /** \brief recode buffer to utf-8
  * constraint: sub_cp != 0
@@ -641,8 +794,6 @@
 	long sz;
 	long bytes_read;
 	char* buf;
-	char* p;
-	int events_reached;
 	ass_track_t* track;
 	
 	FILE* fp = fopen(fname, "rb");
@@ -698,49 +849,15 @@
 	track->name = strdup(fname);
 	
 	// process header
-	events_reached = 0;
-	p = buf;
-	while (p && (*p)) {
-		while (*p == '\n') {++p;}
-		if (strncmp(p, "[Events]", 8) == 0) {
-			events_reached = 1;
-		} else if ((strncmp(p, "Format:", 7) == 0) && (events_reached)) {
-			p = strchr(p, '\n');
-			if (p == 0) {
-				mp_msg(MSGT_GLOBAL, MSGL_WARN, "Incomplete subtitles\n");
-				free(buf);
-				return 0;
-			}
-			ass_process_chunk(track, buf, p - buf + 1);
-			++p;
-			break;
-		}
-		p = strchr(p, '\n');
-	}
-	// process events
-	while (p && (*p)) {
-		char* next;
-		int len;
-		while (*p == '\n') {++p;}
-		next = strchr(p, '\n');
-		len = 0;
-		if (next) {
-			len = next - p;
-			*next = 0;
-		} else {
-			len = strlen(p);
-		}
-		ass_process_external_line(track, p, len);
-		if (next) {
-			p = next + 1;
-			continue;
-		} else
-			break;
-	}
-	
+	process_text(track, buf);
+
+	// there is no explicit end-of-font marker in ssa/ass
+	if (track->parser_priv->fontname)
+		decode_font(track);
+
 	free(buf);
 
-	if (!events_reached) {
+	if (track->track_type == TRACK_TYPE_UNKNOWN) {
 		ass_free_track(track);
 		return 0;
 	}
@@ -853,6 +970,7 @@
 
 ass_track_t* ass_new_track(void) {
 	ass_track_t* track = calloc(1, sizeof(ass_track_t));
+	track->parser_priv = calloc(1, sizeof(parser_priv_t));
 	return track;
 }
 
--- a/libass/ass.h	Tue Aug 22 20:55:49 2006 +0000
+++ b/libass/ass.h	Tue Aug 22 22:11:01 2006 +0000
@@ -131,7 +131,7 @@
  * \param data string to parse
  * \param size length of data
  */
-void ass_process_chunk(ass_track_t* track, char *data, int size);
+void ass_process_codec_private(ass_track_t* track, char *data, int size);
 
 /**
  * \brief Process a chunk of subtitle stream data. In matroska, this containes exactly 1 event (or a commentary)
@@ -141,7 +141,7 @@
  * \param timecode starting time of the event (milliseconds)
  * \param duration duration of the event (milliseconds)
 */
-void ass_process_line(ass_track_t* track, char *data, int size, long long timecode, long long duration);
+void ass_process_chunk(ass_track_t* track, char *data, int size, long long timecode, long long duration);
 
 /**
  * \brief Read subtitles from file.
--- a/libass/ass_types.h	Tue Aug 22 20:55:49 2006 +0000
+++ b/libass/ass_types.h	Tue Aug 22 22:11:01 2006 +0000
@@ -53,6 +53,8 @@
 	char* Text;
 } ass_event_t;
 
+typedef struct parser_priv_s parser_priv_t;
+
 /// ass track represent either an external script or a matroska subtitle stream (no real difference between them)
 /// it can be used in rendering after the headers are parsed (i.e. events format line read)
 typedef struct ass_track_s {
@@ -66,7 +68,7 @@
 	char* style_format; // style format line (everything after "Format: ")
 	char* event_format; // event format line
 
-	enum {TRACK_TYPE_ASS, TRACK_TYPE_SSA} track_type;
+	enum {TRACK_TYPE_UNKNOWN = 0, TRACK_TYPE_ASS, TRACK_TYPE_SSA} track_type;
 	
 	// script header fields
 	int PlayResX;
@@ -77,6 +79,8 @@
 	
 	int default_style; // index of default style
 	char* name; // file name in case of external subs, 0 for streams
+
+	parser_priv_t* parser_priv;
 } ass_track_t;
 
 #endif
--- a/libmpdemux/demux_mkv.c	Tue Aug 22 20:55:49 2006 +0000
+++ b/libmpdemux/demux_mkv.c	Tue Aug 22 22:11:01 2006 +0000
@@ -2309,7 +2309,7 @@
         }
       track->sh_sub.type = 'a';
       track->sh_sub.ass_track = ass_new_track();
-      ass_process_chunk(track->sh_sub.ass_track, track->private_data, track->private_size);
+      ass_process_codec_private(track->sh_sub.ass_track, track->private_data, track->private_size);
     }
 }
 #endif
@@ -2800,7 +2800,7 @@
 
 #ifdef USE_ASS
   if (ass_enabled && track->subtitle_type == MATROSKA_SUBTYPE_SSA) {
-    ass_process_line(track->sh_sub.ass_track, block, size, (long long)timecode, (long long)block_duration);
+    ass_process_chunk(track->sh_sub.ass_track, block, size, (long long)timecode, (long long)block_duration);
     return;
   }
 #endif