changeset 30757:6a5957bf5b70

Extend stream_read_line to support reading lines from UTF-16 encoded files and use this to support reading UTF-16 encoded subtitle files in subreader.c
author reimar
date Sun, 28 Feb 2010 15:24:30 +0000
parents 69bbd8b9fd09
children 33765d7d6e0a
files Changelog stream/stream.c stream/stream.h subreader.c
diffstat 4 files changed, 153 insertions(+), 51 deletions(-) [+]
line wrap: on
line diff
--- a/Changelog	Sun Feb 28 14:27:44 2010 +0000
+++ b/Changelog	Sun Feb 28 15:24:30 2010 +0000
@@ -36,6 +36,7 @@
     * support for displaying subs in the terminal (FIXME)
     * support for subtitles with audio-only files
     * support for right-to-left languages with embedded subtitles
+    * support for UTF-16 encoded external subtitles
     * support for 8 channel audio
     * sync dvd:// and dvdnav:// features
     * support for MPEG-4 ASP in VDPAU video output (non B-frame only)
--- a/stream/stream.c	Sun Feb 28 14:27:44 2010 +0000
+++ b/stream/stream.c	Sun Feb 28 15:24:30 2010 +0000
@@ -41,6 +41,7 @@
 #include "network.h"
 #include "stream.h"
 #include "libmpdemux/demuxer.h"
+#include "libavutil/intreadwrite.h"
 
 #include "m_option.h"
 #include "m_struct.h"
@@ -488,9 +489,103 @@
     return stream_check_interrupt_cb(time);
 }
 
-unsigned char* stream_read_line(stream_t *s,unsigned char* mem, int max) {
+/**
+ * Helper function to read 16 bits little-endian and advance pointer
+ */
+static uint16_t get_le16_inc(const uint8_t **buf)
+{
+  uint16_t v = AV_RL16(*buf);
+  *buf += 2;
+  return v;
+}
+
+/**
+ * Helper function to read 16 bits big-endian and advance pointer
+ */
+static uint16_t get_be16_inc(const uint8_t **buf)
+{
+  uint16_t v = AV_RB16(*buf);
+  *buf += 2;
+  return v;
+}
+
+/**
+ * Find a newline character in buffer
+ * \param buf buffer to search
+ * \param len amount of bytes to search in buffer, may not overread
+ * \param utf16 chose between UTF-8/ASCII/other and LE and BE UTF-16
+ *              0 = UTF-8/ASCII/other, 1 = UTF-16-LE, 2 = UTF-16-BE
+ */
+static const uint8_t *find_newline(const uint8_t *buf, int len, int utf16)
+{
+  uint32_t c;
+  const uint8_t *end = buf + len;
+  switch (utf16) {
+  case 0:
+    return (uint8_t *)memchr(buf, '\n', len);
+  case 1:
+    while (buf < end - 1) {
+      GET_UTF16(c, buf < end - 1 ? get_le16_inc(&buf) : 0, return NULL;)
+      if (buf <= end && c == '\n')
+        return buf - 1;
+    }
+    break;
+  case 2:
+    while (buf < end - 1) {
+      GET_UTF16(c, buf < end - 1 ? get_be16_inc(&buf) : 0, return NULL;)
+      if (buf <= end && c == '\n')
+        return buf - 1;
+    }
+    break;
+  }
+  return NULL;
+}
+
+/**
+ * Copy a number of bytes, converting to UTF-8 if input is UTF-16
+ * \param dst buffer to copy to
+ * \param dstsize size of dst buffer
+ * \param src buffer to copy from
+ * \param len amount of bytes to copy from src
+ * \param utf16 chose between UTF-8/ASCII/other and LE and BE UTF-16
+ *              0 = UTF-8/ASCII/other, 1 = UTF-16-LE, 2 = UTF-16-BE
+ */
+static int copy_characters(uint8_t *dst, int dstsize,
+                           const uint8_t *src, int *len, int utf16)
+{
+  uint32_t c;
+  uint8_t *dst_end = dst + dstsize;
+  const uint8_t *end = src + *len;
+  switch (utf16) {
+  case 0:
+    if (*len > dstsize)
+      *len = dstsize;
+    memcpy(dst, src, *len);
+    return *len;
+  case 1:
+    while (src < end - 1 && dst_end - dst > 8) {
+      uint8_t tmp;
+      GET_UTF16(c, src < end - 1 ? get_le16_inc(&src) : 0, ;)
+      PUT_UTF8(c, tmp, *dst++ = tmp;)
+    }
+    *len -= end - src;
+    return dstsize - (dst_end - dst);
+  case 2:
+    while (src < end - 1 && dst_end - dst > 8) {
+      uint8_t tmp;
+      GET_UTF16(c, src < end - 1 ? get_be16_inc(&src) : 0, ;)
+      PUT_UTF8(c, tmp, *dst++ = tmp;)
+    }
+    *len -= end - src;
+    return dstsize - (dst_end - dst);
+  }
+  return 0;
+}
+
+unsigned char* stream_read_line(stream_t *s,unsigned char* mem, int max, int utf16) {
   int len;
-  unsigned char* end,*ptr = mem;
+  const unsigned char *end;
+  unsigned char *ptr = mem;
   if (max < 1) return NULL;
   max--; // reserve one for 0-termination
   do {
@@ -499,13 +594,14 @@
     if(len <= 0 &&
        (!cache_stream_fill_buffer(s) ||
         (len = s->buf_len-s->buf_pos) <= 0)) break;
-    end = (unsigned char*) memchr((void*)(s->buffer+s->buf_pos),'\n',len);
+    end = find_newline(s->buffer+s->buf_pos, len, utf16);
     if(end) len = end - (s->buffer+s->buf_pos) + 1;
     if(len > 0 && max > 0) {
-      int l = len > max ? max : len;
-      memcpy(ptr,s->buffer+s->buf_pos,l);
+      int l = copy_characters(ptr, max, s->buffer+s->buf_pos, &len, utf16);
       max -= l;
       ptr += l;
+      if (!len)
+        break;
     }
     s->buf_pos += len;
   } while(!end);
--- a/stream/stream.h	Sun Feb 28 14:27:44 2010 +0000
+++ b/stream/stream.h	Sun Feb 28 15:24:30 2010 +0000
@@ -265,7 +265,7 @@
   return total;
 }
 
-unsigned char* stream_read_line(stream_t *s,unsigned char* mem, int max);
+unsigned char* stream_read_line(stream_t *s,unsigned char* mem, int max, int utf16);
 
 inline static int stream_eof(stream_t *s){
   return s->eof;
--- a/subreader.c	Sun Feb 28 14:27:44 2010 +0000
+++ b/subreader.c	Sun Feb 28 15:24:30 2010 +0000
@@ -111,7 +111,7 @@
     return NULL;
 }
 
-static subtitle *sub_read_line_sami(stream_t* st, subtitle *current) {
+static subtitle *sub_read_line_sami(stream_t* st, subtitle *current, int utf16) {
     static char line[LINE_LEN+1];
     static char *s = NULL, *slacktime_s;
     char text[LINE_LEN+1], *p=NULL, *q;
@@ -123,7 +123,7 @@
 
     /* read the first line */
     if (!s)
-	    if (!(s = stream_read_line(st, line, LINE_LEN))) return 0;
+	    if (!(s = stream_read_line(st, line, LINE_LEN, utf16))) return 0;
 
     do {
 	switch (state) {
@@ -230,7 +230,7 @@
 	}
 
 	/* read next line */
-	if (state != 99 && !(s = stream_read_line (st, line, LINE_LEN))) {
+	if (state != 99 && !(s = stream_read_line (st, line, LINE_LEN, utf16))) {
 	    if (current->start > 0) {
 		break; // if it is the last subtitle
 	    } else {
@@ -274,14 +274,14 @@
     else return NULL;  // last text field
 }
 
-static subtitle *sub_read_line_microdvd(stream_t *st,subtitle *current) {
+static subtitle *sub_read_line_microdvd(stream_t *st,subtitle *current, int utf16) {
     char line[LINE_LEN+1];
     char line2[LINE_LEN+1];
     char *p, *next;
     int i;
 
     do {
-	if (!stream_read_line (st, line, LINE_LEN)) return NULL;
+	if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL;
     } while ((sscanf (line,
 		      "{%ld}{}%[^\r\n]",
 		      &(current->start), line2) < 2) &&
@@ -302,14 +302,14 @@
     return current;
 }
 
-static subtitle *sub_read_line_mpl2(stream_t *st,subtitle *current) {
+static subtitle *sub_read_line_mpl2(stream_t *st,subtitle *current, int utf16) {
     char line[LINE_LEN+1];
     char line2[LINE_LEN+1];
     char *p, *next;
     int i;
 
     do {
-	if (!stream_read_line (st, line, LINE_LEN)) return NULL;
+	if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL;
     } while ((sscanf (line,
 		      "[%ld][%ld]%[^\r\n]",
 		      &(current->start), &(current->end), line2) < 3));
@@ -328,19 +328,19 @@
     return current;
 }
 
-static subtitle *sub_read_line_subrip(stream_t* st, subtitle *current) {
+static subtitle *sub_read_line_subrip(stream_t* st, subtitle *current, int utf16) {
     char line[LINE_LEN+1];
     int a1,a2,a3,a4,b1,b2,b3,b4;
     char *p=NULL, *q=NULL;
     int len;
 
     while (1) {
-	if (!stream_read_line (st, line, LINE_LEN)) return NULL;
+	if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL;
 	if (sscanf (line, "%d:%d:%d.%d,%d:%d:%d.%d",&a1,&a2,&a3,&a4,&b1,&b2,&b3,&b4) < 8) continue;
 	current->start = a1*360000+a2*6000+a3*100+a4;
 	current->end   = b1*360000+b2*6000+b3*100+b4;
 
-	if (!stream_read_line (st, line, LINE_LEN)) return NULL;
+	if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL;
 
 	p=q=line;
 	for (current->lines=1; current->lines < SUB_MAX_TEXT; current->lines++) {
@@ -358,21 +358,21 @@
     return current;
 }
 
-static subtitle *sub_read_line_subviewer(stream_t *st,subtitle *current) {
+static subtitle *sub_read_line_subviewer(stream_t *st,subtitle *current, int utf16) {
     char line[LINE_LEN+1];
     int a1,a2,a3,a4,b1,b2,b3,b4;
     char *p=NULL;
     int i,len;
 
     while (!current->text[0]) {
-	if (!stream_read_line (st, line, LINE_LEN)) return NULL;
+	if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL;
 	if ((len=sscanf (line, "%d:%d:%d%[,.:]%d --> %d:%d:%d%[,.:]%d",&a1,&a2,&a3,(char *)&i,&a4,&b1,&b2,&b3,(char *)&i,&b4)) < 10)
 	    continue;
 	current->start = a1*360000+a2*6000+a3*100+a4/10;
 	current->end   = b1*360000+b2*6000+b3*100+b4/10;
 	for (i=0; i<SUB_MAX_TEXT;) {
 	    int blank = 1;
-	    if (!stream_read_line (st, line, LINE_LEN)) break;
+	    if (!stream_read_line (st, line, LINE_LEN, utf16)) break;
 	    len=0;
 	    for (p=line; *p!='\n' && *p!='\r' && *p; p++,len++)
 		if (*p != ' ' && *p != '\t')
@@ -410,21 +410,21 @@
     return current;
 }
 
-static subtitle *sub_read_line_subviewer2(stream_t *st,subtitle *current) {
+static subtitle *sub_read_line_subviewer2(stream_t *st,subtitle *current, int utf16) {
     char line[LINE_LEN+1];
     int a1,a2,a3,a4;
     char *p=NULL;
     int i,len;
 
     while (!current->text[0]) {
-        if (!stream_read_line (st, line, LINE_LEN)) return NULL;
+        if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL;
 	if (line[0]!='{')
 	    continue;
         if ((len=sscanf (line, "{T %d:%d:%d:%d",&a1,&a2,&a3,&a4)) < 4)
             continue;
         current->start = a1*360000+a2*6000+a3*100+a4/10;
         for (i=0; i<SUB_MAX_TEXT;) {
-            if (!stream_read_line (st, line, LINE_LEN)) break;
+            if (!stream_read_line (st, line, LINE_LEN, utf16)) break;
             if (line[0]=='}') break;
             len=0;
             for (p=line; *p!='\n' && *p!='\r' && *p; ++p,++len);
@@ -443,14 +443,14 @@
 }
 
 
-static subtitle *sub_read_line_vplayer(stream_t *st,subtitle *current) {
+static subtitle *sub_read_line_vplayer(stream_t *st,subtitle *current, int utf16) {
 	char line[LINE_LEN+1];
 	int a1,a2,a3;
 	char *p=NULL, *next,separator;
 	int i,len,plen;
 
 	while (!current->text[0]) {
-		if (!stream_read_line (st, line, LINE_LEN)) return NULL;
+		if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL;
 		if ((len=sscanf (line, "%d:%d:%d%c%n",&a1,&a2,&a3,&separator,&plen)) < 4)
 			continue;
 
@@ -489,7 +489,7 @@
 	return current;
 }
 
-static subtitle *sub_read_line_rt(stream_t *st,subtitle *current) {
+static subtitle *sub_read_line_rt(stream_t *st,subtitle *current, int utf16) {
 	//TODO: This format uses quite rich (sub/super)set of xhtml
 	// I couldn't check it since DTD is not included.
 	// WARNING: full XML parses can be required for proper parsing
@@ -499,7 +499,7 @@
     int i,len,plen;
 
     while (!current->text[0]) {
-	if (!stream_read_line (st, line, LINE_LEN)) return NULL;
+	if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL;
 	//TODO: it seems that format of time is not easily determined, it may be 1:12, 1:12.0 or 0:1:12.0
 	//to describe the same moment in time. Maybe there are even more formats in use.
 	//if ((len=sscanf (line, "<Time Begin=\"%d:%d:%d.%d\" End=\"%d:%d:%d.%d\"",&a1,&a2,&a3,&a4,&b1,&b2,&b3,&b4)) < 8)
@@ -539,7 +539,7 @@
     return current;
 }
 
-static subtitle *sub_read_line_ssa(stream_t *st,subtitle *current) {
+static subtitle *sub_read_line_ssa(stream_t *st,subtitle *current, int utf16) {
 /*
  * Sub Station Alpha v4 (and v2?) scripts have 9 commas before subtitle
  * other Sub Station Alpha scripts have only 8 commas before subtitle
@@ -563,7 +563,7 @@
 	char *tmp;
 
 	do {
-		if (!stream_read_line (st, line, LINE_LEN)) return NULL;
+		if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL;
 	} while (sscanf (line, "Dialogue: Marked=%d,%d:%d:%d.%d,%d:%d:%d.%d,"
 			"%[^\n\r]", &nothing,
 			&hour1, &min1, &sec1, &hunsec1,
@@ -640,11 +640,11 @@
  *
  * by set, based on code by szabi (dunnowhat sub format ;-)
  */
-static subtitle *sub_read_line_pjs(stream_t *st,subtitle *current) {
+static subtitle *sub_read_line_pjs(stream_t *st,subtitle *current, int utf16) {
     char line[LINE_LEN+1];
     char text[LINE_LEN+1], *s, *d;
 
-    if (!stream_read_line (st, line, LINE_LEN))
+    if (!stream_read_line (st, line, LINE_LEN, utf16))
 	return NULL;
     /* skip spaces */
     for (s=line; *s && isspace(*s); s++);
@@ -678,7 +678,7 @@
     return current;
 }
 
-static subtitle *sub_read_line_mpsub(stream_t *st, subtitle *current) {
+static subtitle *sub_read_line_mpsub(stream_t *st, subtitle *current, int utf16) {
 	char line[LINE_LEN+1];
 	float a,b;
 	int num=0;
@@ -686,7 +686,7 @@
 
 	do
 	{
-		if (!stream_read_line(st, line, LINE_LEN)) return NULL;
+		if (!stream_read_line(st, line, LINE_LEN, utf16)) return NULL;
 	} while (sscanf (line, "%f %f", &a, &b) !=2);
 
 	mpsub_position += a*mpsub_multiplier;
@@ -695,7 +695,7 @@
 	current->end=(int) mpsub_position;
 
 	while (num < SUB_MAX_TEXT) {
-		if (!stream_read_line (st, line, LINE_LEN)) {
+		if (!stream_read_line (st, line, LINE_LEN, utf16)) {
 			if (num == 0) return NULL;
 			else return current;
 		}
@@ -723,14 +723,14 @@
 subtitle *previous_aqt_sub = NULL;
 #endif
 
-static subtitle *sub_read_line_aqt(stream_t *st,subtitle *current) {
+static subtitle *sub_read_line_aqt(stream_t *st,subtitle *current, int utf16) {
     char line[LINE_LEN+1];
     char *next;
     int i;
 
     while (1) {
     // try to locate next subtitle
-        if (!stream_read_line (st, line, LINE_LEN))
+        if (!stream_read_line (st, line, LINE_LEN, utf16))
 		return NULL;
         if (!(sscanf (line, "-->> %ld", &(current->start)) <1))
 		break;
@@ -745,14 +745,14 @@
     previous_aqt_sub = current;
 #endif
 
-    if (!stream_read_line (st, line, LINE_LEN))
+    if (!stream_read_line (st, line, LINE_LEN, utf16))
 	return NULL;
 
     sub_readtext((char *) &line,&current->text[0]);
     current->lines = 1;
     current->end = current->start; // will be corrected by next subtitle
 
-    if (!stream_read_line (st, line, LINE_LEN))
+    if (!stream_read_line (st, line, LINE_LEN, utf16))
 	return current;
 
     next = line,i=1;
@@ -780,7 +780,7 @@
 subtitle *previous_subrip09_sub = NULL;
 #endif
 
-static subtitle *sub_read_line_subrip09(stream_t *st,subtitle *current) {
+static subtitle *sub_read_line_subrip09(stream_t *st,subtitle *current, int utf16) {
     char line[LINE_LEN+1];
     int a1,a2,a3;
     char * next=NULL;
@@ -788,7 +788,7 @@
 
     while (1) {
     // try to locate next subtitle
-        if (!stream_read_line (st, line, LINE_LEN))
+        if (!stream_read_line (st, line, LINE_LEN, utf16))
 		return NULL;
         if (!((len=sscanf (line, "[%d:%d:%d]",&a1,&a2,&a3)) < 3))
 		break;
@@ -805,7 +805,7 @@
     previous_subrip09_sub = current;
 #endif
 
-    if (!stream_read_line (st, line, LINE_LEN))
+    if (!stream_read_line (st, line, LINE_LEN, utf16))
 	return NULL;
 
     next = line,i=0;
@@ -832,7 +832,7 @@
     return current;
 }
 
-static subtitle *sub_read_line_jacosub(stream_t* st, subtitle * current)
+static subtitle *sub_read_line_jacosub(stream_t* st, subtitle * current, int utf16)
 {
     char line1[LINE_LEN], line2[LINE_LEN], directive[LINE_LEN], *p, *q;
     unsigned a1, a2, a3, a4, b1, b2, b3, b4, comment = 0;
@@ -844,7 +844,7 @@
     memset(line2, 0, LINE_LEN);
     memset(directive, 0, LINE_LEN);
     while (!current->text[0]) {
-	if (!stream_read_line(st, line1, LINE_LEN)) {
+	if (!stream_read_line(st, line1, LINE_LEN, utf16)) {
 	    return NULL;
 	}
 	if (sscanf
@@ -1002,7 +1002,7 @@
 		    (*(p + 1) == '~') || (*(p + 1) == '{')) {
 		    ++p;
 		} else if (eol(*(p + 1))) {
-		    if (!stream_read_line(st, directive, LINE_LEN))
+		    if (!stream_read_line(st, directive, LINE_LEN, utf16))
 			return NULL;
 		    trail_space(directive);
 		    av_strlcat(line2, directive, LINE_LEN);
@@ -1022,14 +1022,14 @@
     return current;
 }
 
-static int sub_autodetect (stream_t* st, int *uses_time) {
+static int sub_autodetect (stream_t* st, int *uses_time, int utf16) {
     char line[LINE_LEN+1];
     int i,j=0;
     char p;
 
     while (j < 100) {
 	j++;
-	if (!stream_read_line (st, line, LINE_LEN))
+	if (!stream_read_line (st, line, LINE_LEN, utf16))
 	    return SUB_INVALID;
 
 	if (sscanf (line, "{%d}{%d}", &i, &i)==2)
@@ -1283,7 +1283,7 @@
 }
 
 struct subreader {
-    subtitle * (*read)(stream_t *st,subtitle *dest);
+    subtitle * (*read)(stream_t *st,subtitle *dest,int utf16);
     void       (*post)(subtitle *dest);
     const char *name;
 };
@@ -1350,6 +1350,7 @@
 #endif
 
 sub_data* sub_read_file (char *filename, float fps) {
+    int utf16;
     stream_t* fd;
     int n_max, n_first, i, j, sub_first, sub_orig;
     subtitle *first, *second, *sub, *return_sub, *alloced_sub = NULL;
@@ -1378,15 +1379,19 @@
     i = 0;
     fd=open_stream (filename, NULL, &i); if (!fd) return NULL;
 
-    sub_format=sub_autodetect (fd, &uses_time);
+    sub_format = SUB_INVALID;
+    for (utf16 = 0; sub_format == SUB_INVALID && utf16 < 3; utf16++) {
+        sub_format=sub_autodetect (fd, &uses_time, utf16);
+        stream_reset(fd);
+        stream_seek(fd,0);
+    }
+    utf16--;
+
     mpsub_multiplier = (uses_time ? 100.0 : 1.0);
     if (sub_format==SUB_INVALID) {mp_msg(MSGT_SUBREADER,MSGL_WARN,"SUB: Could not determine file format\n");return NULL;}
     srp=sr+sub_format;
     mp_msg(MSGT_SUBREADER, MSGL_V, "SUB: Detected subtitle file format: %s\n", srp->name);
 
-    stream_reset(fd);
-    stream_seek(fd,0);
-
 #ifdef CONFIG_ICONV
     sub_utf8_prev=sub_utf8;
     {
@@ -1430,7 +1435,7 @@
 	sub = &first[sub_num];
 #endif
 	memset(sub, '\0', sizeof(subtitle));
-        sub=srp->read(fd,sub);
+        sub=srp->read(fd,sub,utf16);
         if(!sub) break;   // EOF
 #ifdef CONFIG_ICONV
 	if ((sub!=ERR) && sub_utf8 == 2) sub=subcp_recode(sub);