# HG changeset patch
# User rik
# Date 1159025481 0
# Node ID e99cd69dd08e69e4f2c34259d42cea4f924828f6
# Parent  bf6bdb785567968f0cd5ad36d48cc37620d09c9e
Patch by Karolina Lindqvist <karolina.lindqvist@kramnet.se>
"This patch is the MMX optimizations for the zrmjpeg filter, which is used by
the zr2 video output driver."

With some small changes by me:
- column width=80
- kept jpeg_enc_* functions static because they confuse the current vo_zr.c
- did not include jpeg_enc.h because jpeg_enc functions are still static

diff -r bf6bdb785567 -r e99cd69dd08e libmpcodecs/vf_zrmjpeg.c
--- a/libmpcodecs/vf_zrmjpeg.c	Sat Sep 23 15:24:24 2006 +0000
+++ b/libmpcodecs/vf_zrmjpeg.c	Sat Sep 23 15:31:21 2006 +0000
@@ -31,11 +31,13 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
 #include "libavcodec/mpegvideo.h"
+//#include "jpeg_enc.h" /* this file is not present yet */
 
 #undef malloc
 #undef free
 #undef realloc
 
+extern int avcodec_inited;
 
 /* some convenient #define's, is this portable enough? */
 #define VERBOSE(...) mp_msg(MSGT_DECVIDEO, MSGL_V, "vf_zrmjpeg: " __VA_ARGS__)
@@ -60,6 +62,9 @@
 	uint16_t huff_code_ac_chrominance[256];
 } MJpegContext;
 
+// The get_pixels routine to use. The real routine comes from dsputil
+static void (*get_pixels)(DCTELEM *restrict block, const uint8_t *pixels, int line_size);
+
 /* Begin excessive code duplication ************************************/
 /* Code coming from mpegvideo.c and mjpeg.c in ../libavcodec ***********/
 
@@ -75,6 +80,10 @@
 	4520,   6270,  5906,  5315,  4520,  3552,  2446,  1247
 };
 
+/*
+ * This routine is like the routine with the same name in mjpeg.c,
+ * except for some coefficient changes.
+ */
 static void convert_matrix(MpegEncContext *s, int (*qmat)[64], 
 		uint16_t (*qmat16)[2][64], const uint16_t *quant_matrix,
 		int bias, int qmin, int qmax) {
@@ -130,6 +139,9 @@
     	}
 }
 
+/*
+ * This routine is a clone of mjpeg_encode_dc
+ */
 static inline void encode_dc(MpegEncContext *s, int val, 
 		uint8_t *huff_size, uint16_t *huff_code) {
 	int mant, nbits;
@@ -142,19 +154,15 @@
 			val = -val;
 			mant--;
 		}
-        
-		/* compute the log (XXX: optimize) */
-		nbits = 0;
-		while (val != 0) {
-			val = val >> 1;
-			nbits++;
-		}
-            
+		nbits= av_log2_16bit(val) + 1;
 		put_bits(&s->pb, huff_size[nbits], huff_code[nbits]);
 		put_bits(&s->pb, nbits, mant & ((1 << nbits) - 1));
 	}
 }
 
+/*
+ * This routine is a duplicate of encode_block in mjpeg.c
+ */
 static void encode_block(MpegEncContext *s, DCTELEM *block, int n) {
 	int mant, nbits, code, i, j;
 	int component, dc, run, last_index, val;
@@ -199,12 +207,7 @@
                 		mant--;
             		}
             
-            		/* compute the log (XXX: optimize) */
-            		nbits = 0;
-			while (val != 0) {
-				val = val >> 1;
-				nbits++; 
-			}
+			nbits= av_log2_16bit(val) + 1;
 			code = (run << 4) | nbits;
 
 			put_bits(&s->pb, huff_size_ac[code], 
@@ -241,9 +244,6 @@
 	struct MpegEncContext *s;
 	int cheap_upsample;
 	int bw;
-	int y_ps;
-	int u_ps;
-	int v_ps;
 	int y_rs;
 	int u_rs;
 	int v_rs;
@@ -253,7 +253,7 @@
  * changes, it allows for black&white encoding (it skips the U and V
  * macroblocks and it outputs the huffman code for 'no change' (dc) and
  * 'all zero' (ac)) and it takes 4 macroblocks (422) instead of 6 (420) */
-static void zr_mjpeg_encode_mb(jpeg_enc_t *j) {
+static always_inline void zr_mjpeg_encode_mb(jpeg_enc_t *j) {
 
 	MJpegContext *m = j->s->mjpeg_ctx;
 
@@ -279,11 +279,58 @@
     	}
 }
 
+/*
+ * Taking one MCU (YUYV) from 8-bit pixel planar storage and
+ * filling it into four 16-bit pixel DCT macroblocks.
+ */
+static always_inline void fill_block(jpeg_enc_t *j, int x, int y,
+		unsigned char *y_data, unsigned char *u_data,
+		unsigned char *v_data)
+{
+	int i, k;
+	short int *dest;
+	unsigned char *source;
+
+	// The first Y, Y0
+	get_pixels(j->s->block[0], y*8*j->y_rs + 16*x + y_data, j->y_rs);
+	// The second Y, Y1
+	get_pixels(j->s->block[1], y*8*j->y_rs + 16*x + 8 + y_data, j->y_rs);
+
+	if (!j->bw && j->cheap_upsample) {
+		source = y * 4 * j->u_rs + 8*x + u_data;
+		dest = j->s->block[2];
+		for (i = 0; i < 4; i++) {
+			for (k = 0; k < 8; k++) {
+				dest[k] = source[k];   // First row
+				dest[k+8] = source[k]; // Duplicate to next row
+
+			}
+			dest += 16;
+			source += j->u_rs;
+		}
+		source = y * 4 * j->v_rs + 8*x + v_data;
+		dest = j->s->block[3];
+		for (i = 0; i < 4; i++) {
+			for (k = 0; k < 8; k++) {
+				dest[k] = source[k];
+				dest[k+8] = source[k];
+			}
+			dest += 16;
+			source += j->u_rs;
+		}
+	} else if (!j->bw && !j->cheap_upsample) {
+		// U
+		get_pixels(j->s->block[2], y*8*j->u_rs + 8*x + u_data, j->u_rs);
+		// V
+		get_pixels(j->s->block[3], y*8*j->v_rs + 8*x + v_data, j->v_rs);
+	}
+}
+
 /* this function can take all kinds of YUV colorspaces
  * YV12, YVYU, UYVY. The necesary parameters must be set up by the caller
- * y_ps means "y pixel size", y_rs means "y row size".
+ * y_rs means "y row size".
  * For YUYV, for example, is u_buf = y_buf + 1, v_buf = y_buf + 3, 
- * y_ps = 2, u_ps = 4, v_ps = 4, y_rs = u_rs = v_rs.
+ * y_rs = u_rs = v_rs.
  *
  *  The actual buffers must be passed with mjpeg_encode_frame, this is
  *  to make it possible to call encode on the buffer provided by the
@@ -301,46 +348,41 @@
 /* The encoder doesn't know anything about interlacing, the halve height
  * needs to be passed and the double rowstride. Which field gets encoded
  * is decided by what buffers are passed to mjpeg_encode_frame */
-static jpeg_enc_t *jpeg_enc_init(int w, int h, int y_psize, int y_rsize, 
-		int u_psize, int u_rsize, int v_psize, int v_rsize,
+static jpeg_enc_t *jpeg_enc_init(int w, int h, int y_rsize, 
+			  int u_rsize, int v_rsize,
 		int cu, int q, int b) {
 	jpeg_enc_t *j;
 	int i = 0;
-	VERBOSE("JPEG encoder init: %dx%d %d %d %d %d %d %d\n",
-			w, h, y_psize, y_rsize, u_psize, 
-			u_rsize, v_psize, v_rsize);
+	VERBOSE("JPEG encoder init: %dx%d %d %d %d cu=%d q=%d bw=%d\n",
+			w, h, y_rsize, u_rsize, v_rsize, cu, q, b);
 
-	j = malloc(sizeof(jpeg_enc_t));
+	j = av_mallocz(sizeof(jpeg_enc_t));
 	if (j == NULL) return NULL;
 
-	j->s = malloc(sizeof(MpegEncContext));
-	memset(j->s,0x00,sizeof(MpegEncContext));
+	j->s = av_mallocz(sizeof(MpegEncContext));
 	if (j->s == NULL) {
-		free(j);
+		av_free(j);
 		return NULL;
 	}
 
 	/* info on how to access the pixels */
-	j->y_ps = y_psize; 
-	j->u_ps = u_psize; 
-	j->v_ps = v_psize;
 	j->y_rs = y_rsize; 
 	j->u_rs = u_rsize; 
 	j->v_rs = v_rsize;
 
-	j->s->width = w;
+	j->s->width = w;		// image width and height
 	j->s->height = h;
-	j->s->qscale = q;
+	j->s->qscale = q;		// Encoding quality
 
 	j->s->mjpeg_data_only_frames = 0;
 	j->s->out_format = FMT_MJPEG;
-	j->s->intra_only = 1;
-	j->s->encoding = 1;
+	j->s->intra_only = 1;		// Generate only intra pictures for jpeg
+	j->s->encoding = 1;		// Set mode to encode
 	j->s->pict_type = I_TYPE;
 	j->s->y_dc_scale = 8;
 	j->s->c_dc_scale = 8;
 
-	j->s->mjpeg_write_tables = 1;
+	j->s->mjpeg_write_tables = 1;	// setup to write tables
 	j->s->mjpeg_vsample[0] = 1;
 	j->s->mjpeg_vsample[1] = 1;
 	j->s->mjpeg_vsample[2] = 1;
@@ -351,23 +393,40 @@
 	j->cheap_upsample = cu;
 	j->bw = b;
 
+	// Is this needed?
+	/* if libavcodec is used by the decoder then we must not
+	 * initialize again, but if it is not initialized then we must
+	 * initialize it here. */
+	if (!avcodec_inited) {
+		avcodec_init();
+		avcodec_register_all();
+		avcodec_inited=1;
+	}
+
 	if (mjpeg_init(j->s) < 0) {
-		free(j->s);
-		free(j);
+		av_free(j->s);
+		av_free(j);
 		return NULL;
 	}
 
 	/* alloc bogus avctx to keep MPV_common_init from segfaulting */
-	j->s->avctx = calloc(sizeof(*j->s->avctx), 1);
-	/* Set up to encode mjpeg */
+	j->s->avctx = avcodec_alloc_context();
+	if (j->s->avctx == NULL) {
+		av_free(j->s);
+		av_free(j);
+		return NULL;
+	}
+
+	// Set some a minimum amount of default values that are needed
 	j->s->avctx->codec_id = CODEC_ID_MJPEG;
+	j->s->avctx->dct_algo = FF_DCT_AUTO;
+	j->s->intra_quant_bias= 1<<(QUANT_BIAS_SHIFT-1); //(a + x/2)/x
+	j->s->avctx->thread_count = 1;
 
 	/* make MPV_common_init allocate important buffers, like s->block */
-	j->s->avctx->thread_count = 1;
-
 	if (MPV_common_init(j->s) < 0) {
-		free(j->s);
-		free(j);
+		av_free(j->s);
+		av_free(j);
 		return NULL;
 	}
 
@@ -375,24 +434,28 @@
 	j->s->mb_height = j->s->height/8;
 	j->s->mb_intra = 1;
 
+	// Init q matrix
 	j->s->intra_matrix[0] = ff_mpeg1_default_intra_matrix[0];
 	for (i = 1; i < 64; i++) 
 		j->s->intra_matrix[i] = clip_uint8(
 			(ff_mpeg1_default_intra_matrix[i]*j->s->qscale) >> 3);
+	// precompute matrix
 	convert_matrix(j->s, j->s->q_intra_matrix, j->s->q_intra_matrix16, 
 			j->s->intra_matrix, j->s->intra_quant_bias, 8, 8);
+
+	get_pixels = j->s->dsp.get_pixels;
+
 	return j;
 }	
 
-static int jpeg_enc_frame(jpeg_enc_t *j, unsigned char *y_data, 
-		unsigned char *u_data, unsigned char *v_data, char *bufr) {
-	int i, k, mb_x, mb_y, overflow;
-	short int *dest;
-	unsigned char *source;
+static int jpeg_enc_frame(jpeg_enc_t *j, uint8_t *y_data,
+		   uint8_t *u_data, uint8_t *v_data, uint8_t *bufr) {
+	int mb_x, mb_y, overflow;
 	/* initialize the buffer */
 
 	init_put_bits(&j->s->pb, bufr, 1024*256);
 
+	// Emit the mjpeg header blocks
 	mjpeg_picture_header(j->s);
 
 	j->s->header_bits = put_bits_count(&j->s->pb);
@@ -403,72 +466,11 @@
 
 	for (mb_y = 0; mb_y < j->s->mb_height; mb_y++) {
 		for (mb_x = 0; mb_x < j->s->mb_width; mb_x++) {
-			/* conversion 8 to 16 bit and filling of blocks
-			 * must be mmx optimized */
-			/* fill 2 Y macroblocks and one U and one V */
-			source = mb_y * 8 * j->y_rs + 
-				16 * j->y_ps * mb_x + y_data;
-			dest = j->s->block[0];
-			for (i = 0; i < 8; i++) {
-				for (k = 0; k < 8; k++) {
-					dest[k] = source[k*j->y_ps];
-				}
-				dest += 8;
-				source += j->y_rs;
-			}
-			source = mb_y * 8 * j->y_rs + 
-				(16*mb_x + 8)*j->y_ps + y_data;
-			dest = j->s->block[1];
-			for (i = 0; i < 8; i++) {
-				for (k = 0; k < 8; k++) {
-					dest[k] = source[k*j->y_ps];
-				}
-				dest += 8;
-				source += j->y_rs;
-			}
-			if (!j->bw && j->cheap_upsample) {
-				source = mb_y*4*j->u_rs + 
-					8*mb_x*j->u_ps + u_data;
-				dest = j->s->block[2];
-				for (i = 0; i < 4; i++) {
-					for (k = 0; k < 8; k++) {
-						dest[k] = source[k*j->u_ps];
-						dest[k+8] = source[k*j->u_ps];
-					}
-					dest += 16;
-					source += j->u_rs;
-				}
-				source = mb_y*4*j->v_rs + 
-					8*mb_x*j->v_ps + v_data;
-				dest = j->s->block[3];
-				for (i = 0; i < 4; i++) {
-					for (k = 0; k < 8; k++) {
-						dest[k] = source[k*j->v_ps];
-						dest[k+8] = source[k*j->v_ps];
-					}
-					dest += 16;
-					source += j->u_rs;
-				}
-			} else if (!j->bw && !j->cheap_upsample) {
-				source = mb_y*8*j->u_rs + 
-					8*mb_x*j->u_ps + u_data;
-				dest = j->s->block[2];
-				for (i = 0; i < 8; i++) {
-					for (k = 0; k < 8; k++) 
-						dest[k] = source[k*j->u_ps];
-					dest += 8;
-					source += j->u_rs;
-				}
-				source = mb_y*8*j->v_rs + 
-					8*mb_x*j->v_ps + v_data;
-				dest = j->s->block[3];
-				for (i = 0; i < 8; i++) {
-					for (k = 0; k < 8; k++) 
-						dest[k] = source[k*j->v_ps];
-					dest += 8;
-					source += j->u_rs;
-				}
-			}
+			/*
+			 * Fill one DCT block (8x8 pixels) from
+			 * 2 Y macroblocks and one U and one V
+			 */
+			fill_block(j, mb_x, mb_y, y_data, u_data, v_data);
 			emms_c(); /* is this really needed? */
 
 			j->s->block_last_index[0] = 
@@ -509,8 +511,8 @@
 
 static void jpeg_enc_uninit(jpeg_enc_t *j) {
 	mjpeg_close(j->s);
-	free(j->s);
-	free(j);
+	av_free(j->s);
+	av_free(j);
 }
 
 struct vf_priv_s {
@@ -654,11 +656,11 @@
 
 	priv->y_stride = width;
 	priv->c_stride = width/2;
-	priv->j = jpeg_enc_init(width, height/priv->fields, 1, 
-			priv->fields*priv->y_stride, 1, 
-			priv->fields*priv->c_stride, 1, 
-			priv->fields*priv->c_stride, 1, 
-			priv->quality, priv->bw);
+	priv->j = jpeg_enc_init(width, height/priv->fields,
+				priv->fields*priv->y_stride,
+				priv->fields*priv->c_stride,
+				priv->fields*priv->c_stride,
+				1, priv->quality, priv->bw);
 
 	if (!priv->j) return 0;
 	return vf_next_config(vf, width, height, d_width, d_height, flags,