# HG changeset patch # User rik # Date 1159025481 0 # Node ID e99cd69dd08e69e4f2c34259d42cea4f924828f6 # Parent bf6bdb785567968f0cd5ad36d48cc37620d09c9e Patch by Karolina Lindqvist "This patch is the MMX optimizations for the zrmjpeg filter, which is used by the zr2 video output driver." With some small changes by me: - column width=80 - kept jpeg_enc_* functions static because they confuse the current vo_zr.c - did not include jpeg_enc.h because jpeg_enc functions are still static diff -r bf6bdb785567 -r e99cd69dd08e libmpcodecs/vf_zrmjpeg.c --- a/libmpcodecs/vf_zrmjpeg.c Sat Sep 23 15:24:24 2006 +0000 +++ b/libmpcodecs/vf_zrmjpeg.c Sat Sep 23 15:31:21 2006 +0000 @@ -31,11 +31,13 @@ #include "libavcodec/avcodec.h" #include "libavcodec/dsputil.h" #include "libavcodec/mpegvideo.h" +//#include "jpeg_enc.h" /* this file is not present yet */ #undef malloc #undef free #undef realloc +extern int avcodec_inited; /* some convenient #define's, is this portable enough? */ #define VERBOSE(...) mp_msg(MSGT_DECVIDEO, MSGL_V, "vf_zrmjpeg: " __VA_ARGS__) @@ -60,6 +62,9 @@ uint16_t huff_code_ac_chrominance[256]; } MJpegContext; +// The get_pixels routine to use. The real routine comes from dsputil +static void (*get_pixels)(DCTELEM *restrict block, const uint8_t *pixels, int line_size); + /* Begin excessive code duplication ************************************/ /* Code coming from mpegvideo.c and mjpeg.c in ../libavcodec ***********/ @@ -75,6 +80,10 @@ 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247 }; +/* + * This routine is like the routine with the same name in mjpeg.c, + * except for some coefficient changes. + */ static void convert_matrix(MpegEncContext *s, int (*qmat)[64], uint16_t (*qmat16)[2][64], const uint16_t *quant_matrix, int bias, int qmin, int qmax) { @@ -130,6 +139,9 @@ } } +/* + * This routine is a clone of mjpeg_encode_dc + */ static inline void encode_dc(MpegEncContext *s, int val, uint8_t *huff_size, uint16_t *huff_code) { int mant, nbits; @@ -142,19 +154,15 @@ val = -val; mant--; } - - /* compute the log (XXX: optimize) */ - nbits = 0; - while (val != 0) { - val = val >> 1; - nbits++; - } - + nbits= av_log2_16bit(val) + 1; put_bits(&s->pb, huff_size[nbits], huff_code[nbits]); put_bits(&s->pb, nbits, mant & ((1 << nbits) - 1)); } } +/* + * This routine is a duplicate of encode_block in mjpeg.c + */ static void encode_block(MpegEncContext *s, DCTELEM *block, int n) { int mant, nbits, code, i, j; int component, dc, run, last_index, val; @@ -199,12 +207,7 @@ mant--; } - /* compute the log (XXX: optimize) */ - nbits = 0; - while (val != 0) { - val = val >> 1; - nbits++; - } + nbits= av_log2_16bit(val) + 1; code = (run << 4) | nbits; put_bits(&s->pb, huff_size_ac[code], @@ -241,9 +244,6 @@ struct MpegEncContext *s; int cheap_upsample; int bw; - int y_ps; - int u_ps; - int v_ps; int y_rs; int u_rs; int v_rs; @@ -253,7 +253,7 @@ * changes, it allows for black&white encoding (it skips the U and V * macroblocks and it outputs the huffman code for 'no change' (dc) and * 'all zero' (ac)) and it takes 4 macroblocks (422) instead of 6 (420) */ -static void zr_mjpeg_encode_mb(jpeg_enc_t *j) { +static always_inline void zr_mjpeg_encode_mb(jpeg_enc_t *j) { MJpegContext *m = j->s->mjpeg_ctx; @@ -279,11 +279,58 @@ } } +/* + * Taking one MCU (YUYV) from 8-bit pixel planar storage and + * filling it into four 16-bit pixel DCT macroblocks. + */ +static always_inline void fill_block(jpeg_enc_t *j, int x, int y, + unsigned char *y_data, unsigned char *u_data, + unsigned char *v_data) +{ + int i, k; + short int *dest; + unsigned char *source; + + // The first Y, Y0 + get_pixels(j->s->block[0], y*8*j->y_rs + 16*x + y_data, j->y_rs); + // The second Y, Y1 + get_pixels(j->s->block[1], y*8*j->y_rs + 16*x + 8 + y_data, j->y_rs); + + if (!j->bw && j->cheap_upsample) { + source = y * 4 * j->u_rs + 8*x + u_data; + dest = j->s->block[2]; + for (i = 0; i < 4; i++) { + for (k = 0; k < 8; k++) { + dest[k] = source[k]; // First row + dest[k+8] = source[k]; // Duplicate to next row + + } + dest += 16; + source += j->u_rs; + } + source = y * 4 * j->v_rs + 8*x + v_data; + dest = j->s->block[3]; + for (i = 0; i < 4; i++) { + for (k = 0; k < 8; k++) { + dest[k] = source[k]; + dest[k+8] = source[k]; + } + dest += 16; + source += j->u_rs; + } + } else if (!j->bw && !j->cheap_upsample) { + // U + get_pixels(j->s->block[2], y*8*j->u_rs + 8*x + u_data, j->u_rs); + // V + get_pixels(j->s->block[3], y*8*j->v_rs + 8*x + v_data, j->v_rs); + } +} + /* this function can take all kinds of YUV colorspaces * YV12, YVYU, UYVY. The necesary parameters must be set up by the caller - * y_ps means "y pixel size", y_rs means "y row size". + * y_rs means "y row size". * For YUYV, for example, is u_buf = y_buf + 1, v_buf = y_buf + 3, - * y_ps = 2, u_ps = 4, v_ps = 4, y_rs = u_rs = v_rs. + * y_rs = u_rs = v_rs. * * The actual buffers must be passed with mjpeg_encode_frame, this is * to make it possible to call encode on the buffer provided by the @@ -301,46 +348,41 @@ /* The encoder doesn't know anything about interlacing, the halve height * needs to be passed and the double rowstride. Which field gets encoded * is decided by what buffers are passed to mjpeg_encode_frame */ -static jpeg_enc_t *jpeg_enc_init(int w, int h, int y_psize, int y_rsize, - int u_psize, int u_rsize, int v_psize, int v_rsize, +static jpeg_enc_t *jpeg_enc_init(int w, int h, int y_rsize, + int u_rsize, int v_rsize, int cu, int q, int b) { jpeg_enc_t *j; int i = 0; - VERBOSE("JPEG encoder init: %dx%d %d %d %d %d %d %d\n", - w, h, y_psize, y_rsize, u_psize, - u_rsize, v_psize, v_rsize); + VERBOSE("JPEG encoder init: %dx%d %d %d %d cu=%d q=%d bw=%d\n", + w, h, y_rsize, u_rsize, v_rsize, cu, q, b); - j = malloc(sizeof(jpeg_enc_t)); + j = av_mallocz(sizeof(jpeg_enc_t)); if (j == NULL) return NULL; - j->s = malloc(sizeof(MpegEncContext)); - memset(j->s,0x00,sizeof(MpegEncContext)); + j->s = av_mallocz(sizeof(MpegEncContext)); if (j->s == NULL) { - free(j); + av_free(j); return NULL; } /* info on how to access the pixels */ - j->y_ps = y_psize; - j->u_ps = u_psize; - j->v_ps = v_psize; j->y_rs = y_rsize; j->u_rs = u_rsize; j->v_rs = v_rsize; - j->s->width = w; + j->s->width = w; // image width and height j->s->height = h; - j->s->qscale = q; + j->s->qscale = q; // Encoding quality j->s->mjpeg_data_only_frames = 0; j->s->out_format = FMT_MJPEG; - j->s->intra_only = 1; - j->s->encoding = 1; + j->s->intra_only = 1; // Generate only intra pictures for jpeg + j->s->encoding = 1; // Set mode to encode j->s->pict_type = I_TYPE; j->s->y_dc_scale = 8; j->s->c_dc_scale = 8; - j->s->mjpeg_write_tables = 1; + j->s->mjpeg_write_tables = 1; // setup to write tables j->s->mjpeg_vsample[0] = 1; j->s->mjpeg_vsample[1] = 1; j->s->mjpeg_vsample[2] = 1; @@ -351,23 +393,40 @@ j->cheap_upsample = cu; j->bw = b; + // Is this needed? + /* if libavcodec is used by the decoder then we must not + * initialize again, but if it is not initialized then we must + * initialize it here. */ + if (!avcodec_inited) { + avcodec_init(); + avcodec_register_all(); + avcodec_inited=1; + } + if (mjpeg_init(j->s) < 0) { - free(j->s); - free(j); + av_free(j->s); + av_free(j); return NULL; } /* alloc bogus avctx to keep MPV_common_init from segfaulting */ - j->s->avctx = calloc(sizeof(*j->s->avctx), 1); - /* Set up to encode mjpeg */ + j->s->avctx = avcodec_alloc_context(); + if (j->s->avctx == NULL) { + av_free(j->s); + av_free(j); + return NULL; + } + + // Set some a minimum amount of default values that are needed j->s->avctx->codec_id = CODEC_ID_MJPEG; + j->s->avctx->dct_algo = FF_DCT_AUTO; + j->s->intra_quant_bias= 1<<(QUANT_BIAS_SHIFT-1); //(a + x/2)/x + j->s->avctx->thread_count = 1; /* make MPV_common_init allocate important buffers, like s->block */ - j->s->avctx->thread_count = 1; - if (MPV_common_init(j->s) < 0) { - free(j->s); - free(j); + av_free(j->s); + av_free(j); return NULL; } @@ -375,24 +434,28 @@ j->s->mb_height = j->s->height/8; j->s->mb_intra = 1; + // Init q matrix j->s->intra_matrix[0] = ff_mpeg1_default_intra_matrix[0]; for (i = 1; i < 64; i++) j->s->intra_matrix[i] = clip_uint8( (ff_mpeg1_default_intra_matrix[i]*j->s->qscale) >> 3); + // precompute matrix convert_matrix(j->s, j->s->q_intra_matrix, j->s->q_intra_matrix16, j->s->intra_matrix, j->s->intra_quant_bias, 8, 8); + + get_pixels = j->s->dsp.get_pixels; + return j; } -static int jpeg_enc_frame(jpeg_enc_t *j, unsigned char *y_data, - unsigned char *u_data, unsigned char *v_data, char *bufr) { - int i, k, mb_x, mb_y, overflow; - short int *dest; - unsigned char *source; +static int jpeg_enc_frame(jpeg_enc_t *j, uint8_t *y_data, + uint8_t *u_data, uint8_t *v_data, uint8_t *bufr) { + int mb_x, mb_y, overflow; /* initialize the buffer */ init_put_bits(&j->s->pb, bufr, 1024*256); + // Emit the mjpeg header blocks mjpeg_picture_header(j->s); j->s->header_bits = put_bits_count(&j->s->pb); @@ -403,72 +466,11 @@ for (mb_y = 0; mb_y < j->s->mb_height; mb_y++) { for (mb_x = 0; mb_x < j->s->mb_width; mb_x++) { - /* conversion 8 to 16 bit and filling of blocks - * must be mmx optimized */ - /* fill 2 Y macroblocks and one U and one V */ - source = mb_y * 8 * j->y_rs + - 16 * j->y_ps * mb_x + y_data; - dest = j->s->block[0]; - for (i = 0; i < 8; i++) { - for (k = 0; k < 8; k++) { - dest[k] = source[k*j->y_ps]; - } - dest += 8; - source += j->y_rs; - } - source = mb_y * 8 * j->y_rs + - (16*mb_x + 8)*j->y_ps + y_data; - dest = j->s->block[1]; - for (i = 0; i < 8; i++) { - for (k = 0; k < 8; k++) { - dest[k] = source[k*j->y_ps]; - } - dest += 8; - source += j->y_rs; - } - if (!j->bw && j->cheap_upsample) { - source = mb_y*4*j->u_rs + - 8*mb_x*j->u_ps + u_data; - dest = j->s->block[2]; - for (i = 0; i < 4; i++) { - for (k = 0; k < 8; k++) { - dest[k] = source[k*j->u_ps]; - dest[k+8] = source[k*j->u_ps]; - } - dest += 16; - source += j->u_rs; - } - source = mb_y*4*j->v_rs + - 8*mb_x*j->v_ps + v_data; - dest = j->s->block[3]; - for (i = 0; i < 4; i++) { - for (k = 0; k < 8; k++) { - dest[k] = source[k*j->v_ps]; - dest[k+8] = source[k*j->v_ps]; - } - dest += 16; - source += j->u_rs; - } - } else if (!j->bw && !j->cheap_upsample) { - source = mb_y*8*j->u_rs + - 8*mb_x*j->u_ps + u_data; - dest = j->s->block[2]; - for (i = 0; i < 8; i++) { - for (k = 0; k < 8; k++) - dest[k] = source[k*j->u_ps]; - dest += 8; - source += j->u_rs; - } - source = mb_y*8*j->v_rs + - 8*mb_x*j->v_ps + v_data; - dest = j->s->block[3]; - for (i = 0; i < 8; i++) { - for (k = 0; k < 8; k++) - dest[k] = source[k*j->v_ps]; - dest += 8; - source += j->u_rs; - } - } + /* + * Fill one DCT block (8x8 pixels) from + * 2 Y macroblocks and one U and one V + */ + fill_block(j, mb_x, mb_y, y_data, u_data, v_data); emms_c(); /* is this really needed? */ j->s->block_last_index[0] = @@ -509,8 +511,8 @@ static void jpeg_enc_uninit(jpeg_enc_t *j) { mjpeg_close(j->s); - free(j->s); - free(j); + av_free(j->s); + av_free(j); } struct vf_priv_s { @@ -654,11 +656,11 @@ priv->y_stride = width; priv->c_stride = width/2; - priv->j = jpeg_enc_init(width, height/priv->fields, 1, - priv->fields*priv->y_stride, 1, - priv->fields*priv->c_stride, 1, - priv->fields*priv->c_stride, 1, - priv->quality, priv->bw); + priv->j = jpeg_enc_init(width, height/priv->fields, + priv->fields*priv->y_stride, + priv->fields*priv->c_stride, + priv->fields*priv->c_stride, + 1, priv->quality, priv->bw); if (!priv->j) return 0; return vf_next_config(vf, width, height, d_width, d_height, flags,