# HG changeset patch # User melanson # Date 1082938829 0 # Node ID 89422281f6f6fc9086782053d716c4518adfc3bc # Parent 5dfde318d44af76c0fa165b6e1696c40d3f45aca reorganize and simplify the VP3 IDCT stuff diff -r 5dfde318d44a -r 89422281f6f6 dsputil.c --- a/dsputil.c Sun Apr 25 19:03:35 2004 +0000 +++ b/dsputil.c Mon Apr 26 00:20:29 2004 +0000 @@ -3126,8 +3126,7 @@ /* VP3 DSP support */ c->vp3_dsp_init = vp3_dsp_init_c; - c->vp3_idct_put = vp3_idct_put_c; - c->vp3_idct_add = vp3_idct_add_c; + c->vp3_idct = vp3_idct_c; c->get_pixels = get_pixels_c; c->diff_pixels = diff_pixels_c; diff -r 5dfde318d44a -r 89422281f6f6 dsputil.h --- a/dsputil.h Sun Apr 25 19:03:35 2004 +0000 +++ b/dsputil.h Mon Apr 26 00:20:29 2004 +0000 @@ -62,23 +62,16 @@ /* VP3 DSP functions */ void vp3_dsp_init_c(void); -void vp3_idct_put_c(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); -void vp3_idct_add_c(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); +void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, DCTELEM *output_data); void vp3_dsp_init_mmx(void); -void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); -void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); +void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, DCTELEM *output_data); void vp3_dsp_init_sse2(void); -void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); -void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); - +void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, DCTELEM *output_data); /* minimum alignment rules ;) if u notice errors in the align stuff, need more alignment for some asm code for some cpu @@ -318,32 +311,16 @@ /** * This function is responsible for taking a block of zigzag'd, - * quantized DCT coefficients, reconstructing the original block of - * samples, and placing it into the output. + * quantized DCT coefficients and reconstructing the original block of + * samples. * @param input_data 64 zigzag'd, quantized DCT coefficients * @param dequant_matrix 64 zigzag'd quantizer coefficients * @param coeff_count index of the last coefficient - * @param dest the final output location where the transformed samples - * are to be placed - * @param stride the width in 8-bit samples of a line on this plane + * @param output_samples space for 64 DCTELEMs where the transformed + * samples will be stored */ - void (*vp3_idct_put)(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); - - /** - * This function is responsible for taking a block of zigzag'd, - * quantized DCT coefficients, reconstructing the original block of - * samples, and adding the transformed samples to an existing block of - * samples in the output. - * @param input_data 64 zigzag'd, quantized DCT coefficients - * @param dequant_matrix 64 zigzag'd quantizer coefficients - * @param coeff_count index of the last coefficient - * @param dest the final output location where the transformed samples - * are to be placed - * @param stride the width in 8-bit samples of a line on this plane - */ - void (*vp3_idct_add)(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride); + void (*vp3_idct)(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, DCTELEM *output_samples); } DSPContext; diff -r 5dfde318d44a -r 89422281f6f6 i386/dsputil_mmx.c --- a/i386/dsputil_mmx.c Sun Apr 25 19:03:35 2004 +0000 +++ b/i386/dsputil_mmx.c Mon Apr 26 00:20:29 2004 +0000 @@ -2149,14 +2149,12 @@ /* VP3 optimized DSP functions */ if (mm_flags & MM_SSE2) { c->vp3_dsp_init = vp3_dsp_init_sse2; - c->vp3_idct_put = vp3_idct_put_sse2; - c->vp3_idct_add = vp3_idct_add_sse2; + c->vp3_idct = vp3_idct_sse2; } else { c->vp3_dsp_init = vp3_dsp_init_mmx; - c->vp3_idct_put = vp3_idct_put_mmx; - c->vp3_idct_add = vp3_idct_add_mmx; + c->vp3_idct = vp3_idct_mmx; } - + #ifdef CONFIG_ENCODERS c->get_pixels = get_pixels_mmx; c->diff_pixels = diff_pixels_mmx; diff -r 5dfde318d44a -r 89422281f6f6 i386/vp3dsp_mmx.c --- a/i386/vp3dsp_mmx.c Sun Apr 25 19:03:35 2004 +0000 +++ b/i386/vp3dsp_mmx.c Mon Apr 26 00:20:29 2004 +0000 @@ -279,8 +279,8 @@ idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift; } -static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix, - int16_t *output_data) +void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, int16_t *output_data) { /* eax = quantized input * ebx = dequantizer matrix @@ -563,79 +563,3 @@ #undef J } - -void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int16_t transformed_data[64]; - int16_t *op; - int i, j; - uint8_t vector128[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; - - vp3_idct_mmx(input_data, dequant_matrix, transformed_data); - - /* place in final output */ - op = transformed_data; - movq_m2r(*vector128, mm0); - for (i = 0; i < 8; i++) { -#if 1 - for (j = 0; j < 8; j++) { - if (*op < -128) - *dest = 0; - else if (*op > 127) - *dest = 255; - else - *dest = (uint8_t)(*op + 128); - op++; - dest++; - } - dest += (stride - 8); -#else -/* prototype optimization */ - pxor_r2r(mm1, mm1); - packsswb_m2r(*(op + 4), mm1); - movq_r2r(mm1, mm2); - psrlq_i2r(32, mm2); - packsswb_m2r(*(op + 0), mm1); - op += 8; - por_r2r(mm2, mm1); - paddb_r2r(mm0, mm1); - movq_r2m(mm1, *dest); - dest += stride; -#endif - } - - /* be a good MMX citizen */ - emms(); -} - -void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int16_t transformed_data[64]; - int16_t *op; - int i, j; - int16_t sample; - - vp3_idct_mmx(input_data, dequant_matrix, transformed_data); - - /* place in final output */ - op = transformed_data; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - sample = *dest + *op; - if (sample < 0) - *dest = 0; - else if (sample > 255) - *dest = 255; - else - *dest = (uint8_t)(sample & 0xFF); - op++; - dest++; - } - dest += (stride - 8); - } - - /* be a good MMX citizen */ - emms(); -} diff -r 5dfde318d44a -r 89422281f6f6 i386/vp3dsp_sse2.c --- a/i386/vp3dsp_sse2.c Sun Apr 25 19:03:35 2004 +0000 +++ b/i386/vp3dsp_sse2.c Mon Apr 26 00:20:29 2004 +0000 @@ -799,11 +799,12 @@ void vp3_dsp_init_sse2(void) { /* nop */ +av_log(NULL, AV_LOG_INFO, "Hey! SSE2!\n"); } -static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix, - int16_t *output_data) +void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, int16_t *output_data) { unsigned char *input_bytes = (unsigned char *)input_data; unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix; @@ -832,59 +833,3 @@ SSE2_Column_IDCT(); } - - -void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int16_t transformed_data[64]; - int16_t *op; - int i, j; - - vp3_idct_sse2(input_data, dequant_matrix, transformed_data); - - /* place in final output */ - op = transformed_data; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - if (*op < -128) - *dest = 0; - else if (*op > 127) - *dest = 255; - else - *dest = (uint8_t)(*op + 128); - op++; - dest++; - } - dest += (stride - 8); - } -} - - -void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int16_t transformed_data[64]; - int16_t *op; - int i, j; - int16_t sample; - - vp3_idct_sse2(input_data, dequant_matrix, transformed_data); - - /* place in final output */ - op = transformed_data; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - sample = *dest + *op; - if (sample < 0) - *dest = 0; - else if (sample > 255) - *dest = 255; - else - *dest = (uint8_t)(sample & 0xFF); - op++; - dest++; - } - dest += (stride - 8); - } -} diff -r 5dfde318d44a -r 89422281f6f6 vp3.c --- a/vp3.c Sun Apr 25 19:03:35 2004 +0000 +++ b/vp3.c Mon Apr 26 00:20:29 2004 +0000 @@ -2051,6 +2051,7 @@ int m, n; int i = first_fragment; int16_t *dequantizer; + DCTELEM __align16 output_samples[64]; unsigned char *output_plane; unsigned char *last_plane; unsigned char *golden_plane; @@ -2060,6 +2061,10 @@ int motion_halfpel_index; uint8_t *motion_source; + int16_t *op; + uint8_t *dest; + int j, k; + debug_vp3(" vp3: rendering final fragments for %s\n", (plane == 0) ? "Y plane" : (plane == 1) ? "U plane" : "V plane"); @@ -2176,16 +2181,29 @@ s->all_fragments[i].coeffs[0], dequantizer[0]); /* invert DCT and place (or add) in final output */ + s->dsp.vp3_idct(s->all_fragments[i].coeffs, + dequantizer, + s->all_fragments[i].coeff_count, + output_samples); if (s->all_fragments[i].coding_method == MODE_INTRA) { - s->dsp.vp3_idct_put(s->all_fragments[i].coeffs, - dequantizer, - s->all_fragments[i].coeff_count, - output_plane + s->all_fragments[i].first_pixel, - stride); + /* this really needs to be optimized sooner or later */ + op = output_samples; + dest = output_plane + s->all_fragments[i].first_pixel; + for (j = 0; j < 8; j++) { + for (k = 0; k < 8; k++) { + if (*op < -128) + *dest = 0; + else if (*op > 127) + *dest = 255; + else + *dest = (uint8_t)(*op + 128); + op++; + dest++; + } + dest += (stride - 8); + } } else { - s->dsp.vp3_idct_add(s->all_fragments[i].coeffs, - dequantizer, - s->all_fragments[i].coeff_count, + s->dsp.add_pixels_clamped(output_samples, output_plane + s->all_fragments[i].first_pixel, stride); } diff -r 5dfde318d44a -r 89422281f6f6 vp3dsp.c --- a/vp3dsp.c Sun Apr 25 19:03:35 2004 +0000 +++ b/vp3dsp.c Mon Apr 26 00:20:29 2004 +0000 @@ -40,8 +40,10 @@ /* nop */ } -static void vp3_idct_c(int32_t *dequantized_data, int16_t *output_data) +void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix, + int coeff_count, int16_t *output_data) { + int32_t dequantized_data[64]; int32_t *ip = dequantized_data; int16_t *op = output_data; @@ -49,7 +51,13 @@ int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd; int32_t t1, t2; - int i; + int i, j; + + /* de-zigzag and dequantize */ + for (i = 0; i < coeff_count; i++) { + j = dezigzag_index[i]; + dequantized_data[j] = dequant_matrix[i] * input_data[i]; + } /* Inverse DCT on the rows now */ for (i = 0; i < 8; i++) { @@ -248,71 +256,3 @@ op++; } } - -void vp3_idct_put_c(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int32_t dequantized_data[64]; - int16_t transformed_data[64]; - int16_t *op; - int i, j; - - /* de-zigzag and dequantize */ - for (i = 0; i < coeff_count; i++) { - j = dezigzag_index[i]; - dequantized_data[j] = dequant_matrix[i] * input_data[i]; - } - - vp3_idct_c(dequantized_data, transformed_data); - - /* place in final output */ - op = transformed_data; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - if (*op < -128) - *dest = 0; - else if (*op > 127) - *dest = 255; - else - *dest = (uint8_t)(*op + 128); - op++; - dest++; - } - dest += (stride - 8); - } -} - -void vp3_idct_add_c(int16_t *input_data, int16_t *dequant_matrix, - int coeff_count, uint8_t *dest, int stride) -{ - int32_t dequantized_data[64]; - int16_t transformed_data[64]; - int16_t *op; - int i, j; - int16_t sample; - - /* de-zigzag and dequantize */ - for (i = 0; i < coeff_count; i++) { - j = dezigzag_index[i]; - dequantized_data[j] = dequant_matrix[i] * input_data[i]; - } - - vp3_idct_c(dequantized_data, transformed_data); - - /* place in final output */ - op = transformed_data; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - sample = *dest + *op; - if (sample < 0) - *dest = 0; - else if (sample > 255) - *dest = 255; - else - *dest = (uint8_t)(sample & 0xFF); - op++; - dest++; - } - dest += (stride - 8); - } -}