comparison vp8dsp.c @ 12241:c7f6ddcc5c01 libavcodec

VP8: optimize DC-only chroma case in the same way as luma. Add MMX idct_dc_add4uv function for this case. ~40% faster chroma idct.
author darkshikari
date Fri, 23 Jul 2010 06:02:52 +0000
parents 1a7903913e9b
children 2d15f62f4f8a
comparison
equal deleted inserted replaced
12240:e6ade5e849c9 12241:c7f6ddcc5c01
107 dst[3] = cm[dst[3]]; 107 dst[3] = cm[dst[3]];
108 dst += stride; 108 dst += stride;
109 } 109 }
110 } 110 }
111 111
112 static void vp8_idct_dc_add4_c(uint8_t *dst, DCTELEM block[4][16], int stride) 112 static void vp8_idct_dc_add4uv_c(uint8_t *dst, DCTELEM block[4][16], int stride)
113 { 113 {
114 int i, j; 114 vp8_idct_dc_add_c(dst+stride*0+0, block[0], stride);
115 for (j = 0; j < 4; j++) { 115 vp8_idct_dc_add_c(dst+stride*0+4, block[1], stride);
116 uint8_t *pix = dst+j*4; 116 vp8_idct_dc_add_c(dst+stride*4+0, block[2], stride);
117 int dc = (block[j][0] + 4) >> 3; 117 vp8_idct_dc_add_c(dst+stride*4+4, block[3], stride);
118 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc; 118 }
119 block[j][0] = 0; 119
120 if (!dc) 120 static void vp8_idct_dc_add4y_c(uint8_t *dst, DCTELEM block[4][16], int stride)
121 continue; 121 {
122 for (i = 0; i < 4; i++) { 122 vp8_idct_dc_add_c(dst+ 0, block[0], stride);
123 pix[0] = cm[pix[0]]; 123 vp8_idct_dc_add_c(dst+ 4, block[1], stride);
124 pix[1] = cm[pix[1]]; 124 vp8_idct_dc_add_c(dst+ 8, block[2], stride);
125 pix[2] = cm[pix[2]]; 125 vp8_idct_dc_add_c(dst+12, block[3], stride);
126 pix[3] = cm[pix[3]];
127 pix += stride;
128 }
129 }
130 } 126 }
131 127
132 // because I like only having two parameters to pass functions... 128 // because I like only having two parameters to pass functions...
133 #define LOAD_PIXELS\ 129 #define LOAD_PIXELS\
134 int av_unused p3 = p[-4*stride];\ 130 int av_unused p3 = p[-4*stride];\
477 dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] = put_vp8_bilinear ## SIZE ## _hv_c; \ 473 dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] = put_vp8_bilinear ## SIZE ## _hv_c; \
478 dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] = put_vp8_bilinear ## SIZE ## _hv_c 474 dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] = put_vp8_bilinear ## SIZE ## _hv_c
479 475
480 av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) 476 av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
481 { 477 {
482 dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; 478 dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
483 dsp->vp8_idct_add = vp8_idct_add_c; 479 dsp->vp8_idct_add = vp8_idct_add_c;
484 dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; 480 dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
485 dsp->vp8_idct_dc_add4 = vp8_idct_dc_add4_c; 481 dsp->vp8_idct_dc_add4y = vp8_idct_dc_add4y_c;
482 dsp->vp8_idct_dc_add4uv = vp8_idct_dc_add4uv_c;
486 483
487 dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c; 484 dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c;
488 dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c; 485 dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c;
489 dsp->vp8_v_loop_filter8uv = vp8_v_loop_filter8uv_c; 486 dsp->vp8_v_loop_filter8uv = vp8_v_loop_filter8uv_c;
490 dsp->vp8_h_loop_filter8uv = vp8_h_loop_filter8uv_c; 487 dsp->vp8_h_loop_filter8uv = vp8_h_loop_filter8uv_c;