comparison vp8dsp.c @ 12238:1a7903913e9b libavcodec

VP8: 30% faster idct_mb Take shortcuts based on statically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks?
author darkshikari
date Fri, 23 Jul 2010 02:58:27 +0000
parents e08d65897115
children c7f6ddcc5c01
comparison
equal deleted inserted replaced
12237:f0c4dc49c8f1 12238:1a7903913e9b
107 dst[3] = cm[dst[3]]; 107 dst[3] = cm[dst[3]];
108 dst += stride; 108 dst += stride;
109 } 109 }
110 } 110 }
111 111
112 static void vp8_idct_dc_add4_c(uint8_t *dst, DCTELEM block[4][16], int stride)
113 {
114 int i, j;
115 for (j = 0; j < 4; j++) {
116 uint8_t *pix = dst+j*4;
117 int dc = (block[j][0] + 4) >> 3;
118 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
119 block[j][0] = 0;
120 if (!dc)
121 continue;
122 for (i = 0; i < 4; i++) {
123 pix[0] = cm[pix[0]];
124 pix[1] = cm[pix[1]];
125 pix[2] = cm[pix[2]];
126 pix[3] = cm[pix[3]];
127 pix += stride;
128 }
129 }
130 }
112 131
113 // because I like only having two parameters to pass functions... 132 // because I like only having two parameters to pass functions...
114 #define LOAD_PIXELS\ 133 #define LOAD_PIXELS\
115 int av_unused p3 = p[-4*stride];\ 134 int av_unused p3 = p[-4*stride];\
116 int av_unused p2 = p[-3*stride];\ 135 int av_unused p2 = p[-3*stride];\
458 dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] = put_vp8_bilinear ## SIZE ## _hv_c; \ 477 dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] = put_vp8_bilinear ## SIZE ## _hv_c; \
459 dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] = put_vp8_bilinear ## SIZE ## _hv_c 478 dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] = put_vp8_bilinear ## SIZE ## _hv_c
460 479
461 av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) 480 av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
462 { 481 {
463 dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; 482 dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
464 dsp->vp8_idct_add = vp8_idct_add_c; 483 dsp->vp8_idct_add = vp8_idct_add_c;
465 dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; 484 dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
485 dsp->vp8_idct_dc_add4 = vp8_idct_dc_add4_c;
466 486
467 dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c; 487 dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c;
468 dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c; 488 dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c;
469 dsp->vp8_v_loop_filter8uv = vp8_v_loop_filter8uv_c; 489 dsp->vp8_v_loop_filter8uv = vp8_v_loop_filter8uv_c;
470 dsp->vp8_h_loop_filter8uv = vp8_h_loop_filter8uv_c; 490 dsp->vp8_h_loop_filter8uv = vp8_h_loop_filter8uv_c;