Mercurial > libavcodec.hg
comparison vp8dsp.c @ 12241:c7f6ddcc5c01 libavcodec
VP8: optimize DC-only chroma case in the same way as luma.
Add MMX idct_dc_add4uv function for this case.
~40% faster chroma idct.
author | darkshikari |
---|---|
date | Fri, 23 Jul 2010 06:02:52 +0000 |
parents | 1a7903913e9b |
children | 2d15f62f4f8a |
comparison
equal
deleted
inserted
replaced
12240:e6ade5e849c9 | 12241:c7f6ddcc5c01 |
---|---|
107 dst[3] = cm[dst[3]]; | 107 dst[3] = cm[dst[3]]; |
108 dst += stride; | 108 dst += stride; |
109 } | 109 } |
110 } | 110 } |
111 | 111 |
112 static void vp8_idct_dc_add4_c(uint8_t *dst, DCTELEM block[4][16], int stride) | 112 static void vp8_idct_dc_add4uv_c(uint8_t *dst, DCTELEM block[4][16], int stride) |
113 { | 113 { |
114 int i, j; | 114 vp8_idct_dc_add_c(dst+stride*0+0, block[0], stride); |
115 for (j = 0; j < 4; j++) { | 115 vp8_idct_dc_add_c(dst+stride*0+4, block[1], stride); |
116 uint8_t *pix = dst+j*4; | 116 vp8_idct_dc_add_c(dst+stride*4+0, block[2], stride); |
117 int dc = (block[j][0] + 4) >> 3; | 117 vp8_idct_dc_add_c(dst+stride*4+4, block[3], stride); |
118 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc; | 118 } |
119 block[j][0] = 0; | 119 |
120 if (!dc) | 120 static void vp8_idct_dc_add4y_c(uint8_t *dst, DCTELEM block[4][16], int stride) |
121 continue; | 121 { |
122 for (i = 0; i < 4; i++) { | 122 vp8_idct_dc_add_c(dst+ 0, block[0], stride); |
123 pix[0] = cm[pix[0]]; | 123 vp8_idct_dc_add_c(dst+ 4, block[1], stride); |
124 pix[1] = cm[pix[1]]; | 124 vp8_idct_dc_add_c(dst+ 8, block[2], stride); |
125 pix[2] = cm[pix[2]]; | 125 vp8_idct_dc_add_c(dst+12, block[3], stride); |
126 pix[3] = cm[pix[3]]; | |
127 pix += stride; | |
128 } | |
129 } | |
130 } | 126 } |
131 | 127 |
132 // because I like only having two parameters to pass functions... | 128 // because I like only having two parameters to pass functions... |
133 #define LOAD_PIXELS\ | 129 #define LOAD_PIXELS\ |
134 int av_unused p3 = p[-4*stride];\ | 130 int av_unused p3 = p[-4*stride];\ |
477 dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] = put_vp8_bilinear ## SIZE ## _hv_c; \ | 473 dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] = put_vp8_bilinear ## SIZE ## _hv_c; \ |
478 dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] = put_vp8_bilinear ## SIZE ## _hv_c | 474 dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] = put_vp8_bilinear ## SIZE ## _hv_c |
479 | 475 |
480 av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) | 476 av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) |
481 { | 477 { |
482 dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; | 478 dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; |
483 dsp->vp8_idct_add = vp8_idct_add_c; | 479 dsp->vp8_idct_add = vp8_idct_add_c; |
484 dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; | 480 dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; |
485 dsp->vp8_idct_dc_add4 = vp8_idct_dc_add4_c; | 481 dsp->vp8_idct_dc_add4y = vp8_idct_dc_add4y_c; |
482 dsp->vp8_idct_dc_add4uv = vp8_idct_dc_add4uv_c; | |
486 | 483 |
487 dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c; | 484 dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c; |
488 dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c; | 485 dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c; |
489 dsp->vp8_v_loop_filter8uv = vp8_v_loop_filter8uv_c; | 486 dsp->vp8_v_loop_filter8uv = vp8_v_loop_filter8uv_c; |
490 dsp->vp8_h_loop_filter8uv = vp8_h_loop_filter8uv_c; | 487 dsp->vp8_h_loop_filter8uv = vp8_h_loop_filter8uv_c; |