Mercurial > libavcodec.hg
comparison x86/vp8dsp-init.c @ 12241:c7f6ddcc5c01 libavcodec
VP8: optimize DC-only chroma case in the same way as luma.
Add MMX idct_dc_add4uv function for this case.
~40% faster chroma idct.
author | darkshikari |
---|---|
date | Fri, 23 Jul 2010 06:02:52 +0000 |
parents | 1a7903913e9b |
children | 435319d67bd8 |
comparison
equal
deleted
inserted
replaced
12240:e6ade5e849c9 | 12241:c7f6ddcc5c01 |
---|---|
218 HVBILIN(ssse3, 8, 8, 16) | 218 HVBILIN(ssse3, 8, 8, 16) |
219 HVBILIN(ssse3, 8, 16, 16) | 219 HVBILIN(ssse3, 8, 16, 16) |
220 | 220 |
221 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | 221 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); |
222 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); | 222 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); |
223 extern void ff_vp8_idct_dc_add4_mmx(uint8_t *dst, DCTELEM block[4][16], int stride); | 223 extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int stride); |
224 extern void ff_vp8_idct_dc_add4_sse2(uint8_t *dst, DCTELEM block[4][16], int stride); | 224 extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride); |
225 extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride); | |
225 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); | 226 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); |
226 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | 227 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); |
227 extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); | 228 extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); |
228 | 229 |
229 #define DECLARE_LOOP_FILTER(NAME)\ | 230 #define DECLARE_LOOP_FILTER(NAME)\ |
282 { | 283 { |
283 mm_flags = mm_support(); | 284 mm_flags = mm_support(); |
284 | 285 |
285 #if HAVE_YASM | 286 #if HAVE_YASM |
286 if (mm_flags & FF_MM_MMX) { | 287 if (mm_flags & FF_MM_MMX) { |
287 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; | 288 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; |
288 c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_mmx; | 289 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx; |
289 c->vp8_idct_add = ff_vp8_idct_add_mmx; | 290 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx; |
290 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx; | 291 c->vp8_idct_add = ff_vp8_idct_add_mmx; |
292 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx; | |
291 c->put_vp8_epel_pixels_tab[0][0][0] = | 293 c->put_vp8_epel_pixels_tab[0][0][0] = |
292 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx; | 294 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx; |
293 c->put_vp8_epel_pixels_tab[1][0][0] = | 295 c->put_vp8_epel_pixels_tab[1][0][0] = |
294 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; | 296 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; |
295 | 297 |
352 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; | 354 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; |
353 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; | 355 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; |
354 } | 356 } |
355 | 357 |
356 if (mm_flags & FF_MM_SSE2) { | 358 if (mm_flags & FF_MM_SSE2) { |
357 c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_sse2; | 359 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; |
358 | 360 |
359 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; | 361 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; |
360 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; | 362 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; |
361 | 363 |
362 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; | 364 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; |