comparison x86/vp8dsp-init.c @ 12241:c7f6ddcc5c01 libavcodec

VP8: optimize DC-only chroma case in the same way as luma. Add MMX idct_dc_add4uv function for this case. ~40% faster chroma idct.
author darkshikari
date Fri, 23 Jul 2010 06:02:52 +0000
parents 1a7903913e9b
children 435319d67bd8
comparison
equal deleted inserted replaced
12240:e6ade5e849c9 12241:c7f6ddcc5c01
218 HVBILIN(ssse3, 8, 8, 16) 218 HVBILIN(ssse3, 8, 8, 16)
219 HVBILIN(ssse3, 8, 16, 16) 219 HVBILIN(ssse3, 8, 16, 16)
220 220
221 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); 221 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
222 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); 222 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
223 extern void ff_vp8_idct_dc_add4_mmx(uint8_t *dst, DCTELEM block[4][16], int stride); 223 extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int stride);
224 extern void ff_vp8_idct_dc_add4_sse2(uint8_t *dst, DCTELEM block[4][16], int stride); 224 extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
225 extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride);
225 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); 226 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
226 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); 227 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
227 extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); 228 extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
228 229
229 #define DECLARE_LOOP_FILTER(NAME)\ 230 #define DECLARE_LOOP_FILTER(NAME)\
282 { 283 {
283 mm_flags = mm_support(); 284 mm_flags = mm_support();
284 285
285 #if HAVE_YASM 286 #if HAVE_YASM
286 if (mm_flags & FF_MM_MMX) { 287 if (mm_flags & FF_MM_MMX) {
287 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; 288 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
288 c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_mmx; 289 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
289 c->vp8_idct_add = ff_vp8_idct_add_mmx; 290 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
290 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx; 291 c->vp8_idct_add = ff_vp8_idct_add_mmx;
292 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
291 c->put_vp8_epel_pixels_tab[0][0][0] = 293 c->put_vp8_epel_pixels_tab[0][0][0] =
292 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx; 294 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
293 c->put_vp8_epel_pixels_tab[1][0][0] = 295 c->put_vp8_epel_pixels_tab[1][0][0] =
294 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; 296 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
295 297
352 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; 354 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
353 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; 355 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
354 } 356 }
355 357
356 if (mm_flags & FF_MM_SSE2) { 358 if (mm_flags & FF_MM_SSE2) {
357 c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_sse2; 359 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
358 360
359 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; 361 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
360 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; 362 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
361 363
362 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; 364 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;