comparison x86/vp8dsp-init.c @ 12334:435319d67bd8 libavcodec

Use word-writing instead of dword-writing (with two cached but otherwise unchanged bytes) in the horizontal simple loopfilter. This makes the filter quite a bit faster in itself (~30 cycles less on Core1), probably mostly because we don't need a complex 4x4 transpose, but only a simple byte interleave. Also allows using pextrw on SSE4, which speeds up even more (e.g. 25% faster on Core i7).
author rbultje
date Sat, 31 Jul 2010 23:13:15 +0000
parents c7f6ddcc5c01
children 2d15f62f4f8a
comparison
equal deleted inserted replaced
12333:97219f0fa018 12334:435319d67bd8
344 VP8_MC_FUNC(1, 8, sse2); 344 VP8_MC_FUNC(1, 8, sse2);
345 VP8_BILINEAR_MC_FUNC(0, 16, sse2); 345 VP8_BILINEAR_MC_FUNC(0, 16, sse2);
346 VP8_BILINEAR_MC_FUNC(1, 8, sse2); 346 VP8_BILINEAR_MC_FUNC(1, 8, sse2);
347 347
348 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; 348 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
349 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
350 349
351 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; 350 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
352 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; 351 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
353 352
354 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; 353 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
355 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; 354 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
356 } 355 }
357 356
358 if (mm_flags & FF_MM_SSE2) { 357 if (mm_flags & FF_MM_SSE2) {
359 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; 358 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
359
360 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
360 361
361 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; 362 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
362 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; 363 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
363 364
364 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; 365 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
388 } 389 }
389 390
390 if (mm_flags & FF_MM_SSE4) { 391 if (mm_flags & FF_MM_SSE4) {
391 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; 392 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
392 393
394 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
393 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; 395 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
394 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; 396 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
395 } 397 }
396 #endif 398 #endif
397 } 399 }