Mercurial > libavcodec.hg
comparison x86/vp8dsp-init.c @ 12334:435319d67bd8 libavcodec
Use word-writing instead of dword-writing (with two cached but otherwise
unchanged bytes) in the horizontal simple loopfilter. This makes the filter
quite a bit faster in itself (~30 cycles less on Core1), probably mostly
because we don't need a complex 4x4 transpose, but only a simple byte
interleave. Also allows using pextrw on SSE4, which speeds up even more
(e.g. 25% faster on Core i7).
author | rbultje |
---|---|
date | Sat, 31 Jul 2010 23:13:15 +0000 |
parents | c7f6ddcc5c01 |
children | 2d15f62f4f8a |
comparison
equal
deleted
inserted
replaced
12333:97219f0fa018 | 12334:435319d67bd8 |
---|---|
344 VP8_MC_FUNC(1, 8, sse2); | 344 VP8_MC_FUNC(1, 8, sse2); |
345 VP8_BILINEAR_MC_FUNC(0, 16, sse2); | 345 VP8_BILINEAR_MC_FUNC(0, 16, sse2); |
346 VP8_BILINEAR_MC_FUNC(1, 8, sse2); | 346 VP8_BILINEAR_MC_FUNC(1, 8, sse2); |
347 | 347 |
348 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; | 348 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; |
349 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; | |
350 | 349 |
351 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; | 350 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; |
352 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; | 351 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; |
353 | 352 |
354 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; | 353 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; |
355 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; | 354 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; |
356 } | 355 } |
357 | 356 |
358 if (mm_flags & FF_MM_SSE2) { | 357 if (mm_flags & FF_MM_SSE2) { |
359 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; | 358 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; |
359 | |
360 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; | |
360 | 361 |
361 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; | 362 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; |
362 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; | 363 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; |
363 | 364 |
364 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; | 365 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; |
388 } | 389 } |
389 | 390 |
390 if (mm_flags & FF_MM_SSE4) { | 391 if (mm_flags & FF_MM_SSE4) { |
391 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; | 392 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; |
392 | 393 |
394 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; | |
393 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; | 395 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; |
394 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; | 396 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; |
395 } | 397 } |
396 #endif | 398 #endif |
397 } | 399 } |