diff x86/vp8dsp-init.c @ 12238:1a7903913e9b libavcodec

VP8: 30% faster idct_mb Take shortcuts based on statically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks?
author darkshikari
date Fri, 23 Jul 2010 02:58:27 +0000
parents e08d65897115
children c7f6ddcc5c01
line wrap: on
line diff
--- a/x86/vp8dsp-init.c	Fri Jul 23 01:59:56 2010 +0000
+++ b/x86/vp8dsp-init.c	Fri Jul 23 02:58:27 2010 +0000
@@ -220,6 +220,8 @@
 
 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
+extern void ff_vp8_idct_dc_add4_mmx(uint8_t *dst, DCTELEM block[4][16], int stride);
+extern void ff_vp8_idct_dc_add4_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
 extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
@@ -283,6 +285,7 @@
 #if HAVE_YASM
     if (mm_flags & FF_MM_MMX) {
         c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_mmx;
+        c->vp8_idct_dc_add4                 = ff_vp8_idct_dc_add4_mmx;
         c->vp8_idct_add                     = ff_vp8_idct_add_mmx;
         c->vp8_luma_dc_wht                  = ff_vp8_luma_dc_wht_mmx;
         c->put_vp8_epel_pixels_tab[0][0][0]     =
@@ -351,6 +354,8 @@
     }
 
     if (mm_flags & FF_MM_SSE2) {
+        c->vp8_idct_dc_add4           = ff_vp8_idct_dc_add4_sse2;
+
         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;