diff x86/vp8dsp.asm @ 12235:e08d65897115 libavcodec

VP8: clear DCT blocks in iDCT instead of using clear_blocks. ~0.3% faster overall.
author darkshikari
date Fri, 23 Jul 2010 00:07:16 +0000
parents d07e6037846d
children 1a7903913e9b
line wrap: on
line diff
--- a/x86/vp8dsp.asm	Fri Jul 23 00:05:44 2010 +0000
+++ b/x86/vp8dsp.asm	Fri Jul 23 00:07:16 2010 +0000
@@ -913,6 +913,7 @@
     paddw      mm0, [pw_4]
     pxor       mm1, mm1
     psraw      mm0, 3
+    movd      [r1], mm1
     psubw      mm1, mm0
     packuswb   mm0, mm0
     packuswb   mm1, mm1
@@ -944,11 +945,12 @@
 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
     ; load data
     movd       xmm0, [r1]
-    lea          r1, [r0+r2*2]
     pxor       xmm1, xmm1
 
     ; calculate DC
     paddw      xmm0, [pw_4]
+    movd       [r1], xmm1
+    lea          r1, [r0+r2*2]
     movd       xmm2, [r0]
     movd       xmm3, [r0+r2]
     movd       xmm4, [r1]
@@ -1005,14 +1007,26 @@
 %endmacro
 
 INIT_MMX
-cglobal vp8_idct_add_mmx, 3, 3
+%macro VP8_IDCT_ADD 1
+cglobal vp8_idct_add_%1, 3, 3
     ; load block data
-    movq         m0, [r1]
-    movq         m1, [r1+8]
+    movq         m0, [r1+ 0]
+    movq         m1, [r1+ 8]
     movq         m2, [r1+16]
     movq         m3, [r1+24]
     movq         m6, [pw_20091]
     movq         m7, [pw_17734]
+%ifidn %1, sse
+    xorps      xmm0, xmm0
+    movaps  [r1+ 0], xmm0
+    movaps  [r1+16], xmm0
+%else
+    pxor         m4, m4
+    movq    [r1+ 0], m4
+    movq    [r1+ 8], m4
+    movq    [r1+16], m4
+    movq    [r1+24], m4
+%endif
 
     ; actual IDCT
     VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
@@ -1028,6 +1042,10 @@
     STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
 
     RET
+%endmacro
+
+VP8_IDCT_ADD mmx
+VP8_IDCT_ADD sse
 
 ;-----------------------------------------------------------------------------
 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])