# HG changeset patch
# User darkshikari
# Date 1279750263 0
# Node ID baf13deed97e25766a2cfc8b90c9d71dffb55138
# Parent  9eef00a4328022ab400cb96ff4d931611c8c7411
Various VP8 x86 deblocking speedups
SSSE3 versions, improve SSE2 versions a bit.
SSE2/SSSE3 mbedge h functions are currently broken, so explicitly disable them.

diff -r 9eef00a43280 -r baf13deed97e x86/vp8dsp-init.c
--- a/x86/vp8dsp-init.c	Wed Jul 21 20:51:01 2010 +0000
+++ b/x86/vp8dsp-init.c	Wed Jul 21 22:11:03 2010 +0000
@@ -223,64 +223,31 @@
 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
 
-extern void ff_vp8_v_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim);
-extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
-extern void ff_vp8_v_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim);
-extern void ff_vp8_h_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim);
-extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
-extern void ff_vp8_h_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim);
-
-extern void ff_vp8_v_loop_filter16y_inner_mmx   (uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_inner_mmxext(uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_inner_sse2  (uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_inner_mmx   (uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_inner_mmxext(uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_inner_sse2  (uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
+#define DECLARE_LOOP_FILTER(NAME)\
+extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
+extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
+extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
+                                                    int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
+                                                    int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
+                                                    int s, int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
+                                                    int s, int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
+                                                    int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
+                                                    int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
+                                                    int s, int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
+                                                    int s, int e, int i, int hvt);
 
-extern void ff_vp8_v_loop_filter8uv_inner_mmx   (uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_inner_sse2  (uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_inner_mmx   (uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_inner_sse2  (uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
+DECLARE_LOOP_FILTER(mmx)
+DECLARE_LOOP_FILTER(mmxext)
+DECLARE_LOOP_FILTER(sse2)
+DECLARE_LOOP_FILTER(ssse3)
 
-extern void ff_vp8_v_loop_filter16y_mbedge_mmx   (uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_mbedge_sse2  (uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_mbedge_mmx   (uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_mbedge_sse2  (uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-
-extern void ff_vp8_v_loop_filter8uv_mbedge_mmx   (uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_mbedge_sse2  (uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_mbedge_mmx   (uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_mbedge_sse2  (uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
 #endif
 
 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
@@ -384,8 +351,8 @@
         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
 
-        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmxext;
-        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
+        //c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse2;
+        //c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse2;
     }
 
     if (mm_flags & FF_MM_SSSE3) {
@@ -395,6 +362,19 @@
         VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
         VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
         VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
+
+        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
+        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
+
+        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
+        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
+        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
+        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
+
+        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_ssse3;
+        //c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_ssse3;
+        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
+        //c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
     }
 
     if (mm_flags & FF_MM_SSE4) {
diff -r 9eef00a43280 -r baf13deed97e x86/vp8dsp.asm
--- a/x86/vp8dsp.asm	Wed Jul 21 20:51:01 2010 +0000
+++ b/x86/vp8dsp.asm	Wed Jul 21 22:11:03 2010 +0000
@@ -1229,18 +1229,22 @@
     movd    [%7+%9*2], m%4
 %endmacro
 
-%macro SPLATB_REG 3
+%macro SPLATB_REG 3-4
     movd           %1, %2
+%ifidn %3, ssse3
+    pshufb         %1, %4
+%else
     punpcklbw      %1, %1
 %if mmsize == 16 ; sse2
-    punpcklwd      %1, %1
-    pshufd         %1, %1, 0x0
+    pshuflw        %1, %1, 0x0
+    punpcklqdq     %1, %1
 %elifidn %3, mmx
     punpcklwd      %1, %1
     punpckldq      %1, %1
 %else ; mmxext
     pshufw         %1, %1, 0x0
 %endif
+%endif
 %endmacro
 
 %macro SIMPLE_LOOPFILTER 3
@@ -1252,7 +1256,10 @@
 %if mmsize == 8 ; mmx/mmxext
     mov            r3, 2
 %endif
-    SPLATB_REG     m7, r2, %1       ; splat "flim" into register
+%ifidn %1, ssse3
+    pxor           m0, m0
+%endif
+    SPLATB_REG     m7, r2, %1, m0   ; splat "flim" into register
 
     ; set up indexes to address 4 rows
     mov            r2, r1
@@ -1398,6 +1405,8 @@
 INIT_XMM
 SIMPLE_LOOPFILTER sse2,   v, 3
 SIMPLE_LOOPFILTER sse2,   h, 6
+SIMPLE_LOOPFILTER ssse3,  v, 3
+SIMPLE_LOOPFILTER ssse3,  h, 6
 
 ;-----------------------------------------------------------------------------
 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
@@ -1433,11 +1442,15 @@
 %define stack_reg   hev_thr_reg
 %endif
 
+%ifidn %1, ssse3
+    pxor             m7, m7
+%endif
+
 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
     ; splat function arguments
-    SPLATB_REG       m0, E_reg, %1   ; E
-    SPLATB_REG       m1, I_reg, %1   ; I
-    SPLATB_REG       m2, hev_thr_reg, %1 ; hev_thresh
+    SPLATB_REG       m0, E_reg, %1, m7 ; E
+    SPLATB_REG       m1, I_reg, %1, m7 ; I
+    SPLATB_REG       m2, hev_thr_reg, %1, m7 ; hev_thresh
 
     ; align stack
     mov       stack_reg, rsp         ; backup stack pointer
@@ -1470,9 +1483,9 @@
 %define q0backup m8
 
     ; splat function arguments
-    SPLATB_REG   flim_E, E_reg, %1   ; E
-    SPLATB_REG   flim_I, I_reg, %1   ; I
-    SPLATB_REG  hev_thr, hev_thr_reg, %1 ; hev_thresh
+    SPLATB_REG   flim_E, E_reg, %1, m7 ; E
+    SPLATB_REG   flim_I, I_reg, %1, m7 ; I
+    SPLATB_REG  hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
 %endif
 
 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
@@ -1884,15 +1897,15 @@
 %endmacro
 
 INIT_MMX
-INNER_LOOPFILTER mmx,    v, 6, 16, 8
-INNER_LOOPFILTER mmx,    h, 6, 16, 8
-INNER_LOOPFILTER mmxext, v, 6, 16, 8
-INNER_LOOPFILTER mmxext, h, 6, 16, 8
+INNER_LOOPFILTER mmx,    v, 6, 16, 0
+INNER_LOOPFILTER mmx,    h, 6, 16, 0
+INNER_LOOPFILTER mmxext, v, 6, 16, 0
+INNER_LOOPFILTER mmxext, h, 6, 16, 0
 
-INNER_LOOPFILTER mmx,    v, 6,  8, 8
-INNER_LOOPFILTER mmx,    h, 6,  8, 8
-INNER_LOOPFILTER mmxext, v, 6,  8, 8
-INNER_LOOPFILTER mmxext, h, 6,  8, 8
+INNER_LOOPFILTER mmx,    v, 6,  8, 0
+INNER_LOOPFILTER mmx,    h, 6,  8, 0
+INNER_LOOPFILTER mmxext, v, 6,  8, 0
+INNER_LOOPFILTER mmxext, h, 6,  8, 0
 
 INIT_XMM
 INNER_LOOPFILTER sse2,   v, 5, 16, 13
@@ -1904,6 +1917,15 @@
 INNER_LOOPFILTER sse2,   v, 6,  8, 13
 INNER_LOOPFILTER sse2,   h, 6,  8, 13
 
+INNER_LOOPFILTER ssse3,  v, 5, 16, 13
+%ifdef m8
+INNER_LOOPFILTER ssse3,  h, 5, 16, 13
+%else
+INNER_LOOPFILTER ssse3,  h, 6, 16, 13
+%endif
+INNER_LOOPFILTER ssse3,  v, 6,  8, 13
+INNER_LOOPFILTER ssse3,  h, 6,  8, 13
+
 ;-----------------------------------------------------------------------------
 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
 ;                                            int flimE, int flimI, int hev_thr);
@@ -1984,11 +2006,15 @@
 %define stack_reg   hev_thr_reg
 %endif
 
+%ifidn %1, ssse3
+    pxor             m7, m7
+%endif
+
 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
     ; splat function arguments
-    SPLATB_REG       m0, E_reg, %1   ; E
-    SPLATB_REG       m1, I_reg, %1   ; I
-    SPLATB_REG       m2, hev_thr_reg, %1 ; hev_thresh
+    SPLATB_REG       m0, E_reg, %1, m7 ; E
+    SPLATB_REG       m1, I_reg, %1, m7 ; I
+    SPLATB_REG       m2, hev_thr_reg, %1, m7 ; hev_thresh
 
     ; align stack
     mov       stack_reg, rsp         ; backup stack pointer
@@ -2028,9 +2054,9 @@
 %define lim_sign m15
 
     ; splat function arguments
-    SPLATB_REG   flim_E, E_reg, %1   ; E
-    SPLATB_REG   flim_I, I_reg, %1   ; I
-    SPLATB_REG  hev_thr, hev_thr_reg, %1 ; hev_thresh
+    SPLATB_REG   flim_E, E_reg, %1, m7 ; E
+    SPLATB_REG   flim_I, I_reg, %1, m7 ; I
+    SPLATB_REG  hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
 %endif
 
 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
@@ -2521,15 +2547,15 @@
 %endmacro
 
 INIT_MMX
-MBEDGE_LOOPFILTER mmx,    v, 6, 16, 8
-MBEDGE_LOOPFILTER mmx,    h, 6, 16, 8
-MBEDGE_LOOPFILTER mmxext, v, 6, 16, 8
-MBEDGE_LOOPFILTER mmxext, h, 6, 16, 8
+MBEDGE_LOOPFILTER mmx,    v, 6, 16, 0
+MBEDGE_LOOPFILTER mmx,    h, 6, 16, 0
+MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
+MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
 
-MBEDGE_LOOPFILTER mmx,    v, 6,  8, 8
-MBEDGE_LOOPFILTER mmx,    h, 6,  8, 8
-MBEDGE_LOOPFILTER mmxext, v, 6,  8, 8
-MBEDGE_LOOPFILTER mmxext, h, 6,  8, 8
+MBEDGE_LOOPFILTER mmx,    v, 6,  8, 0
+MBEDGE_LOOPFILTER mmx,    h, 6,  8, 0
+MBEDGE_LOOPFILTER mmxext, v, 6,  8, 0
+MBEDGE_LOOPFILTER mmxext, h, 6,  8, 0
 
 INIT_XMM
 MBEDGE_LOOPFILTER sse2,   v, 5, 16, 16
@@ -2540,3 +2566,12 @@
 %endif
 MBEDGE_LOOPFILTER sse2,   v, 6,  8, 16
 MBEDGE_LOOPFILTER sse2,   h, 6,  8, 16
+
+MBEDGE_LOOPFILTER ssse3,  v, 5, 16, 16
+%ifdef m8
+MBEDGE_LOOPFILTER ssse3,  h, 5, 16, 16
+%else
+MBEDGE_LOOPFILTER ssse3,  h, 6, 16, 16
+%endif
+MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 16
+MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 16