changeset 12279:7fb91885433c libavcodec

Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
author rbultje
date Mon, 26 Jul 2010 21:18:19 +0000
parents da5b503f050d
children fbc6fc80e6c6
files x86/vp8dsp.asm
diffstat 1 files changed, 78 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/x86/vp8dsp.asm	Mon Jul 26 19:34:00 2010 +0000
+++ b/x86/vp8dsp.asm	Mon Jul 26 21:18:19 2010 +0000
@@ -145,6 +145,10 @@
 pw_20091: times 4 dw 20091
 pw_17734: times 4 dw 17734
 
+pb_27_63: times 8 db 27, 63
+pb_18_63: times 8 db 18, 63
+pb_9_63:  times 8 db  9, 63
+
 cextern pb_1
 cextern pw_3
 cextern pb_3
@@ -2175,11 +2179,16 @@
 %define stack_reg   hev_thr_reg
 %endif
 
+%define ssse3_or_higher 0
 %ifnidn %1, sse2
 %if mmsize == 16
+%define ssse3_or_higher 1
+%endif
+%endif
+
+%if ssse3_or_higher
     pxor             m7, m7
 %endif
-%endif
 
 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
     ; splat function arguments
@@ -2576,7 +2585,11 @@
     paddusb          m4, m1          ; q0-f1
 
     ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
+%if ssse3_or_higher
+    mova             m7, [pb_1]
+%else
     mova             m7, [pw_63]
+%endif
 %ifdef m8
     SWAP              1, 8
 %else
@@ -2585,15 +2598,40 @@
     pxor             m0, m0
     mova             m6, m1
     pcmpgtb          m0, m1         ; which are negative
+%if ssse3_or_higher
+    punpcklbw        m6, m7         ; interleave with "1" for rounding
+    punpckhbw        m1, m7
+%else
     punpcklbw        m6, m0         ; signed byte->word
     punpckhbw        m1, m0
+%endif
     mova       lim_sign, m0
+%if ssse3_or_higher
+    mova             m7, [pb_27_63]
+%ifndef m8
+    mova        lim_res, m1
+%endif
+%ifdef m10
+    SWAP              0, 10         ; don't lose lim_sign copy
+%endif
+    mova             m0, m7
+    pmaddubsw        m7, m6
+    SWAP              6, 7
+    pmaddubsw        m0, m1
+    SWAP              1, 0
+%ifdef m10
+    SWAP              0, 10
+%else
+    mova             m0, lim_sign
+%endif
+%else
     mova       mask_res, m6         ; backup for later in filter
     mova        lim_res, m1
     pmullw          m6, [pw_27]
     pmullw          m1, [pw_27]
     paddw           m6, m7
     paddw           m1, m7
+%endif
     psraw           m6, 7
     psraw           m1, 7
     packsswb        m6, m1          ; a0
@@ -2601,18 +2639,39 @@
     psubb           m1, m6
     pand            m1, m0          ; -a0
     pandn           m0, m6          ; +a0
+%if ssse3_or_higher
+    mova            m6, [pb_18_63]  ; pipelining
+%endif
     psubusb         m3, m1
     paddusb         m4, m1
     paddusb         m3, m0          ; p0+a0
     psubusb         m4, m0          ; q0-a0
 
+%if ssse3_or_higher
+    SWAP             6, 7
+%ifdef m10
+    SWAP             1, 10
+%else
+    mova            m1, lim_res
+%endif
+    mova            m0, m7
+    pmaddubsw       m7, m6
+    SWAP             6, 7
+    pmaddubsw       m0, m1
+    SWAP             1, 0
+%ifdef m10
+    SWAP             0, 10
+%endif
+    mova            m0, lim_sign
+%else
     mova            m6, mask_res
     mova            m1, lim_res
-    mova            m0, lim_sign
     pmullw          m6, [pw_18]
     pmullw          m1, [pw_18]
     paddw           m6, m7
     paddw           m1, m7
+%endif
+    mova            m0, lim_sign
     psraw           m6, 7
     psraw           m1, 7
     packsswb        m6, m1          ; a1
@@ -2620,11 +2679,27 @@
     psubb           m1, m6
     pand            m1, m0          ; -a1
     pandn           m0, m6          ; +a1
+%if ssse3_or_higher
+    mova            m6, [pb_9_63]
+%endif
     psubusb         m2, m1
     paddusb         m5, m1
     paddusb         m2, m0          ; p1+a1
     psubusb         m5, m0          ; q1-a1
 
+%if ssse3_or_higher
+    SWAP             6, 7
+%ifdef m10
+    SWAP             1, 10
+%else
+    mova            m1, lim_res
+%endif
+    mova            m0, m7
+    pmaddubsw       m7, m6
+    SWAP             6, 7
+    pmaddubsw       m0, m1
+    SWAP             1, 0
+%else
 %ifdef m8
     SWAP             6, 12
     SWAP             1, 8
@@ -2636,6 +2711,7 @@
     pmullw          m1, [pw_9]
     paddw           m6, m7
     paddw           m1, m7
+%endif
 %ifdef m9
     SWAP             7, 9
 %else