changeset 10430:12c8175d6db5 libavcodec

simd add_hfyu_left_prediction 2.2x faster than C on conroe, 3.6x on penryn. 4-6% faster huffyuv decoding if using left or plane mode and yuv
author lorenm
date Sun, 18 Oct 2009 20:10:10 +0000
parents 289dd8daf4ee
children 546b7ebeaf07
files dsputil.h x86/dsputil_mmx.c x86/dsputil_yasm.asm x86/x86inc.asm
diffstat 4 files changed, 84 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/dsputil.h	Sun Oct 18 19:51:18 2009 +0000
+++ b/dsputil.h	Sun Oct 18 20:10:10 2009 +0000
@@ -349,7 +349,7 @@
      */
     void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top);
     void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, uint8_t *diff, int w, int *left, int *left_top);
-    int  (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int acc);
+    int  (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left);
     void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue);
     /* this might write to dst[w] */
     void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
--- a/x86/dsputil_mmx.c	Sun Oct 18 19:51:18 2009 +0000
+++ b/x86/dsputil_mmx.c	Sun Oct 18 20:10:10 2009 +0000
@@ -2385,6 +2385,8 @@
 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top);
+int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, uint8_t *src, int w, int left);
+int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, uint8_t *src, int w, int left);
 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
 void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
@@ -2951,6 +2953,11 @@
             c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
             c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
             c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
+#if HAVE_YASM
+            c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
+            if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe
+                c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
+#endif
         }
 #endif
 
--- a/x86/dsputil_yasm.asm	Sun Oct 18 19:51:18 2009 +0000
+++ b/x86/dsputil_yasm.asm	Sun Oct 18 20:10:10 2009 +0000
@@ -21,6 +21,13 @@
 
 %include "x86inc.asm"
 
+SECTION_RODATA
+pb_f: times 16 db 15
+pb_zzzzzzzz77777777: times 8 db -1
+pb_7: times 8 db 7
+pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
+pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
+
 section .text align=16
 
 %macro PSWAPD_SSE 2
@@ -150,3 +157,70 @@
     movzx   r2d, byte [topq-1]
     mov [left_topq], r2d
     RET
+
+
+%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+    mova    m1, [srcq+wq]
+    mova    m2, m1
+    psllw   m1, 8
+    paddb   m1, m2
+    mova    m2, m1
+    pshufb  m1, m3
+    paddb   m1, m2
+    pshufb  m0, m5
+    mova    m2, m1
+    pshufb  m1, m4
+    paddb   m1, m2
+%if mmsize == 16
+    mova    m2, m1
+    pshufb  m1, m6
+    paddb   m1, m2
+%endif
+    paddb   m0, m1
+%if %1
+    mova    [dstq+wq], m0
+%else
+    movq    [dstq+wq], m0
+    movhps  [dstq+wq+8], m0
+%endif
+    add     wq, mmsize
+    jl %%.loop
+    mov     eax, mmsize-1
+    sub     eax, wd
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+    RET
+%endmacro
+
+; int ff_add_hfyu_left_prediction(uint8_t *dst, uint8_t *src, int w, int left)
+INIT_MMX
+cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
+.skip_prologue:
+    mova    m5, [pb_7 GLOBAL]
+    mova    m4, [pb_zzzz3333zzzzbbbb GLOBAL]
+    mova    m3, [pb_zz11zz55zz99zzdd GLOBAL]
+    movd    m0, leftm
+    psllq   m0, 56
+    ADD_HFYU_LEFT_LOOP 1
+
+INIT_XMM
+cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
+    mova    m5, [pb_f GLOBAL]
+    mova    m6, [pb_zzzzzzzz77777777 GLOBAL]
+    mova    m4, [pb_zzzz3333zzzzbbbb GLOBAL]
+    mova    m3, [pb_zz11zz55zz99zzdd GLOBAL]
+    movd    m0, leftm
+    pslldq  m0, 15
+    test    srcq, 15
+    jnz ff_add_hfyu_left_prediction_ssse3 %+ .skip_prologue
+    test    dstq, 15
+    jnz .unaligned
+    ADD_HFYU_LEFT_LOOP 1
+.unaligned:
+    ADD_HFYU_LEFT_LOOP 0
+
--- a/x86/x86inc.asm	Sun Oct 18 19:51:18 2009 +0000
+++ b/x86/x86inc.asm	Sun Oct 18 20:10:10 2009 +0000
@@ -221,6 +221,7 @@
             CAT_UNDEF arg_name %+ %%i, d
             CAT_UNDEF arg_name %+ %%i, w
             CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
             CAT_UNDEF arg_name, %%i
             %assign %%i %%i+1
         %endrep
@@ -232,6 +233,7 @@
         %xdefine %1d r %+ %%i %+ d
         %xdefine %1w r %+ %%i %+ w
         %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
         CAT_XDEFINE arg_name, %%i, %1
         %assign %%i %%i+1
         %rotate 1