changeset 4127:d011a097bb85 libavcodec

optimize H264_DEBLOCK_P0_Q0 2.5% faster filter_mb_fast() on P3
author michael
date Fri, 03 Nov 2006 02:03:56 +0000
parents 8a67fda4f0b5
children 2e93c877d264
files i386/dsputil_mmx.c i386/h264dsp_mmx.c
diffstat 2 files changed, 23 insertions(+), 40 deletions(-) [+]
line wrap: on
line diff
--- a/i386/dsputil_mmx.c	Thu Nov 02 23:36:39 2006 +0000
+++ b/i386/dsputil_mmx.c	Fri Nov 03 02:03:56 2006 +0000
@@ -58,6 +58,8 @@
 static const uint64_t ff_pb_3  attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
 static const uint64_t ff_pb_7  attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
+static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
+static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
 
 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
--- a/i386/h264dsp_mmx.c	Thu Nov 02 23:36:39 2006 +0000
+++ b/i386/h264dsp_mmx.c	Fri Nov 03 02:03:56 2006 +0000
@@ -337,46 +337,27 @@
 // out: mm1=p0' mm2=q0'
 // clobbers: mm0,3-6
 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
-        /* a = q0^p0^((p1-q1)>>2) */\
-        "movq    %%mm0, %%mm4  \n\t"\
-        "psubb   %%mm3, %%mm4  \n\t"\
-        "psrlw   $2,    %%mm4  \n\t"\
-        "pxor    %%mm1, %%mm4  \n\t"\
-        "pxor    %%mm2, %%mm4  \n\t"\
-        /* b = p0^(q1>>2) */\
-        "psrlw   $2,    %%mm3  \n\t"\
-        "pand "#pb_3f", %%mm3  \n\t"\
-        "movq    %%mm1, %%mm5  \n\t"\
-        "pxor    %%mm3, %%mm5  \n\t"\
-        /* c = q0^(p1>>2) */\
-        "psrlw   $2,    %%mm0  \n\t"\
-        "pand "#pb_3f", %%mm0  \n\t"\
-        "movq    %%mm2, %%mm6  \n\t"\
-        "pxor    %%mm0, %%mm6  \n\t"\
-        /* d = (c^b) & ~(b^a) & 1 */\
-        "pxor    %%mm5, %%mm6  \n\t"\
-        "pxor    %%mm4, %%mm5  \n\t"\
-        "pandn   %%mm6, %%mm5  \n\t"\
-        "pand "#pb_01", %%mm5  \n\t"\
-        /* delta = (avg(q0, p1>>2) + (d&a))
-         *       - (avg(p0, q1>>2) + (d&~a)) */\
-        "pavgb   %%mm2, %%mm0  \n\t"\
-        "pand    %%mm5, %%mm4  \n\t"\
-        "paddusb %%mm4, %%mm0  \n\t"\
-        "pavgb   %%mm1, %%mm3  \n\t"\
-        "pxor    %%mm5, %%mm4  \n\t"\
-        "paddusb %%mm4, %%mm3  \n\t"\
-        /* p0 += clip(delta, -tc0, tc0)
-         * q0 -= clip(delta, -tc0, tc0) */\
-        "movq    %%mm0, %%mm4  \n\t"\
-        "psubusb %%mm3, %%mm0  \n\t"\
-        "psubusb %%mm4, %%mm3  \n\t"\
-        "pminub  %%mm7, %%mm0  \n\t"\
-        "pminub  %%mm7, %%mm3  \n\t"\
-        "paddusb %%mm0, %%mm1  \n\t"\
-        "paddusb %%mm3, %%mm2  \n\t"\
-        "psubusb %%mm3, %%mm1  \n\t"\
-        "psubusb %%mm0, %%mm2  \n\t"
+        "movq    %%mm1              , %%mm5 \n\t"\
+        "pxor    %%mm2              , %%mm5 \n\t" /* p0^q0*/\
+        "pand    "#pb_01"           , %%mm5 \n\t" /* (p0^q0)&1*/\
+        "pcmpeqb %%mm4              , %%mm4 \n\t"\
+        "pxor    %%mm4              , %%mm3 \n\t"\
+        "pavgb   %%mm0              , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
+        "pavgb   "MANGLE(ff_pb_3)"  , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
+        "pxor    %%mm1              , %%mm4 \n\t"\
+        "pavgb   %%mm2              , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
+        "pavgb   %%mm5              , %%mm3 \n\t"\
+        "paddb   %%mm4              , %%mm3 \n\t" /* d+128+33*/\
+        "pxor    %%mm6              , %%mm6 \n\t" /* 0*/\
+        "psubb   %%mm3              , %%mm6 \n\t" /* 128-33-d*/\
+        "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
+        "psubusb "MANGLE(ff_pb_5F)" , %%mm6 \n\t"\
+        "pminub  %%mm7              , %%mm3 \n\t"\
+        "pminub  %%mm7              , %%mm6 \n\t"\
+        "paddusb %%mm3              , %%mm1 \n\t"\
+        "paddusb %%mm6              , %%mm2 \n\t"\
+        "psubusb %%mm6              , %%mm1 \n\t"\
+        "psubusb %%mm3              , %%mm2 \n\t"
 
 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=mm_bone
 // out: (q1addr) = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )