changeset 4133:560ea2d5524e libavcodec

move luma tc0 related init into asm 5% faster filter_mb_fast() on P3
author michael
date Fri, 03 Nov 2006 14:28:30 +0000
parents c9e0315f9954
children 3d2887b288f4
files i386/h264dsp_mmx.c
diffstat 1 files changed, 17 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/i386/h264dsp_mmx.c	Fri Nov 03 13:13:08 2006 +0000
+++ b/i386/h264dsp_mmx.c	Fri Nov 03 14:28:30 2006 +0000
@@ -377,10 +377,7 @@
 
 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
 {
-    uint64_t tmp0;
-    uint64_t tc = (uint8_t)tc0[1]*0x01010000 | (uint8_t)tc0[0]*0x0101;
-    // with luma, tc0=0 doesn't mean no filtering, so we need a separate input mask
-    uint32_t mask[2] = { (tc0[0]>=0)*0xffffffff, (tc0[1]>=0)*0xffffffff };
+    uint64_t tmp0[2];
 
     asm volatile(
         "movq    (%1,%3), %%mm0    \n\t" //p1
@@ -388,8 +385,16 @@
         "movq    (%2),    %%mm2    \n\t" //q0
         "movq    (%2,%3), %%mm3    \n\t" //q1
         H264_DEBLOCK_MASK(%6, %7)
-        "pand     %5,     %%mm7    \n\t"
-        "movq     %%mm7,  %0       \n\t"
+
+        "movd      %5,    %%mm4    \n\t"
+        "punpcklbw %%mm4, %%mm4    \n\t"
+        "punpcklwd %%mm4, %%mm4    \n\t"
+        "pcmpeqb   %%mm3, %%mm3    \n\t"
+        "movq      %%mm4, %%mm6    \n\t"
+        "pcmpgtb   %%mm3, %%mm4    \n\t"
+        "movq      %%mm6, 8+%0     \n\t"
+        "pand      %%mm4, %%mm7    \n\t"
+        "movq      %%mm7, %0       \n\t"
 
         /* filter p1 */
         "movq     (%1),   %%mm3    \n\t" //p2
@@ -397,7 +402,7 @@
         "pandn    %%mm7,  %%mm6    \n\t"
         "pcmpeqb  %%mm7,  %%mm6    \n\t"
         "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
-        "pshufw  $80, %4, %%mm4    \n\t"
+        "movq     8+%0,   %%mm4    \n\t"
         "pand     %%mm7,  %%mm4    \n\t" // mask & tc0
         "movq     %%mm4,  %%mm7    \n\t"
         "psubb    %%mm6,  %%mm7    \n\t"
@@ -410,21 +415,21 @@
         "pandn    %0,     %%mm6    \n\t"
         "pcmpeqb  %0,     %%mm6    \n\t"
         "pand     %0,     %%mm6    \n\t"
-        "pshufw  $80, %4, %%mm5    \n\t"
+        "movq     8+%0,   %%mm5    \n\t"
         "pand     %%mm6,  %%mm5    \n\t"
         "psubb    %%mm6,  %%mm7    \n\t"
         "movq    (%2,%3), %%mm3    \n\t"
         H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
 
         /* filter p0, q0 */
-        H264_DEBLOCK_P0_Q0(%8, %9)
+        H264_DEBLOCK_P0_Q0(%8, unused)
         "movq      %%mm1, (%1,%3,2) \n\t"
         "movq      %%mm2, (%2)      \n\t"
 
-        : "=m"(tmp0)
+        : "=m"(*tmp0)
         : "r"(pix-3*stride), "r"(pix), "r"((long)stride),
-          "m"(tc), "m"(*(uint64_t*)mask), "m"(alpha1), "m"(beta1),
-          "m"(mm_bone), "m"(ff_pb_3F)
+          "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
+          "m"(mm_bone)
     );
 }