changeset 1647:c943c1d2d099 libavcodec

h263_v_loop_filter_mmx
author michael
date Tue, 02 Dec 2003 20:28:10 +0000
parents c3c166ead03a
children de28264c3dc3
files i386/dsputil_mmx.c
diffstat 1 files changed, 92 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/i386/dsputil_mmx.c	Mon Dec 01 20:04:14 2003 +0000
+++ b/i386/dsputil_mmx.c	Tue Dec 02 20:28:10 2003 +0000
@@ -22,6 +22,8 @@
 #include "../dsputil.h"
 #include "../simple_idct.h"
 
+extern const uint8_t ff_h263_loop_filter_strength[32];
+
 int mm_flags; /* multimedia extension flags */
 
 /* pixel operations */
@@ -34,6 +36,8 @@
 static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
 static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
 
+static const uint64_t ff_pb_FC __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
+
 #define JUMPALIGN() __asm __volatile (".balign 8"::)
 #define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
 
@@ -465,6 +469,92 @@
         dst[i+0] += src[i+0];
 }
 
+static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
+    const int strength= ff_h263_loop_filter_strength[qscale];
+
+    asm volatile(
+        "pxor %%mm7, %%mm7		\n\t"
+        "movq  %0, %%mm0		\n\t"
+        "movq  %0, %%mm1		\n\t"
+        "movq  %3, %%mm2		\n\t"
+        "movq  %3, %%mm3		\n\t"
+        "punpcklbw %%mm7, %%mm0		\n\t"
+        "punpckhbw %%mm7, %%mm1		\n\t"
+        "punpcklbw %%mm7, %%mm2		\n\t"
+        "punpckhbw %%mm7, %%mm3		\n\t"
+        "psubw %%mm2, %%mm0		\n\t"
+        "psubw %%mm3, %%mm1		\n\t"
+        "movq  %1, %%mm2		\n\t"
+        "movq  %1, %%mm3		\n\t"
+        "movq  %2, %%mm4		\n\t"
+        "movq  %2, %%mm5		\n\t"
+        "punpcklbw %%mm7, %%mm2		\n\t"
+        "punpckhbw %%mm7, %%mm3		\n\t"
+        "punpcklbw %%mm7, %%mm4		\n\t"
+        "punpckhbw %%mm7, %%mm5		\n\t"
+        "psubw %%mm2, %%mm4		\n\t"
+        "psubw %%mm3, %%mm5		\n\t"
+        "psllw $2, %%mm4		\n\t"
+        "psllw $2, %%mm5		\n\t"
+        "paddw %%mm0, %%mm4		\n\t"
+        "paddw %%mm1, %%mm5		\n\t"
+        "pxor %%mm6, %%mm6		\n\t"
+        "pcmpgtw %%mm4, %%mm6		\n\t" 
+        "pcmpgtw %%mm5, %%mm7		\n\t" 
+        "pxor %%mm6, %%mm4		\n\t"
+        "pxor %%mm7, %%mm5		\n\t"
+        "psubw %%mm6, %%mm4		\n\t" 
+        "psubw %%mm7, %%mm5		\n\t" 
+        "psrlw $3, %%mm4		\n\t"
+        "psrlw $3, %%mm5		\n\t"
+        "packuswb %%mm5, %%mm4		\n\t" //abs(d)
+        "packsswb %%mm7, %%mm6		\n\t" //sign(d)
+        "pxor %%mm7, %%mm7		\n\t"
+        "movd %4, %%mm2			\n\t"
+        "punpcklbw %%mm2, %%mm2		\n\t"
+        "punpcklbw %%mm2, %%mm2		\n\t"
+        "punpcklbw %%mm2, %%mm2		\n\t" //2*strength
+        "psubusb %%mm4, %%mm2		\n\t" // S(2*strength - abs(d))
+        "movq %%mm2, %%mm3		\n\t" // S(2*strength - abs(d))
+        "psubusb %%mm4, %%mm3		\n\t" // S(S(2*strength - abs(d)) - abs(d))
+        "psubb %%mm3, %%mm2		\n\t" // MIN(abs(d), S(2*strength - abs(d)))
+        "movq %1, %%mm3			\n\t"
+        "movq %2, %%mm4			\n\t"
+        "pxor %%mm6, %%mm3		\n\t"
+        "pxor %%mm6, %%mm4		\n\t"
+        "paddusb %%mm2, %%mm3		\n\t"
+        "psubusb %%mm2, %%mm4		\n\t"
+        "pxor %%mm6, %%mm3		\n\t"
+        "pxor %%mm6, %%mm4		\n\t"
+        "movq %%mm3, %1			\n\t"
+        "movq %%mm4, %2			\n\t"
+        "paddusb %%mm2, %%mm2		\n\t"
+        "packsswb %%mm1, %%mm0		\n\t"
+        "pcmpgtb %%mm0, %%mm7		\n\t"
+        "pxor %%mm7, %%mm0		\n\t"
+        "psubb %%mm7, %%mm0		\n\t"
+        "movq %%mm0, %%mm1		\n\t"
+        "psubusb %%mm2, %%mm0		\n\t"
+        "psubb %%mm0, %%mm1		\n\t"
+        "pand %5, %%mm1			\n\t"
+        "psrlw $2, %%mm1		\n\t"
+        "pxor %%mm7, %%mm1		\n\t"
+        "psubb %%mm7, %%mm1		\n\t"
+        "movq %0, %%mm3			\n\t"
+        "movq %3, %%mm4			\n\t"
+        "psubb %%mm1, %%mm3		\n\t"
+        "paddb %%mm1, %%mm4		\n\t"
+        "movq %%mm3, %0			\n\t"
+        "movq %%mm4, %3			\n\t"
+        
+        : "+m" (*(uint64_t*)(src - 2*stride)),
+          "+m" (*(uint64_t*)(src - 1*stride)),
+          "+m" (*(uint64_t*)(src + 0*stride)),
+          "+m" (*(uint64_t*)(src + 1*stride))
+        : "g" (2*strength), "m"(ff_pb_FC)
+    );
+}
+
 #ifdef CONFIG_ENCODERS
 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
     int tmp;
@@ -1691,6 +1781,8 @@
 	c->pix_norm1 = pix_norm1_mmx;
 	c->sse[0] = sse16_mmx;
 #endif //CONFIG_ENCODERS
+
+        c->h263_v_loop_filter= h263_v_loop_filter_mmx;
         
         if (mm_flags & MM_MMXEXT) {
             c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;