diff i386/h264dsp_mmx.c @ 3105:2d35fb3cb940 libavcodec

h264: special case dc-only idct. ~1% faster overall
author lorenm
date Fri, 10 Feb 2006 06:55:25 +0000
parents fcc2892eeab3
children 5b6d0dd37ca7
line wrap: on
line diff
--- a/i386/h264dsp_mmx.c	Fri Feb 10 01:19:54 2006 +0000
+++ b/i386/h264dsp_mmx.c	Fri Feb 10 06:55:25 2006 +0000
@@ -104,6 +104,87 @@
     );
 }
 
+void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    asm volatile(
+        "movd          %0, %%mm0 \n\t"
+        "pxor       %%mm7, %%mm7 \n\t"
+        "pshufw $0, %%mm0, %%mm0 \n\t"
+        "pxor       %%mm1, %%mm1 \n\t"
+        "psubw      %%mm0, %%mm1 \n\t"
+        "pmaxsw     %%mm7, %%mm0 \n\t"
+        "pmaxsw     %%mm7, %%mm1 \n\t"
+        "packuswb   %%mm0, %%mm0 \n\t"
+        "packuswb   %%mm1, %%mm1 \n\t"
+        ::"r"(dc)
+    );
+    asm volatile(
+        "movd          %0, %%mm2 \n\t"
+        "movd          %1, %%mm3 \n\t"
+        "movd          %2, %%mm4 \n\t"
+        "movd          %3, %%mm5 \n\t"
+        "paddusb    %%mm0, %%mm2 \n\t"
+        "paddusb    %%mm0, %%mm3 \n\t"
+        "paddusb    %%mm0, %%mm4 \n\t"
+        "paddusb    %%mm0, %%mm5 \n\t"
+        "psubusb    %%mm1, %%mm2 \n\t"
+        "psubusb    %%mm1, %%mm3 \n\t"
+        "psubusb    %%mm1, %%mm4 \n\t"
+        "psubusb    %%mm1, %%mm5 \n\t"
+        "movd       %%mm2, %0    \n\t"
+        "movd       %%mm3, %1    \n\t"
+        "movd       %%mm4, %2    \n\t"
+        "movd       %%mm5, %3    \n\t"
+        :"+m"(*(uint32_t*)(dst+0*stride)),
+         "+m"(*(uint32_t*)(dst+1*stride)),
+         "+m"(*(uint32_t*)(dst+2*stride)),
+         "+m"(*(uint32_t*)(dst+3*stride))
+    );
+}
+
+void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    int y;
+    asm volatile(
+        "movd          %0, %%mm0 \n\t"
+        "pxor       %%mm7, %%mm7 \n\t"
+        "pshufw $0, %%mm0, %%mm0 \n\t"
+        "pxor       %%mm1, %%mm1 \n\t"
+        "psubw      %%mm0, %%mm1 \n\t"
+        "pmaxsw     %%mm7, %%mm0 \n\t"
+        "pmaxsw     %%mm7, %%mm1 \n\t"
+        "packuswb   %%mm0, %%mm0 \n\t"
+        "packuswb   %%mm1, %%mm1 \n\t"
+        ::"r"(dc)
+    );
+    for(y=2; y--; dst += 4*stride){
+    asm volatile(
+        "movq          %0, %%mm2 \n\t"
+        "movq          %1, %%mm3 \n\t"
+        "movq          %2, %%mm4 \n\t"
+        "movq          %3, %%mm5 \n\t"
+        "paddusb    %%mm0, %%mm2 \n\t"
+        "paddusb    %%mm0, %%mm3 \n\t"
+        "paddusb    %%mm0, %%mm4 \n\t"
+        "paddusb    %%mm0, %%mm5 \n\t"
+        "psubusb    %%mm1, %%mm2 \n\t"
+        "psubusb    %%mm1, %%mm3 \n\t"
+        "psubusb    %%mm1, %%mm4 \n\t"
+        "psubusb    %%mm1, %%mm5 \n\t"
+        "movq       %%mm2, %0    \n\t"
+        "movq       %%mm3, %1    \n\t"
+        "movq       %%mm4, %2    \n\t"
+        "movq       %%mm5, %3    \n\t"
+        :"+m"(*(uint64_t*)(dst+0*stride)),
+         "+m"(*(uint64_t*)(dst+1*stride)),
+         "+m"(*(uint64_t*)(dst+2*stride)),
+         "+m"(*(uint64_t*)(dst+3*stride))
+    );
+    }
+}
+
 
 /***********************************/
 /* deblocking */