# HG changeset patch # User michael # Date 1094521725 0 # Node ID c4a476971abc01e4a4460906cb073929cdc9fe05 # Parent 9ca8a88a8a7091e9d9ec53c840bff0a175eed6da h264 luma motion compensation in mmx2/3dnow diff -r 9ca8a88a8a70 -r c4a476971abc i386/dsputil_mmx.c --- a/i386/dsputil_mmx.c Mon Sep 06 10:32:47 2004 +0000 +++ b/i386/dsputil_mmx.c Tue Sep 07 01:48:45 2004 +0000 @@ -39,7 +39,9 @@ static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; +static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL; static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL; +static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL; static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; @@ -354,6 +356,32 @@ } while (--i); } +static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm __volatile( + "lea (%3, %3), %%eax \n\t" + ".balign 8 \n\t" + "1: \n\t" + "movd (%1), %%mm0 \n\t" + "movd (%1, %3), %%mm1 \n\t" + "movd %%mm0, (%2) \n\t" + "movd %%mm1, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "movd (%1), %%mm0 \n\t" + "movd (%1, %3), %%mm1 \n\t" + "movd %%mm0, (%2) \n\t" + "movd %%mm1, (%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"(line_size) + : "%eax", "memory" + ); +} + static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( @@ -1979,7 +2007,7 @@ }\ \ static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - uint64_t temp[9*4];\ + uint64_t temp[9*2];\ uint64_t *temp_ptr= temp;\ int count= 9;\ \ @@ -2261,6 +2289,430 @@ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ } +#define QPEL_H264(OPNAME, OP, MMX)\ +static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=4;\ +\ + asm volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %5, %%mm4 \n\t"\ + "movq %6, %%mm5 \n\t"\ + "1: \n\t"\ + "movd -1(%0), %%mm1 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "movd 1(%0), %%mm3 \n\t"\ + "movd 2(%0), %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "paddw %%mm0, %%mm1 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "movd -2(%0), %%mm0 \n\t"\ + "movd 3(%0), %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm3, %%mm0 \n\t"\ + "psllw $2, %%mm2 \n\t"\ + "psubw %%mm1, %%mm2 \n\t"\ + "pmullw %%mm4, %%mm2 \n\t"\ + "paddw %%mm5, %%mm0 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm6, d)\ + "addl %3, %0 \n\t"\ + "addl %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+m"(h)\ + : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +}\ +static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + uint64_t temp[4+5];\ + uint64_t *temp_ptr= temp;\ + int h= 3;\ + src -= 2*srcStride;\ + /*FIXME unroll */\ + asm volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "1: \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "movq %%mm0, (%1) \n\t"\ + "addl %3, %0 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "movq %%mm0, 8(%1) \n\t"\ + "addl %3, %0 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "movq %%mm0, 16(%1) \n\t"\ + "addl %3, %0 \n\t"\ + "addl $24, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a" (src), "+c" (temp_ptr), "+d"(h)\ + : "S" (srcStride)\ + : "memory"\ + );\ + \ + temp_ptr= temp;\ + h= 4;\ + \ + asm volatile(\ + "movq %4, %%mm6 \n\t"\ + "movq %5, %%mm7 \n\t"\ + "1: \n\t"\ + "movq 2*8(%0), %%mm0 \n\t"\ + "movq 3*8(%0), %%mm1 \n\t"\ + "paddw %%mm1, %%mm0 \n\t"\ + "psllw $2, %%mm0 \n\t"\ + "movq 1*8(%0), %%mm2 \n\t"\ + "movq 4*8(%0), %%mm3 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "movq 0*8(%0), %%mm4 \n\t"\ + "movq 5*8(%0), %%mm5 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "paddw %%mm7, %%mm4 \n\t"\ + "paddw %%mm4, %%mm0 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, d)\ + "addl %3, %1 \n\t"\ + "addl $8, %0 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + \ + : "+a"(temp_ptr), "+c"(dst), "+d"(h)\ + : "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +}\ +static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + const int h=4;\ + const int w=4;\ + uint8_t *cm = cropTbl + MAX_NEG_CROP;\ + int i;\ + src -= 2*srcStride;\ + for(i=0; i>10];\ + dst[1*dstStride]= cm[( (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4) + 512)>>10];\ + dst[2*dstStride]= cm[( (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5) + 512)>>10];\ + dst[3*dstStride]= cm[( (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6) + 512)>>10];\ + dst++;\ + tmp++;\ + }\ +}\ +\ +static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=8;\ + asm volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %5, %%mm6 \n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 1(%0), %%mm2 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "psllw $2, %%mm0 \n\t"\ + "psllw $2, %%mm1 \n\t"\ + "movq -1(%0), %%mm2 \n\t"\ + "movq 2(%0), %%mm4 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "movq %%mm4, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm5, %%mm1 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm1 \n\t"\ + "movd -2(%0), %%mm2 \n\t"\ + "movd 7(%0), %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "movq %6, %%mm5 \n\t"\ + "paddw %%mm5, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm4, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, q)\ + "addl %3, %0 \n\t"\ + "addl %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+m"(h)\ + : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +}\ +\ +static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + uint64_t temp[(8+5)*2];\ + uint64_t *temp_ptr= temp;\ + int h= 8+5;\ +\ + src -= 2*srcStride;\ + /*FIXME unroll */\ + asm volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq (%0), %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "movq %%mm0, (%1) \n\t"\ + "movq %%mm1, 8(%1) \n\t"\ + "addl $16, %1 \n\t"\ + "addl %3, %0 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a" (src), "+c" (temp_ptr), "+d"(h)\ + : "S" (srcStride)\ + : "memory"\ + );\ + \ + temp_ptr= temp;\ + h= 8;\ + \ + asm volatile(\ + "movq %4, %%mm6 \n\t"\ + "movq %5, %%mm7 \n\t"\ + "1: \n\t"\ + "movq 2*16+0(%0), %%mm0 \n\t"\ + "movq 2*16+8(%0), %%mm1 \n\t"\ + "movq 3*16+0(%0), %%mm2 \n\t"\ + "movq 3*16+8(%0), %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "psllw $2, %%mm0 \n\t"\ + "psllw $2, %%mm1 \n\t"\ + "movq 1*16+0(%0), %%mm2 \n\t"\ + "movq 1*16+8(%0), %%mm3 \n\t"\ + "movq 4*16+0(%0), %%mm4 \n\t"\ + "movq 4*16+8(%0), %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm5, %%mm3 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm3, %%mm1 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm1 \n\t"\ + "movq 0*16+0(%0), %%mm2 \n\t"\ + "movq 0*16+8(%0), %%mm3 \n\t"\ + "movq 5*16+0(%0), %%mm4 \n\t"\ + "movq 5*16+8(%0), %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm5, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "paddw %%mm7, %%mm0 \n\t"\ + "paddw %%mm7, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, q)\ + "addl %3, %1 \n\t"\ + "addl $16, %0 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + \ + : "+a"(temp_ptr), "+c"(dst), "+d"(h)\ + : "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +}\ +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ + OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst+4, tmp+4, src+4, dstStride, tmpStride, srcStride);\ + src += 4*srcStride;\ + dst += 4*dstStride;\ + OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ + OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(dst+4, tmp+4, src+4, dstStride, tmpStride, srcStride);\ +}\ +static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +}\ +\ +static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +}\ +\ +static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ + OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ + OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ +}\ + +#define H264_MC(OPNAME, SIZE, MMX) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _mmx(dst, src, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[SIZE*SIZE/8];\ + uint8_t * const half= (uint8_t*)temp;\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[SIZE*SIZE/8];\ + uint8_t * const half= (uint8_t*)temp;\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+1, half, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[SIZE*SIZE/8];\ + uint8_t * const half= (uint8_t*)temp;\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[SIZE*SIZE/8];\ + uint8_t * const half= (uint8_t*)temp;\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, half, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[SIZE*SIZE/4];\ + uint8_t * const halfH= (uint8_t*)temp;\ + uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[SIZE*SIZE/4];\ + uint8_t * const halfH= (uint8_t*)temp;\ + uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[SIZE*SIZE/4];\ + uint8_t * const halfH= (uint8_t*)temp;\ + uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[SIZE*SIZE/4];\ + uint8_t * const halfH= (uint8_t*)temp;\ + uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + int16_t tmp[SIZE*(SIZE+5)];\ + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + int16_t tmp[SIZE*(SIZE+5)];\ + uint8_t halfH[SIZE*SIZE];\ + uint8_t halfHV[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + int16_t tmp[SIZE*(SIZE+5)];\ + uint8_t halfH[SIZE*SIZE];\ + uint8_t halfHV[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + int16_t tmp[SIZE*(SIZE+5)];\ + uint8_t halfV[SIZE*SIZE];\ + uint8_t halfHV[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + int16_t tmp[SIZE*(SIZE+5)];\ + uint8_t halfV[SIZE*SIZE];\ + uint8_t halfHV[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\ +}\ + #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" #define AVG_3DNOW_OP(a,b,temp, size) \ @@ -2282,6 +2734,24 @@ QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) +QPEL_H264(put_ , PUT_OP, 3dnow) +QPEL_H264(avg_ , AVG_3DNOW_OP, 3dnow) +QPEL_H264(put_ , PUT_OP, mmx2) +QPEL_H264(avg_ , AVG_MMX2_OP, mmx2) + +H264_MC(put_, 4, 3dnow) +H264_MC(put_, 8, 3dnow) +H264_MC(put_, 16,3dnow) +H264_MC(avg_, 4, 3dnow) +H264_MC(avg_, 8, 3dnow) +H264_MC(avg_, 16,3dnow) +H264_MC(put_, 4, mmx2) +H264_MC(put_, 8, mmx2) +H264_MC(put_, 16,mmx2) +H264_MC(avg_, 4, mmx2) +H264_MC(avg_, 8, mmx2) +H264_MC(avg_, 16,mmx2) + #if 0 static void just_return() { return; } #endif @@ -2621,6 +3091,33 @@ SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) #endif +//FIXME 3dnow too +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \ + c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \ + c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \ + c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \ + c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \ + c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \ + c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2 + + dspfunc(put_h264_qpel, 0, 16); + dspfunc(put_h264_qpel, 1, 8); + dspfunc(put_h264_qpel, 2, 4); + dspfunc(avg_h264_qpel, 0, 16); + dspfunc(avg_h264_qpel, 1, 8); + dspfunc(avg_h264_qpel, 2, 4); +#undef dspfunc + #ifdef CONFIG_ENCODERS c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; #endif //CONFIG_ENCODERS @@ -2680,6 +3177,31 @@ SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) + +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \ + c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \ + c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \ + c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \ + c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \ + c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \ + c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow + + dspfunc(put_h264_qpel, 0, 16); + dspfunc(put_h264_qpel, 1, 8); + dspfunc(put_h264_qpel, 2, 4); + dspfunc(avg_h264_qpel, 0, 16); + dspfunc(avg_h264_qpel, 1, 8); + dspfunc(avg_h264_qpel, 2, 4); } } diff -r 9ca8a88a8a70 -r c4a476971abc i386/dsputil_mmx_avg.h --- a/i386/dsputil_mmx_avg.h Mon Sep 06 10:32:47 2004 +0000 +++ b/i386/dsputil_mmx_avg.h Tue Sep 07 01:48:45 2004 +0000 @@ -53,6 +53,53 @@ :"%eax", "memory"); } +static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm __volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movd (%1), %%mm0 \n\t" + "movd (%2), %%mm1 \n\t" + "addl %4, %1 \n\t" + "addl $4, %2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + "movd %%mm0, (%3) \n\t" + "addl %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movd (%1), %%mm0 \n\t" + "addl %4, %1 \n\t" + "movd (%1), %%mm1 \n\t" + "addl %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 4(%2), %%mm1 \n\t" + "movd %%mm0, (%3) \n\t" + "addl %5, %3 \n\t" + "movd %%mm1, (%3) \n\t" + "addl %5, %3 \n\t" + "movd (%1), %%mm0 \n\t" + "addl %4, %1 \n\t" + "movd (%1), %%mm1 \n\t" + "addl %4, %1 \n\t" + PAVGB" 8(%2), %%mm0 \n\t" + PAVGB" 12(%2), %%mm1 \n\t" + "movd %%mm0, (%3) \n\t" + "addl %5, %3 \n\t" + "movd %%mm1, (%3) \n\t" + "addl %5, %3 \n\t" + "addl $16, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"(src1Stride), "D"(dstStride) + :"memory"); +} + + static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { __asm __volatile( @@ -173,6 +220,58 @@ :"memory");*/ } +static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm __volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movd (%1), %%mm0 \n\t" + "movd (%2), %%mm1 \n\t" + "addl %4, %1 \n\t" + "addl $4, %2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movd %%mm0, (%3) \n\t" + "addl %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movd (%1), %%mm0 \n\t" + "addl %4, %1 \n\t" + "movd (%1), %%mm1 \n\t" + "addl %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 4(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movd %%mm0, (%3) \n\t" + "addl %5, %3 \n\t" + PAVGB" (%3), %%mm1 \n\t" + "movd %%mm1, (%3) \n\t" + "addl %5, %3 \n\t" + "movd (%1), %%mm0 \n\t" + "addl %4, %1 \n\t" + "movd (%1), %%mm1 \n\t" + "addl %4, %1 \n\t" + PAVGB" 8(%2), %%mm0 \n\t" + PAVGB" 12(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movd %%mm0, (%3) \n\t" + "addl %5, %3 \n\t" + PAVGB" (%3), %%mm1 \n\t" + "movd %%mm1, (%3) \n\t" + "addl %5, %3 \n\t" + "addl $16, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"(src1Stride), "D"(dstStride) + :"memory"); +} + + static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) { __asm __volatile( diff -r 9ca8a88a8a70 -r c4a476971abc i386/dsputil_mmx_rnd.h --- a/i386/dsputil_mmx_rnd.h Mon Sep 06 10:32:47 2004 +0000 +++ b/i386/dsputil_mmx_rnd.h Tue Sep 07 01:48:45 2004 +0000 @@ -296,6 +296,25 @@ } // avg_pixels +static void DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm __volatile( + "movd %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + "movd %%mm2, %0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } + while (--h); +} + // in case more speed is needed - unroling would certainly help static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) {