# HG changeset patch # User rbultje # Date 1284688566 0 # Node ID c997f09d1e10bd0c910d2fb6ccb8fe0eac5f8d8f # Parent c5ffa8b81f9cdcd80569bcbf5fe63a0f673325e4 Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm, which will hopefully solve the Win64/FATE failures caused by these functions. diff -r c5ffa8b81f9c -r c997f09d1e10 x86/dsputilenc_mmx.c --- a/x86/dsputilenc_mmx.c Fri Sep 17 01:44:17 2010 +0000 +++ b/x86/dsputilenc_mmx.c Fri Sep 17 01:56:06 2010 +0000 @@ -879,55 +879,6 @@ *left = src2[w-1]; } -#define DIFF_PIXELS_1(m,a,t,p1,p2)\ - "mov"#m" "#p1", "#a" \n\t"\ - "mov"#m" "#p2", "#t" \n\t"\ - "punpcklbw "#a", "#t" \n\t"\ - "punpcklbw "#a", "#a" \n\t"\ - "psubw "#t", "#a" \n\t"\ - -#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ - uint8_t *p1b=p1, *p2b=p2;\ - __asm__ volatile(\ - DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ - DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ - DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ - "add %4, %1 \n\t"\ - "add %4, %2 \n\t"\ - DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ - DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ - DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ - DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ - "mov"#m1" "#mm"0, %0 \n\t"\ - DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ - "mov"#m1" %0, "#mm"0 \n\t"\ - : "+m"(temp), "+r"(p1b), "+r"(p2b)\ - : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\ - );\ -} - //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp) - -#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) -#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) - -#define LBUTTERFLY2(a1,b1,a2,b2)\ - "paddw " #b1 ", " #a1 " \n\t"\ - "paddw " #b2 ", " #a2 " \n\t"\ - "paddw " #b1 ", " #b1 " \n\t"\ - "paddw " #b2 ", " #b2 " \n\t"\ - "psubw " #a1 ", " #b1 " \n\t"\ - "psubw " #a2 ", " #b2 " \n\t" - -#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ - LBUTTERFLY2(m0, m1, m2, m3)\ - LBUTTERFLY2(m4, m5, m6, m7)\ - LBUTTERFLY2(m0, m2, m1, m3)\ - LBUTTERFLY2(m4, m6, m5, m7)\ - LBUTTERFLY2(m0, m4, m1, m5)\ - LBUTTERFLY2(m2, m6, m3, m7)\ - -#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) - #define MMABS_MMX(a,z)\ "pxor " #z ", " #z " \n\t"\ "pcmpgtw " #a ", " #z " \n\t"\ @@ -946,34 +897,6 @@ MMABS(a,z)\ "paddusw " #a ", " #sum " \n\t" -#define MMABS_SUM_8x8_NOSPILL\ - MMABS(%%xmm0, %%xmm8)\ - MMABS(%%xmm1, %%xmm9)\ - MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ - MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ - MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ - MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ - MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ - MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ - "paddusw %%xmm1, %%xmm0 \n\t" - -#if ARCH_X86_64 -#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL -#else -#define MMABS_SUM_8x8_SSE2\ - "movdqa %%xmm7, (%1) \n\t"\ - MMABS(%%xmm0, %%xmm7)\ - MMABS(%%xmm1, %%xmm7)\ - MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ - MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ - MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ - MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ - MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ - "movdqa (%1), %%xmm2 \n\t"\ - MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ - "paddusw %%xmm1, %%xmm0 \n\t" -#endif - /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to * about 100k on extreme inputs. But that's very unlikely to occur in natural video, * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ @@ -1002,133 +925,16 @@ "paddusw "#t", "#a" \n\t"\ "movd "#a", "#dst" \n\t"\ -#define HADAMARD8_DIFF_MMX(cpu) \ -static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ - DECLARE_ALIGNED(8, uint64_t, temp)[13];\ - int sum;\ -\ - assert(h==8);\ -\ - DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ -\ - __asm__ volatile(\ - HADAMARD48\ -\ - "movq %%mm7, 96(%1) \n\t"\ -\ - TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ - STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ -\ - "movq 96(%1), %%mm7 \n\t"\ - TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ - STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ -\ - : "=r" (sum)\ - : "r"(temp)\ - );\ -\ - DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ -\ - __asm__ volatile(\ - HADAMARD48\ -\ - "movq %%mm7, 96(%1) \n\t"\ -\ - TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ - STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ -\ - "movq 96(%1), %%mm7 \n\t"\ - TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ - "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ - "movq %%mm6, %%mm7 \n\t"\ - "movq %%mm0, %%mm6 \n\t"\ -\ - LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ -\ - HADAMARD48\ - "movq %%mm7, 64(%1) \n\t"\ - MMABS(%%mm0, %%mm7)\ - MMABS(%%mm1, %%mm7)\ - MMABS_SUM(%%mm2, %%mm7, %%mm0)\ - MMABS_SUM(%%mm3, %%mm7, %%mm1)\ - MMABS_SUM(%%mm4, %%mm7, %%mm0)\ - MMABS_SUM(%%mm5, %%mm7, %%mm1)\ - MMABS_SUM(%%mm6, %%mm7, %%mm0)\ - "movq 64(%1), %%mm2 \n\t"\ - MMABS_SUM(%%mm2, %%mm7, %%mm1)\ - "paddusw %%mm1, %%mm0 \n\t"\ - "movq %%mm0, 64(%1) \n\t"\ -\ - LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ - LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ -\ - HADAMARD48\ - "movq %%mm7, (%1) \n\t"\ - MMABS(%%mm0, %%mm7)\ - MMABS(%%mm1, %%mm7)\ - MMABS_SUM(%%mm2, %%mm7, %%mm0)\ - MMABS_SUM(%%mm3, %%mm7, %%mm1)\ - MMABS_SUM(%%mm4, %%mm7, %%mm0)\ - MMABS_SUM(%%mm5, %%mm7, %%mm1)\ - MMABS_SUM(%%mm6, %%mm7, %%mm0)\ - "movq (%1), %%mm2 \n\t"\ - MMABS_SUM(%%mm2, %%mm7, %%mm1)\ - "paddusw 64(%1), %%mm0 \n\t"\ - "paddusw %%mm1, %%mm0 \n\t"\ -\ - HSUM(%%mm0, %%mm1, %0)\ -\ - : "=r" (sum)\ - : "r"(temp)\ - );\ - return sum&0xFFFF;\ -}\ -WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) +#define hadamard_func(cpu) \ +int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \ + int stride, int h); \ +int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \ + int stride, int h); -#define HADAMARD8_DIFF_SSE2(cpu) \ -static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ - DECLARE_ALIGNED(16, uint64_t, temp)[4];\ - int sum;\ -\ - assert(h==8);\ -\ - DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ -\ - __asm__ volatile(\ - HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ - TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ - HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ - MMABS_SUM_8x8\ - HSUM_SSE2(%%xmm0, %%xmm1, %0)\ - : "=r" (sum)\ - : "r"(temp)\ - );\ - return sum&0xFFFF;\ -}\ -WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) - -#define MMABS(a,z) MMABS_MMX(a,z) -#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) -HADAMARD8_DIFF_MMX(mmx) -#undef MMABS -#undef HSUM - -#define MMABS(a,z) MMABS_MMX2(a,z) -#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 -#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) -HADAMARD8_DIFF_MMX(mmx2) -HADAMARD8_DIFF_SSE2(sse2) -#undef MMABS -#undef MMABS_SUM_8x8 -#undef HSUM - -#if HAVE_SSSE3 -#define MMABS(a,z) MMABS_SSSE3(a,z) -#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL -HADAMARD8_DIFF_SSE2(ssse3) -#undef MMABS -#undef MMABS_SUM_8x8 -#endif +hadamard_func(mmx) +hadamard_func(mmx2) +hadamard_func(sse2) +hadamard_func(ssse3) #define DCT_SAD4(m,mm,o)\ "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ @@ -1312,8 +1118,8 @@ c->diff_bytes= diff_bytes_mmx; c->sum_abs_dctelem= sum_abs_dctelem_mmx; - c->hadamard8_diff[0]= hadamard8_diff16_mmx; - c->hadamard8_diff[1]= hadamard8_diff_mmx; + c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx; + c->hadamard8_diff[1]= ff_hadamard8_diff_mmx; c->pix_norm1 = pix_norm1_mmx; c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx; @@ -1336,8 +1142,8 @@ if (mm_flags & AV_CPU_FLAG_MMX2) { c->sum_abs_dctelem= sum_abs_dctelem_mmx2; - c->hadamard8_diff[0]= hadamard8_diff16_mmx2; - c->hadamard8_diff[1]= hadamard8_diff_mmx2; + c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2; + c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2; c->vsad[4]= vsad_intra16_mmx2; if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ @@ -1350,8 +1156,8 @@ if(mm_flags & AV_CPU_FLAG_SSE2){ c->get_pixels = get_pixels_sse2; c->sum_abs_dctelem= sum_abs_dctelem_sse2; - c->hadamard8_diff[0]= hadamard8_diff16_sse2; - c->hadamard8_diff[1]= hadamard8_diff_sse2; + c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2; + c->hadamard8_diff[1]= ff_hadamard8_diff_sse2; } if (CONFIG_LPC && mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) { @@ -1365,8 +1171,8 @@ } c->add_8x8basis= add_8x8basis_ssse3; c->sum_abs_dctelem= sum_abs_dctelem_ssse3; - c->hadamard8_diff[0]= hadamard8_diff16_ssse3; - c->hadamard8_diff[1]= hadamard8_diff_ssse3; + c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3; + c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3; } #endif diff -r c5ffa8b81f9c -r c997f09d1e10 x86/dsputilenc_yasm.asm --- a/x86/dsputilenc_yasm.asm Fri Sep 17 01:44:17 2010 +0000 +++ b/x86/dsputilenc_yasm.asm Fri Sep 17 01:56:06 2010 +0000 @@ -26,6 +26,261 @@ SECTION .text +%macro DIFF_PIXELS_1 4 + movh %1, %3 + movh %2, %4 + punpcklbw %2, %1 + punpcklbw %1, %1 + psubw %1, %2 +%endmacro + +; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 +; %6=temporary storage location +; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) +%macro DIFF_PIXELS_8 6 + DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] + DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] + DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] + add %1, %5 + add %2, %5 + DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] + DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] + DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] + DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] +%ifdef m8 + DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] +%else + mova [%6], m0 + DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] + mova m0, [%6] +%endif + sub %1, %5 + sub %2, %5 +%endmacro + +%macro HADAMARD8 0 + SUMSUB_BADC m0, m1, m2, m3 + SUMSUB_BADC m4, m5, m6, m7 + SUMSUB_BADC m0, m2, m1, m3 + SUMSUB_BADC m4, m6, m5, m7 + SUMSUB_BADC m0, m4, m1, m5 + SUMSUB_BADC m2, m6, m3, m7 +%endmacro + +%macro ABS1_SUM 3 + ABS1 %1, %2 + paddusw %3, %1 +%endmacro + +%macro ABS2_SUM 6 + ABS2 %1, %2, %3, %4 + paddusw %5, %1 + paddusw %6, %2 +%endmacro + +%macro ABS_SUM_8x8_64 1 + ABS2 m0, m1, m8, m9 + ABS2_SUM m2, m3, m8, m9, m0, m1 + ABS2_SUM m4, m5, m8, m9, m0, m1 + ABS2_SUM m6, m7, m8, m9, m0, m1 + paddusw m0, m1 +%endmacro + +%macro ABS_SUM_8x8_32 1 + mova [%1], m7 + ABS1 m0, m7 + ABS1 m1, m7 + ABS1_SUM m2, m7, m0 + ABS1_SUM m3, m7, m1 + ABS1_SUM m4, m7, m0 + ABS1_SUM m5, m7, m1 + ABS1_SUM m6, m7, m0 + mova m2, [%1] + ABS1_SUM m2, m7, m1 + paddusw m0, m1 +%endmacro + +; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to +; about 100k on extreme inputs. But that's very unlikely to occur in natural video, +; and it's even more unlikely to not have any alternative mvs/modes with lower cost. +%macro HSUM_MMX 3 + mova %2, %1 + psrlq %1, 32 + paddusw %1, %2 + mova %2, %1 + psrlq %1, 16 + paddusw %1, %2 + movd %3, %1 +%endmacro + +%macro HSUM_MMX2 3 + pshufw %2, %1, 0xE + paddusw %1, %2 + pshufw %2, %1, 0x1 + paddusw %1, %2 + movd %3, %1 +%endmacro + +%macro HSUM_SSE2 3 + movhlps %2, %1 + paddusw %1, %2 + pshuflw %2, %1, 0xE + paddusw %1, %2 + pshuflw %2, %1, 0x1 + paddusw %1, %2 + movd %3, %1 +%endmacro + +%macro STORE4 5 + mova [%1+mmsize*0], %2 + mova [%1+mmsize*1], %3 + mova [%1+mmsize*2], %4 + mova [%1+mmsize*3], %5 +%endmacro + +%macro LOAD4 5 + mova %2, [%1+mmsize*0] + mova %3, [%1+mmsize*1] + mova %4, [%1+mmsize*2] + mova %5, [%1+mmsize*3] +%endmacro + +%macro hadamard8_16_wrapper 3 +cglobal hadamard8_diff_%1, 4, 4, %2 +%ifndef m8 + %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) + SUB rsp, pad +%endif + call hadamard8x8_diff_%1 +%ifndef m8 + ADD rsp, pad +%endif + RET + +cglobal hadamard8_diff16_%1, 5, 6, %2 +%ifndef m8 + %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) + SUB rsp, pad +%endif + + call hadamard8x8_diff_%1 + mov r5d, eax + + add r1, 8 + add r2, 8 + call hadamard8x8_diff_%1 + add r5d, eax + + cmp r4d, 16 + jne .done + + lea r1, [r1+r3*8-8] + lea r2, [r2+r3*8-8] + call hadamard8x8_diff_%1 + add r5d, eax + + add r1, 8 + add r2, 8 + call hadamard8x8_diff_%1 + add r5d, eax + +.done + mov eax, r5d +%ifndef m8 + ADD rsp, pad +%endif + RET +%endmacro + +%macro HADAMARD8_DIFF_MMX 1 +ALIGN 16 +; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, +; int stride, int h) +; r0 = void *s = unused, int h = unused (always 8) +; note how r1, r2 and r3 are not clobbered in this function, so 16x16 +; can simply call this 2x2x (and that's why we access rsp+gprsize +; everywhere, which is rsp of calling func +hadamard8x8_diff_%1: + lea r0, [r3*3] + + ; first 4x8 pixels + DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 + HADAMARD8 + mova [rsp+gprsize+0x60], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + STORE4 rsp+gprsize, m0, m1, m2, m3 + mova m7, [rsp+gprsize+0x60] + TRANSPOSE4x4W 4, 5, 6, 7, 0 + STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 + + ; second 4x8 pixels + DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 + HADAMARD8 + mova [rsp+gprsize+0x60], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 + mova m7, [rsp+gprsize+0x60] + TRANSPOSE4x4W 4, 5, 6, 7, 0 + + LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 + HADAMARD8 + ABS_SUM_8x8_32 rsp+gprsize+0x60 + mova [rsp+gprsize+0x60], m0 + + LOAD4 rsp+gprsize , m0, m1, m2, m3 + LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 + HADAMARD8 + ABS_SUM_8x8_32 rsp+gprsize + paddusw m0, [rsp+gprsize+0x60] + + HSUM m0, m1, eax + and rax, 0xFFFF + ret + +hadamard8_16_wrapper %1, 0, 14 +%endmacro + +%macro HADAMARD8_DIFF_SSE2 2 +hadamard8x8_diff_%1: + lea r0, [r3*3] + DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize + HADAMARD8 +%ifdef ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] +%endif + HADAMARD8 + ABS_SUM_8x8 rsp+gprsize + HSUM_SSE2 m0, m1, eax + and eax, 0xFFFF + ret + +hadamard8_16_wrapper %1, %2, 3 +%endmacro + +INIT_MMX +%define ABS1 ABS1_MMX +%define HSUM HSUM_MMX +HADAMARD8_DIFF_MMX mmx + +%define ABS1 ABS1_MMX2 +%define HSUM HSUM_MMX2 +HADAMARD8_DIFF_MMX mmx2 + +INIT_XMM +%define ABS2 ABS2_MMX2 +%ifdef ARCH_X86_64 +%define ABS_SUM_8x8 ABS_SUM_8x8_64 +%else +%define ABS_SUM_8x8 ABS_SUM_8x8_32 +%endif +HADAMARD8_DIFF_SSE2 sse2, 10 + +%define ABS2 ABS2_SSSE3 +%define ABS_SUM_8x8 ABS_SUM_8x8_64 +HADAMARD8_DIFF_SSE2 ssse3, 9 + INIT_XMM ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) cglobal sse16_sse2, 5, 5, 8 diff -r c5ffa8b81f9c -r c997f09d1e10 x86/x86util.asm --- a/x86/x86util.asm Fri Sep 17 01:44:17 2010 +0000 +++ b/x86/x86util.asm Fri Sep 17 01:56:06 2010 +0000 @@ -148,12 +148,30 @@ %endmacro %macro ABS1_MMX 2 ; a, tmp + pxor %2, %2 + pcmpgtw %2, %1 + pxor %1, %2 + psubw %1, %2 +%endmacro + +%macro ABS2_MMX 4 ; a, b, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + pcmpgtw %3, %1 + pcmpgtw %4, %2 + pxor %1, %3 + pxor %2, %4 + psubw %1, %3 + psubw %2, %4 +%endmacro + +%macro ABS1_MMX2 2 ; a, tmp pxor %2, %2 psubw %2, %1 pmaxsw %1, %2 %endmacro -%macro ABS2_MMX 4 ; a, b, tmp0, tmp1 +%macro ABS2_MMX2 4 ; a, b, tmp0, tmp1 pxor %3, %3 pxor %4, %4 psubw %3, %1