# HG changeset patch # User lorenm # Date 1264526268 0 # Node ID aa10bb3c244c24880fd2df1066b6ef02b1e0da42 # Parent 4aee091df9347d697d62a7eab15a1c1c2ef9be4b optimize h264_loop_filter_strength_mmx2 244->160 cycles on core2 diff -r 4aee091df934 -r aa10bb3c244c x86/h264dsp_mmx.c --- a/x86/h264dsp_mmx.c Tue Jan 26 15:34:21 2010 +0000 +++ b/x86/h264dsp_mmx.c Tue Jan 26 17:17:48 2010 +0000 @@ -796,18 +796,19 @@ int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { int dir; __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "movq %0, %%mm6 \n\t" - "movq %1, %%mm5 \n\t" - "movq %2, %%mm4 \n\t" - ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) + "movq %0, %%mm7 \n" + "movq %1, %%mm6 \n" + ::"m"(ff_pb_1), "m"(ff_pb_3) ); if(field) __asm__ volatile( - "movq %0, %%mm5 \n\t" - "movq %1, %%mm4 \n\t" - ::"m"(ff_pb_3_1), "m"(ff_pb_7_3) + "movq %0, %%mm6 \n" + ::"m"(ff_pb_3_1) ); + __asm__ volatile( + "movq %%mm6, %%mm5 \n" + "paddb %%mm5, %%mm5 \n" + :); // could do a special case for dir==0 && edges==1, but it only reduces the // average filter time by 1.2% @@ -815,89 +816,84 @@ const x86_reg d_idx = dir ? -8 : -1; const int mask_mv = dir ? mask_mv1 : mask_mv0; DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; - int b_idx, edge, l; + int b_idx, edge; for( b_idx=12, edge=0; edge= 0; l-- ) { + if(bidir) { __asm__ volatile( - "movd (%0), %%mm1 \n\t" - "punpckldq (%0,%1), %%mm1 \n\t" - "punpckldq %%mm1, %%mm2 \n\t" - "pcmpeqb %%mm2, %%mm1 \n\t" - "paddb %%mm6, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] - "por %%mm1, %%mm0 \n\t" - - "movq (%2), %%mm1 \n\t" - "movq 8(%2), %%mm2 \n\t" - "psubw (%2,%1,4), %%mm1 \n\t" - "psubw 8(%2,%1,4), %%mm2 \n\t" - "packsswb %%mm2, %%mm1 \n\t" - "paddb %%mm5, %%mm1 \n\t" - "pminub %%mm4, %%mm1 \n\t" - "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit - "por %%mm1, %%mm0 \n\t" - ::"r"(ref[l]+b_idx), - "r"(d_idx), - "r"(mv[l]+b_idx) + "movd (%1,%0), %%mm2 \n" + "punpckldq 40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] } + "pshufw $0x44, (%1), %%mm0 \n" // { ref0[b], ref0[b] } + "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] } + "pshufw $0x4E, %%mm2, %%mm3 \n" + "psubb %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } + "psubb %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } + "1: \n" + "por %%mm1, %%mm0 \n" + "movq (%2,%0,4), %%mm1 \n" + "movq 8(%2,%0,4), %%mm2 \n" + "movq %%mm1, %%mm3 \n" + "movq %%mm2, %%mm4 \n" + "psubw (%2), %%mm1 \n" + "psubw 8(%2), %%mm2 \n" + "psubw 160(%2), %%mm3 \n" + "psubw 168(%2), %%mm4 \n" + "packsswb %%mm2, %%mm1 \n" + "packsswb %%mm4, %%mm3 \n" + "paddb %%mm6, %%mm1 \n" + "paddb %%mm6, %%mm3 \n" + "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit + "psubusb %%mm5, %%mm3 \n" + "packsswb %%mm3, %%mm1 \n" + "add $40, %0 \n" + "cmp $40, %0 \n" + "jl 1b \n" + "sub $80, %0 \n" + "pshufw $0x4E, %%mm1, %%mm1 \n" + "por %%mm1, %%mm0 \n" + "pshufw $0x4E, %%mm0, %%mm1 \n" + "pminub %%mm1, %%mm0 \n" + ::"r"(d_idx), + "r"(ref[0]+b_idx), + "r"(mv[0]+b_idx) ); - } - if(bidir==1){ - __asm__ volatile("pxor %%mm3, %%mm3 \n\t":); - for( l = bidir; l >= 0; l-- ) { + } else { __asm__ volatile( - "movd (%0), %%mm1 \n\t" - "punpckldq (%1), %%mm1 \n\t" - "punpckldq %%mm1, %%mm2 \n\t" - "pcmpeqb %%mm2, %%mm1 \n\t" - "paddb %%mm6, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] - "por %%mm1, %%mm3 \n\t" - - "movq (%2), %%mm1 \n\t" - "movq 8(%2), %%mm2 \n\t" - "psubw (%3), %%mm1 \n\t" - "psubw 8(%3), %%mm2 \n\t" - "packsswb %%mm2, %%mm1 \n\t" - "paddb %%mm5, %%mm1 \n\t" - "pminub %%mm4, %%mm1 \n\t" - "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit - "por %%mm1, %%mm3 \n\t" - ::"r"(ref[l]+b_idx), - "r"(ref[1-l]+b_idx+d_idx), - "r"(mv[l][b_idx]), - "r"(mv[1-l][b_idx+d_idx]) + "movd (%1), %%mm0 \n" + "psubb (%1,%0), %%mm0 \n" // ref[b] != ref[bn] + "movq (%2), %%mm1 \n" + "movq 8(%2), %%mm2 \n" + "psubw (%2,%0,4), %%mm1 \n" + "psubw 8(%2,%0,4), %%mm2 \n" + "packsswb %%mm2, %%mm1 \n" + "paddb %%mm6, %%mm1 \n" + "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit + "packsswb %%mm1, %%mm1 \n" + "por %%mm1, %%mm0 \n" + ::"r"(d_idx), + "r"(ref[0]+b_idx), + "r"(mv[0]+b_idx) ); - } - __asm__ volatile( - "pcmpeqw %%mm7, %%mm3 \n\t" - "psubusw %%mm3, %%mm0 \n\t" - :); } } __asm__ volatile( - "movd %0, %%mm1 \n\t" - "por %1, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn] + "movd %0, %%mm1 \n" + "por %1, %%mm1 \n" // nnz[b] || nnz[bn] ::"m"(nnz[b_idx]), "m"(nnz[b_idx+d_idx]) ); __asm__ volatile( - "pcmpeqw %%mm7, %%mm0 \n\t" - "pcmpeqw %%mm7, %%mm0 \n\t" - "psrlw $15, %%mm0 \n\t" // nonzero -> 1 - "psrlw $14, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "por %%mm1, %%mm2 \n\t" - "psrlw $1, %%mm1 \n\t" - "pandn %%mm2, %%mm1 \n\t" - "movq %%mm1, %0 \n\t" + "pminub %%mm7, %%mm1 \n" + "pminub %%mm7, %%mm0 \n" + "psllw $1, %%mm1 \n" + "pxor %%mm2, %%mm2 \n" + "pmaxub %%mm0, %%mm1 \n" + "punpcklbw %%mm2, %%mm1 \n" + "movq %%mm1, %0 \n" :"=m"(*bS[dir][edge]) ::"memory" );