comparison i386/h264dsp_mmx.c @ 4135:bbf0caa655f0 libavcodec

2 instructions less (same speed)
author michael
date Fri, 03 Nov 2006 15:40:57 +0000
parents 3d2887b288f4
children 05ffda007f80
comparison
equal deleted inserted replaced
4134:3d2887b288f4 4135:bbf0caa655f0
315 "psubusb "#x", "#t" \n\t"\ 315 "psubusb "#x", "#t" \n\t"\
316 "psubusb "#y", "#o" \n\t"\ 316 "psubusb "#y", "#o" \n\t"\
317 "por "#t", "#o" \n\t"\ 317 "por "#t", "#o" \n\t"\
318 "psubusb "#a", "#o" \n\t" 318 "psubusb "#a", "#o" \n\t"
319 319
320 // out: o = |x-y|>a
321 // clobbers: t
322 #define DIFF_GT2_MMX(x,y,a,o,t)\
323 "movq "#y", "#t" \n\t"\
324 "movq "#x", "#o" \n\t"\
325 "psubusb "#x", "#t" \n\t"\
326 "psubusb "#y", "#o" \n\t"\
327 "psubusb "#a", "#t" \n\t"\
328 "psubusb "#a", "#o" \n\t"\
329 "pcmpeqb "#t", "#o" \n\t"\
330
320 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 331 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
321 // out: mm5=beta-1, mm7=mask 332 // out: mm5=beta-1, mm7=mask
322 // clobbers: mm4,mm6 333 // clobbers: mm4,mm6
323 #define H264_DEBLOCK_MASK(alpha1, beta1) \ 334 #define H264_DEBLOCK_MASK(alpha1, beta1) \
324 "pshufw $0, "#alpha1", %%mm4 \n\t"\ 335 "pshufw $0, "#alpha1", %%mm4 \n\t"\
396 "pand %%mm4, %%mm7 \n\t" 407 "pand %%mm4, %%mm7 \n\t"
397 "movq %%mm7, %0 \n\t" 408 "movq %%mm7, %0 \n\t"
398 409
399 /* filter p1 */ 410 /* filter p1 */
400 "movq (%1), %%mm3 \n\t" //p2 411 "movq (%1), %%mm3 \n\t" //p2
401 DIFF_GT_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 412 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
402 "pandn %%mm7, %%mm6 \n\t"
403 "pcmpeqb %%mm7, %%mm6 \n\t"
404 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta 413 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta
405 "movq 8+%0, %%mm4 \n\t" // can be merged with the and below but is slower then 414 "movq 8+%0, %%mm4 \n\t" // can be merged with the and below but is slower then
406 "pand %%mm7, %%mm4 \n\t" // mask & tc0 415 "pand %%mm7, %%mm4 \n\t" // mask & tc0
407 "movq %%mm4, %%mm7 \n\t" 416 "movq %%mm4, %%mm7 \n\t"
408 "psubb %%mm6, %%mm7 \n\t" 417 "psubb %%mm6, %%mm7 \n\t"
409 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 418 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0
410 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) 419 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
411 420
412 /* filter q1 */ 421 /* filter q1 */
413 "movq (%2,%3,2), %%mm4 \n\t" //q2 422 "movq (%2,%3,2), %%mm4 \n\t" //q2
414 DIFF_GT_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 423 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
415 "pandn %0, %%mm6 \n\t"
416 "pcmpeqb %0, %%mm6 \n\t"
417 "pand %0, %%mm6 \n\t" 424 "pand %0, %%mm6 \n\t"
418 "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then 425 "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then
419 "pand %%mm6, %%mm5 \n\t" 426 "pand %%mm6, %%mm5 \n\t"
420 "psubb %%mm6, %%mm7 \n\t" 427 "psubb %%mm6, %%mm7 \n\t"
421 "movq (%2,%3), %%mm3 \n\t" 428 "movq (%2,%3), %%mm3 \n\t"