Mercurial > libavcodec.hg
comparison i386/h264dsp_mmx.c @ 4135:bbf0caa655f0 libavcodec
2 instructions less (same speed)
author | michael |
---|---|
date | Fri, 03 Nov 2006 15:40:57 +0000 |
parents | 3d2887b288f4 |
children | 05ffda007f80 |
comparison
equal
deleted
inserted
replaced
4134:3d2887b288f4 | 4135:bbf0caa655f0 |
---|---|
315 "psubusb "#x", "#t" \n\t"\ | 315 "psubusb "#x", "#t" \n\t"\ |
316 "psubusb "#y", "#o" \n\t"\ | 316 "psubusb "#y", "#o" \n\t"\ |
317 "por "#t", "#o" \n\t"\ | 317 "por "#t", "#o" \n\t"\ |
318 "psubusb "#a", "#o" \n\t" | 318 "psubusb "#a", "#o" \n\t" |
319 | 319 |
320 // out: o = |x-y|>a | |
321 // clobbers: t | |
322 #define DIFF_GT2_MMX(x,y,a,o,t)\ | |
323 "movq "#y", "#t" \n\t"\ | |
324 "movq "#x", "#o" \n\t"\ | |
325 "psubusb "#x", "#t" \n\t"\ | |
326 "psubusb "#y", "#o" \n\t"\ | |
327 "psubusb "#a", "#t" \n\t"\ | |
328 "psubusb "#a", "#o" \n\t"\ | |
329 "pcmpeqb "#t", "#o" \n\t"\ | |
330 | |
320 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 | 331 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 |
321 // out: mm5=beta-1, mm7=mask | 332 // out: mm5=beta-1, mm7=mask |
322 // clobbers: mm4,mm6 | 333 // clobbers: mm4,mm6 |
323 #define H264_DEBLOCK_MASK(alpha1, beta1) \ | 334 #define H264_DEBLOCK_MASK(alpha1, beta1) \ |
324 "pshufw $0, "#alpha1", %%mm4 \n\t"\ | 335 "pshufw $0, "#alpha1", %%mm4 \n\t"\ |
396 "pand %%mm4, %%mm7 \n\t" | 407 "pand %%mm4, %%mm7 \n\t" |
397 "movq %%mm7, %0 \n\t" | 408 "movq %%mm7, %0 \n\t" |
398 | 409 |
399 /* filter p1 */ | 410 /* filter p1 */ |
400 "movq (%1), %%mm3 \n\t" //p2 | 411 "movq (%1), %%mm3 \n\t" //p2 |
401 DIFF_GT_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 | 412 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 |
402 "pandn %%mm7, %%mm6 \n\t" | |
403 "pcmpeqb %%mm7, %%mm6 \n\t" | |
404 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta | 413 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta |
405 "movq 8+%0, %%mm4 \n\t" // can be merged with the and below but is slower then | 414 "movq 8+%0, %%mm4 \n\t" // can be merged with the and below but is slower then |
406 "pand %%mm7, %%mm4 \n\t" // mask & tc0 | 415 "pand %%mm7, %%mm4 \n\t" // mask & tc0 |
407 "movq %%mm4, %%mm7 \n\t" | 416 "movq %%mm4, %%mm7 \n\t" |
408 "psubb %%mm6, %%mm7 \n\t" | 417 "psubb %%mm6, %%mm7 \n\t" |
409 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 | 418 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 |
410 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) | 419 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) |
411 | 420 |
412 /* filter q1 */ | 421 /* filter q1 */ |
413 "movq (%2,%3,2), %%mm4 \n\t" //q2 | 422 "movq (%2,%3,2), %%mm4 \n\t" //q2 |
414 DIFF_GT_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 | 423 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 |
415 "pandn %0, %%mm6 \n\t" | |
416 "pcmpeqb %0, %%mm6 \n\t" | |
417 "pand %0, %%mm6 \n\t" | 424 "pand %0, %%mm6 \n\t" |
418 "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then | 425 "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then |
419 "pand %%mm6, %%mm5 \n\t" | 426 "pand %%mm6, %%mm5 \n\t" |
420 "psubb %%mm6, %%mm7 \n\t" | 427 "psubb %%mm6, %%mm7 \n\t" |
421 "movq (%2,%3), %%mm3 \n\t" | 428 "movq (%2,%3), %%mm3 \n\t" |