Mercurial > libavcodec.hg
comparison i386/h264dsp_mmx.c @ 4133:560ea2d5524e libavcodec
move luma tc0 related init into asm
5% faster filter_mb_fast() on P3
author | michael |
---|---|
date | Fri, 03 Nov 2006 14:28:30 +0000 |
parents | 1a8e384d0463 |
children | 3d2887b288f4 |
comparison
equal
deleted
inserted
replaced
4132:c9e0315f9954 | 4133:560ea2d5524e |
---|---|
375 "pminub "#tc0", "#q2" \n\t"\ | 375 "pminub "#tc0", "#q2" \n\t"\ |
376 "movq "#q2", "q1addr" \n\t" | 376 "movq "#q2", "q1addr" \n\t" |
377 | 377 |
378 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) | 378 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) |
379 { | 379 { |
380 uint64_t tmp0; | 380 uint64_t tmp0[2]; |
381 uint64_t tc = (uint8_t)tc0[1]*0x01010000 | (uint8_t)tc0[0]*0x0101; | |
382 // with luma, tc0=0 doesn't mean no filtering, so we need a separate input mask | |
383 uint32_t mask[2] = { (tc0[0]>=0)*0xffffffff, (tc0[1]>=0)*0xffffffff }; | |
384 | 381 |
385 asm volatile( | 382 asm volatile( |
386 "movq (%1,%3), %%mm0 \n\t" //p1 | 383 "movq (%1,%3), %%mm0 \n\t" //p1 |
387 "movq (%1,%3,2), %%mm1 \n\t" //p0 | 384 "movq (%1,%3,2), %%mm1 \n\t" //p0 |
388 "movq (%2), %%mm2 \n\t" //q0 | 385 "movq (%2), %%mm2 \n\t" //q0 |
389 "movq (%2,%3), %%mm3 \n\t" //q1 | 386 "movq (%2,%3), %%mm3 \n\t" //q1 |
390 H264_DEBLOCK_MASK(%6, %7) | 387 H264_DEBLOCK_MASK(%6, %7) |
391 "pand %5, %%mm7 \n\t" | 388 |
392 "movq %%mm7, %0 \n\t" | 389 "movd %5, %%mm4 \n\t" |
390 "punpcklbw %%mm4, %%mm4 \n\t" | |
391 "punpcklwd %%mm4, %%mm4 \n\t" | |
392 "pcmpeqb %%mm3, %%mm3 \n\t" | |
393 "movq %%mm4, %%mm6 \n\t" | |
394 "pcmpgtb %%mm3, %%mm4 \n\t" | |
395 "movq %%mm6, 8+%0 \n\t" | |
396 "pand %%mm4, %%mm7 \n\t" | |
397 "movq %%mm7, %0 \n\t" | |
393 | 398 |
394 /* filter p1 */ | 399 /* filter p1 */ |
395 "movq (%1), %%mm3 \n\t" //p2 | 400 "movq (%1), %%mm3 \n\t" //p2 |
396 DIFF_GT_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 | 401 DIFF_GT_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 |
397 "pandn %%mm7, %%mm6 \n\t" | 402 "pandn %%mm7, %%mm6 \n\t" |
398 "pcmpeqb %%mm7, %%mm6 \n\t" | 403 "pcmpeqb %%mm7, %%mm6 \n\t" |
399 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta | 404 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta |
400 "pshufw $80, %4, %%mm4 \n\t" | 405 "movq 8+%0, %%mm4 \n\t" |
401 "pand %%mm7, %%mm4 \n\t" // mask & tc0 | 406 "pand %%mm7, %%mm4 \n\t" // mask & tc0 |
402 "movq %%mm4, %%mm7 \n\t" | 407 "movq %%mm4, %%mm7 \n\t" |
403 "psubb %%mm6, %%mm7 \n\t" | 408 "psubb %%mm6, %%mm7 \n\t" |
404 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 | 409 "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 |
405 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) | 410 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) |
408 "movq (%2,%3,2), %%mm4 \n\t" //q2 | 413 "movq (%2,%3,2), %%mm4 \n\t" //q2 |
409 DIFF_GT_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 | 414 DIFF_GT_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 |
410 "pandn %0, %%mm6 \n\t" | 415 "pandn %0, %%mm6 \n\t" |
411 "pcmpeqb %0, %%mm6 \n\t" | 416 "pcmpeqb %0, %%mm6 \n\t" |
412 "pand %0, %%mm6 \n\t" | 417 "pand %0, %%mm6 \n\t" |
413 "pshufw $80, %4, %%mm5 \n\t" | 418 "movq 8+%0, %%mm5 \n\t" |
414 "pand %%mm6, %%mm5 \n\t" | 419 "pand %%mm6, %%mm5 \n\t" |
415 "psubb %%mm6, %%mm7 \n\t" | 420 "psubb %%mm6, %%mm7 \n\t" |
416 "movq (%2,%3), %%mm3 \n\t" | 421 "movq (%2,%3), %%mm3 \n\t" |
417 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6) | 422 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6) |
418 | 423 |
419 /* filter p0, q0 */ | 424 /* filter p0, q0 */ |
420 H264_DEBLOCK_P0_Q0(%8, %9) | 425 H264_DEBLOCK_P0_Q0(%8, unused) |
421 "movq %%mm1, (%1,%3,2) \n\t" | 426 "movq %%mm1, (%1,%3,2) \n\t" |
422 "movq %%mm2, (%2) \n\t" | 427 "movq %%mm2, (%2) \n\t" |
423 | 428 |
424 : "=m"(tmp0) | 429 : "=m"(*tmp0) |
425 : "r"(pix-3*stride), "r"(pix), "r"((long)stride), | 430 : "r"(pix-3*stride), "r"(pix), "r"((long)stride), |
426 "m"(tc), "m"(*(uint64_t*)mask), "m"(alpha1), "m"(beta1), | 431 "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), |
427 "m"(mm_bone), "m"(ff_pb_3F) | 432 "m"(mm_bone) |
428 ); | 433 ); |
429 } | 434 } |
430 | 435 |
431 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) | 436 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
432 { | 437 { |