Mercurial > libavcodec.hg
comparison i386/idct_mmx.c @ 2754:a49f140179e9 libavcodec
sort H.264 mmx dsp functions into their own file
author | lorenm |
---|---|
date | Thu, 02 Jun 2005 20:45:35 +0000 |
parents | 23665209e823 |
children | b128802eb77b |
comparison
equal
deleted
inserted
replaced
2753:ba8ecddf5598 | 2754:a49f140179e9 |
---|---|
596 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) | 596 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) |
597 | 597 |
598 declare_idct (ff_mmx_idct, mmx_table, | 598 declare_idct (ff_mmx_idct, mmx_table, |
599 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) | 599 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) |
600 | 600 |
601 | |
602 | |
603 /* in/out: mma=mma+mmb, mmb=mmb-mma */ | |
604 #define SUMSUB_BA( a, b ) \ | |
605 "paddw "#b", "#a" \n\t"\ | |
606 "paddw "#b", "#b" \n\t"\ | |
607 "psubw "#a", "#b" \n\t" | |
608 | |
609 #define SUMSUB_BADC( a, b, c, d ) \ | |
610 "paddw "#b", "#a" \n\t"\ | |
611 "paddw "#d", "#c" \n\t"\ | |
612 "paddw "#b", "#b" \n\t"\ | |
613 "paddw "#d", "#d" \n\t"\ | |
614 "psubw "#a", "#b" \n\t"\ | |
615 "psubw "#c", "#d" \n\t" | |
616 | |
617 #define SUMSUBD2_AB( a, b, t ) \ | |
618 "movq "#b", "#t" \n\t"\ | |
619 "psraw $1 , "#b" \n\t"\ | |
620 "paddw "#a", "#b" \n\t"\ | |
621 "psraw $1 , "#a" \n\t"\ | |
622 "psubw "#t", "#a" \n\t" | |
623 | |
624 #define IDCT4_1D( s02, s13, d02, d13, t ) \ | |
625 SUMSUB_BA ( s02, d02 )\ | |
626 SUMSUBD2_AB( s13, d13, t )\ | |
627 SUMSUB_BADC( d13, s02, s13, d02 ) | |
628 | |
629 #define SBUTTERFLY( a, b, t, n ) \ | |
630 "movq "#a", "#t" \n\t" /* abcd */\ | |
631 "punpckl"#n" "#b", "#a" \n\t" /* aebf */\ | |
632 "punpckh"#n" "#b", "#t" \n\t" /* cgdh */ | |
633 | |
634 #define TRANSPOSE4( a, b, c, d, t ) \ | |
635 SBUTTERFLY( a, b, t, wd ) /* a=aebf t=cgdh */\ | |
636 SBUTTERFLY( c, d, b, wd ) /* c=imjn b=kolp */\ | |
637 SBUTTERFLY( a, c, d, dq ) /* a=aeim d=bfjn */\ | |
638 SBUTTERFLY( t, b, c, dq ) /* t=cgko c=dhlp */ | |
639 | |
640 #define STORE_DIFF_4P( p, t, z ) \ | |
641 "psraw $6, "#p" \n\t"\ | |
642 "movd (%0), "#t" \n\t"\ | |
643 "punpcklbw "#z", "#t" \n\t"\ | |
644 "paddsw "#t", "#p" \n\t"\ | |
645 "packuswb "#z", "#p" \n\t"\ | |
646 "movd "#p", (%0) \n\t" | |
647 | |
648 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL; | |
649 | |
650 void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride) | |
651 { | |
652 /* Load dct coeffs */ | |
653 asm volatile( | |
654 "movq (%0), %%mm0 \n\t" | |
655 "movq 8(%0), %%mm1 \n\t" | |
656 "movq 16(%0), %%mm2 \n\t" | |
657 "movq 24(%0), %%mm3 \n\t" | |
658 :: "r"(block) ); | |
659 | |
660 asm volatile( | |
661 /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ | |
662 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) | |
663 | |
664 "movq %0, %%mm6 \n\t" | |
665 /* in: 1,4,0,2 out: 1,2,3,0 */ | |
666 TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ) | |
667 | |
668 "paddw %%mm6, %%mm3 \n\t" | |
669 | |
670 /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ | |
671 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) | |
672 | |
673 "pxor %%mm7, %%mm7 \n\t" | |
674 :: "m"(ff_pw_32)); | |
675 | |
676 asm volatile( | |
677 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) | |
678 "add %1, %0 \n\t" | |
679 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) | |
680 "add %1, %0 \n\t" | |
681 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) | |
682 "add %1, %0 \n\t" | |
683 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) | |
684 : "+r"(dst) | |
685 : "r" ((long)stride) | |
686 ); | |
687 } |