comparison i386/idct_mmx.c @ 2754:a49f140179e9 libavcodec

sort H.264 mmx dsp functions into their own file
author lorenm
date Thu, 02 Jun 2005 20:45:35 +0000
parents 23665209e823
children b128802eb77b
comparison
equal deleted inserted replaced
2753:ba8ecddf5598 2754:a49f140179e9
596 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) 596 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
597 597
598 declare_idct (ff_mmx_idct, mmx_table, 598 declare_idct (ff_mmx_idct, mmx_table,
599 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) 599 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
600 600
601
602
603 /* in/out: mma=mma+mmb, mmb=mmb-mma */
604 #define SUMSUB_BA( a, b ) \
605 "paddw "#b", "#a" \n\t"\
606 "paddw "#b", "#b" \n\t"\
607 "psubw "#a", "#b" \n\t"
608
609 #define SUMSUB_BADC( a, b, c, d ) \
610 "paddw "#b", "#a" \n\t"\
611 "paddw "#d", "#c" \n\t"\
612 "paddw "#b", "#b" \n\t"\
613 "paddw "#d", "#d" \n\t"\
614 "psubw "#a", "#b" \n\t"\
615 "psubw "#c", "#d" \n\t"
616
617 #define SUMSUBD2_AB( a, b, t ) \
618 "movq "#b", "#t" \n\t"\
619 "psraw $1 , "#b" \n\t"\
620 "paddw "#a", "#b" \n\t"\
621 "psraw $1 , "#a" \n\t"\
622 "psubw "#t", "#a" \n\t"
623
624 #define IDCT4_1D( s02, s13, d02, d13, t ) \
625 SUMSUB_BA ( s02, d02 )\
626 SUMSUBD2_AB( s13, d13, t )\
627 SUMSUB_BADC( d13, s02, s13, d02 )
628
629 #define SBUTTERFLY( a, b, t, n ) \
630 "movq "#a", "#t" \n\t" /* abcd */\
631 "punpckl"#n" "#b", "#a" \n\t" /* aebf */\
632 "punpckh"#n" "#b", "#t" \n\t" /* cgdh */
633
634 #define TRANSPOSE4( a, b, c, d, t ) \
635 SBUTTERFLY( a, b, t, wd ) /* a=aebf t=cgdh */\
636 SBUTTERFLY( c, d, b, wd ) /* c=imjn b=kolp */\
637 SBUTTERFLY( a, c, d, dq ) /* a=aeim d=bfjn */\
638 SBUTTERFLY( t, b, c, dq ) /* t=cgko c=dhlp */
639
640 #define STORE_DIFF_4P( p, t, z ) \
641 "psraw $6, "#p" \n\t"\
642 "movd (%0), "#t" \n\t"\
643 "punpcklbw "#z", "#t" \n\t"\
644 "paddsw "#t", "#p" \n\t"\
645 "packuswb "#z", "#p" \n\t"\
646 "movd "#p", (%0) \n\t"
647
648 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
649
650 void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride)
651 {
652 /* Load dct coeffs */
653 asm volatile(
654 "movq (%0), %%mm0 \n\t"
655 "movq 8(%0), %%mm1 \n\t"
656 "movq 16(%0), %%mm2 \n\t"
657 "movq 24(%0), %%mm3 \n\t"
658 :: "r"(block) );
659
660 asm volatile(
661 /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */
662 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
663
664 "movq %0, %%mm6 \n\t"
665 /* in: 1,4,0,2 out: 1,2,3,0 */
666 TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
667
668 "paddw %%mm6, %%mm3 \n\t"
669
670 /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */
671 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
672
673 "pxor %%mm7, %%mm7 \n\t"
674 :: "m"(ff_pw_32));
675
676 asm volatile(
677 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
678 "add %1, %0 \n\t"
679 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
680 "add %1, %0 \n\t"
681 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
682 "add %1, %0 \n\t"
683 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
684 : "+r"(dst)
685 : "r" ((long)stride)
686 );
687 }