Mercurial > mplayer.hg
comparison liba52/imdct.c @ 16173:d6219ce521e9
liba52 asm optimizations ported to amd64
author | aurel |
---|---|
date | Fri, 05 Aug 2005 13:33:50 +0000 |
parents | 130dd060f723 |
children | 72764c0dad8a |
comparison
equal
deleted
inserted
replaced
16172:ac70488d1f3e | 16173:d6219ce521e9 |
---|---|
99 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39, | 99 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39, |
100 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d, | 100 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d, |
101 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b, | 101 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b, |
102 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f}; | 102 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f}; |
103 | 103 |
104 #ifdef ARCH_X86 | 104 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
105 // NOTE: SSE needs 16byte alignment or it will segfault | 105 // NOTE: SSE needs 16byte alignment or it will segfault |
106 // | 106 // |
107 static complex_t __attribute__((aligned(16))) buf[128]; | 107 static complex_t __attribute__((aligned(16))) buf[128]; |
108 static float __attribute__((aligned(16))) sseSinCos1c[256]; | 108 static float __attribute__((aligned(16))) sseSinCos1c[256]; |
109 static float __attribute__((aligned(16))) sseSinCos1d[256]; | 109 static float __attribute__((aligned(16))) sseSinCos1d[256]; |
440 { | 440 { |
441 int i; | 441 int i; |
442 int k; | 442 int k; |
443 int p,q; | 443 int p,q; |
444 int m; | 444 int m; |
445 int two_m; | 445 long two_m; |
446 int two_m_plus_one; | 446 long two_m_plus_one; |
447 | 447 |
448 sample_t tmp_b_i; | 448 sample_t tmp_b_i; |
449 sample_t tmp_b_r; | 449 sample_t tmp_b_r; |
450 sample_t tmp_a_i; | 450 sample_t tmp_a_i; |
451 sample_t tmp_a_r; | 451 sample_t tmp_a_r; |
745 #endif | 745 #endif |
746 | 746 |
747 | 747 |
748 // Stuff below this line is borrowed from libac3 | 748 // Stuff below this line is borrowed from libac3 |
749 #include "srfftp.h" | 749 #include "srfftp.h" |
750 #ifdef ARCH_X86 | 750 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
751 #ifndef HAVE_3DNOW | 751 #ifndef HAVE_3DNOW |
752 #define HAVE_3DNOW 1 | 752 #define HAVE_3DNOW 1 |
753 #endif | 753 #endif |
754 #include "srfftp_3dnow.h" | 754 #include "srfftp_3dnow.h" |
755 | 755 |
766 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias) | 766 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias) |
767 { | 767 { |
768 /* int i,k; | 768 /* int i,k; |
769 int p,q;*/ | 769 int p,q;*/ |
770 int m; | 770 int m; |
771 int two_m; | 771 long two_m; |
772 int two_m_plus_one; | 772 long two_m_plus_one; |
773 int two_m_plus_one_shl3; | 773 long two_m_plus_one_shl3; |
774 complex_t *buf_offset; | 774 complex_t *buf_offset; |
775 | 775 |
776 /* sample_t tmp_a_i; | 776 /* sample_t tmp_a_i; |
777 sample_t tmp_a_r; | 777 sample_t tmp_a_r; |
778 sample_t tmp_b_i; | 778 sample_t tmp_b_i; |
786 /* see the c version (dct_do_512()), its allmost identical, just in C */ | 786 /* see the c version (dct_do_512()), its allmost identical, just in C */ |
787 | 787 |
788 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ | 788 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ |
789 /* Bit reversed shuffling */ | 789 /* Bit reversed shuffling */ |
790 asm volatile( | 790 asm volatile( |
791 "xorl %%esi, %%esi \n\t" | 791 "xor %%"REG_S", %%"REG_S" \n\t" |
792 "leal "MANGLE(bit_reverse_512)", %%eax \n\t" | 792 "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t" |
793 "movl $1008, %%edi \n\t" | 793 "mov $1008, %%"REG_D" \n\t" |
794 "pushl %%ebp \n\t" //use ebp without telling gcc | 794 "push %%"REG_BP" \n\t" //use ebp without telling gcc |
795 ".balign 16 \n\t" | 795 ".balign 16 \n\t" |
796 "1: \n\t" | 796 "1: \n\t" |
797 "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI | 797 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI |
798 "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI | 798 "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI |
799 "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi | 799 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi |
800 "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi | 800 "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi |
801 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR | 801 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR |
802 "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t" | 802 "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t" |
803 "mulps %%xmm0, %%xmm2 \n\t" | 803 "mulps %%xmm0, %%xmm2 \n\t" |
804 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI | 804 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI |
805 "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" | 805 "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t" |
806 "subps %%xmm0, %%xmm2 \n\t" | 806 "subps %%xmm0, %%xmm2 \n\t" |
807 "movzbl (%%eax), %%edx \n\t" | 807 "movzb (%%"REG_a"), %%"REG_d" \n\t" |
808 "movzbl 1(%%eax), %%ebp \n\t" | 808 "movzb 1(%%"REG_a"), %%"REG_BP" \n\t" |
809 "movlps %%xmm2, (%1, %%edx,8) \n\t" | 809 "movlps %%xmm2, (%1, %%"REG_d", 8) \n\t" |
810 "movhps %%xmm2, (%1, %%ebp,8) \n\t" | 810 "movhps %%xmm2, (%1, %%"REG_BP", 8) \n\t" |
811 "addl $16, %%esi \n\t" | 811 "add $16, %%"REG_S" \n\t" |
812 "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap | 812 "add $2, %%"REG_a" \n\t" // avoid complex addressing for P4 crap |
813 "subl $16, %%edi \n\t" | 813 "sub $16, %%"REG_D" \n\t" |
814 " jnc 1b \n\t" | 814 "jnc 1b \n\t" |
815 "popl %%ebp \n\t"//no we didnt touch ebp *g* | 815 "pop %%"REG_BP" \n\t"//no we didnt touch ebp *g* |
816 :: "b" (data), "c" (buf) | 816 :: "r" (data), "r" (buf) |
817 : "%esi", "%edi", "%eax", "%edx" | 817 : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d |
818 ); | 818 ); |
819 | 819 |
820 | 820 |
821 /* FFT Merge */ | 821 /* FFT Merge */ |
822 /* unoptimized variant | 822 /* unoptimized variant |
848 /* 1. iteration */ | 848 /* 1. iteration */ |
849 // Note w[0][0]={1,0} | 849 // Note w[0][0]={1,0} |
850 asm volatile( | 850 asm volatile( |
851 "xorps %%xmm1, %%xmm1 \n\t" | 851 "xorps %%xmm1, %%xmm1 \n\t" |
852 "xorps %%xmm2, %%xmm2 \n\t" | 852 "xorps %%xmm2, %%xmm2 \n\t" |
853 "movl %0, %%esi \n\t" | 853 "mov %0, %%"REG_S" \n\t" |
854 ".balign 16 \n\t" | 854 ".balign 16 \n\t" |
855 "1: \n\t" | 855 "1: \n\t" |
856 "movlps (%%esi), %%xmm0 \n\t" //buf[p] | 856 "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p] |
857 "movlps 8(%%esi), %%xmm1\n\t" //buf[q] | 857 "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q] |
858 "movhps (%%esi), %%xmm0 \n\t" //buf[p] | 858 "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p] |
859 "movhps 8(%%esi), %%xmm2\n\t" //buf[q] | 859 "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q] |
860 "addps %%xmm1, %%xmm0 \n\t" | 860 "addps %%xmm1, %%xmm0 \n\t" |
861 "subps %%xmm2, %%xmm0 \n\t" | 861 "subps %%xmm2, %%xmm0 \n\t" |
862 "movaps %%xmm0, (%%esi) \n\t" | 862 "movaps %%xmm0, (%%"REG_S")\n\t" |
863 "addl $16, %%esi \n\t" | 863 "add $16, %%"REG_S" \n\t" |
864 "cmpl %1, %%esi \n\t" | 864 "cmp %1, %%"REG_S" \n\t" |
865 " jb 1b \n\t" | 865 " jb 1b \n\t" |
866 :: "g" (buf), "r" (buf + 128) | 866 :: "g" (buf), "r" (buf + 128) |
867 : "%esi" | 867 : "%"REG_S |
868 ); | 868 ); |
869 | 869 |
870 /* 2. iteration */ | 870 /* 2. iteration */ |
871 // Note w[1]={{1,0}, {0,-1}} | 871 // Note w[1]={{1,0}, {0,-1}} |
872 asm volatile( | 872 asm volatile( |
873 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 | 873 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 |
874 "movl %0, %%esi \n\t" | 874 "mov %0, %%"REG_S" \n\t" |
875 ".balign 16 \n\t" | 875 ".balign 16 \n\t" |
876 "1: \n\t" | 876 "1: \n\t" |
877 "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3 | 877 "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3 |
878 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 | 878 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 |
879 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 | 879 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 |
880 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 | 880 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1 |
881 "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1 | 881 "movaps (%%"REG_S"), %%xmm1 \n\t" //r0,i0,r1,i1 |
882 "addps %%xmm2, %%xmm0 \n\t" | 882 "addps %%xmm2, %%xmm0 \n\t" |
883 "subps %%xmm2, %%xmm1 \n\t" | 883 "subps %%xmm2, %%xmm1 \n\t" |
884 "movaps %%xmm0, (%%esi) \n\t" | 884 "movaps %%xmm0, (%%"REG_S") \n\t" |
885 "movaps %%xmm1, 16(%%esi) \n\t" | 885 "movaps %%xmm1, 16(%%"REG_S") \n\t" |
886 "addl $32, %%esi \n\t" | 886 "add $32, %%"REG_S" \n\t" |
887 "cmpl %1, %%esi \n\t" | 887 "cmp %1, %%"REG_S" \n\t" |
888 " jb 1b \n\t" | 888 " jb 1b \n\t" |
889 :: "g" (buf), "r" (buf + 128) | 889 :: "g" (buf), "r" (buf + 128) |
890 : "%esi" | 890 : "%"REG_S |
891 ); | 891 ); |
892 | 892 |
893 /* 3. iteration */ | 893 /* 3. iteration */ |
894 /* | 894 /* |
895 Note sseW2+0={1,1,sqrt(2),sqrt(2)) | 895 Note sseW2+0={1,1,sqrt(2),sqrt(2)) |
900 asm volatile( | 900 asm volatile( |
901 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" | 901 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" |
902 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" | 902 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" |
903 "xorps %%xmm5, %%xmm5 \n\t" | 903 "xorps %%xmm5, %%xmm5 \n\t" |
904 "xorps %%xmm2, %%xmm2 \n\t" | 904 "xorps %%xmm2, %%xmm2 \n\t" |
905 "movl %0, %%esi \n\t" | 905 "mov %0, %%"REG_S" \n\t" |
906 ".balign 16 \n\t" | 906 ".balign 16 \n\t" |
907 "1: \n\t" | 907 "1: \n\t" |
908 "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5 | 908 "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5 |
909 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7 | 909 "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7 |
910 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 | 910 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 |
911 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 | 911 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 |
912 "mulps %%xmm2, %%xmm4 \n\t" | 912 "mulps %%xmm2, %%xmm4 \n\t" |
913 "mulps %%xmm3, %%xmm5 \n\t" | 913 "mulps %%xmm3, %%xmm5 \n\t" |
914 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 | 914 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 |
915 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 | 915 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 |
916 "mulps %%xmm6, %%xmm3 \n\t" | 916 "mulps %%xmm6, %%xmm3 \n\t" |
917 "mulps %%xmm7, %%xmm2 \n\t" | 917 "mulps %%xmm7, %%xmm2 \n\t" |
918 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 | 918 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1 |
919 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3 | 919 "movaps 16(%%"REG_S"), %%xmm1 \n\t" //r2,i2,r3,i3 |
920 "addps %%xmm4, %%xmm2 \n\t" | 920 "addps %%xmm4, %%xmm2 \n\t" |
921 "addps %%xmm5, %%xmm3 \n\t" | 921 "addps %%xmm5, %%xmm3 \n\t" |
922 "movaps %%xmm2, %%xmm4 \n\t" | 922 "movaps %%xmm2, %%xmm4 \n\t" |
923 "movaps %%xmm3, %%xmm5 \n\t" | 923 "movaps %%xmm3, %%xmm5 \n\t" |
924 "addps %%xmm0, %%xmm2 \n\t" | 924 "addps %%xmm0, %%xmm2 \n\t" |
925 "addps %%xmm1, %%xmm3 \n\t" | 925 "addps %%xmm1, %%xmm3 \n\t" |
926 "subps %%xmm4, %%xmm0 \n\t" | 926 "subps %%xmm4, %%xmm0 \n\t" |
927 "subps %%xmm5, %%xmm1 \n\t" | 927 "subps %%xmm5, %%xmm1 \n\t" |
928 "movaps %%xmm2, (%%esi) \n\t" | 928 "movaps %%xmm2, (%%"REG_S") \n\t" |
929 "movaps %%xmm3, 16(%%esi) \n\t" | 929 "movaps %%xmm3, 16(%%"REG_S") \n\t" |
930 "movaps %%xmm0, 32(%%esi) \n\t" | 930 "movaps %%xmm0, 32(%%"REG_S") \n\t" |
931 "movaps %%xmm1, 48(%%esi) \n\t" | 931 "movaps %%xmm1, 48(%%"REG_S") \n\t" |
932 "addl $64, %%esi \n\t" | 932 "add $64, %%"REG_S" \n\t" |
933 "cmpl %1, %%esi \n\t" | 933 "cmp %1, %%"REG_S" \n\t" |
934 " jb 1b \n\t" | 934 " jb 1b \n\t" |
935 :: "g" (buf), "r" (buf + 128) | 935 :: "g" (buf), "r" (buf + 128) |
936 : "%esi" | 936 : "%"REG_S |
937 ); | 937 ); |
938 | 938 |
939 /* 4-7. iterations */ | 939 /* 4-7. iterations */ |
940 for (m=3; m < 7; m++) { | 940 for (m=3; m < 7; m++) { |
941 two_m = (1 << m); | 941 two_m = (1 << m); |
942 two_m_plus_one = two_m<<1; | 942 two_m_plus_one = two_m<<1; |
943 two_m_plus_one_shl3 = (two_m_plus_one<<3); | 943 two_m_plus_one_shl3 = (two_m_plus_one<<3); |
944 buf_offset = buf+128; | 944 buf_offset = buf+128; |
945 asm volatile( | 945 asm volatile( |
946 "movl %0, %%esi \n\t" | 946 "mov %0, %%"REG_S" \n\t" |
947 ".balign 16 \n\t" | 947 ".balign 16 \n\t" |
948 "1: \n\t" | 948 "1: \n\t" |
949 "xorl %%edi, %%edi \n\t" // k | 949 "xor %%"REG_D", %%"REG_D" \n\t" // k |
950 "leal (%%esi, %3), %%edx \n\t" | 950 "lea (%%"REG_S", %3), %%"REG_d" \n\t" |
951 "2: \n\t" | 951 "2: \n\t" |
952 "movaps (%%edx, %%edi), %%xmm1 \n\t" | 952 "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t" |
953 "movaps (%4, %%edi, 2), %%xmm2 \n\t" | 953 "movaps (%4, %%"REG_D", 2), %%xmm2 \n\t" |
954 "mulps %%xmm1, %%xmm2 \n\t" | 954 "mulps %%xmm1, %%xmm2 \n\t" |
955 "shufps $0xB1, %%xmm1, %%xmm1 \n\t" | 955 "shufps $0xB1, %%xmm1, %%xmm1 \n\t" |
956 "mulps 16(%4, %%edi, 2), %%xmm1 \n\t" | 956 "mulps 16(%4, %%"REG_D", 2), %%xmm1 \n\t" |
957 "movaps (%%esi, %%edi), %%xmm0 \n\t" | 957 "movaps (%%"REG_S", %%"REG_D"), %%xmm0 \n\t" |
958 "addps %%xmm2, %%xmm1 \n\t" | 958 "addps %%xmm2, %%xmm1 \n\t" |
959 "movaps %%xmm1, %%xmm2 \n\t" | 959 "movaps %%xmm1, %%xmm2 \n\t" |
960 "addps %%xmm0, %%xmm1 \n\t" | 960 "addps %%xmm0, %%xmm1 \n\t" |
961 "subps %%xmm2, %%xmm0 \n\t" | 961 "subps %%xmm2, %%xmm0 \n\t" |
962 "movaps %%xmm1, (%%esi, %%edi) \n\t" | 962 "movaps %%xmm1, (%%"REG_S", %%"REG_D") \n\t" |
963 "movaps %%xmm0, (%%edx, %%edi) \n\t" | 963 "movaps %%xmm0, (%%"REG_d", %%"REG_D") \n\t" |
964 "addl $16, %%edi \n\t" | 964 "add $16, %%"REG_D" \n\t" |
965 "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0 | 965 "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0 |
966 " jb 2b \n\t" | 966 "jb 2b \n\t" |
967 "addl %2, %%esi \n\t" | 967 "add %2, %%"REG_S" \n\t" |
968 "cmpl %1, %%esi \n\t" | 968 "cmp %1, %%"REG_S" \n\t" |
969 " jb 1b \n\t" | 969 " jb 1b \n\t" |
970 :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3), | 970 :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3), |
971 "r" (sseW[m]) | 971 "r" (sseW[m]) |
972 : "%esi", "%edi", "%edx" | 972 : "%"REG_S, "%"REG_D, "%"REG_d |
973 ); | 973 ); |
974 } | 974 } |
975 | 975 |
976 /* Post IFFT complex multiply plus IFFT complex conjugate*/ | 976 /* Post IFFT complex multiply plus IFFT complex conjugate*/ |
977 asm volatile( | 977 asm volatile( |
978 "movl $-1024, %%esi \n\t" | 978 "mov $-1024, %%"REG_S" \n\t" |
979 ".balign 16 \n\t" | 979 ".balign 16 \n\t" |
980 "1: \n\t" | 980 "1: \n\t" |
981 "movaps (%0, %%esi), %%xmm0 \n\t" | 981 "movaps (%0, %%"REG_S"), %%xmm0 \n\t" |
982 "movaps (%0, %%esi), %%xmm1 \n\t" | 982 "movaps (%0, %%"REG_S"), %%xmm1 \n\t" |
983 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" | 983 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" |
984 "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t" | 984 "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t" |
985 "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" | 985 "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t" |
986 "addps %%xmm1, %%xmm0 \n\t" | 986 "addps %%xmm1, %%xmm0 \n\t" |
987 "movaps %%xmm0, (%0, %%esi) \n\t" | 987 "movaps %%xmm0, (%0, %%"REG_S") \n\t" |
988 "addl $16, %%esi \n\t" | 988 "add $16, %%"REG_S" \n\t" |
989 " jnz 1b \n\t" | 989 " jnz 1b \n\t" |
990 :: "r" (buf+128) | 990 :: "r" (buf+128) |
991 : "%esi" | 991 : "%"REG_S |
992 ); | 992 ); |
993 | 993 |
994 | 994 |
995 data_ptr = data; | 995 data_ptr = data; |
996 delay_ptr = delay; | 996 delay_ptr = delay; |
997 window_ptr = imdct_window; | 997 window_ptr = imdct_window; |
998 | 998 |
999 /* Window and convert to real valued signal */ | 999 /* Window and convert to real valued signal */ |
1000 asm volatile( | 1000 asm volatile( |
1001 "xorl %%edi, %%edi \n\t" // 0 | 1001 "xor %%"REG_D", %%"REG_D" \n\t" // 0 |
1002 "xorl %%esi, %%esi \n\t" // 0 | 1002 "xor %%"REG_S", %%"REG_S" \n\t" // 0 |
1003 "movss %3, %%xmm2 \n\t" // bias | 1003 "movss %3, %%xmm2 \n\t" // bias |
1004 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | 1004 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... |
1005 ".balign 16 \n\t" | 1005 ".balign 16 \n\t" |
1006 "1: \n\t" | 1006 "1: \n\t" |
1007 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? | 1007 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? |
1008 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? | 1008 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? |
1009 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? | 1009 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? |
1010 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? | 1010 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? |
1011 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A | 1011 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A |
1012 "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | 1012 "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" |
1013 "addps (%2, %%esi), %%xmm0 \n\t" | 1013 "addps (%2, %%"REG_S"), %%xmm0 \n\t" |
1014 "addps %%xmm2, %%xmm0 \n\t" | 1014 "addps %%xmm2, %%xmm0 \n\t" |
1015 "movaps %%xmm0, (%1, %%esi) \n\t" | 1015 "movaps %%xmm0, (%1, %%"REG_S") \n\t" |
1016 "addl $16, %%esi \n\t" | 1016 "add $16, %%"REG_S" \n\t" |
1017 "subl $16, %%edi \n\t" | 1017 "sub $16, %%"REG_D" \n\t" |
1018 "cmpl $512, %%esi \n\t" | 1018 "cmp $512, %%"REG_S" \n\t" |
1019 " jb 1b \n\t" | 1019 " jb 1b \n\t" |
1020 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | 1020 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) |
1021 : "%esi", "%edi" | 1021 : "%"REG_S, "%"REG_D |
1022 ); | 1022 ); |
1023 data_ptr+=128; | 1023 data_ptr+=128; |
1024 delay_ptr+=128; | 1024 delay_ptr+=128; |
1025 // window_ptr+=128; | 1025 // window_ptr+=128; |
1026 | 1026 |
1027 asm volatile( | 1027 asm volatile( |
1028 "movl $1024, %%edi \n\t" // 512 | 1028 "mov $1024, %%"REG_D" \n\t" // 512 |
1029 "xorl %%esi, %%esi \n\t" // 0 | 1029 "xor %%"REG_S", %%"REG_S" \n\t" // 0 |
1030 "movss %3, %%xmm2 \n\t" // bias | 1030 "movss %3, %%xmm2 \n\t" // bias |
1031 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | 1031 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... |
1032 ".balign 16 \n\t" | 1032 ".balign 16 \n\t" |
1033 "1: \n\t" | 1033 "1: \n\t" |
1034 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A | 1034 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A |
1035 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C | 1035 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C |
1036 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C | 1036 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C |
1037 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A | 1037 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A |
1038 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A | 1038 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A |
1039 "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | 1039 "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" |
1040 "addps (%2, %%esi), %%xmm0 \n\t" | 1040 "addps (%2, %%"REG_S"), %%xmm0 \n\t" |
1041 "addps %%xmm2, %%xmm0 \n\t" | 1041 "addps %%xmm2, %%xmm0 \n\t" |
1042 "movaps %%xmm0, (%1, %%esi) \n\t" | 1042 "movaps %%xmm0, (%1, %%"REG_S") \n\t" |
1043 "addl $16, %%esi \n\t" | 1043 "add $16, %%"REG_S" \n\t" |
1044 "subl $16, %%edi \n\t" | 1044 "sub $16, %%"REG_D" \n\t" |
1045 "cmpl $512, %%esi \n\t" | 1045 "cmp $512, %%"REG_S" \n\t" |
1046 " jb 1b \n\t" | 1046 " jb 1b \n\t" |
1047 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | 1047 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) |
1048 : "%esi", "%edi" | 1048 : "%"REG_S, "%"REG_D |
1049 ); | 1049 ); |
1050 data_ptr+=128; | 1050 data_ptr+=128; |
1051 // window_ptr+=128; | 1051 // window_ptr+=128; |
1052 | 1052 |
1053 /* The trailing edge of the window goes into the delay line */ | 1053 /* The trailing edge of the window goes into the delay line */ |
1054 delay_ptr = delay; | 1054 delay_ptr = delay; |
1055 | 1055 |
1056 asm volatile( | 1056 asm volatile( |
1057 "xorl %%edi, %%edi \n\t" // 0 | 1057 "xor %%"REG_D", %%"REG_D" \n\t" // 0 |
1058 "xorl %%esi, %%esi \n\t" // 0 | 1058 "xor %%"REG_S", %%"REG_S" \n\t" // 0 |
1059 ".balign 16 \n\t" | 1059 ".balign 16 \n\t" |
1060 "1: \n\t" | 1060 "1: \n\t" |
1061 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A | 1061 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A |
1062 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C | 1062 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C |
1063 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C | 1063 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C |
1064 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A | 1064 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A |
1065 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A | 1065 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A |
1066 "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | 1066 "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" |
1067 "movaps %%xmm0, (%1, %%esi) \n\t" | 1067 "movaps %%xmm0, (%1, %%"REG_S") \n\t" |
1068 "addl $16, %%esi \n\t" | 1068 "add $16, %%"REG_S" \n\t" |
1069 "subl $16, %%edi \n\t" | 1069 "sub $16, %%"REG_D" \n\t" |
1070 "cmpl $512, %%esi \n\t" | 1070 "cmp $512, %%"REG_S" \n\t" |
1071 " jb 1b \n\t" | 1071 " jb 1b \n\t" |
1072 :: "r" (buf+64), "r" (delay_ptr) | 1072 :: "r" (buf+64), "r" (delay_ptr) |
1073 : "%esi", "%edi" | 1073 : "%"REG_S, "%"REG_D |
1074 ); | 1074 ); |
1075 delay_ptr+=128; | 1075 delay_ptr+=128; |
1076 // window_ptr-=128; | 1076 // window_ptr-=128; |
1077 | 1077 |
1078 asm volatile( | 1078 asm volatile( |
1079 "movl $1024, %%edi \n\t" // 1024 | 1079 "mov $1024, %%"REG_D" \n\t" // 1024 |
1080 "xorl %%esi, %%esi \n\t" // 0 | 1080 "xor %%"REG_S", %%"REG_S" \n\t" // 0 |
1081 ".balign 16 \n\t" | 1081 ".balign 16 \n\t" |
1082 "1: \n\t" | 1082 "1: \n\t" |
1083 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? | 1083 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ? |
1084 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? | 1084 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ? |
1085 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? | 1085 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ? |
1086 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? | 1086 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ? |
1087 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A | 1087 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A |
1088 "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | 1088 "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t" |
1089 "movaps %%xmm0, (%1, %%esi) \n\t" | 1089 "movaps %%xmm0, (%1, %%"REG_S") \n\t" |
1090 "addl $16, %%esi \n\t" | 1090 "add $16, %%"REG_S" \n\t" |
1091 "subl $16, %%edi \n\t" | 1091 "sub $16, %%"REG_D" \n\t" |
1092 "cmpl $512, %%esi \n\t" | 1092 "cmp $512, %%"REG_S" \n\t" |
1093 " jb 1b \n\t" | 1093 " jb 1b \n\t" |
1094 :: "r" (buf), "r" (delay_ptr) | 1094 :: "r" (buf), "r" (delay_ptr) |
1095 : "%esi", "%edi" | 1095 : "%"REG_S, "%"REG_D |
1096 ); | 1096 ); |
1097 } | 1097 } |
1098 #endif //arch_x86 | 1098 #endif // ARCH_X86 || ARCH_X86_64 |
1099 | 1099 |
1100 void | 1100 void |
1101 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias) | 1101 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias) |
1102 { | 1102 { |
1103 int i,k; | 1103 int i,k; |
1240 /* Twiddle factors to turn IFFT into IMDCT */ | 1240 /* Twiddle factors to turn IFFT into IMDCT */ |
1241 for (i = 0; i < 128; i++) { | 1241 for (i = 0; i < 128; i++) { |
1242 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); | 1242 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); |
1243 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); | 1243 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); |
1244 } | 1244 } |
1245 #ifdef ARCH_X86 | 1245 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1246 for (i = 0; i < 128; i++) { | 1246 for (i = 0; i < 128; i++) { |
1247 sseSinCos1c[2*i+0]= xcos1[i]; | 1247 sseSinCos1c[2*i+0]= xcos1[i]; |
1248 sseSinCos1c[2*i+1]= -xcos1[i]; | 1248 sseSinCos1c[2*i+1]= -xcos1[i]; |
1249 sseSinCos1d[2*i+0]= xsin1[i]; | 1249 sseSinCos1d[2*i+0]= xsin1[i]; |
1250 sseSinCos1d[2*i+1]= xsin1[i]; | 1250 sseSinCos1d[2*i+1]= xsin1[i]; |
1262 for (k = 0; k < j; k++) { | 1262 for (k = 0; k < j; k++) { |
1263 w[i][k].real = cos (-M_PI * k / j); | 1263 w[i][k].real = cos (-M_PI * k / j); |
1264 w[i][k].imag = sin (-M_PI * k / j); | 1264 w[i][k].imag = sin (-M_PI * k / j); |
1265 } | 1265 } |
1266 } | 1266 } |
1267 #ifdef ARCH_X86 | 1267 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1268 for (i = 1; i < 7; i++) { | 1268 for (i = 1; i < 7; i++) { |
1269 j = 1 << i; | 1269 j = 1 << i; |
1270 for (k = 0; k < j; k+=2) { | 1270 for (k = 0; k < j; k+=2) { |
1271 | 1271 |
1272 sseW[i][4*k + 0] = w[i][k+0].real; | 1272 sseW[i][4*k + 0] = w[i][k+0].real; |
1305 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1]; | 1305 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1]; |
1306 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0]; | 1306 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0]; |
1307 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1]; | 1307 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1]; |
1308 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0]; | 1308 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0]; |
1309 } | 1309 } |
1310 #endif // arch_x86 | 1310 #endif // ARCH_X86 || ARCH_X86_64 |
1311 | 1311 |
1312 imdct_512 = imdct_do_512; | 1312 imdct_512 = imdct_do_512; |
1313 #ifdef ARCH_X86 | 1313 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1314 if(mm_accel & MM_ACCEL_X86_SSE) | 1314 if(mm_accel & MM_ACCEL_X86_SSE) |
1315 { | 1315 { |
1316 fprintf (stderr, "Using SSE optimized IMDCT transform\n"); | 1316 fprintf (stderr, "Using SSE optimized IMDCT transform\n"); |
1317 imdct_512 = imdct_do_512_sse; | 1317 imdct_512 = imdct_do_512_sse; |
1318 } | 1318 } |
1327 { | 1327 { |
1328 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n"); | 1328 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n"); |
1329 imdct_512 = imdct_do_512_3dnow; | 1329 imdct_512 = imdct_do_512_3dnow; |
1330 } | 1330 } |
1331 else | 1331 else |
1332 #endif // arch_x86 | 1332 #endif // ARCH_X86 || ARCH_X86_64 |
1333 #ifdef HAVE_ALTIVEC | 1333 #ifdef HAVE_ALTIVEC |
1334 if (mm_accel & MM_ACCEL_PPC_ALTIVEC) | 1334 if (mm_accel & MM_ACCEL_PPC_ALTIVEC) |
1335 { | 1335 { |
1336 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n"); | 1336 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n"); |
1337 imdct_512 = imdct_do_512_altivec; | 1337 imdct_512 = imdct_do_512_altivec; |