comparison liba52/imdct.c @ 16173:d6219ce521e9

liba52 asm optimizations ported to amd64
author aurel
date Fri, 05 Aug 2005 13:33:50 +0000
parents 130dd060f723
children 72764c0dad8a
comparison
equal deleted inserted replaced
16172:ac70488d1f3e 16173:d6219ce521e9
99 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39, 99 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
100 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d, 100 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
101 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b, 101 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
102 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f}; 102 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
103 103
104 #ifdef ARCH_X86 104 #if defined(ARCH_X86) || defined(ARCH_X86_64)
105 // NOTE: SSE needs 16byte alignment or it will segfault 105 // NOTE: SSE needs 16byte alignment or it will segfault
106 // 106 //
107 static complex_t __attribute__((aligned(16))) buf[128]; 107 static complex_t __attribute__((aligned(16))) buf[128];
108 static float __attribute__((aligned(16))) sseSinCos1c[256]; 108 static float __attribute__((aligned(16))) sseSinCos1c[256];
109 static float __attribute__((aligned(16))) sseSinCos1d[256]; 109 static float __attribute__((aligned(16))) sseSinCos1d[256];
440 { 440 {
441 int i; 441 int i;
442 int k; 442 int k;
443 int p,q; 443 int p,q;
444 int m; 444 int m;
445 int two_m; 445 long two_m;
446 int two_m_plus_one; 446 long two_m_plus_one;
447 447
448 sample_t tmp_b_i; 448 sample_t tmp_b_i;
449 sample_t tmp_b_r; 449 sample_t tmp_b_r;
450 sample_t tmp_a_i; 450 sample_t tmp_a_i;
451 sample_t tmp_a_r; 451 sample_t tmp_a_r;
745 #endif 745 #endif
746 746
747 747
748 // Stuff below this line is borrowed from libac3 748 // Stuff below this line is borrowed from libac3
749 #include "srfftp.h" 749 #include "srfftp.h"
750 #ifdef ARCH_X86 750 #if defined(ARCH_X86) || defined(ARCH_X86_64)
751 #ifndef HAVE_3DNOW 751 #ifndef HAVE_3DNOW
752 #define HAVE_3DNOW 1 752 #define HAVE_3DNOW 1
753 #endif 753 #endif
754 #include "srfftp_3dnow.h" 754 #include "srfftp_3dnow.h"
755 755
766 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias) 766 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
767 { 767 {
768 /* int i,k; 768 /* int i,k;
769 int p,q;*/ 769 int p,q;*/
770 int m; 770 int m;
771 int two_m; 771 long two_m;
772 int two_m_plus_one; 772 long two_m_plus_one;
773 int two_m_plus_one_shl3; 773 long two_m_plus_one_shl3;
774 complex_t *buf_offset; 774 complex_t *buf_offset;
775 775
776 /* sample_t tmp_a_i; 776 /* sample_t tmp_a_i;
777 sample_t tmp_a_r; 777 sample_t tmp_a_r;
778 sample_t tmp_b_i; 778 sample_t tmp_b_i;
786 /* see the c version (dct_do_512()), its allmost identical, just in C */ 786 /* see the c version (dct_do_512()), its allmost identical, just in C */
787 787
788 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ 788 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
789 /* Bit reversed shuffling */ 789 /* Bit reversed shuffling */
790 asm volatile( 790 asm volatile(
791 "xorl %%esi, %%esi \n\t" 791 "xor %%"REG_S", %%"REG_S" \n\t"
792 "leal "MANGLE(bit_reverse_512)", %%eax \n\t" 792 "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
793 "movl $1008, %%edi \n\t" 793 "mov $1008, %%"REG_D" \n\t"
794 "pushl %%ebp \n\t" //use ebp without telling gcc 794 "push %%"REG_BP" \n\t" //use ebp without telling gcc
795 ".balign 16 \n\t" 795 ".balign 16 \n\t"
796 "1: \n\t" 796 "1: \n\t"
797 "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI 797 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI
798 "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI 798 "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI
799 "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi 799 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi
800 "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi 800 "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi
801 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR 801 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
802 "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t" 802 "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
803 "mulps %%xmm0, %%xmm2 \n\t" 803 "mulps %%xmm0, %%xmm2 \n\t"
804 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI 804 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
805 "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" 805 "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
806 "subps %%xmm0, %%xmm2 \n\t" 806 "subps %%xmm0, %%xmm2 \n\t"
807 "movzbl (%%eax), %%edx \n\t" 807 "movzb (%%"REG_a"), %%"REG_d" \n\t"
808 "movzbl 1(%%eax), %%ebp \n\t" 808 "movzb 1(%%"REG_a"), %%"REG_BP" \n\t"
809 "movlps %%xmm2, (%1, %%edx,8) \n\t" 809 "movlps %%xmm2, (%1, %%"REG_d", 8) \n\t"
810 "movhps %%xmm2, (%1, %%ebp,8) \n\t" 810 "movhps %%xmm2, (%1, %%"REG_BP", 8) \n\t"
811 "addl $16, %%esi \n\t" 811 "add $16, %%"REG_S" \n\t"
812 "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap 812 "add $2, %%"REG_a" \n\t" // avoid complex addressing for P4 crap
813 "subl $16, %%edi \n\t" 813 "sub $16, %%"REG_D" \n\t"
814 " jnc 1b \n\t" 814 "jnc 1b \n\t"
815 "popl %%ebp \n\t"//no we didnt touch ebp *g* 815 "pop %%"REG_BP" \n\t"//no we didnt touch ebp *g*
816 :: "b" (data), "c" (buf) 816 :: "r" (data), "r" (buf)
817 : "%esi", "%edi", "%eax", "%edx" 817 : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
818 ); 818 );
819 819
820 820
821 /* FFT Merge */ 821 /* FFT Merge */
822 /* unoptimized variant 822 /* unoptimized variant
848 /* 1. iteration */ 848 /* 1. iteration */
849 // Note w[0][0]={1,0} 849 // Note w[0][0]={1,0}
850 asm volatile( 850 asm volatile(
851 "xorps %%xmm1, %%xmm1 \n\t" 851 "xorps %%xmm1, %%xmm1 \n\t"
852 "xorps %%xmm2, %%xmm2 \n\t" 852 "xorps %%xmm2, %%xmm2 \n\t"
853 "movl %0, %%esi \n\t" 853 "mov %0, %%"REG_S" \n\t"
854 ".balign 16 \n\t" 854 ".balign 16 \n\t"
855 "1: \n\t" 855 "1: \n\t"
856 "movlps (%%esi), %%xmm0 \n\t" //buf[p] 856 "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
857 "movlps 8(%%esi), %%xmm1\n\t" //buf[q] 857 "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
858 "movhps (%%esi), %%xmm0 \n\t" //buf[p] 858 "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
859 "movhps 8(%%esi), %%xmm2\n\t" //buf[q] 859 "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
860 "addps %%xmm1, %%xmm0 \n\t" 860 "addps %%xmm1, %%xmm0 \n\t"
861 "subps %%xmm2, %%xmm0 \n\t" 861 "subps %%xmm2, %%xmm0 \n\t"
862 "movaps %%xmm0, (%%esi) \n\t" 862 "movaps %%xmm0, (%%"REG_S")\n\t"
863 "addl $16, %%esi \n\t" 863 "add $16, %%"REG_S" \n\t"
864 "cmpl %1, %%esi \n\t" 864 "cmp %1, %%"REG_S" \n\t"
865 " jb 1b \n\t" 865 " jb 1b \n\t"
866 :: "g" (buf), "r" (buf + 128) 866 :: "g" (buf), "r" (buf + 128)
867 : "%esi" 867 : "%"REG_S
868 ); 868 );
869 869
870 /* 2. iteration */ 870 /* 2. iteration */
871 // Note w[1]={{1,0}, {0,-1}} 871 // Note w[1]={{1,0}, {0,-1}}
872 asm volatile( 872 asm volatile(
873 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 873 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
874 "movl %0, %%esi \n\t" 874 "mov %0, %%"REG_S" \n\t"
875 ".balign 16 \n\t" 875 ".balign 16 \n\t"
876 "1: \n\t" 876 "1: \n\t"
877 "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3 877 "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3
878 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 878 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
879 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 879 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
880 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 880 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
881 "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1 881 "movaps (%%"REG_S"), %%xmm1 \n\t" //r0,i0,r1,i1
882 "addps %%xmm2, %%xmm0 \n\t" 882 "addps %%xmm2, %%xmm0 \n\t"
883 "subps %%xmm2, %%xmm1 \n\t" 883 "subps %%xmm2, %%xmm1 \n\t"
884 "movaps %%xmm0, (%%esi) \n\t" 884 "movaps %%xmm0, (%%"REG_S") \n\t"
885 "movaps %%xmm1, 16(%%esi) \n\t" 885 "movaps %%xmm1, 16(%%"REG_S") \n\t"
886 "addl $32, %%esi \n\t" 886 "add $32, %%"REG_S" \n\t"
887 "cmpl %1, %%esi \n\t" 887 "cmp %1, %%"REG_S" \n\t"
888 " jb 1b \n\t" 888 " jb 1b \n\t"
889 :: "g" (buf), "r" (buf + 128) 889 :: "g" (buf), "r" (buf + 128)
890 : "%esi" 890 : "%"REG_S
891 ); 891 );
892 892
893 /* 3. iteration */ 893 /* 3. iteration */
894 /* 894 /*
895 Note sseW2+0={1,1,sqrt(2),sqrt(2)) 895 Note sseW2+0={1,1,sqrt(2),sqrt(2))
900 asm volatile( 900 asm volatile(
901 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" 901 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
902 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" 902 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
903 "xorps %%xmm5, %%xmm5 \n\t" 903 "xorps %%xmm5, %%xmm5 \n\t"
904 "xorps %%xmm2, %%xmm2 \n\t" 904 "xorps %%xmm2, %%xmm2 \n\t"
905 "movl %0, %%esi \n\t" 905 "mov %0, %%"REG_S" \n\t"
906 ".balign 16 \n\t" 906 ".balign 16 \n\t"
907 "1: \n\t" 907 "1: \n\t"
908 "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5 908 "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5
909 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7 909 "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7
910 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 910 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
911 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 911 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
912 "mulps %%xmm2, %%xmm4 \n\t" 912 "mulps %%xmm2, %%xmm4 \n\t"
913 "mulps %%xmm3, %%xmm5 \n\t" 913 "mulps %%xmm3, %%xmm5 \n\t"
914 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 914 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
915 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 915 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
916 "mulps %%xmm6, %%xmm3 \n\t" 916 "mulps %%xmm6, %%xmm3 \n\t"
917 "mulps %%xmm7, %%xmm2 \n\t" 917 "mulps %%xmm7, %%xmm2 \n\t"
918 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 918 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
919 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3 919 "movaps 16(%%"REG_S"), %%xmm1 \n\t" //r2,i2,r3,i3
920 "addps %%xmm4, %%xmm2 \n\t" 920 "addps %%xmm4, %%xmm2 \n\t"
921 "addps %%xmm5, %%xmm3 \n\t" 921 "addps %%xmm5, %%xmm3 \n\t"
922 "movaps %%xmm2, %%xmm4 \n\t" 922 "movaps %%xmm2, %%xmm4 \n\t"
923 "movaps %%xmm3, %%xmm5 \n\t" 923 "movaps %%xmm3, %%xmm5 \n\t"
924 "addps %%xmm0, %%xmm2 \n\t" 924 "addps %%xmm0, %%xmm2 \n\t"
925 "addps %%xmm1, %%xmm3 \n\t" 925 "addps %%xmm1, %%xmm3 \n\t"
926 "subps %%xmm4, %%xmm0 \n\t" 926 "subps %%xmm4, %%xmm0 \n\t"
927 "subps %%xmm5, %%xmm1 \n\t" 927 "subps %%xmm5, %%xmm1 \n\t"
928 "movaps %%xmm2, (%%esi) \n\t" 928 "movaps %%xmm2, (%%"REG_S") \n\t"
929 "movaps %%xmm3, 16(%%esi) \n\t" 929 "movaps %%xmm3, 16(%%"REG_S") \n\t"
930 "movaps %%xmm0, 32(%%esi) \n\t" 930 "movaps %%xmm0, 32(%%"REG_S") \n\t"
931 "movaps %%xmm1, 48(%%esi) \n\t" 931 "movaps %%xmm1, 48(%%"REG_S") \n\t"
932 "addl $64, %%esi \n\t" 932 "add $64, %%"REG_S" \n\t"
933 "cmpl %1, %%esi \n\t" 933 "cmp %1, %%"REG_S" \n\t"
934 " jb 1b \n\t" 934 " jb 1b \n\t"
935 :: "g" (buf), "r" (buf + 128) 935 :: "g" (buf), "r" (buf + 128)
936 : "%esi" 936 : "%"REG_S
937 ); 937 );
938 938
939 /* 4-7. iterations */ 939 /* 4-7. iterations */
940 for (m=3; m < 7; m++) { 940 for (m=3; m < 7; m++) {
941 two_m = (1 << m); 941 two_m = (1 << m);
942 two_m_plus_one = two_m<<1; 942 two_m_plus_one = two_m<<1;
943 two_m_plus_one_shl3 = (two_m_plus_one<<3); 943 two_m_plus_one_shl3 = (two_m_plus_one<<3);
944 buf_offset = buf+128; 944 buf_offset = buf+128;
945 asm volatile( 945 asm volatile(
946 "movl %0, %%esi \n\t" 946 "mov %0, %%"REG_S" \n\t"
947 ".balign 16 \n\t" 947 ".balign 16 \n\t"
948 "1: \n\t" 948 "1: \n\t"
949 "xorl %%edi, %%edi \n\t" // k 949 "xor %%"REG_D", %%"REG_D" \n\t" // k
950 "leal (%%esi, %3), %%edx \n\t" 950 "lea (%%"REG_S", %3), %%"REG_d" \n\t"
951 "2: \n\t" 951 "2: \n\t"
952 "movaps (%%edx, %%edi), %%xmm1 \n\t" 952 "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t"
953 "movaps (%4, %%edi, 2), %%xmm2 \n\t" 953 "movaps (%4, %%"REG_D", 2), %%xmm2 \n\t"
954 "mulps %%xmm1, %%xmm2 \n\t" 954 "mulps %%xmm1, %%xmm2 \n\t"
955 "shufps $0xB1, %%xmm1, %%xmm1 \n\t" 955 "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
956 "mulps 16(%4, %%edi, 2), %%xmm1 \n\t" 956 "mulps 16(%4, %%"REG_D", 2), %%xmm1 \n\t"
957 "movaps (%%esi, %%edi), %%xmm0 \n\t" 957 "movaps (%%"REG_S", %%"REG_D"), %%xmm0 \n\t"
958 "addps %%xmm2, %%xmm1 \n\t" 958 "addps %%xmm2, %%xmm1 \n\t"
959 "movaps %%xmm1, %%xmm2 \n\t" 959 "movaps %%xmm1, %%xmm2 \n\t"
960 "addps %%xmm0, %%xmm1 \n\t" 960 "addps %%xmm0, %%xmm1 \n\t"
961 "subps %%xmm2, %%xmm0 \n\t" 961 "subps %%xmm2, %%xmm0 \n\t"
962 "movaps %%xmm1, (%%esi, %%edi) \n\t" 962 "movaps %%xmm1, (%%"REG_S", %%"REG_D") \n\t"
963 "movaps %%xmm0, (%%edx, %%edi) \n\t" 963 "movaps %%xmm0, (%%"REG_d", %%"REG_D") \n\t"
964 "addl $16, %%edi \n\t" 964 "add $16, %%"REG_D" \n\t"
965 "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0 965 "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0
966 " jb 2b \n\t" 966 "jb 2b \n\t"
967 "addl %2, %%esi \n\t" 967 "add %2, %%"REG_S" \n\t"
968 "cmpl %1, %%esi \n\t" 968 "cmp %1, %%"REG_S" \n\t"
969 " jb 1b \n\t" 969 " jb 1b \n\t"
970 :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3), 970 :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
971 "r" (sseW[m]) 971 "r" (sseW[m])
972 : "%esi", "%edi", "%edx" 972 : "%"REG_S, "%"REG_D, "%"REG_d
973 ); 973 );
974 } 974 }
975 975
976 /* Post IFFT complex multiply plus IFFT complex conjugate*/ 976 /* Post IFFT complex multiply plus IFFT complex conjugate*/
977 asm volatile( 977 asm volatile(
978 "movl $-1024, %%esi \n\t" 978 "mov $-1024, %%"REG_S" \n\t"
979 ".balign 16 \n\t" 979 ".balign 16 \n\t"
980 "1: \n\t" 980 "1: \n\t"
981 "movaps (%0, %%esi), %%xmm0 \n\t" 981 "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
982 "movaps (%0, %%esi), %%xmm1 \n\t" 982 "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
983 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" 983 "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
984 "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t" 984 "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
985 "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" 985 "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
986 "addps %%xmm1, %%xmm0 \n\t" 986 "addps %%xmm1, %%xmm0 \n\t"
987 "movaps %%xmm0, (%0, %%esi) \n\t" 987 "movaps %%xmm0, (%0, %%"REG_S") \n\t"
988 "addl $16, %%esi \n\t" 988 "add $16, %%"REG_S" \n\t"
989 " jnz 1b \n\t" 989 " jnz 1b \n\t"
990 :: "r" (buf+128) 990 :: "r" (buf+128)
991 : "%esi" 991 : "%"REG_S
992 ); 992 );
993 993
994 994
995 data_ptr = data; 995 data_ptr = data;
996 delay_ptr = delay; 996 delay_ptr = delay;
997 window_ptr = imdct_window; 997 window_ptr = imdct_window;
998 998
999 /* Window and convert to real valued signal */ 999 /* Window and convert to real valued signal */
1000 asm volatile( 1000 asm volatile(
1001 "xorl %%edi, %%edi \n\t" // 0 1001 "xor %%"REG_D", %%"REG_D" \n\t" // 0
1002 "xorl %%esi, %%esi \n\t" // 0 1002 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1003 "movss %3, %%xmm2 \n\t" // bias 1003 "movss %3, %%xmm2 \n\t" // bias
1004 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... 1004 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
1005 ".balign 16 \n\t" 1005 ".balign 16 \n\t"
1006 "1: \n\t" 1006 "1: \n\t"
1007 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? 1007 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
1008 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? 1008 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
1009 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? 1009 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
1010 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? 1010 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
1011 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A 1011 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1012 "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" 1012 "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1013 "addps (%2, %%esi), %%xmm0 \n\t" 1013 "addps (%2, %%"REG_S"), %%xmm0 \n\t"
1014 "addps %%xmm2, %%xmm0 \n\t" 1014 "addps %%xmm2, %%xmm0 \n\t"
1015 "movaps %%xmm0, (%1, %%esi) \n\t" 1015 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1016 "addl $16, %%esi \n\t" 1016 "add $16, %%"REG_S" \n\t"
1017 "subl $16, %%edi \n\t" 1017 "sub $16, %%"REG_D" \n\t"
1018 "cmpl $512, %%esi \n\t" 1018 "cmp $512, %%"REG_S" \n\t"
1019 " jb 1b \n\t" 1019 " jb 1b \n\t"
1020 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) 1020 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1021 : "%esi", "%edi" 1021 : "%"REG_S, "%"REG_D
1022 ); 1022 );
1023 data_ptr+=128; 1023 data_ptr+=128;
1024 delay_ptr+=128; 1024 delay_ptr+=128;
1025 // window_ptr+=128; 1025 // window_ptr+=128;
1026 1026
1027 asm volatile( 1027 asm volatile(
1028 "movl $1024, %%edi \n\t" // 512 1028 "mov $1024, %%"REG_D" \n\t" // 512
1029 "xorl %%esi, %%esi \n\t" // 0 1029 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1030 "movss %3, %%xmm2 \n\t" // bias 1030 "movss %3, %%xmm2 \n\t" // bias
1031 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... 1031 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
1032 ".balign 16 \n\t" 1032 ".balign 16 \n\t"
1033 "1: \n\t" 1033 "1: \n\t"
1034 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A 1034 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
1035 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C 1035 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
1036 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C 1036 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
1037 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A 1037 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
1038 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A 1038 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1039 "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" 1039 "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1040 "addps (%2, %%esi), %%xmm0 \n\t" 1040 "addps (%2, %%"REG_S"), %%xmm0 \n\t"
1041 "addps %%xmm2, %%xmm0 \n\t" 1041 "addps %%xmm2, %%xmm0 \n\t"
1042 "movaps %%xmm0, (%1, %%esi) \n\t" 1042 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1043 "addl $16, %%esi \n\t" 1043 "add $16, %%"REG_S" \n\t"
1044 "subl $16, %%edi \n\t" 1044 "sub $16, %%"REG_D" \n\t"
1045 "cmpl $512, %%esi \n\t" 1045 "cmp $512, %%"REG_S" \n\t"
1046 " jb 1b \n\t" 1046 " jb 1b \n\t"
1047 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) 1047 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1048 : "%esi", "%edi" 1048 : "%"REG_S, "%"REG_D
1049 ); 1049 );
1050 data_ptr+=128; 1050 data_ptr+=128;
1051 // window_ptr+=128; 1051 // window_ptr+=128;
1052 1052
1053 /* The trailing edge of the window goes into the delay line */ 1053 /* The trailing edge of the window goes into the delay line */
1054 delay_ptr = delay; 1054 delay_ptr = delay;
1055 1055
1056 asm volatile( 1056 asm volatile(
1057 "xorl %%edi, %%edi \n\t" // 0 1057 "xor %%"REG_D", %%"REG_D" \n\t" // 0
1058 "xorl %%esi, %%esi \n\t" // 0 1058 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1059 ".balign 16 \n\t" 1059 ".balign 16 \n\t"
1060 "1: \n\t" 1060 "1: \n\t"
1061 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A 1061 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
1062 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C 1062 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
1063 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C 1063 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
1064 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A 1064 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
1065 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A 1065 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1066 "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" 1066 "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1067 "movaps %%xmm0, (%1, %%esi) \n\t" 1067 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1068 "addl $16, %%esi \n\t" 1068 "add $16, %%"REG_S" \n\t"
1069 "subl $16, %%edi \n\t" 1069 "sub $16, %%"REG_D" \n\t"
1070 "cmpl $512, %%esi \n\t" 1070 "cmp $512, %%"REG_S" \n\t"
1071 " jb 1b \n\t" 1071 " jb 1b \n\t"
1072 :: "r" (buf+64), "r" (delay_ptr) 1072 :: "r" (buf+64), "r" (delay_ptr)
1073 : "%esi", "%edi" 1073 : "%"REG_S, "%"REG_D
1074 ); 1074 );
1075 delay_ptr+=128; 1075 delay_ptr+=128;
1076 // window_ptr-=128; 1076 // window_ptr-=128;
1077 1077
1078 asm volatile( 1078 asm volatile(
1079 "movl $1024, %%edi \n\t" // 1024 1079 "mov $1024, %%"REG_D" \n\t" // 1024
1080 "xorl %%esi, %%esi \n\t" // 0 1080 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1081 ".balign 16 \n\t" 1081 ".balign 16 \n\t"
1082 "1: \n\t" 1082 "1: \n\t"
1083 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? 1083 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
1084 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? 1084 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
1085 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? 1085 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
1086 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? 1086 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
1087 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A 1087 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1088 "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" 1088 "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1089 "movaps %%xmm0, (%1, %%esi) \n\t" 1089 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1090 "addl $16, %%esi \n\t" 1090 "add $16, %%"REG_S" \n\t"
1091 "subl $16, %%edi \n\t" 1091 "sub $16, %%"REG_D" \n\t"
1092 "cmpl $512, %%esi \n\t" 1092 "cmp $512, %%"REG_S" \n\t"
1093 " jb 1b \n\t" 1093 " jb 1b \n\t"
1094 :: "r" (buf), "r" (delay_ptr) 1094 :: "r" (buf), "r" (delay_ptr)
1095 : "%esi", "%edi" 1095 : "%"REG_S, "%"REG_D
1096 ); 1096 );
1097 } 1097 }
1098 #endif //arch_x86 1098 #endif // ARCH_X86 || ARCH_X86_64
1099 1099
1100 void 1100 void
1101 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias) 1101 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
1102 { 1102 {
1103 int i,k; 1103 int i,k;
1240 /* Twiddle factors to turn IFFT into IMDCT */ 1240 /* Twiddle factors to turn IFFT into IMDCT */
1241 for (i = 0; i < 128; i++) { 1241 for (i = 0; i < 128; i++) {
1242 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); 1242 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
1243 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); 1243 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
1244 } 1244 }
1245 #ifdef ARCH_X86 1245 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1246 for (i = 0; i < 128; i++) { 1246 for (i = 0; i < 128; i++) {
1247 sseSinCos1c[2*i+0]= xcos1[i]; 1247 sseSinCos1c[2*i+0]= xcos1[i];
1248 sseSinCos1c[2*i+1]= -xcos1[i]; 1248 sseSinCos1c[2*i+1]= -xcos1[i];
1249 sseSinCos1d[2*i+0]= xsin1[i]; 1249 sseSinCos1d[2*i+0]= xsin1[i];
1250 sseSinCos1d[2*i+1]= xsin1[i]; 1250 sseSinCos1d[2*i+1]= xsin1[i];
1262 for (k = 0; k < j; k++) { 1262 for (k = 0; k < j; k++) {
1263 w[i][k].real = cos (-M_PI * k / j); 1263 w[i][k].real = cos (-M_PI * k / j);
1264 w[i][k].imag = sin (-M_PI * k / j); 1264 w[i][k].imag = sin (-M_PI * k / j);
1265 } 1265 }
1266 } 1266 }
1267 #ifdef ARCH_X86 1267 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1268 for (i = 1; i < 7; i++) { 1268 for (i = 1; i < 7; i++) {
1269 j = 1 << i; 1269 j = 1 << i;
1270 for (k = 0; k < j; k+=2) { 1270 for (k = 0; k < j; k+=2) {
1271 1271
1272 sseW[i][4*k + 0] = w[i][k+0].real; 1272 sseW[i][4*k + 0] = w[i][k+0].real;
1305 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1]; 1305 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
1306 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0]; 1306 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0];
1307 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1]; 1307 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
1308 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0]; 1308 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
1309 } 1309 }
1310 #endif // arch_x86 1310 #endif // ARCH_X86 || ARCH_X86_64
1311 1311
1312 imdct_512 = imdct_do_512; 1312 imdct_512 = imdct_do_512;
1313 #ifdef ARCH_X86 1313 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1314 if(mm_accel & MM_ACCEL_X86_SSE) 1314 if(mm_accel & MM_ACCEL_X86_SSE)
1315 { 1315 {
1316 fprintf (stderr, "Using SSE optimized IMDCT transform\n"); 1316 fprintf (stderr, "Using SSE optimized IMDCT transform\n");
1317 imdct_512 = imdct_do_512_sse; 1317 imdct_512 = imdct_do_512_sse;
1318 } 1318 }
1327 { 1327 {
1328 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n"); 1328 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
1329 imdct_512 = imdct_do_512_3dnow; 1329 imdct_512 = imdct_do_512_3dnow;
1330 } 1330 }
1331 else 1331 else
1332 #endif // arch_x86 1332 #endif // ARCH_X86 || ARCH_X86_64
1333 #ifdef HAVE_ALTIVEC 1333 #ifdef HAVE_ALTIVEC
1334 if (mm_accel & MM_ACCEL_PPC_ALTIVEC) 1334 if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
1335 { 1335 {
1336 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n"); 1336 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
1337 imdct_512 = imdct_do_512_altivec; 1337 imdct_512 = imdct_do_512_altivec;