885
|
1 /*
|
|
2 * srfftp.h
|
|
3 *
|
|
4 * Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - April 2000
|
|
5 *
|
|
6 * 64 and 128 point split radix fft for ac3dec
|
|
7 *
|
|
8 * The algorithm is desribed in the book:
|
|
9 * "Computational Frameworks of the Fast Fourier Transform".
|
|
10 *
|
|
11 * The ideas and the the organization of code borrowed from djbfft written by
|
|
12 * D. J. Bernstein <djb@cr.py.to>. djbff can be found at
|
|
13 * http://cr.yp.to/djbfft.html.
|
|
14 *
|
|
15 * srfftp.h is free software; you can redistribute it and/or modify
|
|
16 * it under the terms of the GNU General Public License as published by
|
|
17 * the Free Software Foundation; either version 2, or (at your option)
|
|
18 * any later version.
|
|
19 *
|
|
20 * srfftp.h is distributed in the hope that it will be useful,
|
|
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
23 * GNU General Public License for more details.
|
|
24 *
|
|
25 * You should have received a copy of the GNU General Public License
|
|
26 * along with GNU Make; see the file COPYING. If not, write to
|
|
27 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
28 *
|
|
29 * Modified for using AMD's 3DNow! - 3DNowEx(DSP)! SIMD operations
|
|
30 * by Nick Kurshev <nickols_k@mail.ru>
|
|
31 */
|
|
32
|
|
33 #ifndef SRFFTP_3DNOW_H__
|
|
34 #define SRFFTP_3DNOW_H__
|
|
35
|
890
|
36 static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
|
886
|
37
|
885
|
38 #ifdef HAVE_3DNOWEX
|
|
39 #define TRANS_FILL_MM6_MM7_3DNOW()\
|
890
|
40 __asm__ __volatile__(\
|
885
|
41 "movl $-1, %%eax\n\t"\
|
|
42 "movd %%eax, %%mm7\n\t"\
|
|
43 "negl %%eax\n\t"\
|
|
44 "movd %%eax, %%mm6\n\t"\
|
|
45 "punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\
|
|
46 "pi2fd %%mm7, %%mm7\n\t"\
|
|
47 "pswapd %%mm7, %%mm6\n\t"/* 1.0 | -1.0 */\
|
|
48 :::"eax","memory");
|
|
49 #else
|
|
50 #define TRANS_FILL_MM6_MM7_3DNOW()\
|
890
|
51 __asm__ __volatile__(\
|
885
|
52 "movl $-1, %%eax\n\t"\
|
|
53 "movd %%eax, %%mm7\n\t"\
|
|
54 "negl %%eax\n\t"\
|
|
55 "movd %%eax, %%mm6\n\t"\
|
|
56 "punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\
|
|
57 "punpckldq %%mm7, %%mm6\n\t" /* 1.0 | -1.0 */\
|
|
58 "pi2fd %%mm7, %%mm7\n\t"\
|
|
59 "pi2fd %%mm6, %%mm6\n\t"\
|
|
60 :::"eax","memory");
|
|
61 #endif
|
|
62
|
|
63 #ifdef HAVE_3DNOWEX
|
886
|
64 #define PSWAP_MM(mm_base,mm_hlp) "pswapd "##mm_base","##mm_base"\n\t"
|
885
|
65 #else
|
|
66 #define PSWAP_MM(mm_base,mm_hlp)\
|
886
|
67 "movq "##mm_base","##mm_hlp"\n\t"\
|
885
|
68 "psrlq $32, "##mm_base"\n\t"\
|
|
69 "punpckldq "##mm_hlp","##mm_base"\n\t"
|
|
70 #endif
|
|
71
|
|
72 #define TRANSZERO_3DNOW(A0,A4,A8,A12) \
|
|
73 { \
|
890
|
74 __asm__ __volatile__("femms":::"memory");\
|
885
|
75 TRANS_FILL_MM6_MM7_3DNOW()\
|
890
|
76 __asm__ __volatile__(\
|
885
|
77 "movq %4, %%mm0\n\t" /* mm0 = wTB[0]*/\
|
|
78 "movq %5, %%mm1\n\t" /* mm1 = wTB[k*2]*/ \
|
|
79 "movq %%mm0, %%mm5\n\t"/*u.re = wTB[0].re + wTB[k*2].re;*/\
|
|
80 "pfadd %%mm1, %%mm5\n\t"/*u.im = wTB[0].im + wTB[k*2].im; mm5 = u*/\
|
|
81 "pfmul %%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\
|
|
82 "pfmul %%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\
|
|
83 "pfadd %%mm1, %%mm0\n\t"/*v.im = wTB[0].re - wTB[k*2].re;*/\
|
|
84 "movq %%mm0, %%mm4\n\t"/*v.re =-wTB[0].im + wTB[k*2].im;*/\
|
|
85 PSWAP_MM("%%mm4","%%mm2")/* mm4 = v*/\
|
|
86 "movq %6, %%mm0\n\t" /* a1 = A0;*/\
|
886
|
87 "movq %7, %%mm2\n\t" /* a1 = A4;*/\
|
885
|
88 "movq %%mm0, %%mm1\n\t"\
|
886
|
89 "movq %%mm2, %%mm3\n\t"\
|
885
|
90 "pfadd %%mm5, %%mm0\n\t" /*A0 = a1 + u;*/\
|
886
|
91 "pfadd %%mm4, %%mm2\n\t" /*A12 = a1 + v;*/\
|
885
|
92 "pfsub %%mm5, %%mm1\n\t" /*A1 = a1 - u;*/\
|
886
|
93 "pfsub %%mm4, %%mm3\n\t" /*A4 = a1 - v;*/\
|
885
|
94 "movq %%mm0, %0\n\t"\
|
886
|
95 "movq %%mm2, %3\n\t"\
|
885
|
96 "movq %%mm1, %1\n\t"\
|
|
97 "movq %%mm3, %2"\
|
|
98 :"=m"(A0), "=m"(A8), "=m"(A4), "=m"(A12)\
|
|
99 :"m"(wTB[0]), "m"(wTB[k*2]), "0"(A0), "2"(A4)\
|
|
100 :"memory");\
|
890
|
101 __asm__ __volatile__("femms":::"memory");\
|
885
|
102 }
|
|
103
|
886
|
104 #define TRANSHALF_16_3DNOW(A2,A6,A10,A14)\
|
|
105 {\
|
890
|
106 __asm__ __volatile__("femms":::"memory");\
|
886
|
107 TRANS_FILL_MM6_MM7_3DNOW()\
|
890
|
108 __asm__ __volatile__(\
|
886
|
109 "movq %4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\
|
|
110 "movq %%mm0, %%mm1\n\t"\
|
|
111 "pfmul %%mm7, %%mm1\n\t"\
|
|
112 "pfacc %%mm1, %%mm0\n\t"/*u.im = wTB[2].im - wTB[2].re; mm0 = u*/\
|
|
113 "movq %5, %%mm1\n\t" /*a.re = wTB[6].im - wTB[6].re; */\
|
|
114 "movq %%mm1, %%mm2\n\t"\
|
|
115 "pfmul %%mm7, %%mm1\n\t"\
|
|
116 "pfacc %%mm2, %%mm1\n\t"/*a.im = wTB[6].im + wTB[6].re; mm1 = a*/\
|
|
117 "movq %%mm1, %%mm2\n\t"\
|
|
118 "pfmul %%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\
|
|
119 "movq %%mm0, %%mm3\n\t"/*v.re = u.im + a.im;*/\
|
|
120 "pfadd %%mm2, %%mm3\n\t"\
|
|
121 PSWAP_MM("%%mm3","%%mm2")/*mm3 = v*/\
|
|
122 "pfmul %%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\
|
|
123 "pfadd %%mm1, %%mm0\n\t"/*u.im = u.im - a.im; mm0 = u*/\
|
890
|
124 "movq %8, %%mm2\n\t"\
|
886
|
125 "pfmul %%mm2, %%mm3\n\t" /* v *= HSQRT2_3DNOW; */\
|
|
126 "pfmul %%mm2, %%mm0\n\t" /* u *= HSQRT2_3DNOW; */\
|
|
127 "movq %6, %%mm1\n\t" /* a1 = A2;*/\
|
|
128 "movq %%mm1, %%mm2\n\t"\
|
|
129 "pfadd %%mm0, %%mm1\n\t" /*A2 = a1 + u;*/\
|
|
130 "pfsub %%mm0, %%mm2\n\t" /*A2 = a1 - u;*/\
|
|
131 "movq %%mm1, %0\n\t"\
|
|
132 "movq %%mm2, %1\n\t"\
|
|
133 "movq %7, %%mm1\n\t" /* a1 = A6;*/\
|
|
134 "movq %%mm1, %%mm2\n\t"\
|
|
135 "movq %%mm3, %%mm4\n\t"\
|
|
136 "pfmul %%mm6, %%mm4\n\t"/*A6.re = a1.re + v.re;*/\
|
|
137 "pfadd %%mm4, %%mm1\n\t"/*A6.im = a1.im - v.im;*/\
|
|
138 "pfmul %%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\
|
|
139 "pfadd %%mm3, %%mm2\n\t"/*A14.im = a1.im + v.im;*/\
|
|
140 "movq %%mm1, %2\n\t"\
|
|
141 "movq %%mm2, %3"\
|
|
142 :"=m"(A2), "=m"(A10), "=m"(A6), "=m"(A14)\
|
|
143 :"m"(wTB[2]), "m"(wTB[6]), "0"(A2), "2"(A6), "m"(HSQRT2_3DNOW)\
|
|
144 :"memory");\
|
890
|
145 __asm__ __volatile__("femms":::"memory");\
|
886
|
146 }
|
|
147
|
885
|
148 #endif
|