Mercurial > mplayer.hg
annotate liba52/imdct.c @ 3899:0d1457cdde44
mp3lameless build fixed
author | arpi |
---|---|
date | Sun, 30 Dec 2001 16:52:58 +0000 |
parents | 0410677eda4a |
children | 0cc94b1eec0f |
rev | line source |
---|---|
3394 | 1 /* |
2 * imdct.c | |
3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org> | |
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> | |
5 * | |
6 * This file is part of a52dec, a free ATSC A-52 stream decoder. | |
7 * See http://liba52.sourceforge.net/ for updates. | |
8 * | |
9 * a52dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * a52dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
3579 | 22 * |
23 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) | |
3884 | 24 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru> |
25 * michael did port them from libac3 (untested, perhaps totally broken) | |
3394 | 26 */ |
27 | |
28 #include "config.h" | |
3579 | 29 #include "../cpudetect.h" |
3394 | 30 |
31 #include <math.h> | |
32 #include <stdio.h> | |
33 #ifndef M_PI | |
34 #define M_PI 3.1415926535897932384626433832795029 | |
35 #endif | |
36 #include <inttypes.h> | |
37 | |
38 #include "a52.h" | |
39 #include "a52_internal.h" | |
40 #include "mm_accel.h" | |
41 | |
3884 | 42 #ifdef RUNTIME_CPUDETECT |
43 #undef HAVE_3DNOWEX | |
44 #endif | |
45 | |
46 #define USE_AC3_C | |
47 | |
3394 | 48 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias); |
49 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias); | |
50 | |
51 typedef struct complex_s { | |
52 sample_t real; | |
53 sample_t imag; | |
54 } complex_t; | |
55 | |
3884 | 56 static void fft_128p(complex_t *a); |
57 static void fft_128p_3dnow(complex_t *a); | |
58 | |
59 static const int pm128[128] __attribute__((aligned(16))) = | |
60 { | |
61 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120, | |
62 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124, | |
63 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122, | |
64 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126, | |
65 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121, | |
66 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125, | |
67 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123, | |
68 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127 | |
69 }; | |
3394 | 70 |
71 /* 128 point bit-reverse LUT */ | |
72 static uint8_t bit_reverse_512[] = { | |
73 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70, | |
74 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78, | |
75 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74, | |
76 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c, | |
77 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72, | |
78 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a, | |
79 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76, | |
80 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e, | |
81 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71, | |
82 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79, | |
83 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75, | |
84 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d, | |
85 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73, | |
86 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b, | |
87 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77, | |
88 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f}; | |
89 | |
90 static uint8_t bit_reverse_256[] = { | |
91 0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38, | |
92 0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c, | |
93 0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a, | |
94 0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e, | |
95 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39, | |
96 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d, | |
97 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b, | |
98 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f}; | |
99 | |
3579 | 100 #ifdef ARCH_X86 |
3508 | 101 // NOTE: SSE needs 16byte alignment or it will segfault |
3581 | 102 // |
3508 | 103 static complex_t __attribute__((aligned(16))) buf[128]; |
3581 | 104 static float __attribute__((aligned(16))) sseSinCos1c[256]; |
105 static float __attribute__((aligned(16))) sseSinCos1d[256]; | |
3512 | 106 static float __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1}; |
3534 | 107 //static float __attribute__((aligned(16))) sseW0[4]; |
108 static float __attribute__((aligned(16))) sseW1[8]; | |
109 static float __attribute__((aligned(16))) sseW2[16]; | |
110 static float __attribute__((aligned(16))) sseW3[32]; | |
111 static float __attribute__((aligned(16))) sseW4[64]; | |
112 static float __attribute__((aligned(16))) sseW5[128]; | |
113 static float __attribute__((aligned(16))) sseW6[256]; | |
114 static float __attribute__((aligned(16))) *sseW[7]= | |
115 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; | |
3553 | 116 static float __attribute__((aligned(16))) sseWindow[512]; |
3508 | 117 #else |
3394 | 118 static complex_t buf[128]; |
3508 | 119 #endif |
3394 | 120 |
121 /* Twiddle factor LUT */ | |
122 static complex_t w_1[1]; | |
123 static complex_t w_2[2]; | |
124 static complex_t w_4[4]; | |
125 static complex_t w_8[8]; | |
126 static complex_t w_16[16]; | |
127 static complex_t w_32[32]; | |
128 static complex_t w_64[64]; | |
129 static complex_t * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; | |
130 | |
131 /* Twiddle factors for IMDCT */ | |
132 static sample_t xcos1[128]; | |
133 static sample_t xsin1[128]; | |
134 static sample_t xcos2[64]; | |
135 static sample_t xsin2[64]; | |
136 | |
137 /* Windowing function for Modified DCT - Thank you acroread */ | |
138 sample_t imdct_window[] = { | |
139 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130, | |
140 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443, | |
141 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061, | |
142 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121, | |
143 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770, | |
144 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153, | |
145 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389, | |
146 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563, | |
147 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699, | |
148 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757, | |
149 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626, | |
150 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126, | |
151 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019, | |
152 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031, | |
153 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873, | |
154 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269, | |
155 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981, | |
156 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831, | |
157 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716, | |
158 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610, | |
159 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560, | |
160 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674, | |
161 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099, | |
162 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994, | |
163 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513, | |
164 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788, | |
165 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919, | |
166 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974, | |
167 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993, | |
168 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999, | |
169 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, | |
170 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 }; | |
171 | |
172 | |
173 static inline void swap_cmplx(complex_t *a, complex_t *b) | |
174 { | |
175 complex_t tmp; | |
176 | |
177 tmp = *a; | |
178 *a = *b; | |
179 *b = tmp; | |
180 } | |
181 | |
182 | |
183 | |
184 static inline complex_t cmplx_mult(complex_t a, complex_t b) | |
185 { | |
186 complex_t ret; | |
187 | |
188 ret.real = a.real * b.real - a.imag * b.imag; | |
189 ret.imag = a.real * b.imag + a.imag * b.real; | |
190 | |
191 return ret; | |
192 } | |
193 | |
194 void | |
195 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias) | |
196 { | |
197 int i,k; | |
198 int p,q; | |
199 int m; | |
200 int two_m; | |
201 int two_m_plus_one; | |
202 | |
203 sample_t tmp_a_i; | |
204 sample_t tmp_a_r; | |
205 sample_t tmp_b_i; | |
206 sample_t tmp_b_r; | |
207 | |
208 sample_t *data_ptr; | |
209 sample_t *delay_ptr; | |
210 sample_t *window_ptr; | |
211 | |
212 /* 512 IMDCT with source and dest data in 'data' */ | |
213 | |
3884 | 214 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ |
3394 | 215 for( i=0; i < 128; i++) { |
216 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ | |
3884 | 217 #ifdef USE_AC3_C |
218 int j= pm128[i]; | |
219 #else | |
220 int j= bit_reverse_512[i]; | |
221 #endif | |
222 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); | |
223 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); | |
3394 | 224 } |
225 | |
226 /* FFT Merge */ | |
3549 | 227 /* unoptimized variant |
228 for (m=1; m < 7; m++) { | |
229 if(m) | |
230 two_m = (1 << m); | |
231 else | |
232 two_m = 1; | |
233 | |
234 two_m_plus_one = (1 << (m+1)); | |
235 | |
236 for(i = 0; i < 128; i += two_m_plus_one) { | |
237 for(k = 0; k < two_m; k++) { | |
238 p = k + i; | |
239 q = p + two_m; | |
3508 | 240 tmp_a_r = buf[p].real; |
241 tmp_a_i = buf[p].imag; | |
3549 | 242 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; |
243 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
3508 | 244 buf[p].real = tmp_a_r + tmp_b_r; |
245 buf[p].imag = tmp_a_i + tmp_b_i; | |
246 buf[q].real = tmp_a_r - tmp_b_r; | |
247 buf[q].imag = tmp_a_i - tmp_b_i; | |
3549 | 248 } |
249 } | |
250 } | |
251 */ | |
3884 | 252 #ifdef USE_AC3_C |
253 fft_128p (&buf[0]); | |
254 #else | |
255 | |
3623 | 256 /* 1. iteration */ |
3579 | 257 for(i = 0; i < 128; i += 2) { |
258 tmp_a_r = buf[i].real; | |
259 tmp_a_i = buf[i].imag; | |
260 tmp_b_r = buf[i+1].real; | |
261 tmp_b_i = buf[i+1].imag; | |
262 buf[i].real = tmp_a_r + tmp_b_r; | |
263 buf[i].imag = tmp_a_i + tmp_b_i; | |
264 buf[i+1].real = tmp_a_r - tmp_b_r; | |
265 buf[i+1].imag = tmp_a_i - tmp_b_i; | |
266 } | |
267 | |
3623 | 268 /* 2. iteration */ |
3579 | 269 // Note w[1]={{1,0}, {0,-1}} |
270 for(i = 0; i < 128; i += 4) { | |
271 tmp_a_r = buf[i].real; | |
272 tmp_a_i = buf[i].imag; | |
273 tmp_b_r = buf[i+2].real; | |
274 tmp_b_i = buf[i+2].imag; | |
275 buf[i].real = tmp_a_r + tmp_b_r; | |
276 buf[i].imag = tmp_a_i + tmp_b_i; | |
277 buf[i+2].real = tmp_a_r - tmp_b_r; | |
278 buf[i+2].imag = tmp_a_i - tmp_b_i; | |
279 tmp_a_r = buf[i+1].real; | |
280 tmp_a_i = buf[i+1].imag; | |
281 tmp_b_r = buf[i+3].imag; | |
282 tmp_b_i = buf[i+3].real; | |
283 buf[i+1].real = tmp_a_r + tmp_b_r; | |
284 buf[i+1].imag = tmp_a_i - tmp_b_i; | |
285 buf[i+3].real = tmp_a_r - tmp_b_r; | |
286 buf[i+3].imag = tmp_a_i + tmp_b_i; | |
287 } | |
288 | |
3623 | 289 /* 3. iteration */ |
3579 | 290 for(i = 0; i < 128; i += 8) { |
291 tmp_a_r = buf[i].real; | |
292 tmp_a_i = buf[i].imag; | |
293 tmp_b_r = buf[i+4].real; | |
294 tmp_b_i = buf[i+4].imag; | |
295 buf[i].real = tmp_a_r + tmp_b_r; | |
296 buf[i].imag = tmp_a_i + tmp_b_i; | |
297 buf[i+4].real = tmp_a_r - tmp_b_r; | |
298 buf[i+4].imag = tmp_a_i - tmp_b_i; | |
299 tmp_a_r = buf[1+i].real; | |
300 tmp_a_i = buf[1+i].imag; | |
301 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; | |
302 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; | |
303 buf[1+i].real = tmp_a_r + tmp_b_r; | |
304 buf[1+i].imag = tmp_a_i + tmp_b_i; | |
305 buf[i+5].real = tmp_a_r - tmp_b_r; | |
306 buf[i+5].imag = tmp_a_i - tmp_b_i; | |
307 tmp_a_r = buf[i+2].real; | |
308 tmp_a_i = buf[i+2].imag; | |
309 tmp_b_r = buf[i+6].imag; | |
310 tmp_b_i = - buf[i+6].real; | |
311 buf[i+2].real = tmp_a_r + tmp_b_r; | |
312 buf[i+2].imag = tmp_a_i + tmp_b_i; | |
313 buf[i+6].real = tmp_a_r - tmp_b_r; | |
314 buf[i+6].imag = tmp_a_i - tmp_b_i; | |
315 tmp_a_r = buf[i+3].real; | |
316 tmp_a_i = buf[i+3].imag; | |
317 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; | |
318 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; | |
319 buf[i+3].real = tmp_a_r + tmp_b_r; | |
320 buf[i+3].imag = tmp_a_i + tmp_b_i; | |
321 buf[i+7].real = tmp_a_r - tmp_b_r; | |
322 buf[i+7].imag = tmp_a_i - tmp_b_i; | |
323 } | |
324 | |
3623 | 325 /* 4-7. iterations */ |
3579 | 326 for (m=3; m < 7; m++) { |
327 two_m = (1 << m); | |
328 | |
329 two_m_plus_one = two_m<<1; | |
330 | |
331 for(i = 0; i < 128; i += two_m_plus_one) { | |
332 for(k = 0; k < two_m; k++) { | |
333 int p = k + i; | |
334 int q = p + two_m; | |
335 tmp_a_r = buf[p].real; | |
336 tmp_a_i = buf[p].imag; | |
337 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
338 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
339 buf[p].real = tmp_a_r + tmp_b_r; | |
340 buf[p].imag = tmp_a_i + tmp_b_i; | |
341 buf[q].real = tmp_a_r - tmp_b_r; | |
342 buf[q].imag = tmp_a_i - tmp_b_i; | |
343 } | |
344 } | |
345 } | |
3884 | 346 #endif |
3579 | 347 /* Post IFFT complex multiply plus IFFT complex conjugate*/ |
348 for( i=0; i < 128; i++) { | |
349 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ | |
350 tmp_a_r = buf[i].real; | |
351 tmp_a_i = -1.0 * buf[i].imag; | |
352 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]); | |
353 buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]); | |
354 } | |
355 | |
356 data_ptr = data; | |
357 delay_ptr = delay; | |
358 window_ptr = imdct_window; | |
359 | |
360 /* Window and convert to real valued signal */ | |
361 for(i=0; i< 64; i++) { | |
362 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; | |
363 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | |
364 } | |
365 | |
366 for(i=0; i< 64; i++) { | |
367 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; | |
368 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | |
369 } | |
3884 | 370 |
3579 | 371 /* The trailing edge of the window goes into the delay line */ |
372 delay_ptr = delay; | |
373 | |
374 for(i=0; i< 64; i++) { | |
375 *delay_ptr++ = -buf[64+i].real * *--window_ptr; | |
376 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; | |
377 } | |
378 | |
379 for(i=0; i<64; i++) { | |
380 *delay_ptr++ = buf[i].imag * *--window_ptr; | |
381 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; | |
382 } | |
383 } | |
384 | |
385 #ifdef ARCH_X86 | |
3884 | 386 #include "srfftp_3dnow.h" |
387 | |
388 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = { 0x00000000UL, 0x80000000UL }; | |
389 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = { 0x80000000UL, 0x00000000UL }; | |
390 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 }; | |
391 | |
392 void | |
393 imdct_do_512_3dnow(sample_t data[],sample_t delay[], sample_t bias) | |
394 { | |
395 int i,k; | |
396 int p,q; | |
397 int m; | |
398 int two_m; | |
399 int two_m_plus_one; | |
400 | |
401 sample_t tmp_a_i; | |
402 sample_t tmp_a_r; | |
403 sample_t tmp_b_i; | |
404 sample_t tmp_b_r; | |
405 | |
406 sample_t *data_ptr; | |
407 sample_t *delay_ptr; | |
408 sample_t *window_ptr; | |
409 | |
410 /* 512 IMDCT with source and dest data in 'data' */ | |
411 | |
412 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ | |
413 #if 1 | |
414 __asm__ __volatile__ ( | |
415 "movq %0, %%mm7\n\t" | |
416 ::"m"(x_plus_minus_3dnow) | |
417 :"memory"); | |
418 for( i=0; i < 128; i++) { | |
419 int j = pm128[i]; | |
420 __asm__ __volatile__ ( | |
421 "movd %1, %%mm0\n\t" | |
422 "movd %3, %%mm1\n\t" | |
423 "punpckldq %2, %%mm0\n\t" /* mm0 = data[256-2*j-1] | data[2*j]*/ | |
424 "punpckldq %4, %%mm1\n\t" /* mm1 = xcos[j] | xsin[j] */ | |
425 "movq %%mm0, %%mm2\n\t" | |
426 "pfmul %%mm1, %%mm0\n\t" | |
427 #ifdef HAVE_3DNOWEX | |
428 "pswapd %%mm1, %%mm1\n\t" | |
429 #else | |
430 "movq %%mm1, %%mm5\n\t" | |
431 "psrlq $32, %%mm1\n\t" | |
432 "punpckldq %%mm5, %%mm1\n\t" | |
433 #endif | |
434 "pfmul %%mm1, %%mm2\n\t" | |
435 #ifdef HAVE_3DNOWEX | |
436 "pfpnacc %%mm2, %%mm0\n\t" | |
437 #else | |
438 "pxor %%mm7, %%mm0\n\t" | |
439 "pfacc %%mm2, %%mm0\n\t" | |
440 #endif | |
441 "pxor %%mm7, %%mm0\n\t" | |
442 "movq %%mm0, %0" | |
443 :"=m"(buf[i]) | |
444 :"m"(data[256-2*j-1]), "m"(data[2*j]), "m"(xcos1[j]), "m"(xsin1[j]) | |
445 :"memory" | |
446 ); | |
447 /* buf[i].re = (data[256-2*j-1] * xcos1[j] - data[2*j] * xsin1[j]); | |
448 buf[i].im = (data[256-2*j-1] * xsin1[j] + data[2*j] * xcos1[j])*(-1.0);*/ | |
449 } | |
450 #else | |
451 __asm__ __volatile__ ("femms":::"memory"); | |
452 for( i=0; i < 128; i++) { | |
453 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ | |
454 int j= pm128[i]; | |
455 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); | |
456 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); | |
457 } | |
458 #endif | |
459 | |
460 /* FFT Merge */ | |
461 /* unoptimized variant | |
462 for (m=1; m < 7; m++) { | |
463 if(m) | |
464 two_m = (1 << m); | |
465 else | |
466 two_m = 1; | |
467 | |
468 two_m_plus_one = (1 << (m+1)); | |
469 | |
470 for(i = 0; i < 128; i += two_m_plus_one) { | |
471 for(k = 0; k < two_m; k++) { | |
472 p = k + i; | |
473 q = p + two_m; | |
474 tmp_a_r = buf[p].real; | |
475 tmp_a_i = buf[p].imag; | |
476 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
477 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
478 buf[p].real = tmp_a_r + tmp_b_r; | |
479 buf[p].imag = tmp_a_i + tmp_b_i; | |
480 buf[q].real = tmp_a_r - tmp_b_r; | |
481 buf[q].imag = tmp_a_i - tmp_b_i; | |
482 } | |
483 } | |
484 } | |
485 */ | |
486 | |
487 fft_128p_3dnow (&buf[0]); | |
488 // asm volatile ("femms \n\t":::"memory"); | |
489 | |
490 /* Post IFFT complex multiply plus IFFT complex conjugate*/ | |
491 #if 1 | |
492 __asm__ __volatile__ ( | |
493 "movq %0, %%mm7\n\t" | |
494 "movq %1, %%mm6\n\t" | |
495 ::"m"(x_plus_minus_3dnow), | |
496 "m"(x_minus_plus_3dnow) | |
497 :"eax","memory"); | |
498 for (i=0; i < 128; i++) { | |
499 __asm__ __volatile__ ( | |
500 "movq %1, %%mm0\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ | |
501 "movq %%mm0, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ | |
502 #ifndef HAVE_3DNOWEX | |
503 "movq %%mm1, %%mm2\n\t" | |
504 "psrlq $32, %%mm1\n\t" | |
505 "punpckldq %%mm2, %%mm1\n\t" | |
506 #else | |
507 "pswapd %%mm1, %%mm1\n\t" /* ac3_buf[i].re | ac3_buf[i].im */ | |
508 #endif | |
509 "movd %3, %%mm3\n\t" /* ac3_xsin[i] */ | |
510 "punpckldq %2, %%mm3\n\t" /* ac3_xsin[i] | ac3_xcos[i] */ | |
511 "pfmul %%mm3, %%mm0\n\t" | |
512 "pfmul %%mm3, %%mm1\n\t" | |
513 #ifndef HAVE_3DNOWEX | |
514 "pxor %%mm7, %%mm0\n\t" | |
515 "pfacc %%mm1, %%mm0\n\t" | |
516 "movd %%mm0, 4%0\n\t" | |
517 "psrlq $32, %%mm0\n\t" | |
518 "movd %%mm0, %0\n\t" | |
519 #else | |
520 "pfpnacc %%mm1, %%mm0\n\t" /* mm0 = mm0[0] - mm0[1] | mm1[0] + mm1[1] */ | |
521 "pswapd %%mm0, %%mm0\n\t" | |
522 "movq %%mm0, %0" | |
523 #endif | |
524 :"=m"(buf[i]) | |
525 :"m"(buf[i]),"m"(xcos1[i]),"m"(xsin1[i]) | |
526 :"memory"); | |
527 /* ac3_buf[i].re =(tmp_a_r * ac3_xcos1[i]) + (tmp_a_i * ac3_xsin1[i]); | |
528 ac3_buf[i].im =(tmp_a_r * ac3_xsin1[i]) - (tmp_a_i * ac3_xcos1[i]);*/ | |
529 } | |
530 #else | |
531 __asm__ __volatile__ ("femms":::"memory"); | |
532 for( i=0; i < 128; i++) { | |
533 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ | |
534 tmp_a_r = buf[i].real; | |
535 tmp_a_i = -1.0 * buf[i].imag; | |
536 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]); | |
537 buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]); | |
538 } | |
539 #endif | |
540 | |
541 data_ptr = data; | |
542 delay_ptr = delay; | |
543 window_ptr = imdct_window; | |
544 | |
545 /* Window and convert to real valued signal */ | |
546 #if 1 | |
547 asm volatile ( | |
548 "movd (%0), %%mm3 \n\t" | |
549 "punpckldq %%mm3, %%mm3 \n\t" | |
550 :: "r" (&bias) | |
551 ); | |
552 for (i=0; i< 64; i++) { | |
553 /* merge two loops in one to enable working of 2 decoders */ | |
554 __asm__ __volatile__ ( | |
555 "movd 516(%1), %%mm0\n\t" | |
556 "movd (%1), %%mm1\n\t" /**data_ptr++=-buf[64+i].im**window_ptr+++*delay_ptr++;*/ | |
557 "punpckldq (%2), %%mm0\n\t"/*data_ptr[128]=-buf[i].re*window_ptr[128]+delay_ptr[128];*/ | |
558 "punpckldq 516(%2), %%mm1\n\t" | |
559 "pfmul (%3), %%mm0\n\t"/**data_ptr++=buf[64-i-1].re**window_ptr+++*delay_ptr++;*/ | |
560 "pfmul 512(%3), %%mm1\n\t" | |
561 "pxor %%mm6, %%mm0\n\t"/*data_ptr[128]=buf[128-i-1].im*window_ptr[128]+delay_ptr[128];*/ | |
562 "pxor %%mm6, %%mm1\n\t" | |
563 "pfadd (%4), %%mm0\n\t" | |
564 "pfadd 512(%4), %%mm1\n\t" | |
565 "pfadd %%mm3, %%mm0\n\t" | |
566 "pfadd %%mm3, %%mm1\n\t" | |
567 "movq %%mm0, (%0)\n\t" | |
568 "movq %%mm1, 512(%0)" | |
569 :"=r"(data_ptr) | |
570 :"r"(&buf[i].real), "r"(&buf[64-i-1].real), "r"(window_ptr), "r"(delay_ptr), "0"(data_ptr) | |
571 :"memory"); | |
572 data_ptr += 2; | |
573 window_ptr += 2; | |
574 delay_ptr += 2; | |
575 } | |
576 window_ptr += 128; | |
577 #else | |
578 __asm__ __volatile__ ("femms":::"memory"); | |
579 for(i=0; i< 64; i++) { | |
580 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; | |
581 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | |
582 } | |
583 | |
584 for(i=0; i< 64; i++) { | |
585 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; | |
586 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | |
587 } | |
588 #endif | |
589 | |
590 /* The trailing edge of the window goes into the delay line */ | |
591 delay_ptr = delay; | |
592 #if 1 | |
593 for(i=0; i< 64; i++) { | |
594 /* merge two loops in one to enable working of 2 decoders */ | |
595 window_ptr -=2; | |
596 __asm__ __volatile__( | |
597 "movd 508(%1), %%mm0\n\t" | |
598 "movd (%1), %%mm1\n\t" | |
599 "punpckldq (%2), %%mm0\n\t" | |
600 "punpckldq 508(%2), %%mm1\n\t" | |
601 #ifdef HAVE_3DNOWEX | |
602 "pswapd (%3), %%mm3\n\t" | |
603 "pswapd -512(%3), %%mm4\n\t" | |
604 #else | |
605 "movq (%3), %%mm3\n\t"/**delay_ptr++=-buf[64+i].re**--window_ptr;*/ | |
606 "movq -512(%3), %%mm4\n\t" | |
607 "psrlq $32, %%mm3\n\t"/*delay_ptr[128]=buf[i].im**window_ptr[-512];*/ | |
608 "psrlq $32, %%mm4\n\t"/**delay_ptr++=buf[64-i-1].im**--window_ptr;*/ | |
609 "punpckldq (%3), %%mm3\n\t"/*delay_ptr[128]=-buf[128-i-1].re**window_ptr[-512];*/ | |
610 "punpckldq -512(%3), %%mm4\n\t" | |
611 #endif | |
612 "pfmul %%mm3, %%mm0\n\t" | |
613 "pfmul %%mm4, %%mm1\n\t" | |
614 "pxor %%mm6, %%mm0\n\t" | |
615 "pxor %%mm7, %%mm1\n\t" | |
616 "movq %%mm0, (%0)\n\t" | |
617 "movq %%mm1, 512(%0)" | |
618 :"=r"(delay_ptr) | |
619 :"r"(&buf[i].imag), "r"(&buf[64-i-1].imag), "r"(window_ptr), "0"(delay_ptr) | |
620 :"memory"); | |
621 delay_ptr += 2; | |
622 } | |
623 __asm__ __volatile__ ("femms":::"memory"); | |
624 #else | |
625 __asm__ __volatile__ ("femms":::"memory"); | |
626 for(i=0; i< 64; i++) { | |
627 *delay_ptr++ = -buf[64+i].real * *--window_ptr; | |
628 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; | |
629 } | |
630 | |
631 for(i=0; i<64; i++) { | |
632 *delay_ptr++ = buf[i].imag * *--window_ptr; | |
633 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; | |
634 } | |
635 #endif | |
636 } | |
637 | |
638 | |
3579 | 639 void |
640 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias) | |
641 { | |
642 int i,k; | |
643 int p,q; | |
644 int m; | |
645 int two_m; | |
646 int two_m_plus_one; | |
647 | |
648 sample_t tmp_a_i; | |
649 sample_t tmp_a_r; | |
650 sample_t tmp_b_i; | |
651 sample_t tmp_b_r; | |
652 | |
653 sample_t *data_ptr; | |
654 sample_t *delay_ptr; | |
655 sample_t *window_ptr; | |
656 | |
657 /* 512 IMDCT with source and dest data in 'data' */ | |
3623 | 658 /* see the c version (dct_do_512()), its allmost identical, just in C */ |
659 | |
3579 | 660 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ |
661 /* Bit reversed shuffling */ | |
662 asm volatile( | |
663 "xorl %%esi, %%esi \n\t" | |
664 "leal bit_reverse_512, %%eax \n\t" | |
665 "movl $1008, %%edi \n\t" | |
666 "pushl %%ebp \n\t" //use ebp without telling gcc | |
667 ".balign 16 \n\t" | |
668 "1: \n\t" | |
3584 | 669 "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI |
670 "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI | |
671 "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi | |
672 "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi | |
673 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR | |
674 "movaps sseSinCos1c(%%esi), %%xmm2 \n\t" | |
675 "mulps %%xmm0, %%xmm2 \n\t" | |
676 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI | |
677 "mulps sseSinCos1d(%%esi), %%xmm0 \n\t" | |
678 "subps %%xmm0, %%xmm2 \n\t" | |
3579 | 679 "movzbl (%%eax), %%edx \n\t" |
680 "movzbl 1(%%eax), %%ebp \n\t" | |
3584 | 681 "movlps %%xmm2, (%1, %%edx,8) \n\t" |
682 "movhps %%xmm2, (%1, %%ebp,8) \n\t" | |
3579 | 683 "addl $16, %%esi \n\t" |
684 "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap | |
685 "subl $16, %%edi \n\t" | |
686 " jnc 1b \n\t" | |
687 "popl %%ebp \n\t"//no we didnt touch ebp *g* | |
688 :: "b" (data), "c" (buf) | |
689 : "%esi", "%edi", "%eax", "%edx" | |
690 ); | |
691 | |
692 | |
693 /* FFT Merge */ | |
694 /* unoptimized variant | |
695 for (m=1; m < 7; m++) { | |
696 if(m) | |
697 two_m = (1 << m); | |
698 else | |
699 two_m = 1; | |
700 | |
701 two_m_plus_one = (1 << (m+1)); | |
702 | |
703 for(i = 0; i < 128; i += two_m_plus_one) { | |
704 for(k = 0; k < two_m; k++) { | |
705 p = k + i; | |
706 q = p + two_m; | |
707 tmp_a_r = buf[p].real; | |
708 tmp_a_i = buf[p].imag; | |
709 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
710 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
711 buf[p].real = tmp_a_r + tmp_b_r; | |
712 buf[p].imag = tmp_a_i + tmp_b_i; | |
713 buf[q].real = tmp_a_r - tmp_b_r; | |
714 buf[q].imag = tmp_a_i - tmp_b_i; | |
715 } | |
716 } | |
717 } | |
718 */ | |
719 | |
3623 | 720 /* 1. iteration */ |
3549 | 721 // Note w[0][0]={1,0} |
3508 | 722 asm volatile( |
723 "xorps %%xmm1, %%xmm1 \n\t" | |
724 "xorps %%xmm2, %%xmm2 \n\t" | |
725 "movl %0, %%esi \n\t" | |
3529 | 726 ".balign 16 \n\t" |
3508 | 727 "1: \n\t" |
728 "movlps (%%esi), %%xmm0 \n\t" //buf[p] | |
729 "movlps 8(%%esi), %%xmm1\n\t" //buf[q] | |
730 "movhps (%%esi), %%xmm0 \n\t" //buf[p] | |
731 "movhps 8(%%esi), %%xmm2\n\t" //buf[q] | |
732 "addps %%xmm1, %%xmm0 \n\t" | |
733 "subps %%xmm2, %%xmm0 \n\t" | |
734 "movaps %%xmm0, (%%esi) \n\t" | |
735 "addl $16, %%esi \n\t" | |
736 "cmpl %1, %%esi \n\t" | |
737 " jb 1b \n\t" | |
738 :: "g" (buf), "r" (buf + 128) | |
739 : "%esi" | |
740 ); | |
3549 | 741 |
3623 | 742 /* 2. iteration */ |
3512 | 743 // Note w[1]={{1,0}, {0,-1}} |
744 asm volatile( | |
745 "movaps ps111_1, %%xmm7 \n\t" // 1,1,1,-1 | |
746 "movl %0, %%esi \n\t" | |
3529 | 747 ".balign 16 \n\t" |
3512 | 748 "1: \n\t" |
749 "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3 | |
750 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 | |
751 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 | |
752 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 | |
753 "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1 | |
754 "addps %%xmm2, %%xmm0 \n\t" | |
755 "subps %%xmm2, %%xmm1 \n\t" | |
756 "movaps %%xmm0, (%%esi) \n\t" | |
757 "movaps %%xmm1, 16(%%esi) \n\t" | |
758 "addl $32, %%esi \n\t" | |
759 "cmpl %1, %%esi \n\t" | |
760 " jb 1b \n\t" | |
761 :: "g" (buf), "r" (buf + 128) | |
762 : "%esi" | |
763 ); | |
3549 | 764 |
3623 | 765 /* 3. iteration */ |
3534 | 766 /* |
767 Note sseW2+0={1,1,sqrt(2),sqrt(2)) | |
768 Note sseW2+16={0,0,sqrt(2),-sqrt(2)) | |
769 Note sseW2+32={0,0,-sqrt(2),-sqrt(2)) | |
770 Note sseW2+48={1,-1,sqrt(2),-sqrt(2)) | |
771 */ | |
772 asm volatile( | |
3537 | 773 "movaps 48+sseW2, %%xmm6 \n\t" |
3534 | 774 "movaps 16+sseW2, %%xmm7 \n\t" |
775 "xorps %%xmm5, %%xmm5 \n\t" | |
776 "xorps %%xmm2, %%xmm2 \n\t" | |
777 "movl %0, %%esi \n\t" | |
778 ".balign 16 \n\t" | |
779 "1: \n\t" | |
3537 | 780 "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5 |
3534 | 781 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7 |
3537 | 782 "movaps sseW2, %%xmm4 \n\t" //r4,i4,r5,i5 |
783 "movaps 32+sseW2, %%xmm5 \n\t" //r6,i6,r7,i7 | |
784 "mulps %%xmm2, %%xmm4 \n\t" | |
785 "mulps %%xmm3, %%xmm5 \n\t" | |
3534 | 786 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 |
787 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 | |
3537 | 788 "mulps %%xmm6, %%xmm3 \n\t" |
3534 | 789 "mulps %%xmm7, %%xmm2 \n\t" |
790 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 | |
791 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3 | |
792 "addps %%xmm4, %%xmm2 \n\t" | |
793 "addps %%xmm5, %%xmm3 \n\t" | |
794 "movaps %%xmm2, %%xmm4 \n\t" | |
795 "movaps %%xmm3, %%xmm5 \n\t" | |
796 "addps %%xmm0, %%xmm2 \n\t" | |
797 "addps %%xmm1, %%xmm3 \n\t" | |
798 "subps %%xmm4, %%xmm0 \n\t" | |
799 "subps %%xmm5, %%xmm1 \n\t" | |
800 "movaps %%xmm2, (%%esi) \n\t" | |
801 "movaps %%xmm3, 16(%%esi) \n\t" | |
802 "movaps %%xmm0, 32(%%esi) \n\t" | |
803 "movaps %%xmm1, 48(%%esi) \n\t" | |
804 "addl $64, %%esi \n\t" | |
805 "cmpl %1, %%esi \n\t" | |
806 " jb 1b \n\t" | |
807 :: "g" (buf), "r" (buf + 128) | |
808 : "%esi" | |
809 ); | |
3508 | 810 |
3623 | 811 /* 4-7. iterations */ |
3546 | 812 for (m=3; m < 7; m++) { |
813 two_m = (1 << m); | |
814 two_m_plus_one = two_m<<1; | |
815 asm volatile( | |
816 "movl %0, %%esi \n\t" | |
817 ".balign 16 \n\t" | |
818 "1: \n\t" | |
819 "xorl %%edi, %%edi \n\t" // k | |
820 "leal (%%esi, %3), %%edx \n\t" | |
821 "2: \n\t" | |
822 "movaps (%%edx, %%edi), %%xmm1 \n\t" | |
823 "movaps (%4, %%edi, 2), %%xmm2 \n\t" | |
824 "mulps %%xmm1, %%xmm2 \n\t" | |
825 "shufps $0xB1, %%xmm1, %%xmm1 \n\t" | |
826 "mulps 16(%4, %%edi, 2), %%xmm1 \n\t" | |
827 "movaps (%%esi, %%edi), %%xmm0 \n\t" | |
828 "addps %%xmm2, %%xmm1 \n\t" | |
829 "movaps %%xmm1, %%xmm2 \n\t" | |
830 "addps %%xmm0, %%xmm1 \n\t" | |
831 "subps %%xmm2, %%xmm0 \n\t" | |
832 "movaps %%xmm1, (%%esi, %%edi) \n\t" | |
833 "movaps %%xmm0, (%%edx, %%edi) \n\t" | |
834 "addl $16, %%edi \n\t" | |
835 "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0 | |
836 " jb 2b \n\t" | |
837 "addl %2, %%esi \n\t" | |
838 "cmpl %1, %%esi \n\t" | |
839 " jb 1b \n\t" | |
840 :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3), | |
841 "r" (sseW[m]) | |
842 : "%esi", "%edi", "%edx" | |
843 ); | |
844 } | |
845 | |
3623 | 846 /* Post IFFT complex multiply plus IFFT complex conjugate*/ |
3581 | 847 asm volatile( |
848 "movl $-1024, %%esi \n\t" | |
849 ".balign 16 \n\t" | |
850 "1: \n\t" | |
851 "movaps (%0, %%esi), %%xmm0 \n\t" | |
852 "movaps (%0, %%esi), %%xmm1 \n\t" | |
853 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" | |
854 "mulps 1024+sseSinCos1c(%%esi), %%xmm1 \n\t" | |
855 "mulps 1024+sseSinCos1d(%%esi), %%xmm0 \n\t" | |
856 "addps %%xmm1, %%xmm0 \n\t" | |
857 "movaps %%xmm0, (%0, %%esi) \n\t" | |
858 "addl $16, %%esi \n\t" | |
859 " jnz 1b \n\t" | |
860 :: "r" (buf+128) | |
861 : "%esi" | |
862 ); | |
863 | |
3394 | 864 |
865 data_ptr = data; | |
866 delay_ptr = delay; | |
867 window_ptr = imdct_window; | |
868 | |
869 /* Window and convert to real valued signal */ | |
3552 | 870 asm volatile( |
871 "xorl %%edi, %%edi \n\t" // 0 | |
872 "xorl %%esi, %%esi \n\t" // 0 | |
873 "movss %3, %%xmm2 \n\t" // bias | |
874 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | |
875 ".balign 16 \n\t" | |
876 "1: \n\t" | |
877 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? | |
878 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? | |
879 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? | |
880 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? | |
881 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A | |
882 "mulps sseWindow(%%esi), %%xmm0 \n\t" | |
883 "addps (%2, %%esi), %%xmm0 \n\t" | |
884 "addps %%xmm2, %%xmm0 \n\t" | |
885 "movaps %%xmm0, (%1, %%esi) \n\t" | |
886 "addl $16, %%esi \n\t" | |
887 "subl $16, %%edi \n\t" | |
888 "cmpl $512, %%esi \n\t" | |
889 " jb 1b \n\t" | |
890 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | |
891 : "%esi", "%edi" | |
892 ); | |
893 data_ptr+=128; | |
894 delay_ptr+=128; | |
3553 | 895 // window_ptr+=128; |
3579 | 896 |
3552 | 897 asm volatile( |
898 "movl $1024, %%edi \n\t" // 512 | |
899 "xorl %%esi, %%esi \n\t" // 0 | |
900 "movss %3, %%xmm2 \n\t" // bias | |
901 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | |
902 ".balign 16 \n\t" | |
903 "1: \n\t" | |
904 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A | |
905 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C | |
906 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C | |
907 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A | |
908 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A | |
909 "mulps 512+sseWindow(%%esi), %%xmm0 \n\t" | |
910 "addps (%2, %%esi), %%xmm0 \n\t" | |
911 "addps %%xmm2, %%xmm0 \n\t" | |
912 "movaps %%xmm0, (%1, %%esi) \n\t" | |
913 "addl $16, %%esi \n\t" | |
914 "subl $16, %%edi \n\t" | |
915 "cmpl $512, %%esi \n\t" | |
916 " jb 1b \n\t" | |
917 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | |
918 : "%esi", "%edi" | |
919 ); | |
920 data_ptr+=128; | |
3553 | 921 // window_ptr+=128; |
3394 | 922 |
923 /* The trailing edge of the window goes into the delay line */ | |
924 delay_ptr = delay; | |
925 | |
3553 | 926 asm volatile( |
927 "xorl %%edi, %%edi \n\t" // 0 | |
928 "xorl %%esi, %%esi \n\t" // 0 | |
929 ".balign 16 \n\t" | |
930 "1: \n\t" | |
931 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A | |
932 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C | |
933 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C | |
934 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A | |
935 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A | |
936 "mulps 1024+sseWindow(%%esi), %%xmm0 \n\t" | |
937 "movaps %%xmm0, (%1, %%esi) \n\t" | |
938 "addl $16, %%esi \n\t" | |
939 "subl $16, %%edi \n\t" | |
940 "cmpl $512, %%esi \n\t" | |
941 " jb 1b \n\t" | |
942 :: "r" (buf+64), "r" (delay_ptr) | |
943 : "%esi", "%edi" | |
944 ); | |
945 delay_ptr+=128; | |
946 // window_ptr-=128; | |
3579 | 947 |
3553 | 948 asm volatile( |
949 "movl $1024, %%edi \n\t" // 1024 | |
950 "xorl %%esi, %%esi \n\t" // 0 | |
951 ".balign 16 \n\t" | |
952 "1: \n\t" | |
953 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? | |
954 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? | |
955 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? | |
956 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? | |
957 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A | |
958 "mulps 1536+sseWindow(%%esi), %%xmm0 \n\t" | |
959 "movaps %%xmm0, (%1, %%esi) \n\t" | |
960 "addl $16, %%esi \n\t" | |
961 "subl $16, %%edi \n\t" | |
962 "cmpl $512, %%esi \n\t" | |
963 " jb 1b \n\t" | |
964 :: "r" (buf), "r" (delay_ptr) | |
965 : "%esi", "%edi" | |
966 ); | |
3394 | 967 } |
3579 | 968 #endif //arch_x86 |
3394 | 969 |
970 void | |
971 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias) | |
972 { | |
973 int i,k; | |
974 int p,q; | |
975 int m; | |
976 int two_m; | |
977 int two_m_plus_one; | |
978 | |
979 sample_t tmp_a_i; | |
980 sample_t tmp_a_r; | |
981 sample_t tmp_b_i; | |
982 sample_t tmp_b_r; | |
983 | |
984 sample_t *data_ptr; | |
985 sample_t *delay_ptr; | |
986 sample_t *window_ptr; | |
987 | |
988 complex_t *buf_1, *buf_2; | |
989 | |
990 buf_1 = &buf[0]; | |
991 buf_2 = &buf[64]; | |
992 | |
993 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ | |
994 for(k=0; k<64; k++) { | |
995 /* X1[k] = X[2*k] */ | |
996 /* X2[k] = X[2*k+1] */ | |
997 | |
998 p = 2 * (128-2*k-1); | |
999 q = 2 * (2 * k); | |
1000 | |
1001 /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */ | |
1002 buf_1[k].real = data[p] * xcos2[k] - data[q] * xsin2[k]; | |
1003 buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]); | |
1004 /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */ | |
1005 buf_2[k].real = data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k]; | |
1006 buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]); | |
1007 } | |
1008 | |
1009 /* IFFT Bit reversed shuffling */ | |
1010 for(i=0; i<64; i++) { | |
1011 k = bit_reverse_256[i]; | |
1012 if (k < i) { | |
1013 swap_cmplx(&buf_1[i],&buf_1[k]); | |
1014 swap_cmplx(&buf_2[i],&buf_2[k]); | |
1015 } | |
1016 } | |
1017 | |
1018 /* FFT Merge */ | |
1019 for (m=0; m < 6; m++) { | |
1020 two_m = (1 << m); | |
1021 two_m_plus_one = (1 << (m+1)); | |
1022 | |
1023 /* FIXME */ | |
1024 if(m) | |
1025 two_m = (1 << m); | |
1026 else | |
1027 two_m = 1; | |
1028 | |
1029 for(k = 0; k < two_m; k++) { | |
1030 for(i = 0; i < 64; i += two_m_plus_one) { | |
1031 p = k + i; | |
1032 q = p + two_m; | |
1033 /* Do block 1 */ | |
1034 tmp_a_r = buf_1[p].real; | |
1035 tmp_a_i = buf_1[p].imag; | |
1036 tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag; | |
1037 tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag; | |
1038 buf_1[p].real = tmp_a_r + tmp_b_r; | |
1039 buf_1[p].imag = tmp_a_i + tmp_b_i; | |
1040 buf_1[q].real = tmp_a_r - tmp_b_r; | |
1041 buf_1[q].imag = tmp_a_i - tmp_b_i; | |
1042 | |
1043 /* Do block 2 */ | |
1044 tmp_a_r = buf_2[p].real; | |
1045 tmp_a_i = buf_2[p].imag; | |
1046 tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag; | |
1047 tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag; | |
1048 buf_2[p].real = tmp_a_r + tmp_b_r; | |
1049 buf_2[p].imag = tmp_a_i + tmp_b_i; | |
1050 buf_2[q].real = tmp_a_r - tmp_b_r; | |
1051 buf_2[q].imag = tmp_a_i - tmp_b_i; | |
1052 } | |
1053 } | |
1054 } | |
1055 | |
1056 /* Post IFFT complex multiply */ | |
1057 for( i=0; i < 64; i++) { | |
1058 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */ | |
1059 tmp_a_r = buf_1[i].real; | |
1060 tmp_a_i = -buf_1[i].imag; | |
1061 buf_1[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]); | |
1062 buf_1[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]); | |
1063 /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */ | |
1064 tmp_a_r = buf_2[i].real; | |
1065 tmp_a_i = -buf_2[i].imag; | |
1066 buf_2[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]); | |
1067 buf_2[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]); | |
1068 } | |
1069 | |
1070 data_ptr = data; | |
1071 delay_ptr = delay; | |
1072 window_ptr = imdct_window; | |
1073 | |
1074 /* Window and convert to real valued signal */ | |
1075 for(i=0; i< 64; i++) { | |
1076 *data_ptr++ = -buf_1[i].imag * *window_ptr++ + *delay_ptr++ + bias; | |
1077 *data_ptr++ = buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | |
1078 } | |
1079 | |
1080 for(i=0; i< 64; i++) { | |
1081 *data_ptr++ = -buf_1[i].real * *window_ptr++ + *delay_ptr++ + bias; | |
1082 *data_ptr++ = buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | |
1083 } | |
1084 | |
1085 delay_ptr = delay; | |
1086 | |
1087 for(i=0; i< 64; i++) { | |
1088 *delay_ptr++ = -buf_2[i].real * *--window_ptr; | |
1089 *delay_ptr++ = buf_2[64-i-1].imag * *--window_ptr; | |
1090 } | |
1091 | |
1092 for(i=0; i< 64; i++) { | |
1093 *delay_ptr++ = buf_2[i].imag * *--window_ptr; | |
1094 *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr; | |
1095 } | |
1096 } | |
1097 | |
1098 void imdct_init (uint32_t mm_accel) | |
1099 { | |
1100 #ifdef LIBA52_MLIB | |
1101 if (mm_accel & MM_ACCEL_MLIB) { | |
1102 fprintf (stderr, "Using mlib for IMDCT transform\n"); | |
1103 imdct_512 = imdct_do_512_mlib; | |
1104 imdct_256 = imdct_do_256_mlib; | |
1105 } else | |
1106 #endif | |
1107 { | |
1108 int i, j, k; | |
1109 | |
3884 | 1110 if(gCpuCaps.hasSSE) fprintf (stderr, "Using SSE optimized IMDCT transform\n"); |
1111 else if(gCpuCaps.has3DNow) fprintf (stderr, "Using experimental 3DNow optimized IMDCT transform\n"); | |
1112 else fprintf (stderr, "No accelerated IMDCT transform found\n"); | |
3394 | 1113 |
1114 /* Twiddle factors to turn IFFT into IMDCT */ | |
1115 for (i = 0; i < 128; i++) { | |
1116 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); | |
1117 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); | |
1118 } | |
3579 | 1119 #ifdef ARCH_X86 |
3527 | 1120 for (i = 0; i < 128; i++) { |
3581 | 1121 sseSinCos1c[2*i+0]= xcos1[i]; |
1122 sseSinCos1c[2*i+1]= -xcos1[i]; | |
1123 sseSinCos1d[2*i+0]= xsin1[i]; | |
1124 sseSinCos1d[2*i+1]= xsin1[i]; | |
3527 | 1125 } |
1126 #endif | |
3394 | 1127 |
1128 /* More twiddle factors to turn IFFT into IMDCT */ | |
1129 for (i = 0; i < 64; i++) { | |
1130 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1)); | |
1131 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1)); | |
1132 } | |
1133 | |
1134 for (i = 0; i < 7; i++) { | |
1135 j = 1 << i; | |
1136 for (k = 0; k < j; k++) { | |
1137 w[i][k].real = cos (-M_PI * k / j); | |
1138 w[i][k].imag = sin (-M_PI * k / j); | |
1139 } | |
1140 } | |
3579 | 1141 #ifdef ARCH_X86 |
3534 | 1142 for (i = 1; i < 7; i++) { |
1143 j = 1 << i; | |
1144 for (k = 0; k < j; k+=2) { | |
1145 | |
1146 sseW[i][4*k + 0] = w[i][k+0].real; | |
1147 sseW[i][4*k + 1] = w[i][k+0].real; | |
1148 sseW[i][4*k + 2] = w[i][k+1].real; | |
1149 sseW[i][4*k + 3] = w[i][k+1].real; | |
1150 | |
1151 sseW[i][4*k + 4] = -w[i][k+0].imag; | |
1152 sseW[i][4*k + 5] = w[i][k+0].imag; | |
1153 sseW[i][4*k + 6] = -w[i][k+1].imag; | |
1154 sseW[i][4*k + 7] = w[i][k+1].imag; | |
1155 | |
1156 //we multiply more or less uninitalized numbers so we need to use exactly 0.0 | |
1157 if(k==0) | |
1158 { | |
1159 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0; | |
1160 sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0; | |
1161 } | |
1162 | |
1163 if(2*k == j) | |
1164 { | |
1165 sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0; | |
1166 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0); | |
1167 } | |
1168 } | |
1169 } | |
3552 | 1170 |
1171 for(i=0; i<128; i++) | |
1172 { | |
1173 sseWindow[2*i+0]= -imdct_window[2*i+0]; | |
3553 | 1174 sseWindow[2*i+1]= imdct_window[2*i+1]; |
3552 | 1175 } |
3553 | 1176 |
1177 for(i=0; i<64; i++) | |
1178 { | |
1179 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1]; | |
1180 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0]; | |
1181 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1]; | |
1182 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0]; | |
1183 } | |
3579 | 1184 #endif // arch_x86 |
1185 | |
3720
120ac80f13c2
Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents:
3623
diff
changeset
|
1186 imdct_512 = imdct_do_512; |
120ac80f13c2
Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents:
3623
diff
changeset
|
1187 #ifdef ARCH_X86 |
3884 | 1188 if(gCpuCaps.hasSSE) imdct_512 = imdct_do_512_sse; |
1189 else if(gCpuCaps.has3DNow) imdct_512 = imdct_do_512_3dnow; | |
3720
120ac80f13c2
Fixed #ifdef discrepancy that was breaking compilation on PPC platform
melanson
parents:
3623
diff
changeset
|
1190 #endif // arch_x86 |
3394 | 1191 imdct_256 = imdct_do_256; |
1192 } | |
1193 } | |
3884 | 1194 |
1195 // Stuff below this line is borrowed from libac3 | |
1196 #include "srfftp.h" | |
1197 | |
1198 #ifdef ARCH_X86 | |
1199 | |
1200 static void fft_4_3dnow(complex_t *x) | |
1201 { | |
1202 /* delta_p = 1 here */ | |
1203 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} | |
1204 */ | |
1205 __asm__ __volatile__( | |
1206 "movq 24(%1), %%mm3\n\t" | |
1207 "movq 8(%1), %%mm1\n\t" | |
1208 "pxor %2, %%mm3\n\t" /* mm3.re | -mm3.im */ | |
1209 "pxor %3, %%mm1\n\t" /* -mm1.re | mm1.im */ | |
1210 "pfadd %%mm1, %%mm3\n\t" /* vi.im = x[3].re - x[1].re; */ | |
1211 "movq %%mm3, %%mm4\n\t" /* vi.re =-x[3].im + x[1].im; mm4 = vi */ | |
1212 #ifdef HAVE_3DNOWEX | |
1213 "pswapd %%mm4, %%mm4\n\t" | |
1214 #else | |
1215 "movq %%mm4, %%mm5\n\t" | |
1216 "psrlq $32, %%mm4\n\t" | |
1217 "punpckldq %%mm5, %%mm4\n\t" | |
1218 #endif | |
1219 "movq (%1), %%mm5\n\t" /* yb.re = x[0].re - x[2].re; */ | |
1220 "movq (%1), %%mm6\n\t" /* yt.re = x[0].re + x[2].re; */ | |
1221 "movq 24(%1), %%mm7\n\t" /* u.re = x[3].re + x[1].re; */ | |
1222 "pfsub 16(%1), %%mm5\n\t" /* yb.im = x[0].im - x[2].im; mm5 = yb */ | |
1223 "pfadd 16(%1), %%mm6\n\t" /* yt.im = x[0].im + x[2].im; mm6 = yt */ | |
1224 "pfadd 8(%1), %%mm7\n\t" /* u.im = x[3].im + x[1].im; mm7 = u */ | |
1225 | |
1226 "movq %%mm6, %%mm0\n\t" /* x[0].re = yt.re + u.re; */ | |
1227 "movq %%mm5, %%mm1\n\t" /* x[1].re = yb.re + vi.re; */ | |
1228 "pfadd %%mm7, %%mm0\n\t" /*x[0].im = yt.im + u.im; */ | |
1229 "pfadd %%mm4, %%mm1\n\t" /* x[1].im = yb.im + vi.im; */ | |
1230 "movq %%mm0, (%0)\n\t" | |
1231 "movq %%mm1, 8(%0)\n\t" | |
1232 | |
1233 "pfsub %%mm7, %%mm6\n\t" /* x[2].re = yt.re - u.re; */ | |
1234 "pfsub %%mm4, %%mm5\n\t" /* x[3].re = yb.re - vi.re; */ | |
1235 "movq %%mm6, 16(%0)\n\t" /* x[2].im = yt.im - u.im; */ | |
1236 "movq %%mm5, 24(%0)" /* x[3].im = yb.im - vi.im; */ | |
1237 :"=r"(x) | |
1238 :"0"(x), | |
1239 "m"(x_plus_minus_3dnow), | |
1240 "m"(x_minus_plus_3dnow) | |
1241 :"memory"); | |
1242 } | |
1243 | |
1244 static void fft_8_3dnow(complex_t *x) | |
1245 { | |
1246 /* delta_p = diag{1, sqrt(i)} here */ | |
1247 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} | |
1248 */ | |
1249 complex_t wT1, wB1, wB2; | |
1250 | |
1251 __asm__ __volatile__( | |
1252 "movq 8(%2), %%mm0\n\t" | |
1253 "movq 24(%2), %%mm1\n\t" | |
1254 "movq %%mm0, %0\n\t" /* wT1 = x[1]; */ | |
1255 "movq %%mm1, %1\n\t" /* wB1 = x[3]; */ | |
1256 :"=m"(wT1), "=m"(wB1) | |
1257 :"r"(x) | |
1258 :"memory"); | |
1259 | |
1260 __asm__ __volatile__( | |
1261 "movq 16(%0), %%mm2\n\t" | |
1262 "movq 32(%0), %%mm3\n\t" | |
1263 "movq %%mm2, 8(%0)\n\t" /* x[1] = x[2]; */ | |
1264 "movq 48(%0), %%mm4\n\t" | |
1265 "movq %%mm3, 16(%0)\n\t" /* x[2] = x[4]; */ | |
1266 "movq %%mm4, 24(%0)\n\t" /* x[3] = x[6]; */ | |
1267 :"=r"(x) | |
1268 :"0"(x) | |
1269 :"memory"); | |
1270 | |
1271 fft_4_3dnow(&x[0]); | |
1272 | |
1273 /* x[0] x[4] x[2] x[6] */ | |
1274 | |
1275 __asm__ __volatile__( | |
1276 "movq 40(%1), %%mm0\n\t" | |
1277 "movq %%mm0, %%mm3\n\t" | |
1278 "movq 56(%1), %%mm1\n\t" | |
1279 "pfadd %%mm1, %%mm0\n\t" | |
1280 "pfsub %%mm1, %%mm3\n\t" | |
1281 "movq (%2), %%mm2\n\t" | |
1282 "pfadd %%mm2, %%mm0\n\t" | |
1283 "pfadd %%mm2, %%mm3\n\t" | |
1284 "movq (%3), %%mm1\n\t" | |
1285 "pfadd %%mm1, %%mm0\n\t" | |
1286 "pfsub %%mm1, %%mm3\n\t" | |
1287 "movq (%1), %%mm1\n\t" | |
1288 "movq 16(%1), %%mm4\n\t" | |
1289 "movq %%mm1, %%mm2\n\t" | |
1290 #ifdef HAVE_3DNOWEX | |
1291 "pswapd %%mm3, %%mm3\n\t" | |
1292 #else | |
1293 "movq %%mm3, %%mm6\n\t" | |
1294 "psrlq $32, %%mm3\n\t" | |
1295 "punpckldq %%mm6, %%mm3\n\t" | |
1296 #endif | |
1297 "pfadd %%mm0, %%mm1\n\t" | |
1298 "movq %%mm4, %%mm5\n\t" | |
1299 "pfsub %%mm0, %%mm2\n\t" | |
1300 "pfadd %%mm3, %%mm4\n\t" | |
1301 "movq %%mm1, (%0)\n\t" | |
1302 "pfsub %%mm3, %%mm5\n\t" | |
1303 "movq %%mm2, 32(%0)\n\t" | |
1304 "movd %%mm4, 16(%0)\n\t" | |
1305 "movd %%mm5, 48(%0)\n\t" | |
1306 "psrlq $32, %%mm4\n\t" | |
1307 "psrlq $32, %%mm5\n\t" | |
1308 "movd %%mm4, 52(%0)\n\t" | |
1309 "movd %%mm5, 20(%0)" | |
1310 :"=r"(x) | |
1311 :"0"(x), "r"(&wT1), "r"(&wB1) | |
1312 :"memory"); | |
1313 | |
1314 /* x[1] x[5] */ | |
1315 __asm__ __volatile__ ( | |
1316 "movq %6, %%mm6\n\t" | |
1317 "movq %5, %%mm7\n\t" | |
1318 "movq %1, %%mm0\n\t" | |
1319 "movq %2, %%mm1\n\t" | |
1320 "movq 56(%3), %%mm3\n\t" | |
1321 "pfsub 40(%3), %%mm0\n\t" | |
1322 #ifdef HAVE_3DNOWEX | |
1323 "pswapd %%mm1, %%mm1\n\t" | |
1324 #else | |
1325 "movq %%mm1, %%mm2\n\t" | |
1326 "psrlq $32, %%mm1\n\t" | |
1327 "punpckldq %%mm2,%%mm1\n\t" | |
1328 #endif | |
1329 "pxor %%mm7, %%mm1\n\t" | |
1330 "pfadd %%mm1, %%mm0\n\t" | |
1331 #ifdef HAVE_3DNOWEX | |
1332 "pswapd %%mm3, %%mm3\n\t" | |
1333 #else | |
1334 "movq %%mm3, %%mm2\n\t" | |
1335 "psrlq $32, %%mm3\n\t" | |
1336 "punpckldq %%mm2,%%mm3\n\t" | |
1337 #endif | |
1338 "pxor %%mm6, %%mm3\n\t" | |
1339 "pfadd %%mm3, %%mm0\n\t" | |
1340 "movq %%mm0, %%mm1\n\t" | |
1341 "pxor %%mm6, %%mm1\n\t" | |
1342 "pfacc %%mm1, %%mm0\n\t" | |
1343 "pfmul %4, %%mm0\n\t" | |
1344 | |
1345 "movq 40(%3), %%mm5\n\t" | |
1346 #ifdef HAVE_3DNOWEX | |
1347 "pswapd %%mm5, %%mm5\n\t" | |
1348 #else | |
1349 "movq %%mm5, %%mm1\n\t" | |
1350 "psrlq $32, %%mm5\n\t" | |
1351 "punpckldq %%mm1,%%mm5\n\t" | |
1352 #endif | |
1353 "movq %%mm5, %0\n\t" | |
1354 | |
1355 "movq 8(%3), %%mm1\n\t" | |
1356 "movq %%mm1, %%mm2\n\t" | |
1357 "pfsub %%mm0, %%mm1\n\t" | |
1358 "pfadd %%mm0, %%mm2\n\t" | |
1359 "movq %%mm1, 40(%3)\n\t" | |
1360 "movq %%mm2, 8(%3)\n\t" | |
1361 :"=m"(wB2) | |
1362 :"m"(wT1), "m"(wB1), "r"(x), "m"(HSQRT2_3DNOW), | |
1363 "m"(x_plus_minus_3dnow), "m"(x_minus_plus_3dnow) | |
1364 :"memory"); | |
1365 | |
1366 | |
1367 /* x[3] x[7] */ | |
1368 __asm__ __volatile__( | |
1369 "movq %1, %%mm0\n\t" | |
1370 #ifdef HAVE_3DNOWEX | |
1371 "pswapd %3, %%mm1\n\t" | |
1372 #else | |
1373 "movq %3, %%mm1\n\t" | |
1374 "psrlq $32, %%mm1\n\t" | |
1375 "punpckldq %3, %%mm1\n\t" | |
1376 #endif | |
1377 "pxor %%mm6, %%mm1\n\t" | |
1378 "pfadd %%mm1, %%mm0\n\t" | |
1379 "movq %2, %%mm2\n\t" | |
1380 "movq 56(%4), %%mm3\n\t" | |
1381 "pxor %%mm7, %%mm3\n\t" | |
1382 "pfadd %%mm3, %%mm2\n\t" | |
1383 #ifdef HAVE_3DNOWEX | |
1384 "pswapd %%mm2, %%mm2\n\t" | |
1385 #else | |
1386 "movq %%mm2, %%mm5\n\t" | |
1387 "psrlq $32, %%mm2\n\t" | |
1388 "punpckldq %%mm5,%%mm2\n\t" | |
1389 #endif | |
1390 "movq 24(%4), %%mm3\n\t" | |
1391 "pfsub %%mm2, %%mm0\n\t" | |
1392 "movq %%mm3, %%mm4\n\t" | |
1393 "movq %%mm0, %%mm1\n\t" | |
1394 "pxor %%mm6, %%mm0\n\t" | |
1395 "pfacc %%mm1, %%mm0\n\t" | |
1396 "pfmul %5, %%mm0\n\t" | |
1397 "movq %%mm0, %%mm1\n\t" | |
1398 "pxor %%mm6, %%mm1\n\t" | |
1399 "pxor %%mm7, %%mm0\n\t" | |
1400 "pfadd %%mm1, %%mm3\n\t" | |
1401 "pfadd %%mm0, %%mm4\n\t" | |
1402 "movq %%mm4, 24(%0)\n\t" | |
1403 "movq %%mm3, 56(%0)\n\t" | |
1404 :"=r"(x) | |
1405 :"m"(wT1), "m"(wB2), "m"(wB1), "0"(x), "m"(HSQRT2_3DNOW) | |
1406 :"memory"); | |
1407 } | |
1408 | |
1409 static void fft_asmb_3dnow(int k, complex_t *x, complex_t *wTB, | |
1410 const complex_t *d, const complex_t *d_3) | |
1411 { | |
1412 register complex_t *x2k, *x3k, *x4k, *wB; | |
1413 | |
1414 TRANS_FILL_MM6_MM7_3DNOW(); | |
1415 x2k = x + 2 * k; | |
1416 x3k = x2k + 2 * k; | |
1417 x4k = x3k + 2 * k; | |
1418 wB = wTB + 2 * k; | |
1419 | |
1420 TRANSZERO_3DNOW(x[0],x2k[0],x3k[0],x4k[0]); | |
1421 TRANS_3DNOW(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]); | |
1422 | |
1423 --k; | |
1424 for(;;) { | |
1425 TRANS_3DNOW(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]); | |
1426 TRANS_3DNOW(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]); | |
1427 if (!--k) break; | |
1428 x += 2; | |
1429 x2k += 2; | |
1430 x3k += 2; | |
1431 x4k += 2; | |
1432 d += 2; | |
1433 d_3 += 2; | |
1434 wTB += 2; | |
1435 wB += 2; | |
1436 } | |
1437 | |
1438 } | |
1439 | |
1440 void fft_asmb16_3dnow(complex_t *x, complex_t *wTB) | |
1441 { | |
1442 int k = 2; | |
1443 | |
1444 TRANS_FILL_MM6_MM7_3DNOW(); | |
1445 /* transform x[0], x[8], x[4], x[12] */ | |
1446 TRANSZERO_3DNOW(x[0],x[4],x[8],x[12]); | |
1447 | |
1448 /* transform x[1], x[9], x[5], x[13] */ | |
1449 TRANS_3DNOW(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]); | |
1450 | |
1451 /* transform x[2], x[10], x[6], x[14] */ | |
1452 TRANSHALF_16_3DNOW(x[2],x[6],x[10],x[14]); | |
1453 | |
1454 /* transform x[3], x[11], x[7], x[15] */ | |
1455 TRANS_3DNOW(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]); | |
1456 | |
1457 } | |
1458 | |
1459 static void fft_128p_3dnow(complex_t *a) | |
1460 { | |
1461 fft_8_3dnow(&a[0]); fft_4_3dnow(&a[8]); fft_4_3dnow(&a[12]); | |
1462 fft_asmb16_3dnow(&a[0], &a[8]); | |
1463 | |
1464 fft_8_3dnow(&a[16]), fft_8_3dnow(&a[24]); | |
1465 fft_asmb_3dnow(4, &a[0], &a[16],&delta32[0], &delta32_3[0]); | |
1466 | |
1467 fft_8_3dnow(&a[32]); fft_4_3dnow(&a[40]); fft_4_3dnow(&a[44]); | |
1468 fft_asmb16_3dnow(&a[32], &a[40]); | |
1469 | |
1470 fft_8_3dnow(&a[48]); fft_4_3dnow(&a[56]); fft_4_3dnow(&a[60]); | |
1471 fft_asmb16_3dnow(&a[48], &a[56]); | |
1472 | |
1473 fft_asmb_3dnow(8, &a[0], &a[32],&delta64[0], &delta64_3[0]); | |
1474 | |
1475 fft_8_3dnow(&a[64]); fft_4_3dnow(&a[72]); fft_4_3dnow(&a[76]); | |
1476 /* fft_16(&a[64]); */ | |
1477 fft_asmb16_3dnow(&a[64], &a[72]); | |
1478 | |
1479 fft_8_3dnow(&a[80]); fft_8_3dnow(&a[88]); | |
1480 | |
1481 /* fft_32(&a[64]); */ | |
1482 fft_asmb_3dnow(4, &a[64], &a[80],&delta32[0], &delta32_3[0]); | |
1483 | |
1484 fft_8_3dnow(&a[96]); fft_4_3dnow(&a[104]), fft_4_3dnow(&a[108]); | |
1485 /* fft_16(&a[96]); */ | |
1486 fft_asmb16_3dnow(&a[96], &a[104]); | |
1487 | |
1488 fft_8_3dnow(&a[112]), fft_8_3dnow(&a[120]); | |
1489 /* fft_32(&a[96]); */ | |
1490 fft_asmb_3dnow(4, &a[96], &a[112], &delta32[0], &delta32_3[0]); | |
1491 | |
1492 /* fft_128(&a[0]); */ | |
1493 fft_asmb_3dnow(16, &a[0], &a[64], &delta128[0], &delta128_3[0]); | |
1494 } | |
1495 #endif //ARCH_X86 | |
1496 | |
1497 static void fft_asmb(int k, complex_t *x, complex_t *wTB, | |
1498 const complex_t *d, const complex_t *d_3) | |
1499 { | |
1500 register complex_t *x2k, *x3k, *x4k, *wB; | |
1501 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i; | |
1502 | |
1503 x2k = x + 2 * k; | |
1504 x3k = x2k + 2 * k; | |
1505 x4k = x3k + 2 * k; | |
1506 wB = wTB + 2 * k; | |
1507 | |
1508 TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]); | |
1509 TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]); | |
1510 | |
1511 --k; | |
1512 for(;;) { | |
1513 TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]); | |
1514 TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]); | |
1515 if (!--k) break; | |
1516 x += 2; | |
1517 x2k += 2; | |
1518 x3k += 2; | |
1519 x4k += 2; | |
1520 d += 2; | |
1521 d_3 += 2; | |
1522 wTB += 2; | |
1523 wB += 2; | |
1524 } | |
1525 | |
1526 } | |
1527 | |
1528 static void fft_asmb16(complex_t *x, complex_t *wTB) | |
1529 { | |
1530 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i; | |
1531 int k = 2; | |
1532 | |
1533 /* transform x[0], x[8], x[4], x[12] */ | |
1534 TRANSZERO(x[0],x[4],x[8],x[12]); | |
1535 | |
1536 /* transform x[1], x[9], x[5], x[13] */ | |
1537 TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]); | |
1538 | |
1539 /* transform x[2], x[10], x[6], x[14] */ | |
1540 TRANSHALF_16(x[2],x[6],x[10],x[14]); | |
1541 | |
1542 /* transform x[3], x[11], x[7], x[15] */ | |
1543 TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]); | |
1544 | |
1545 } | |
1546 | |
1547 static void fft_4(complex_t *x) | |
1548 { | |
1549 /* delta_p = 1 here */ | |
1550 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} | |
1551 */ | |
1552 | |
1553 register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i; | |
1554 | |
1555 yt_r = x[0].real; | |
1556 yb_r = yt_r - x[2].real; | |
1557 yt_r += x[2].real; | |
1558 | |
1559 u_r = x[1].real; | |
1560 vi_i = x[3].real - u_r; | |
1561 u_r += x[3].real; | |
1562 | |
1563 u_i = x[1].imag; | |
1564 vi_r = u_i - x[3].imag; | |
1565 u_i += x[3].imag; | |
1566 | |
1567 yt_i = yt_r; | |
1568 yt_i += u_r; | |
1569 x[0].real = yt_i; | |
1570 yt_r -= u_r; | |
1571 x[2].real = yt_r; | |
1572 yt_i = yb_r; | |
1573 yt_i += vi_r; | |
1574 x[1].real = yt_i; | |
1575 yb_r -= vi_r; | |
1576 x[3].real = yb_r; | |
1577 | |
1578 yt_i = x[0].imag; | |
1579 yb_i = yt_i - x[2].imag; | |
1580 yt_i += x[2].imag; | |
1581 | |
1582 yt_r = yt_i; | |
1583 yt_r += u_i; | |
1584 x[0].imag = yt_r; | |
1585 yt_i -= u_i; | |
1586 x[2].imag = yt_i; | |
1587 yt_r = yb_i; | |
1588 yt_r += vi_i; | |
1589 x[1].imag = yt_r; | |
1590 yb_i -= vi_i; | |
1591 x[3].imag = yb_i; | |
1592 } | |
1593 | |
1594 | |
1595 static void fft_8(complex_t *x) | |
1596 { | |
1597 /* delta_p = diag{1, sqrt(i)} here */ | |
1598 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} | |
1599 */ | |
1600 register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i; | |
1601 | |
1602 wT1_r = x[1].real; | |
1603 wT1_i = x[1].imag; | |
1604 wB1_r = x[3].real; | |
1605 wB1_i = x[3].imag; | |
1606 | |
1607 x[1] = x[2]; | |
1608 x[2] = x[4]; | |
1609 x[3] = x[6]; | |
1610 fft_4(&x[0]); | |
1611 | |
1612 | |
1613 /* x[0] x[4] */ | |
1614 wT2_r = x[5].real; | |
1615 wT2_r += x[7].real; | |
1616 wT2_r += wT1_r; | |
1617 wT2_r += wB1_r; | |
1618 wT2_i = wT2_r; | |
1619 wT2_r += x[0].real; | |
1620 wT2_i = x[0].real - wT2_i; | |
1621 x[0].real = wT2_r; | |
1622 x[4].real = wT2_i; | |
1623 | |
1624 wT2_i = x[5].imag; | |
1625 wT2_i += x[7].imag; | |
1626 wT2_i += wT1_i; | |
1627 wT2_i += wB1_i; | |
1628 wT2_r = wT2_i; | |
1629 wT2_r += x[0].imag; | |
1630 wT2_i = x[0].imag - wT2_i; | |
1631 x[0].imag = wT2_r; | |
1632 x[4].imag = wT2_i; | |
1633 | |
1634 /* x[2] x[6] */ | |
1635 wT2_r = x[5].imag; | |
1636 wT2_r -= x[7].imag; | |
1637 wT2_r += wT1_i; | |
1638 wT2_r -= wB1_i; | |
1639 wT2_i = wT2_r; | |
1640 wT2_r += x[2].real; | |
1641 wT2_i = x[2].real - wT2_i; | |
1642 x[2].real = wT2_r; | |
1643 x[6].real = wT2_i; | |
1644 | |
1645 wT2_i = x[5].real; | |
1646 wT2_i -= x[7].real; | |
1647 wT2_i += wT1_r; | |
1648 wT2_i -= wB1_r; | |
1649 wT2_r = wT2_i; | |
1650 wT2_r += x[2].imag; | |
1651 wT2_i = x[2].imag - wT2_i; | |
1652 x[2].imag = wT2_i; | |
1653 x[6].imag = wT2_r; | |
1654 | |
1655 | |
1656 /* x[1] x[5] */ | |
1657 wT2_r = wT1_r; | |
1658 wT2_r += wB1_i; | |
1659 wT2_r -= x[5].real; | |
1660 wT2_r -= x[7].imag; | |
1661 wT2_i = wT1_i; | |
1662 wT2_i -= wB1_r; | |
1663 wT2_i -= x[5].imag; | |
1664 wT2_i += x[7].real; | |
1665 | |
1666 wB2_r = wT2_r; | |
1667 wB2_r += wT2_i; | |
1668 wT2_i -= wT2_r; | |
1669 wB2_r *= HSQRT2; | |
1670 wT2_i *= HSQRT2; | |
1671 wT2_r = wB2_r; | |
1672 wB2_r += x[1].real; | |
1673 wT2_r = x[1].real - wT2_r; | |
1674 | |
1675 wB2_i = x[5].real; | |
1676 x[1].real = wB2_r; | |
1677 x[5].real = wT2_r; | |
1678 | |
1679 wT2_r = wT2_i; | |
1680 wT2_r += x[1].imag; | |
1681 wT2_i = x[1].imag - wT2_i; | |
1682 wB2_r = x[5].imag; | |
1683 x[1].imag = wT2_r; | |
1684 x[5].imag = wT2_i; | |
1685 | |
1686 /* x[3] x[7] */ | |
1687 wT1_r -= wB1_i; | |
1688 wT1_i += wB1_r; | |
1689 wB1_r = wB2_i - x[7].imag; | |
1690 wB1_i = wB2_r + x[7].real; | |
1691 wT1_r -= wB1_r; | |
1692 wT1_i -= wB1_i; | |
1693 wB1_r = wT1_r + wT1_i; | |
1694 wB1_r *= HSQRT2; | |
1695 wT1_i -= wT1_r; | |
1696 wT1_i *= HSQRT2; | |
1697 wB2_r = x[3].real; | |
1698 wB2_i = wB2_r + wT1_i; | |
1699 wB2_r -= wT1_i; | |
1700 x[3].real = wB2_i; | |
1701 x[7].real = wB2_r; | |
1702 wB2_i = x[3].imag; | |
1703 wB2_r = wB2_i + wB1_r; | |
1704 wB2_i -= wB1_r; | |
1705 x[3].imag = wB2_i; | |
1706 x[7].imag = wB2_r; | |
1707 } | |
1708 | |
1709 | |
1710 static void fft_128p(complex_t *a) | |
1711 { | |
1712 fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]); | |
1713 fft_asmb16(&a[0], &a[8]); | |
1714 | |
1715 fft_8(&a[16]), fft_8(&a[24]); | |
1716 fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]); | |
1717 | |
1718 fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]); | |
1719 fft_asmb16(&a[32], &a[40]); | |
1720 | |
1721 fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]); | |
1722 fft_asmb16(&a[48], &a[56]); | |
1723 | |
1724 fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]); | |
1725 | |
1726 fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]); | |
1727 /* fft_16(&a[64]); */ | |
1728 fft_asmb16(&a[64], &a[72]); | |
1729 | |
1730 fft_8(&a[80]); fft_8(&a[88]); | |
1731 | |
1732 /* fft_32(&a[64]); */ | |
1733 fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]); | |
1734 | |
1735 fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]); | |
1736 /* fft_16(&a[96]); */ | |
1737 fft_asmb16(&a[96], &a[104]); | |
1738 | |
1739 fft_8(&a[112]), fft_8(&a[120]); | |
1740 /* fft_32(&a[96]); */ | |
1741 fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]); | |
1742 | |
1743 /* fft_128(&a[0]); */ | |
1744 fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]); | |
1745 } | |
1746 | |
1747 | |
1748 |